In [62]:
#Import Necessary libraries
suppressWarnings({
library('tidyverse')
library('dplyr')
source('functions.R')})

In [63]:
train = readRDS("04a-wrangledTrain.rds")
train_subset = feature_selection(train)

# 05e Linear Regression with interactions
This file will perform k-fold cv on two different linear regression with interactions based models, one on the entire predictor set, the other on a subset of the predictor variables. Interaction terms in between age and other features selected by feature selection technique were added.

In [108]:
#' @description
#' Define training function to train model based on data to be used for linear Regression with certain interavtions
#'
#' @param data - dataset which is being used to train the desired model
#'
#' @return model - the trained model
myTrain = function(data){
    model = lm(log(price)~.+age:fuel+age:drive+age:type+age:cylinders, data = data)
    return(model)
}

In [109]:
#' @description
#' Function to get prediction for new data based on input model with prediciton interval of 80%
#' 
#' @param model - trained model
#' @newdata - the data for which the predictions are to be made
#'
#' @return a dataset return predicted price in USD with IS
myPredict= function(model, newdata){
    log_price_predict = predict(model, newdata=newdata, interval="prediction", level=0.8)
    return(exp(log_price_predict))
}

In [110]:
#' @description 
#' Function to produce interval scores for given predictions and true values
#'
#' @param predict = predicted values with 80% PI
#' @param data = dataset which contains price - the true value
#'
#' @return interval score outcomes = summary of level, average length of interval, interval score and coverage
myScore= function(predict, data){
    intervalScore(predict, data$price, level = 0.8)
    }

In [111]:
### Below are the output for each fold. Each fold (1,2,3) has the label + its summary for level, avglength, IS, cover
### for the fold in that order
kFold(3, 123, train)

In [112]:
### Below are the output for each fold. Each fold (1,2,3) has the label + its summary for level, avglength, IS, cover
### for the fold in that order
kFold(3, 123, train_subset)

The myPredict() and myScore() functions are now redefined for 50% prediction intervals

In [114]:
#' @description
#' Function to get prediction for new data based on input model with prediciton interval of 50%
#' 
#' @param model - trained model
#' @newdata - the data for which the predictions are to be made
#'
#' @return a dataset return predicted price in USD with IS
myPredict= function(model, newdata){
    log_price_predict = predict(model, newdata=newdata, interval="prediction", level=0.5)
    return(exp(log_price_predict))
}

In [115]:
#' @description 
#' Function to produce interval scores for given predictions and true values
#'
#' @param predict = predicted values with 50% PI
#' @param data = dataset which contains price - the true value
#'
#' @return interval score outcomes = summary of level, average length of interval, interval score and coverage
myScore= function(predict, data){
    intervalScore(predict, data$price, level = 0.5)
    }

In [116]:
kFold(3, 123, train)

In [117]:
kFold(3, 123, train_subset)

### Training the models
Here we train the models and save them for comparison with other methods on the holdout.

In [118]:
model_full <- myTrain(train)
model_subset<-myTrain(train_subset)
saveRDS(model_full, "05e-lmInteractions.rds")
saveRDS(model_subset, "05e-subsetlmInteractions.rds")

In [38]:
holdo<-readRDS("04b-wrangledHoldout.rds")
holdo_subset<- feature_selection(holdo)

In [39]:
model_full <- myTrain(train)
model_subset<-myTrain(train_subset)

In [40]:
myPredict= function(model, newdata){
    log_price_predict = predict(model, newdata=newdata, interval="prediction", level=0.5)
    return(exp(log_price_predict))
}

In [41]:
myScore= function(predict, data){
    intervalScore(predict, data$price, level = 0.5)
    }

In [42]:
prediction_full<- myPredict(model_full, holdo)
predict_subset<- myPredict(model_subset, holdo_subset)

"prediction from a rank-deficient fit may be misleading"


In [43]:
myScore(prediction_full, holdo)

In [44]:
myScore(predict_subset, holdo_subset)