In [2]:
#Import Necessary libraries
suppressWarnings({
library('tidyverse')
library('dplyr')
source('functions.R')})

In [3]:
train = readRDS("04a-wrangledTrain.rds")
train_subset = feature_selection(train)

# 05a Linear Regression
This file will perform k-fold cv on two different linear regression without weights and interactions based models, one on the entire predictor set, the other on a subset of the predictor variables

## Unique Functions for Linear Regression 
We define myTrain, myPredict() and myScore() function as required by the kFold() customized to fit, predict and score the linear model

In [4]:
#' @description
#' Define training function to train model based on data to be used for linear Regression without any weights//interavtions
#'
#' @param data - dataset which is being used to train the desired model
#'
#' @return model - the trained model
myTrain = function(data){
    model = lm(log(price)~., data = data)
    return(model)
}

In [5]:
#' @description
#' Function to get prediction for new data based on input model with prediciton interval of 80%
#' 
#' @param model - trained model
#' @newdata - the data for which the predictions are to be made
#'
#' @return a dataset return predicted price in USD with IS
myPredict= function(model, newdata){
    log_price_predict = predict(model, newdata=newdata, interval="prediction", level=0.8)
    return(exp(log_price_predict))
}

In [6]:
#' @description 
#' Function to produce interval scores for given predictions and true values
#'
#' @param predict = predicted values with 80% PI
#' @param data = dataset which contains price - the true value
#'
#' @return interval score outcomes = summary of level, average length of interval, interval score and coverage
myScore= function(predict, data){
    intervalScore(predict, data$price, level = 0.8)
    }

In [7]:
### Below are the output for each fold. Each fold (1,2,3) has the label + its summary for level, avglength, IS, cover
### for the fold in that order
kFold(3, 123, train)

In [8]:
### Below are the output for each fold. Each fold (1,2,3) has the label + its summary for level, avglength, IS, cover
### for the fold in that order
kFold(3, 123, feature_selection(train))

The myPredict() and myScore() functions are now redefined for 50% prediction intervals

In [9]:
#' @description
#' Function to get prediction for new data based on input model with prediciton interval of 50%
#' 
#' @param model - trained model
#' @newdata - the data for which the predictions are to be made
#'
#' @return a dataset return predicted price in USD with PI
myPredict= function(model, newdata){
    log_price_predict = predict(model, newdata=newdata, interval="prediction", level=0.5)
    return(exp(log_price_predict))
}

In [10]:
#' @description 
#' Function to produce interval scores for given predictions and true values
#'
#' @param predict = predicted values with 50% PI
#' @param data = dataset which contains price - the true value
#'
#' @return interval score outcomes = summary of level, average length of interval, interval score and coverage
myScore= function(predict, data){
    intervalScore(predict, data$price, level = 0.5)
    }

In [11]:
### Below are the output for each fold. Each fold (1,2,3) has the label + its summary for level, avglength, IS, cover
### for the fold in that order
kFold(3, 123, train)

In [12]:
### Below are the output for each fold. Each fold (1,2,3) has the label + its summary for level, avglength, IS, cover
### for the fold in that order
kFold(3, 123, feature_selection(train))

### Training the models
Here we train the models and save them for comparison with other methods on the holdout.

In [13]:
model_full <- myTrain(train)
model_subset<-myTrain(train_subset)
saveRDS(model_full, "05a-lm.rds")
saveRDS(model_subset, "05a-subsetlm.rds")

In [None]:
holdo<-readRDS("04b-wrangledHoldout.rds")
holdo_subset<- feature_selection(holdo)

In [None]:
model_full <- myTrain(train)
model_subset<-myTrain(train_subset)

In [None]:
myPredict= function(model, newdata){
    log_price_predict = predict(model, newdata=newdata, interval="prediction", level=0.5)
    return(exp(log_price_predict))
}

In [None]:
myScore= function(predict, data){
    intervalScore(predict, data$price, level = 0.5)
    }

In [None]:
model_full <- myTrain(train)
model_subset<-myTrain(train_subset)

In [None]:
prediction_full<- myPredict(model_full, holdo)
predict_subset<- myPredict(model_subset, holdo_subset)

In [None]:
myScore(prediction_full, holdo)

In [None]:
myScore(predict_subset, holdo_subset)