# Weighted Least Squares Methods

This docoment fits the weighted least squares model. It discusses the intuition behind creating the weights. 

### Load Libraries + training and holdout data

In [34]:
#Import Necessary libraries
suppressWarnings({
library('tidyverse')
library('dplyr')
source('functions.R')})

In [3]:
train = readRDS("04a-wrangledTrain.rds")
train = train[-84211, ]

#### Finding the weights

In [9]:
##Find initial weights
train%>%
group_by(manufacturer)%>%
summarize(w = n()/nrow(train))

manufacturer,w
<fct>,<dbl>
acura,0.01491313
alfa-romeo,0.001963468
aston-martin,6.664514e-05
audi,0.0203524
bmw,0.0355116
buick,0.01272922
cadillac,0.01644084
chevrolet,0.1197408
chrysler,0.01566161
dodge,0.03103613


In [10]:
## Function to return a manufacturer's country of origin
# @param manufacturer string: a car's manufacturing company
# @return originCountry string: a manufacturor's country of origin
weight_fn<- function(manufacturer){
    w = switch(manufacturer,
        "missing" = 0.0309899878500792,
        "acura" = 0.0149131306295915,
        "alfa-romeo" = 0.0019634682128338,
        "aston-martin" = 0,
        "audi" = 0.0203523989685384,
        "bmw" = 0.0355116039433414, 
        "buick" = 0.012729220815839,
        "cadillac" = 0.0164408421894465,
        "chevrolet" = 0.119740801689711,
        "chrysler" = 0.0156616067629433,
        "dodge" = 0.0310361267898064,
        "ferrari" = 0.000179429210050086,
        "fiat" = 0.00206599919000528,
        "ford" = 0.159384404013062,
        "gmc" = 0.0375007049004681,
        "harley-davidson" = 0.000446009750695929, 
        "honda" = 0.0573353224342905,
        "hyundai" = 0.0257967938563438,
        "infiniti" = 0.0124267544331831,
        "jaguar" = 0.00479844973162517,
        "jeep" = 0.0426067475636077,
        "kia" = 0.0211060016507487,
        "land rover" = 0,
        "lexus" = 0.0209265724406986,
        "lincoln" = 0.0101351870934006,
        "mazda" = 0.0135494686332108,
        "mercedes-benz" = 0.0264017266216556,
        "mercury" = 0.00247099654983262,
        "mini" = 0.0066183745764189,
        "mitsubishi" = 0.00794102418193097,
        "nissan" = 0.0488355044267749,
        "pontiac" = 0.00493174000194809,
        "porsche" = 0.00315282754802295,
        "ram" = 0.0381466500566484,
        "rover" = 0.00554179931611838,
        "saturn" = 0.00291700630052855,
        "subaru" = 0.0266990664554529,
        "tesla" = 0.00207112573886385,
        "toyota" = 0.0845880561664693,
        "volkswagen"  = 0.0235564920051471,
        "volvo" = 0.00842804632349549)
    return(unlist(w, use.names = FALSE))
    }

In [11]:
## Function to apply country_of_origin to a dataframe
# @param data dataframe: the data to retrieve the manufacturer's country of origin from, must have a columm called 'manufacturer'
# @retrun new_data datframe: a copy of the original dataframe containing the new variable countryOrigin
weight_transform<-function(data){
    var <- as.character(data$manufacturer)
    var[var==""] = "missing"
    data$manufacturer =  var
    w = sapply(data$manufacturer, function(i) weight_fn(i))
    new_data = data%>%
        mutate(weights = ((1- w)/sum(1-w))*nrow(data))
    return(new_data)             
    }

In [13]:
train = weight_transform(train)

In [14]:
sum(train$weights)

In [15]:
myTrain = function(data){
    model = lm(log(price)~.-manufacturer, data = data, weights = data$weights)
    return(model)
}

In [16]:
myPredict= function(model, newdata){
    log_price_predict = predict(model, newdata=newdata, interval="prediction", level=0.5)
    return(exp(log_price_predict))
}

In [17]:
myScore= function(predict, data){
    intervalScore(predict, data$price, level = 0.5)
    }

In [18]:
kFold(3, 123, train)

"Assuming constant prediction variance even though model fit is weighted
"
"Assuming constant prediction variance even though model fit is weighted
"
"Assuming constant prediction variance even though model fit is weighted
"


In [19]:
kFold(3, 123, feature_selection(train))

In [31]:
myPredict= function(model, newdata){
    log_price_predict = predict(model, newdata=newdata, interval="prediction", level=0.5)
    return(exp(log_price_predict))
}

In [32]:
myScore= function(predict, data){
    intervalScore(predict, data$price, level = 0.5)
    }

In [22]:
kFold(3, 123, train)

"Assuming constant prediction variance even though model fit is weighted
"
"Assuming constant prediction variance even though model fit is weighted
"
"Assuming constant prediction variance even though model fit is weighted
"


In [23]:
kFold(3, 123, feature_selection(train))

In [24]:
holdo<-readRDS("04b-wrangledHoldout.rds")
holdo = weight_transform(holdo)
holdo_subset<- feature_selection(holdo)
holdo_subset = weight_transform(holdo_subset)

In [25]:
myScore= function(predict, data){
    intervalScore(predict, data$price, level = 0.5)
    }

In [26]:
feature_selection = function(data){
    new_data = subset(data, select = c(price, age, fuel, drive, type, manufacturer, weights))
    return(new_data)
    }

In [27]:
train_subset<- feature_selection(train)
model_full  <- myTrain(train)
model_subset<-myTrain(train_subset)

In [28]:
prediction_full<- myPredict(model_full, holdo)
predict_subset<- myPredict(model_subset, holdo_subset)

"Assuming constant prediction variance even though model fit is weighted
"
"Assuming constant prediction variance even though model fit is weighted
"


In [33]:
myScore(prediction_full, holdo)

In [30]:
myScore(predict_subset, holdo_subset)