In [1]:
#Import Necessary libraries
library('tidyverse')
library('dplyr')
source('functions.R')

-- [1mAttaching packages[22m ------------------------------------------------------------------------------- tidyverse 1.3.2 --
[32mv[39m [34mggplot2[39m 3.3.6      [32mv[39m [34mpurrr  [39m 0.3.5 
[32mv[39m [34mtibble [39m 3.1.8      [32mv[39m [34mdplyr  [39m 1.0.10
[32mv[39m [34mtidyr  [39m 1.2.1      [32mv[39m [34mstringr[39m 1.4.1 
[32mv[39m [34mreadr  [39m 2.1.3      [32mv[39m [34mforcats[39m 0.5.2 
-- [1mConflicts[22m ---------------------------------------------------------------------------------- tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


In [2]:
train = readRDS("04a-wrangledTrain.rds")
train = train[-84211, ]

In [4]:
colnames(train)

In [20]:
train%>%
group_by(manufacturer)%>%
summarize(w = (n()/nrow(train)))%>%
unlist()

In [21]:
## Function to return a manufacturer's country of origin
# @param manufacturer string: a car's manufacturing company
# @return originCountry string: a manufacturor's country of origin
weight_fn<- function(manufacturer){
    w = switch(manufacturer,
        "missing" = 0.0309899878500792,
        "acura" = 0.0149131306295915,
        "alfa-romeo" = 0.0019634682128338,
        "aston-martin" = 0,
        "audi" = 0.0203523989685384,
        "bmw" = 0.0355116039433414, 
        "buick" = 0.012729220815839,
        "cadillac" = 0.0164408421894465,
        "chevrolet" = 0.119740801689711,
        "chrysler" = 0.0156616067629433,
        "dodge" = 0.0310361267898064,
        "ferrari" = 0.000179429210050086,
        "fiat" = 0.00206599919000528,
        "ford" = 0.159384404013062,
        "gmc" = 0.0375007049004681,
        "harley-davidson" = 0.000446009750695929, 
        "honda" = 0.0573353224342905,
        "hyundai" = 0.0257967938563438,
        "infiniti" = 0.0124267544331831,
        "jaguar" = 0.00479844973162517,
        "jeep" = 0.0426067475636077,
        "kia" = 0.0211060016507487,
        "land rover" = 0,
        "lexus" = 0.0209265724406986,
        "lincoln" = 0.0101351870934006,
        "mazda" = 0.0135494686332108,
        "mercedes-benz" = 0.0264017266216556,
        "mercury" = 0.00247099654983262,
        "mini" = 0.0066183745764189,
        "mitsubishi" = 0.00794102418193097,
        "mogran" = 0.0488355044267749,
        "nissan" = 0.00493174000194809,
        "pontiac" = 0.00315282754802295,
        "porsche" = 0.0381466500566484,
        "ram" = 0.00554179931611838,
        "rover" = 0.00291700630052855,
        "saturn" = 0.0266990664554529,
        "subaru" = 0.00207112573886385,
        "tesla" = 0.0845880561664693,
        "toyota"  = 0.0235564920051471,
        "volkswagen" = 0.0235564920051471,
        "volvo" = 0.00842804632349549)
    return(unlist(w, use.names = FALSE))
    }

In [24]:
## Function to apply country_of_origin to a dataframe
# @param data dataframe: the data to retrieve the manufacturer's country of origin from, must have a columm called 'manufacturer'
# @retrun new_data datframe: a copy of the original dataframe containing the new variable countryOrigin
weight_transform<-function(data){
    var <- as.character(data$manufacturer)
    var[var==""] = "missing"
    data$manufacturer =  var
    w = sapply(data$manufacturer, function(i) weight_fn(i))
    new_data = data%>%
        mutate(weights = w)
    return(new_data)             
    }

In [25]:
head(weight_transform(train))

Unnamed: 0_level_0,price,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,paint_color,state,type,countryOrigin,isLuxury,age,weights
Unnamed: 0_level_1,<dbl>,<chr>,<fct>,<fct>,<fct>,<int>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<dbl>,<dbl>,<dbl>
1,33590,gmc,good,8 cylinders,gas,57923,clean,other,,,white,al,pickup,USA,0,8,0.0375007
2,22590,chevrolet,good,8 cylinders,gas,71229,clean,other,,,blue,al,pickup,USA,0,12,0.1197408
3,39590,chevrolet,good,8 cylinders,gas,19160,clean,other,,,red,al,pickup,USA,0,2,0.1197408
4,30990,toyota,good,8 cylinders,gas,41124,clean,other,,,red,al,pickup,Japan,0,5,0.02355649
5,15000,ford,excellent,6 cylinders,gas,128000,clean,automatic,rwd,full-size,black,al,truck,USA,0,9,0.1593844
6,27990,gmc,good,8 cylinders,gas,68696,clean,other,4wd,,black,al,pickup,USA,0,10,0.0375007


In [26]:
myTrain = function(data){
    model = lm(log(price)~.-manufacturer, data = data, weights = data$weights)
    return(model)
}

In [27]:
summary(myTrain(train))


Call:
lm(formula = log(price) ~ . - manufacturer, data = data, weights = data$weights)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.7801 -0.2050  0.0087  0.2221  3.4769 

Coefficients:
                           Estimate Std. Error  t value Pr(>|t|)    
(Intercept)               1.051e+01  2.325e-02  452.107  < 2e-16 ***
conditionexcellent       -2.169e-02  2.737e-03   -7.926 2.27e-15 ***
conditionfair            -6.044e-01  8.199e-03  -73.726  < 2e-16 ***
conditiongood            -1.237e-01  2.957e-03  -41.847  < 2e-16 ***
conditionlike new        -2.491e-02  4.328e-03   -5.757 8.59e-09 ***
conditionnew             -4.862e-02  1.674e-02   -2.905 0.003677 ** 
conditionsalvage         -5.867e-01  2.715e-02  -21.614  < 2e-16 ***
cylinders10 cylinders     5.912e-01  1.665e-02   35.520  < 2e-16 ***
cylinders12 cylinders     1.031e+00  4.847e-02   21.260  < 2e-16 ***
cylinders3 cylinders     -4.409e-01  2.496e-02  -17.663  < 2e-16 ***
cylinders4 cylinders     -1.785e-01  3.211e-

In [36]:
myPredict= function(model, newdata){
    log_price_predict = predict(model, newdata=newdata, interval="prediction", level=0.5)
    return(exp(log_price_predict))
}

In [37]:
myScore= function(predict, data){
    intervalScore(predict, data$price, level = 0.5)
    }

In [38]:
kFold(3, 123, train)

In [39]:
kFold(3, 447, feature_selection(train))

In [40]:
myPredict= function(model, newdata){
    log_price_predict = predict(model, newdata=newdata, interval="prediction", level=0.8)
    return(exp(log_price_predict))
}

In [41]:
myScore= function(predict, data){
    intervalScore(predict, data$price, level = 0.8)
    }

In [42]:
kFold(3, 123, train)

In [43]:
kFold(3, 123, feature_selection(train))