## Linear Regression

- prepare data
- train model (a learner)
- test/ evaluate model

In [1]:
## mtcars dataset
head(mtcars)

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2
Valiant,18.1,6,225,105,2.76,3.46,20.22,1,0,3,1


In [2]:
## we need to split data into two sets
## train vs. test (split-test)
set.seed(99)
id <- sample(nrow(mtcars), size=nrow(mtcars)*0.9, replace=FALSE)
train_df <- mtcars[id, ]
test_df <- mtcars[-id, ] ## hold-out set

In [3]:
print( dim(train_df))
print( dim(test_df))

[1] 28 11
[1]  4 11


In [4]:
## train model
lm_model <- lm(mpg ~ wt + hp, data = train_df)
print(lm_model)


Call:
lm(formula = mpg ~ wt + hp, data = train_df)

Coefficients:
(Intercept)           wt           hp  
   36.10333     -3.65468     -0.03045  



In [5]:
## look at residuals
train_mse <- mean(lm_model$residuals ** 2)
print(train_mse)

[1] 5.283601


In [6]:
## test model
p <- predict(lm_model, newdata = test_df)
print("Predictions")
print(p)

[1] "Predictions"
        Merc 280      Merc 450SLC   Toyota Corolla Pontiac Firebird 
        19.78635         16.80833         27.41799         16.72301 


In [7]:
## compute error (MSE)
test_mse <- mean( (test_df$mpg - p)**2 )
print(test_mse)

[1] 12.77062


## Let's Build Another Model

- we can improve model by adding more variable
- our goal is to reduce test MSE (not train MSE)

In [8]:
## train new model with 5 features
lm_model_v2 <- lm(mpg ~ wt + hp + qsec + disp + am, data = train_df)
print(lm_model_v2)

## evaluate train
print( mean(lm_model_v2$residuals ** 2) )

## predict
p2 <- predict(lm_model_v2, newdata = test_df)
test_mse_v2 <- mean( (p2 - test_df$mpg) ** 2 )
print(test_mse_v2)


Call:
lm(formula = mpg ~ wt + hp + qsec + disp + am, data = train_df)

Coefficients:
(Intercept)           wt           hp         qsec         disp           am  
  17.774102    -3.393671    -0.019445     0.774800     0.003606     2.911812  

[1] 4.367178
[1] 9.40383


In [9]:
## train full model
lm_model_v3 <- lm(mpg ~ ., data = train_df)
print(lm_model_v3)

## evaluate train
print( mean(lm_model_v3$residuals ** 2) )

## predict
p3 <- predict(lm_model_v3, newdata = test_df)
test_mse_v3 <- mean( (p3 - test_df$mpg) ** 2 )
print(test_mse_v3)


Call:
lm(formula = mpg ~ ., data = train_df)

Coefficients:
(Intercept)          cyl         disp           hp         drat           wt  
  21.731821    -0.418306     0.005115    -0.019240     0.546789    -2.710774  
       qsec           vs           am         gear         carb  
   0.345890     0.428050     1.531849     0.774156    -0.270280  

[1] 4.080218
[1] 10.7737
