In [3]:
#importing necessary libraries
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.metrics import r2_score

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

In [4]:
#things we have already done in previous notebook
cars = pd.read_csv("Car-mpg- Dataset.csv")  
cars = cars.drop('car_name', axis=1)
cars['origin'] = cars['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
cars = pd.get_dummies(cars, columns=['origin'])
cars = cars.replace('?', np.nan)
cars = cars.apply(lambda x: x.fillna(x.median()),axis=0)

In [5]:
#splitting data into dependent and independent
X = cars.drop('mpg', axis=1)

y = cars[['mpg']]

In [6]:
#scaling the data

from sklearn import preprocessing

X_scaled = preprocessing.scale(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled

Unnamed: 0,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
0,1.498191,1.090604,0.673118,0.630870,-1.295498,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
1,1.498191,1.503514,1.589958,0.854333,-1.477038,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
2,1.498191,1.196232,1.197027,0.550470,-1.658577,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
3,1.498191,1.061796,1.197027,0.546923,-1.295498,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
4,1.498191,1.042591,0.935072,0.565841,-1.840117,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
...,...,...,...,...,...,...,...,...,...,...
393,-0.856321,-0.513026,-0.479482,-0.213324,0.011586,1.621983,0.941412,0.773559,-0.497643,-0.461968
394,-0.856321,-0.925936,-1.370127,-0.993671,3.279296,1.621983,0.941412,-1.292726,-0.497643,2.164651
395,-0.856321,-0.561039,-0.531873,-0.798585,-1.440730,1.621983,0.941412,0.773559,-0.497643,-0.461968
396,-0.856321,-0.705077,-0.662850,-0.408411,1.100822,1.621983,0.941412,0.773559,-0.497643,-0.461968


In [7]:
y_scaled = preprocessing.scale(y)
y_scaled = pd.DataFrame(y_scaled, columns=y.columns) 
y_scaled

Unnamed: 0,mpg
0,-0.706439
1,-1.090751
2,-0.706439
3,-0.962647
4,-0.834543
...,...
393,0.446497
394,2.624265
395,1.087017
396,0.574601


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size = 0.3, random_state = 1)

In [9]:
#fitting a simple linear model

reg_model = LinearRegression()
reg_model.fit(X_train, y_train)

for idx, col_name in enumerate(X_train.columns):
  print(f"The coeffiecint for {col_name} is {reg_model.coef_[0][idx]}")

The coeffiecint for cyl is 0.32102238569161046
The coeffiecint for disp is 0.3248343091848398
The coeffiecint for hp is -0.22916950059437655
The coeffiecint for wt is -0.7112101905072297
The coeffiecint for acc is 0.014713682764191029
The coeffiecint for yr is 0.37558119495107434
The coeffiecint for car_type is 0.38147694842331054
The coeffiecint for origin_america is -0.07472247547584163
The coeffiecint for origin_asia is 0.044515252035678465
The coeffiecint for origin_europe is 0.04834854953945399


In [10]:
#building a regularised ridge model
ridge = Ridge(alpha = .3)
ridge.fit(X_train, y_train)
print(f"The coefficients are {ridge.coef_}")

The coefficients are [[ 0.31649043  0.31320707 -0.22876025 -0.70109447  0.01295851  0.37447352
   0.37725608 -0.07423624  0.04441039  0.04784031]]


In [11]:
#building a regularised lasso model
lasso = Lasso(alpha = .1)
lasso.fit(X_train, y_train)
print(f"The coefficients are {lasso.coef_}")

The coefficients are [-0.         -0.         -0.01690287 -0.51890013  0.          0.28138241
  0.1278489  -0.01642647  0.          0.        ]


Comaring the scores of all the Regression models we built (Simple Linear regression, Ridge regression, Lasso Regresssion)

In [12]:
print(reg_model.score(X_train, y_train))
print(reg_model.score(X_test, y_test))

0.8343770256960538
0.8513421387780067


In [13]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

0.8343617931312616
0.8518882171608504


In [14]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))

0.7938010766228453
0.8375229615977083


In [15]:
#The results are quite similar in both the regularised and non regularised models. But the regularised models are quite simpler and they are number of dimensions is less in Lasso model

**We will build a Polynomial model now to capture the non linear relationship between the features**

In [16]:
from sklearn.preprocessing import PolynomialFeatures

In [17]:
poly = PolynomialFeatures(degree = 2, interaction_only = True)

In [18]:
X_poly = poly.fit_transform(X_scaled)
X_train_poly, X_test_poly, y_train_poly, y_test_poly = train_test_split(X_poly, y, test_size = 0.3, random_state = 1)
X_train_poly.shape

(278, 56)

In [19]:
reg_model_poly = LinearRegression()
reg_model_poly.fit(X_train_poly, y_train_poly)
reg_model_poly.coef_[0]

array([-9.67853872e-13, -3.45886171e+10, -4.44319332e+00, -2.20713292e+00,
       -2.95250387e+00, -1.53883744e+00,  3.01473348e+00, -4.60540335e+10,
       -2.07900267e+10,  1.09400877e+11, -8.68487255e+10, -1.26832814e+00,
       -1.16659123e+00, -1.35749094e-01,  2.81359972e+00, -1.97650100e+00,
       -1.10816273e+11, -1.93916592e+11, -7.12573640e+10, -6.80153296e+10,
        3.85037368e-01,  1.71579269e-01, -5.25571363e-01,  3.49994333e+00,
       -2.04658007e+00, -2.62191546e+10, -2.16090047e+10, -2.06258483e+10,
        1.83546538e-01, -6.24509796e-01, -1.89501969e+00, -5.63179799e-01,
       -1.08850675e+10, -8.97113113e+09, -8.56296678e+09, -1.93246841e-01,
        5.19730568e-01, -3.53975296e+00,  4.92858486e+10,  4.06198503e+10,
        3.87717472e+10,  5.21789551e-01,  1.75518036e+00,  9.36501148e+09,
        7.71834867e+09,  7.36718284e+09,  3.74290466e-01,  6.87454454e+09,
        5.66578394e+09,  5.40800475e+09, -2.72196349e+10,  2.96019913e+10,
        2.82551736e+10,  

In [20]:
ridge_poly = Ridge(.3)
ridge_poly.fit(X_train_poly, y_train_poly)
ridge_poly.coef_

array([[ 0.        ,  3.73512981, -2.93500874, -2.13974194, -3.56547812,
        -1.28898893,  3.01290805,  2.04739082,  0.0786974 ,  0.21972225,
        -0.3302341 , -1.46231096, -1.17221896,  0.00856067,  2.48054694,
        -1.67596093,  0.99537516, -2.29024279,  4.7699338 , -2.08598898,
         0.34009408,  0.35024058, -0.41761834,  3.06970569, -2.21649433,
         1.86339518, -2.62934278,  0.38596397,  0.12088534, -0.53440382,
        -1.88265835, -0.7675926 , -0.90146842,  0.52416091,  0.59678246,
        -0.26349448,  0.5827378 , -3.02842915, -0.36548074,  0.5956112 ,
        -0.15941014,  0.49168856,  1.45652375, -0.43819158, -0.20964198,
         0.77665496,  0.36489921, -0.4750838 ,  0.3551047 ,  0.23188557,
        -1.42941282,  2.06831543, -0.34986402, -0.32320394,  0.39054656,
         0.06283411]])

In [21]:
lasso_poly = Lasso(.1)
lasso_poly.fit(X_train_poly, y_train_poly)
lasso_poly.coef_

array([ 0.        , -0.        , -0.        , -1.59613165, -5.22452383,
       -0.        ,  2.86907439,  0.03030592, -0.10514919,  0.        ,
        0.        , -0.        , -0.        ,  0.        ,  0.28971732,
       -0.        ,  0.        , -0.        ,  0.11457443, -0.        ,
        0.        ,  1.15720495,  0.        ,  0.        , -0.        ,
        0.        ,  0.        , -0.        ,  0.04724906,  0.        ,
       -0.6925298 , -0.        ,  0.        ,  0.        , -0.        ,
       -0.        , -0.        , -0.67082659,  0.        , -0.        ,
       -0.        ,  0.16918498, -0.        , -0.61771612,  0.        ,
        0.36046427,  0.        , -0.37086554,  0.        ,  0.        ,
       -0.        , -0.        ,  0.18165859, -0.        , -0.        ,
       -0.        ])

In [22]:
#Checking the scores of the Simple, Ridge, Lasso regressions with polynomial features
#Simple linear regression
print(reg_model_poly.score(X_train_poly, y_train_poly))
print(reg_model_poly.score(X_test_poly, y_test_poly))

0.9150553186137941
0.8587339450004422


In [23]:
#Ridge regression
print(ridge_poly.score(X_train_poly, y_train_poly))
print(ridge_poly.score(X_test_poly, y_test_poly))

0.9143225702003367
0.861339805369855


In [24]:
#Lasso regression
print(lasso_poly.score(X_train_poly, y_train_poly))
print(lasso_poly.score(X_test_poly, y_test_poly))

0.8900519684208551
0.880222844847697


The accuracy is a bit better than earlier and if we compare the accuracies of the earlier linear versions. For training data, polynomial linear regression, poynomial ridge regression is doing quite good. But to generalize the model and predict the scores for test data lasso model and ridge model are doing really well. We can get a competitive scores using very less dimensions and simpler models using regularization methods.