In [2]:
%matplotlib inline


# Numerical libraries
import numpy as np   

# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.metrics import r2_score

# to handle data in form of rows and columns 
import pandas as pd    

# importing ploting libraries
import matplotlib.pyplot as plt   

#importing seaborn for statistical plots
import seaborn as sns

In [3]:
mpg_df = pd.read_csv("car-mpg.csv")  
mpg_df = mpg_df.drop('car_name', axis=1)
mpg_df['origin'] = mpg_df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
mpg_df = pd.get_dummies(mpg_df, columns=['origin'])
mpg_df = mpg_df.replace('?', np.nan)
mpg_df = mpg_df.apply(lambda x: x.fillna(x.median()),axis=0)

# separate independent and dependent variables

In [4]:
# Copy all the predictor variables into X dataframe. Since 'mpg' is dependent variable drop it
X = mpg_df.drop('mpg', axis=1)

# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
y = mpg_df[['mpg']]


In [5]:
from sklearn import preprocessing

# scale all the columns of the mpg_df. This will produce a numpy array
X_scaled = preprocessing.scale(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)  # ideally the training and test should be 

y_scaled = preprocessing.scale(y)
y_scaled = pd.DataFrame(y_scaled, columns=y.columns)  # ideally the training and test should be 

In [6]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.30, random_state=1)

# fit a simple linear model

In [7]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for cyl is 0.3210223856916102
The coefficient for disp is 0.3248343091848389
The coefficient for hp is -0.2291695005943763
The coefficient for wt is -0.7112101905072293
The coefficient for acc is 0.014713682764191417
The coefficient for yr is 0.3755811949510745
The coefficient for car_type is 0.38147694842331037
The coefficient for origin_america is -0.07472247547584179
The coefficient for origin_asia is 0.04451525203567797
The coefficient for origin_europe is 0.048348549539453875


In [8]:
intercept = regression_model.intercept_[0]

print("The intercept for our model is {}".format(intercept))

The intercept for our model is 0.01928411610363971


# Create a regularized RIDGE model and note the coefficients

In [9]:
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))
    

Ridge model: [[ 0.31649043  0.31320707 -0.22876025 -0.70109447  0.01295851  0.37447352
   0.37725608 -0.07423624  0.04441039  0.04784031]]


# Create a regularized LASSO model and note the coefficients

In [10]:
lasso = Lasso(alpha=0.1)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))

# Observe, many of the coefficients have become 0 indicating drop of those dimensions from the model

Lasso model: [-0.         -0.         -0.01690287 -0.51890013  0.          0.28138241
  0.1278489  -0.01642647  0.          0.        ]


## Let us compare their scores

In [11]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))


0.8343770256960538
0.8513421387780067


In [12]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

0.8343617931312616
0.8518882171608506


In [13]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))

0.7938010766228453
0.8375229615977083


In [14]:
# More or less similar results but with less complex models.  Complexity is a function of variables and coefficients
## Note - with Lasso, we get equally good result in test though not so in training.  Further, the number of dimensions is much less
# in LASSO model than ridge or un-regularized model

# Let us generate polynomial models reflecting the non-linear interaction between some dimensions

In [15]:
from sklearn.preprocessing import PolynomialFeatures

In [16]:
poly = PolynomialFeatures(degree = 2, interaction_only=True)

#poly = PolynomialFeatures(2)

In [17]:
X_poly = poly.fit_transform(X_scaled)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.30, random_state=1)
X_train.shape

(278, 56)

# Fit a simple non regularized linear model on poly features-

In [18]:
regression_model.fit(X_train, y_train)
print(regression_model.coef_[0])


[-9.67853872e-13 -1.06354274e+13 -4.20438564e+00 -1.84192048e+00
 -2.72638006e+00 -1.26167775e+00  3.11178320e+00 -1.41608532e+13
 -1.75748808e+14 -1.00552702e+14 -1.67776805e+14 -2.24530835e+00
  1.72983592e+00 -1.43166279e+00  5.23339190e+00 -2.40839721e+00
 -3.40741702e+13 -1.33084675e+13  1.62631150e+13  1.55231834e+13
  3.94181666e-02 -1.20973679e-01 -1.74919750e+00  2.64875467e+00
 -1.96915695e+00 -1.01987477e+13 -8.40548787e+12 -8.02305890e+12
 -5.16931724e-01 -3.96352229e-01 -1.62429132e+00  1.23706751e+00
  1.05966794e+13  8.73345073e+12  8.33610026e+12 -2.50000000e-01
  3.00781250e-01 -5.60937500e+00 -1.49082566e+13 -1.22869174e+13
 -1.17278929e+13 -1.71875000e-01  3.98437500e+00  7.41223011e+12
  6.10892753e+12  5.83098639e+12  8.59375000e-02 -2.52292482e+13
 -2.07931549e+13 -1.98471176e+13  1.63473760e+13  2.94730682e+13
  2.81321163e+13  1.08552233e+14  5.18717282e+13 -8.66275359e+13]


In [19]:
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))

Ridge model: [[ 0.          3.73512981 -2.93500874 -2.13974194 -3.56547812 -1.28898893
   3.01290805  2.04739082  0.0786974   0.21972225 -0.3302341  -1.46231096
  -1.17221896  0.00856067  2.48054694 -1.67596093  0.99537516 -2.29024279
   4.7699338  -2.08598898  0.34009408  0.35024058 -0.41761834  3.06970569
  -2.21649433  1.86339518 -2.62934278  0.38596397  0.12088534 -0.53440382
  -1.88265835 -0.7675926  -0.90146842  0.52416091  0.59678246 -0.26349448
   0.5827378  -3.02842915 -0.36548074  0.5956112  -0.15941014  0.49168856
   1.45652375 -0.43819158 -0.20964198  0.77665496  0.36489921 -0.4750838
   0.3551047   0.23188557 -1.42941282  2.06831543 -0.34986402 -0.32320394
   0.39054656  0.06283411]]


In [20]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))


0.9143225702003364
0.8613398053698547


In [21]:
lasso = Lasso(alpha=0.01)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))


Lasso model: [ 0.          0.52263805 -0.5402102  -1.99423315 -4.55360385 -0.85285179
  2.99044036  0.00711821 -0.          0.76073274 -0.         -0.
 -0.19736449  0.          2.04221833 -1.00014513  0.         -0.
  4.28412669 -0.          0.          0.31442062 -0.          2.13894094
 -1.06760107  0.         -0.          0.          0.         -0.44991392
 -1.55885506 -0.         -0.68837902  0.          0.17455864 -0.34653644
  0.3313704  -2.84931966  0.         -0.34340563  0.00815105  0.47019445
  1.25759712 -0.69634581  0.          0.55528147  0.2948979  -0.67289549
  0.06490671  0.         -1.19639935  1.06711702  0.         -0.88034391
  0.         -0.        ]


In [22]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))


0.9098286193898272
0.8695296858772456
