# lasso, ridge - regularization

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [2]:
data = pd.read_csv(r"C:\Users\Parva\OneDrive\Desktop\naresh it\17th\17th\TASK-22_LASSO,RIDGE\car-mpg.csv")
data.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type,car_name
0,18.0,8,307.0,130,3504,12.0,70,1,0,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,0,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,0,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,0,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,0,ford torino


In [3]:
data = data.drop(['car_name'], axis = 1)
data['origin'] = data['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
data = pd.get_dummies(data,columns = ['origin'])
data = data.replace('?', np.nan)
data = data.apply(lambda x: x.fillna(x.median()), axis = 0)

In [4]:
data

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
0,18.0,8,307.0,130,3504,12.0,70,0,1,0,0
1,15.0,8,350.0,165,3693,11.5,70,0,1,0,0
2,18.0,8,318.0,150,3436,11.0,70,0,1,0,0
3,16.0,8,304.0,150,3433,12.0,70,0,1,0,0
4,17.0,8,302.0,140,3449,10.5,70,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,1,0,0
394,44.0,4,97.0,52,2130,24.6,82,1,0,0,1
395,32.0,4,135.0,84,2295,11.6,82,1,1,0,0
396,28.0,4,120.0,79,2625,18.6,82,1,1,0,0


In [5]:
data.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
0,18.0,8,307.0,130,3504,12.0,70,0,1,0,0
1,15.0,8,350.0,165,3693,11.5,70,0,1,0,0
2,18.0,8,318.0,150,3436,11.0,70,0,1,0,0
3,16.0,8,304.0,150,3433,12.0,70,0,1,0,0
4,17.0,8,302.0,140,3449,10.5,70,0,1,0,0


# model buliding

In [6]:
x = data.drop(['mpg'],axis=1)  # independent variable
x

Unnamed: 0,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
0,8,307.0,130,3504,12.0,70,0,1,0,0
1,8,350.0,165,3693,11.5,70,0,1,0,0
2,8,318.0,150,3436,11.0,70,0,1,0,0
3,8,304.0,150,3433,12.0,70,0,1,0,0
4,8,302.0,140,3449,10.5,70,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
393,4,140.0,86,2790,15.6,82,1,1,0,0
394,4,97.0,52,2130,24.6,82,1,0,0,1
395,4,135.0,84,2295,11.6,82,1,1,0,0
396,4,120.0,79,2625,18.6,82,1,1,0,0


In [7]:
y = data[['mpg']]    # dependent varible
y

Unnamed: 0,mpg
0,18.0
1,15.0
2,18.0
3,16.0
4,17.0
...,...
393,27.0
394,44.0
395,32.0
396,28.0


In [8]:
# scaling the data

x_s = preprocessing.scale(x)
x_s = pd.DataFrame(x_s,columns=x.columns)
x_s

Unnamed: 0,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
0,1.498191,1.090604,0.673118,0.630870,-1.295498,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
1,1.498191,1.503514,1.589958,0.854333,-1.477038,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
2,1.498191,1.196232,1.197027,0.550470,-1.658577,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
3,1.498191,1.061796,1.197027,0.546923,-1.295498,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
4,1.498191,1.042591,0.935072,0.565841,-1.840117,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
...,...,...,...,...,...,...,...,...,...,...
393,-0.856321,-0.513026,-0.479482,-0.213324,0.011586,1.621983,0.941412,0.773559,-0.497643,-0.461968
394,-0.856321,-0.925936,-1.370127,-0.993671,3.279296,1.621983,0.941412,-1.292726,-0.497643,2.164651
395,-0.856321,-0.561039,-0.531873,-0.798585,-1.440730,1.621983,0.941412,0.773559,-0.497643,-0.461968
396,-0.856321,-0.705077,-0.662850,-0.408411,1.100822,1.621983,0.941412,0.773559,-0.497643,-0.461968


In [9]:
y_s = preprocessing.scale(y)
y_s = pd.DataFrame(y_s, columns = y.columns) #ideally train, test data should be in columns
y_s

Unnamed: 0,mpg
0,-0.706439
1,-1.090751
2,-0.706439
3,-0.962647
4,-0.834543
...,...
393,0.446497
394,2.624265
395,1.087017
396,0.574601


In [10]:
x_train , x_test , y_train , y_test = train_test_split(x_s , y_s , test_size=0.30,random_state=0)
x_train.shape

(278, 10)

In [11]:
x_test.shape

(120, 10)

# simple linear model

In [12]:
# fit simple linear model and find coefficents
regression_model = LinearRegression()
regression_model.fit(x_train,y_train)

In [13]:
for idx, col_name in enumerate(x_train.columns):
    print('The coefficeint for {} is {}'.format(col_name,regression_model.coef_[0][idx]))

The coefficeint for cyl is 0.24744479758946683
The coefficeint for disp is 0.2883821544609883
The coefficeint for hp is -0.18990342687152895
The coefficeint for wt is -0.6732229065111781
The coefficeint for acc is 0.06754501540688176
The coefficeint for yr is 0.3446364072117277
The coefficeint for car_type is 0.3149149154003768
The coefficeint for origin_america is -0.07682943694882889
The coefficeint for origin_asia is 0.0633604889661996
The coefficeint for origin_europe is 0.0312833573514752


In [14]:
intercept = regression_model.intercept_[0]
print('the intercept is {}'.format(intercept))

the intercept is -0.01950046762401742


# Regularized Ridge Regression

In [15]:
ridge_model = Ridge(alpha = 0.3)
ridge_model.fit(x_train, y_train)

print('Ridge model coef: {}'.format(ridge_model.coef_))
#As the data has 10 columns hence 10 coefficients appear here    

Ridge model coef: [[ 0.24424435  0.27853222 -0.18980689 -0.66458446  0.06588077  0.34396213
   0.31169746 -0.07642734  0.06333336  0.03080065]]


# Regularized Lasso Regression

In [23]:
lasso_model = Lasso(alpha = 0.1)
lasso_model.fit(x_train,y_train)

print('Lasso model coef:{}'.format(lasso_model.coef_))

Lasso model coef:[-0.         -0.         -0.06203044 -0.48363379  0.          0.27163751
  0.09620861 -0.03490256  0.          0.        ]


# Score comparision

In [24]:
#Model score - r^2 or coeff of determinant
#r^2 = 1-(RSS/TSS) = Regression error/TSS 

#Simple Linear Model

print(regression_model.score(x_train, y_train))
print(regression_model.score(x_test,y_test))


0.836163800114943
0.8439452810748138


In [25]:
# Ridge
print(ridge_model.score(x_train,y_train))
print(ridge_model.score(x_test,y_test))

0.8361520170844985
0.8437853815947186


In [27]:
# lasso
print(lasso_model.score(x_train, y_train))
print(lasso_model.score(x_test, y_test))

0.7994535676270828
0.81026554865651
