## Model selection and tuning

In [32]:
# Import numerical libraries
import pandas as pd
import numpy as np

# Import graphical plotting libraries
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Import Linear Regression Machine Learning Libraries
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score

In [33]:
# Pandas dataframe
data_cars = pd.read_csv(r'C:\Users\sruja\OneDrive\Learning\DS and AI\3. Jan\6th - Regularization\7th- l1, l2, scaling\lasso, ridge, elastic net\TASK-22_LASSO,RIDGE\car-mpg.csv')

In [34]:
# Seeing the first five rows to get a glimpse of the data
data_cars.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type,car_name
0,18.0,8,307.0,130,3504,12.0,70,1,0,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,0,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,0,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,0,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,0,ford torino


In [35]:
data_cars = data_cars.drop(['car_name'], axis = 1)

In [36]:
data_cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   mpg       398 non-null    float64
 1   cyl       398 non-null    int64  
 2   disp      398 non-null    float64
 3   hp        398 non-null    object 
 4   wt        398 non-null    int64  
 5   acc       398 non-null    float64
 6   yr        398 non-null    int64  
 7   origin    398 non-null    int64  
 8   car_type  398 non-null    int64  
dtypes: float64(3), int64(5), object(1)
memory usage: 28.1+ KB


In [38]:
data_cars['origin'] = data_cars['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})

In [39]:
data_cars = pd.get_dummies(data_cars,columns = ['origin'])
data_cars = data_cars.replace('?', np.nan)

In [41]:
data_cars['hp']

0      130
1      165
2      150
3      150
4      140
      ... 
393     86
394     52
395     84
396     79
397     82
Name: hp, Length: 398, dtype: object

In [43]:
# Changing the datatype from object to float so that lambda function can be used in the next step (it works only on numeric data)
data_cars['hp'] = data_cars['hp'].astype(float)

In [44]:
data_cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   mpg             398 non-null    float64
 1   cyl             398 non-null    int64  
 2   disp            398 non-null    float64
 3   hp              392 non-null    float64
 4   wt              398 non-null    int64  
 5   acc             398 non-null    float64
 6   yr              398 non-null    int64  
 7   car_type        398 non-null    int64  
 8   origin_america  398 non-null    bool   
 9   origin_asia     398 non-null    bool   
 10  origin_europe   398 non-null    bool   
dtypes: bool(3), float64(4), int64(4)
memory usage: 26.2 KB


In [45]:
data_cars = data_cars.apply(lambda x: x.fillna(x.median()), axis = 0)  # This code throws an error if all the datatypes are not numeric
# data_cars.fillna(data_cars.median(numeric_only = True), inplace = True)

In [47]:
data_cars.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
0,18.0,8,307.0,130.0,3504,12.0,70,0,True,False,False
1,15.0,8,350.0,165.0,3693,11.5,70,0,True,False,False
2,18.0,8,318.0,150.0,3436,11.0,70,0,True,False,False
3,16.0,8,304.0,150.0,3433,12.0,70,0,True,False,False
4,17.0,8,302.0,140.0,3449,10.5,70,0,True,False,False


In [48]:
data_cars['origin_america'] = data_cars['origin_america'].astype(int)
data_cars['origin_asia'] = data_cars['origin_asia'].astype(int)
data_cars['origin_europe'] = data_cars['origin_europe'].astype(int)

In [49]:
data_cars.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
0,18.0,8,307.0,130.0,3504,12.0,70,0,1,0,0
1,15.0,8,350.0,165.0,3693,11.5,70,0,1,0,0
2,18.0,8,318.0,150.0,3436,11.0,70,0,1,0,0
3,16.0,8,304.0,150.0,3433,12.0,70,0,1,0,0
4,17.0,8,302.0,140.0,3449,10.5,70,0,1,0,0


### Build a machine learning model

In [63]:
X = data_cars.drop(['mpg'], axis = 1) 
y = data_cars[['mpg']]  # Mileage per gallon

In [64]:
X

Unnamed: 0,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
0,8,307.0,130.0,3504,12.0,70,0,1,0,0
1,8,350.0,165.0,3693,11.5,70,0,1,0,0
2,8,318.0,150.0,3436,11.0,70,0,1,0,0
3,8,304.0,150.0,3433,12.0,70,0,1,0,0
4,8,302.0,140.0,3449,10.5,70,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6,82,1,1,0,0
394,4,97.0,52.0,2130,24.6,82,1,0,0,1
395,4,135.0,84.0,2295,11.6,82,1,1,0,0
396,4,120.0,79.0,2625,18.6,82,1,1,0,0


In [65]:
# Scaling
X_s = preprocessing.scale(X)
X_s = pd.DataFrame(X_s, columns = X.columns) # converting scaled data into dataframe

y_s = preprocessing.scale(y)
y_s = pd.DataFrame(y_s, columns = y.columns) # ideally train, test data should be in columns

In [66]:
X_s

Unnamed: 0,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
0,1.498191,1.090604,0.673118,0.630870,-1.295498,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
1,1.498191,1.503514,1.589958,0.854333,-1.477038,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
2,1.498191,1.196232,1.197027,0.550470,-1.658577,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
3,1.498191,1.061796,1.197027,0.546923,-1.295498,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
4,1.498191,1.042591,0.935072,0.565841,-1.840117,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
...,...,...,...,...,...,...,...,...,...,...
393,-0.856321,-0.513026,-0.479482,-0.213324,0.011586,1.621983,0.941412,0.773559,-0.497643,-0.461968
394,-0.856321,-0.925936,-1.370127,-0.993671,3.279296,1.621983,0.941412,-1.292726,-0.497643,2.164651
395,-0.856321,-0.561039,-0.531873,-0.798585,-1.440730,1.621983,0.941412,0.773559,-0.497643,-0.461968
396,-0.856321,-0.705077,-0.662850,-0.408411,1.100822,1.621983,0.941412,0.773559,-0.497643,-0.461968


In [67]:
y_s

Unnamed: 0,mpg
0,-0.706439
1,-1.090751
2,-0.706439
3,-0.962647
4,-0.834543
...,...
393,0.446497
394,2.624265
395,1.087017
396,0.574601


In [62]:
data_cars.shape

(398, 11)

In [68]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, test_size = 0.20, random_state = 0)
X_train.shape

(318, 10)

### Simple Linear Model

In [70]:
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train, y_train)

for idx, col_name in enumerate(X_train.columns):
    print('The coefficient for {} is {}'.format(col_name, linear_reg_model.coef_[0][idx]))    
intercept = linear_reg_model.intercept_[0]
print('The intercept is {}'.format(intercept))

The coefficient for cyl is 0.24297168181227313
The coefficient for disp is 0.2923555012814415
The coefficient for hp is -0.18342828140772874
The coefficient for wt is -0.6656888297642692
The coefficient for acc is 0.06522198269426219
The coefficient for yr is 0.3476276950232327
The coefficient for car_type is 0.3364881665039492
The coefficient for origin_america is 7464399952891.158
The coefficient for origin_asia is 6151924272936.219
The coefficient for origin_europe is 5872026888334.256
The intercept is -0.018203868513746675


### L1 or Lasso regularization

In [71]:
lasso_model = Lasso(alpha = 0.1)
lasso_model.fit(X_train, y_train)

print('Lasso model coef: {}'.format(lasso_model.coef_))   

Lasso model coef: [-0.         -0.         -0.07247557 -0.45867691  0.          0.2698134
  0.11341188 -0.04988145  0.          0.        ]


It can be seen that some of the coefficients have been reduced to zero, which is why this model is used for feature elimination