In [1]:
# IMPORTING THE LIBRARIES
import numpy as np  # Numerical lib

import pandas as pd # Data Handling lib

import matplotlib.pyplot as plt  # Plotting lib
%matplotlib inline

import seaborn as sns # Statistical plots lib

from sklearn.linear_model import LinearRegression, Ridge, Lasso # Linear regression ML lib

from sklearn.model_selection import train_test_split # for splitting data into training and testing parts

from sklearn import preprocessing # for scaling all variables similarly

In [2]:
# reading the data
car_data = pd.read_csv('C://Users/syeda/Downloads/car-mpg.csv')

In [3]:
car_data.head() # check if the data is read and have a glimpse of the df

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type,car_name
0,18.0,8,307.0,130,3504,12.0,70,1,0,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,0,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,0,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,0,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,0,ford torino


### Pre-Processing or Cleaning the data

In [4]:
car_data.shape  # checking rows and columns

(398, 10)

In [5]:
car_data.isnull().sum()  # checking for null values

mpg         0
cyl         0
disp        0
hp          0
wt          0
acc         0
yr          0
origin      0
car_type    0
car_name    0
dtype: int64

In [6]:
car_data.info()  # seeing data types and non-null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   mpg       398 non-null    float64
 1   cyl       398 non-null    int64  
 2   disp      398 non-null    float64
 3   hp        398 non-null    object 
 4   wt        398 non-null    int64  
 5   acc       398 non-null    float64
 6   yr        398 non-null    int64  
 7   origin    398 non-null    int64  
 8   car_type  398 non-null    int64  
 9   car_name  398 non-null    object 
dtypes: float64(3), int64(5), object(2)
memory usage: 31.2+ KB


- Thus there are no null rows.
- But there are columns like `car_name` which should have no effect on target variable `mpg`. so we can drop it. Good idea would be to make a copy of the original and then drop it.
- `hp` seems to be a number but is shown as an `object`. This may be due to unexpected values. So we may try and clean that too.
- also `origin` should be a categorical value. So we convert to categorical and do the One-Hot Encoding

In [7]:
car_data_copy = car_data.copy()

In [8]:
car_data_copy.drop(axis=1,columns=['car_name'],inplace=True)

#### Cleaning `hp`

In [9]:
car_data_copy[car_data_copy['hp'].str.isdigit() == False] # to check which characters make hp an object

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type
32,25.0,4,98.0,?,2046,19.0,71,1,1
126,21.0,6,200.0,?,2875,17.0,74,1,0
330,40.9,4,85.0,?,1835,17.3,80,2,1
336,23.6,4,140.0,?,2905,14.3,80,1,1
354,34.5,4,100.0,?,2320,15.8,81,2,1
374,23.0,4,151.0,?,3035,20.5,82,1,1


In [10]:
# replace ? with median
car_data_copy['hp'].replace({'?': np.nan},inplace=True)
car_data_copy['hp'] = car_data_copy['hp'].astype('float64')
car_data_copy['hp'] = car_data_copy['hp'].fillna(car_data_copy.hp.median())

In [11]:
car_data_copy['origin'].replace({1:'america',2:'europe',3:'asia'},inplace=True)
car_data_copy = pd.get_dummies(data=car_data_copy,columns=['origin'])

### Splitting data into X and y

In [12]:
X = car_data_copy.drop(axis=1,columns=['mpg','origin_europe']) 
# independant variables
# dropping dummy column becuase we don't require them all

In [13]:
y = car_data_copy[['mpg']] # dependant or target variable

### Scaling all columns

In [14]:
X_scaled = preprocessing.scale(X)  # gives numpy array
X_scaled = pd.DataFrame(data=X_scaled,columns=X.columns) # convert to df

In [15]:
y_scaled = preprocessing.scale(y)
y_scaled = pd.DataFrame(data=y_scaled,columns=y.columns)

### Splitting into Training and Testing data

In [16]:
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y_scaled,test_size=0.30,random_state=1)

### Fit the Linear Model

In [17]:
linearReg = LinearRegression()
linearReg.fit(X_train,y_train)

LinearRegression()

In [18]:
linearReg.coef_ # coefficient matrix

array([[ 0.32102239,  0.32483431, -0.2291695 , -0.71121019,  0.01471368,
         0.37558119,  0.38147695, -0.13618216, -0.00613789]])

In [19]:
linearReg.intercept_ # the intercept matrix

array([0.01928412])

In [20]:
for indx, col_name in enumerate(X_train.columns):
    print(f'The coefficient of {col_name} is {linearReg.coef_[0][indx]}')

The coefficient of cyl is 0.3210223856916109
The coefficient of disp is 0.32483430918483924
The coefficient of hp is -0.2291695005943764
The coefficient of wt is -0.7112101905072297
The coefficient of acc is 0.014713682764191237
The coefficient of yr is 0.3755811949510749
The coefficient of car_type is 0.3814769484233108
The coefficient of origin_america is -0.13618215843840353
The coefficient of origin_asia is -0.006137890589388725


### RIDGE Model

In [21]:
ridge = Ridge(alpha=0.3)
ridge.fit(X_train,y_train)

print('Ridge Model Coeff:',ridge.coef_)
print('Ridge Model Intercept:',ridge.intercept_)

Ridge Model Coeff: [[ 0.31658439  0.31300635 -0.22875871 -0.70101302  0.01295503  0.37442624
   0.37733935 -0.13479863 -0.00552806]]
Ridge Model Intercept: [0.01918841]


### LASSO Model

In [22]:
lasso = Lasso(alpha=0.2)
lasso.fit(X_train,y_train)

print('Lasso Model Coeff:',lasso.coef_)
print('Lasso Model Intercept:',lasso.intercept_)

Lasso Model Coeff: [-0.         -0.         -0.         -0.49040652  0.          0.20770417
  0.09573255 -0.          0.        ]
Lasso Model Intercept: [0.00845078]


In `Lasso` many coefficients are suppressed to zero and `Ridge` close to zero

#### Comparison of Scores of models

In [23]:
print(linearReg.score(X_train, y_train))
print(linearReg.score(X_test, y_test))
# score from all dimensions

0.8343770256960538
0.8513421387780066


In [24]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))
# score from all dimensions

0.8343615817491262
0.851903014380742


In [25]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))
# score from lesser dimensions because some were dropped during Shrinkage
# thus less complex

0.7449291598497929
0.7889774437561077


In [26]:
# More or less similar results but with less complex models. 
# Complexity is a function of variables and coefficients
# Note - with Lasso, we get equally good result in test though not so in training. 
# Further, the number of dimensions is much less in LASSO model than ridge or un-regularized model

## Generating new dimensions using `Polynomial Features`

In [27]:
poly = preprocessing.PolynomialFeatures(degree=2,interaction_only=True)

X_poly = poly.fit_transform(X_scaled)

X_train,X_test,y_train,y_test = train_test_split(X_poly,y_scaled,test_size=0.30,random_state=1)

In [28]:
X_train.shape  # increased columns

(278, 46)

### Simple Linear Model

In [29]:
linearReg.fit(X_train,y_train)
print('Simple Linear Reg Coeff',linearReg.coef_[0])
print()
print('Simple Linear Reg Intercept',linearReg.intercept_)

# we are in curse of dimensionality.
# the coefficients are very large, thus sharp peaks and valleys
print()
# the model will perform well in training but bad in test data - over fit
print('Training Score',linearReg.score(X_train,y_train))
print('Testing Score',linearReg.score(X_test,y_test))

Simple Linear Reg Coeff [ 5.10550450e-14 -2.92688588e+11 -5.61248779e-01 -2.89993286e-01
 -3.73680115e-01 -1.96283340e-01  3.85353088e-01 -3.89708847e+11
 -9.80241969e+10  4.61517748e+11 -1.55683517e-01 -1.65529251e-01
 -1.05104446e-02  3.51501465e-01 -2.55874634e-01 -9.37726375e+11
 -9.09299408e+11  1.29382324e+00  5.14817238e-02  2.40907669e-02
 -6.41250610e-02  4.50363159e-01 -2.60421515e-01  1.92657471e-01
 -7.80395508e-01  2.39706039e-02 -8.48464966e-02 -2.47711182e-01
 -8.01219940e-02 -2.26280212e-01 -1.70745850e-02 -2.34565735e-02
  6.74896240e-02 -4.36374426e-01 -2.19116211e-02  2.55073547e-01
  6.21833801e-02  2.18288422e-01 -1.77679181e-01 -1.67973518e-01
  4.79278564e-02 -1.00662231e-01  1.28707886e-02 -5.34265029e+11
  5.26367188e-01  3.57011222e+11]

Simple Linear Reg Intercept [-4.10035849e+11]

Training Score 0.915008095303072
Testing Score 0.858609120302925


### Ridge Model

In [30]:
ridge.fit(X_train,y_train)
print('Ridge Reg Coeff',ridge.coef_[0])
print()
print('Ridge Reg Intercept',ridge.intercept_[0])
print()
# the coefficients have been suppressed, thus smoother curve
# the model will perform well in training and comparable in test
print('Training Score',ridge.score(X_train,y_train))
print('Testing Score',ridge.score(X_test,y_test))

Ridge Reg Coeff [ 0.          0.46885355 -0.37402127 -0.27089854 -0.46317828 -0.16311881
  0.38622902  0.23163975  0.05750483  0.05631796 -0.18330425 -0.14913042
 -0.00190077  0.31347509 -0.21325107  0.02784534 -0.15822531  0.76956105
  0.04194593  0.03727214 -0.05285657  0.39111239 -0.28346519  0.21829204
 -0.32564817  0.02123812 -0.06697644 -0.23952141 -0.09061334 -0.20705001
 -0.00695606 -0.03625414  0.07317645 -0.39531717 -0.03127763  0.07598792
  0.06354046  0.18321052 -0.18031527 -0.12535362  0.04665224 -0.09707233
  0.01599254 -0.23507189  0.24895732 -0.10142059]

Ridge Reg Intercept -0.5025097833156755

Training Score 0.9141778746823359
Testing Score 0.8603085263110439


### Lasso Model

In [31]:
lasso.fit(X_train,y_train)
print('Lasso Reg Coeff',lasso.coef_)
print()
print('Lasso Reg Intercept',lasso.intercept_[0])
print()
# the coefficients have been suppressed, thus smoother curve
# the model will perform well in training and comparable in test with much lesser dimensions
# because some features are zero and dropped
print('Training Score',lasso.score(X_train,y_train))
print('Testing Score',lasso.score(X_test,y_test))

Lasso Reg Coeff [ 0.         -0.         -0.         -0.         -0.49040652  0.
  0.20770417  0.09573255 -0.          0.         -0.          0.
  0.          0.         -0.          0.          0.         -0.
  0.          0.         -0.         -0.         -0.          0.
 -0.          0.         -0.         -0.         -0.          0.
 -0.         -0.         -0.         -0.          0.         -0.
  0.          0.         -0.          0.          0.         -0.
  0.         -0.          0.         -0.        ]

Lasso Reg Intercept 0.008450775157924207

Training Score 0.7449291598497929
Testing Score 0.7889774437561077
