# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score

# Load Dataset

In [2]:
data = pd.read_csv(r'E:\PYTHONCLASSJUPYTER\PrakashSenapati\car_mpg.csv')

In [3]:
data.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type,car_name
0,18.0,8,307.0,130,3504,12.0,70,1,0,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,0,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,0,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,0,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,0,ford torino


In [4]:
data.shape

(398, 10)

In [5]:
data = data.drop(['car_name'], axis = 1)
data.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type
0,18.0,8,307.0,130,3504,12.0,70,1,0
1,15.0,8,350.0,165,3693,11.5,70,1,0
2,18.0,8,318.0,150,3436,11.0,70,1,0
3,16.0,8,304.0,150,3433,12.0,70,1,0
4,17.0,8,302.0,140,3449,10.5,70,1,0


# EDA

In [6]:
data['origin'] = data['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
data.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type
0,18.0,8,307.0,130,3504,12.0,70,america,0
1,15.0,8,350.0,165,3693,11.5,70,america,0
2,18.0,8,318.0,150,3436,11.0,70,america,0
3,16.0,8,304.0,150,3433,12.0,70,america,0
4,17.0,8,302.0,140,3449,10.5,70,america,0


In [7]:
data = pd.get_dummies(data,columns = ['origin'])
data.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
0,18.0,8,307.0,130,3504,12.0,70,0,True,False,False
1,15.0,8,350.0,165,3693,11.5,70,0,True,False,False
2,18.0,8,318.0,150,3436,11.0,70,0,True,False,False
3,16.0,8,304.0,150,3433,12.0,70,0,True,False,False
4,17.0,8,302.0,140,3449,10.5,70,0,True,False,False


In [8]:
data.isnull().sum()

mpg               0
cyl               0
disp              0
hp                0
wt                0
acc               0
yr                0
car_type          0
origin_america    0
origin_asia       0
origin_europe     0
dtype: int64

In [11]:
# data = data.replace('?', np.nan)

In [17]:
data.dtypes

mpg               float64
cyl                 int64
disp              float64
hp                 object
wt                  int64
acc               float64
yr                  int64
car_type            int64
origin_america       bool
origin_asia          bool
origin_europe        bool
dtype: object

In [21]:
# data['hp'] = data['hp'].astype(int)
data['hp'] = pd.to_numeric(data['hp'])

In [22]:
data.dtypes

mpg               float64
cyl                 int64
disp              float64
hp                float64
wt                  int64
acc               float64
yr                  int64
car_type            int64
origin_america       bool
origin_asia          bool
origin_europe        bool
dtype: object

In [23]:
data = data.apply(lambda x: x.fillna(x.median()), axis = 0)

In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   mpg             398 non-null    float64
 1   cyl             398 non-null    int64  
 2   disp            398 non-null    float64
 3   hp              398 non-null    float64
 4   wt              398 non-null    int64  
 5   acc             398 non-null    float64
 6   yr              398 non-null    int64  
 7   car_type        398 non-null    int64  
 8   origin_america  398 non-null    bool   
 9   origin_asia     398 non-null    bool   
 10  origin_europe   398 non-null    bool   
dtypes: bool(3), float64(4), int64(4)
memory usage: 26.2 KB


# Model Building

## Divide EDA_Cleaned Data

* x is independent dataframe variable
* y id dependent dataframe variable

In [27]:
x = data.drop(['mpg'], axis = 1)
y = data[['mpg']]

In [29]:
x.head()

Unnamed: 0,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
0,8,307.0,130.0,3504,12.0,70,0,True,False,False
1,8,350.0,165.0,3693,11.5,70,0,True,False,False
2,8,318.0,150.0,3436,11.0,70,0,True,False,False
3,8,304.0,150.0,3433,12.0,70,0,True,False,False
4,8,302.0,140.0,3449,10.5,70,0,True,False,False


In [34]:
x.shape

(398, 10)

In [30]:
y.head()

Unnamed: 0,mpg
0,18.0
1,15.0
2,18.0
3,16.0
4,17.0


In [35]:
y.shape

(398, 1)

## Scale Data

- **x_s is scaled data prepared from x**

In [36]:
x_s = preprocessing.scale(x)
x_s.shape

(398, 10)

In [39]:
print(x_s)

[[ 1.49819126  1.0906037   0.67311762 ...  0.77355903 -0.49764335
  -0.46196822]
 [ 1.49819126  1.5035143   1.58995818 ...  0.77355903 -0.49764335
  -0.46196822]
 [ 1.49819126  1.19623199  1.19702651 ...  0.77355903 -0.49764335
  -0.46196822]
 ...
 [-0.85632057 -0.56103873 -0.53187283 ...  0.77355903 -0.49764335
  -0.46196822]
 [-0.85632057 -0.70507731 -0.66285006 ...  0.77355903 -0.49764335
  -0.46196822]
 [-0.85632057 -0.71467988 -0.58426372 ...  0.77355903 -0.49764335
  -0.46196822]]


- **Convert scaled data (x_s) into Pandas DataFrame**

In [40]:
x_s = pd.DataFrame(x_s, columns=x.columns)
x_s

Unnamed: 0,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
0,1.498191,1.090604,0.673118,0.630870,-1.295498,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
1,1.498191,1.503514,1.589958,0.854333,-1.477038,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
2,1.498191,1.196232,1.197027,0.550470,-1.658577,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
3,1.498191,1.061796,1.197027,0.546923,-1.295498,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
4,1.498191,1.042591,0.935072,0.565841,-1.840117,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
...,...,...,...,...,...,...,...,...,...,...
393,-0.856321,-0.513026,-0.479482,-0.213324,0.011586,1.621983,0.941412,0.773559,-0.497643,-0.461968
394,-0.856321,-0.925936,-1.370127,-0.993671,3.279296,1.621983,0.941412,-1.292726,-0.497643,2.164651
395,-0.856321,-0.561039,-0.531873,-0.798585,-1.440730,1.621983,0.941412,0.773559,-0.497643,-0.461968
396,-0.856321,-0.705077,-0.662850,-0.408411,1.100822,1.621983,0.941412,0.773559,-0.497643,-0.461968


In [43]:
y_s = preprocessing.scale(y)
print(y_s[0:10])

[[-0.7064387 ]
 [-1.09075062]
 [-0.7064387 ]
 [-0.96264665]
 [-0.83454267]
 [-1.09075062]
 [-1.2188546 ]
 [-1.2188546 ]
 [-1.2188546 ]
 [-1.09075062]]


In [46]:
y_s = pd.DataFrame(y_s, columns=y.columns)
y_s.head(10)

Unnamed: 0,mpg
0,-0.706439
1,-1.090751
2,-0.706439
3,-0.962647
4,-0.834543
5,-1.090751
6,-1.218855
7,-1.218855
8,-1.218855
9,-1.090751


## Split into Train & Test Data

In [47]:
x_train, x_test, y_train, y_test = train_test_split(x_s, y_s, test_size=0.30, random_state=1)

In [48]:
x_train.head(5)

Unnamed: 0,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
350,-0.856321,-0.849116,-1.081977,-0.893172,-0.24257,1.351199,0.941412,0.773559,-0.497643,-0.461968
59,-0.856321,-0.925936,-1.317736,-0.847061,2.879909,-1.085858,0.941412,-1.292726,-0.497643,2.164651
120,-0.856321,-0.695475,0.2016,-0.121101,-0.024722,-0.815074,0.941412,-1.292726,-0.497643,2.164651
12,1.498191,1.983643,1.197027,0.934732,-2.203196,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
349,-0.856321,-0.983552,-0.951,-1.165111,0.156817,1.351199,0.941412,-1.292726,2.009471,-0.461968


In [49]:
x_train.shape

(278, 10)

In [50]:
x_test.head(5)

Unnamed: 0,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
174,0.320935,-0.215346,-0.191332,0.016051,-0.387801,-0.273506,-1.062235,0.773559,-0.497643,-0.461968
359,-0.856321,-0.503423,-0.636655,0.306908,1.754364,1.351199,0.941412,-1.292726,-0.497643,2.164651
250,1.498191,1.196232,0.935072,0.903991,-0.859804,0.538847,-1.062235,0.773559,-0.497643,-0.461968
274,-0.267693,-0.599449,-0.034159,-0.16603,0.120509,0.538847,0.941412,-1.292726,-0.497643,2.164651
283,0.320935,0.370411,-0.3747,0.34829,0.955591,0.809631,-1.062235,0.773559,-0.497643,-0.461968


In [51]:
x_test.shape

(120, 10)

In [52]:
y_train.head()

Unnamed: 0,mpg
350,1.432898
59,-0.065919
120,-0.578335
12,-1.090751
349,1.356035


In [53]:
y_train.shape

(278, 1)

In [54]:
y_test.head()

Unnamed: 0,mpg
174,-0.706439
359,0.587411
250,-0.527093
274,-0.4118
283,-0.42461


In [55]:
y_test.shape

(120, 1)

## Simple Linear Model

In [56]:
regression_model = LinearRegression()
regression_model.fit(x_train, y_train)

In [57]:
for idx, col_name in enumerate(x_train.columns):
    print(idx, col_name)

0 cyl
1 disp
2 hp
3 wt
4 acc
5 yr
6 car_type
7 origin_america
8 origin_asia
9 origin_europe


In [63]:
for idx, col_name in enumerate(x_train.columns):
    print("The coefficient for ['{}'] = {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for ['cyl'] = 0.32102238569161057
The coefficient for ['disp'] = 0.324834309184838
The coefficient for ['hp'] = -0.22916950059437718
The coefficient for ['wt'] = -0.7112101905072288
The coefficient for ['acc'] = 0.014713682764190883
The coefficient for ['yr'] = 0.3755811949510743
The coefficient for ['car_type'] = 0.3814769484233099
The coefficient for ['origin_america'] = -0.07472247547584182
The coefficient for ['origin_asia'] = 0.044515252035678216
The coefficient for ['origin_europe'] = 0.04834854953945382


In [64]:
intercept = regression_model.intercept_[0]
print('The intercept is {}'.format(intercept))

The intercept is 0.0192841161036397


## Regularized Ridge Regression

In [68]:
ridge_model = Ridge(alpha = 0.3)
ridge_model.fit(x_train, y_train)

In [69]:
print( ridge_model.coef_ )

[[ 0.31649043  0.31320707 -0.22876025 -0.70109447  0.01295851  0.37447352
   0.37725608 -0.07423624  0.04441039  0.04784031]]


## Regularized Lasso Regression

In [65]:
lasso_model = Lasso(alpha = 0.1)
lasso_model.fit(x_train, y_train)

In [67]:
print( lasso_model.coef_ )

[-0.         -0.         -0.01690287 -0.51890013  0.          0.28138241
  0.1278489  -0.01642647  0.          0.        ]


## Score Comparison

In [70]:
print(regression_model.score(x_train, y_train))
print(regression_model.score(x_test, y_test))

0.8343770256960538
0.8513421387780067


In [71]:
print(ridge_model.score(x_train, y_train))
print(ridge_model.score(x_test, y_test))

0.8343617931312616
0.8518882171608508


In [72]:
print(lasso_model.score(x_train, y_train))
print(lasso_model.score(x_test, y_test))

0.7938010766228453
0.8375229615977083


## Polynomial Features