In [144]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [145]:
automobile_df = pd.read_csv('datasets/auto-processed.csv')
automobile_df.head()

Unnamed: 0,wheel-base,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,...,engine-type_ohc,engine-type_ohcf,engine-type_ohcv,fuel-system_1bbl,fuel-system_2bbl,fuel-system_idi,fuel-system_mfi,fuel-system_mpfi,fuel-system_spdi,fuel-system_spfi
0,114.2,3485,152,3.7,3.52,21.0,95.0,4150.0,25,25,...,0,0,0,0,0,1,0,0,0,0
1,101.2,2395,108,3.5,2.8,8.8,101.0,5800.0,23,29,...,1,0,0,0,0,0,0,1,0,0
2,98.4,2536,146,3.62,3.5,9.3,116.0,4800.0,24,30,...,1,0,0,0,0,0,0,1,0,0
3,110.0,3505,209,3.62,3.39,8.0,182.0,5400.0,15,20,...,1,0,0,0,0,0,0,1,0,0
4,96.3,2328,122,3.35,3.46,8.5,88.0,5000.0,25,32,...,1,0,0,0,1,0,0,0,0,0


In [146]:
target_city = 'city-mpg'
target_highway = 'highway-mpg'
features = ['wheel-base', 'curb-weight', 'engine-size', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'price', 'num-of-cylinders', 'fuel-type_diesel',
       'fuel-type_gas', 'aspiration_std', 'aspiration_turbo',
       'drive-wheels_4wd', 'drive-wheels_fwd', 'drive-wheels_rwd',
       'engine-location_front', 'engine-location_rear', 'engine-type_dohc',
       'engine-type_l', 'engine-type_ohc', 'engine-type_ohcf',
       'engine-type_ohcv', 'fuel-system_1bbl', 'fuel-system_2bbl',
       'fuel-system_idi', 'fuel-system_mfi', 'fuel-system_mpfi',
       'fuel-system_spdi', 'fuel-system_spfi']
scaler = StandardScaler()
data = pd.DataFrame(scaler.fit_transform(automobile_df[features]), columns=features)

#### fit_transform is applied in train data.
#### transform is applied to test data. It uses th parameters learned in train data.

In [147]:
data.head()

Unnamed: 0,wheel-base,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,price,num-of-cylinders,...,engine-type_ohc,engine-type_ohcf,engine-type_ohcv,fuel-system_1bbl,fuel-system_2bbl,fuel-system_idi,fuel-system_mfi,fuel-system_mpfi,fuel-system_spdi,fuel-system_spfi
0,2.502101,1.769307,0.582216,1.366739,0.860789,2.666801,-0.21899,-2.032862,0.476251,-0.408778,...,-1.658312,-0.288675,-0.267261,-0.244505,-0.698963,2.95804,-0.071796,-0.916309,-0.219971,-0.071796
1,0.376632,-0.313355,-0.48245,0.629188,-1.437268,-0.344295,-0.060144,1.499805,0.395984,-0.408778,...,0.603023,-0.288675,-0.267261,-0.244505,-0.698963,-0.338062,-0.071796,1.091335,-0.219971,-0.071796
2,-0.081162,-0.043946,0.437034,1.071718,0.796955,-0.22089,0.33697,-0.641205,-0.449126,-0.408778,...,0.603023,-0.288675,-0.267261,-0.244505,-0.698963,-0.338062,-0.071796,1.091335,-0.219971,-0.071796
3,1.815411,1.807521,1.961441,1.071718,0.445862,-0.541744,2.084273,0.643401,2.940894,1.559413,...,0.603023,-0.288675,-0.267261,-0.244505,-0.698963,-0.338062,-0.071796,1.091335,-0.219971,-0.071796
4,-0.424507,-0.441371,-0.143693,0.076024,0.669285,-0.418339,-0.40431,-0.213003,-0.590994,-0.408778,...,0.603023,-0.288675,-0.267261,-0.244505,1.43069,-0.338062,-0.071796,-0.916309,-0.219971,-0.071796


In [148]:
data.describe()

Unnamed: 0,wheel-base,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,price,num-of-cylinders,...,engine-type_ohc,engine-type_ohcf,engine-type_ohcv,fuel-system_1bbl,fuel-system_2bbl,fuel-system_idi,fuel-system_mfi,fuel-system_mpfi,fuel-system_spdi,fuel-system_spfi
count,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,...,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0
mean,-3.34718e-15,0.0,4.554761e-18,2.7328570000000003e-17,1.293552e-15,-1.821904e-16,1.229786e-16,-4.053737e-16,3.1883330000000004e-17,-3.05169e-16,...,1.093143e-16,-7.059880000000001e-17,2.2773810000000003e-17,3.6438090000000005e-17,9.792736e-17,4.0992850000000006e-17,-2.2773810000000003e-17,9.223391000000001e-17,-1.13869e-16,-1.821904e-17
std,1.002574,1.002574,1.002574,1.002574,1.002574,1.002574,1.002574,1.002574,1.002574,1.002574,...,1.002574,1.002574,1.002574,1.002574,1.002574,1.002574,1.002574,1.002574,1.002574,1.002574
min,-2.010434,-2.046359,-1.619706,-2.911057,-3.767243,-0.7885555,-1.463281,-2.032862,-1.011744,-1.392874,...,-1.658312,-0.2886751,-0.2672612,-0.2445048,-0.6989632,-0.3380617,-0.07179582,-0.9163092,-0.2199707,-0.07179582
25%,-0.7188029,-0.791029,-0.7244192,-0.6615265,-0.4478266,-0.4183387,-0.880847,-0.6412052,-0.6833944,-0.4087782,...,-1.658312,-0.2886751,-0.2672612,-0.2445048,-0.6989632,-0.3380617,-0.07179582,-0.9163092,-0.2199707,-0.07179582
50%,-0.3100587,-0.277051,-0.1920865,-0.07148571,0.1266878,-0.2949331,-0.2189898,0.001097954,-0.3737117,-0.4087782,...,0.6030227,-0.2886751,-0.2672612,-0.2445048,-0.6989632,-0.3380617,-0.07179582,-0.9163092,-0.2199707,-0.07179582
75%,0.5728288,0.734664,0.4249355,0.9610856,0.5096974,-0.1962086,0.3369701,0.8575022,0.4058149,-0.4087782,...,0.6030227,-0.2886751,-0.2672612,-0.2445048,1.43069,-0.3380617,-0.07179582,1.091335,-0.2199707,-0.07179582
max,3.597536,2.879424,4.792483,2.2518,2.935425,3.160424,4.202216,3.212614,4.00117,7.463988,...,0.6030227,3.464102,3.741657,4.089899,1.43069,2.95804,13.92839,1.091335,4.546061,13.92839


In [149]:
X = data
Y = automobile_df['city-mpg']

In [150]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [151]:
from sklearn.model_selection import GridSearchCV 

parameters = {'alpha': [0.2, 0.4, 0.6, 0.7, 0.8, 0.9, 1.0]}
grid_search = GridSearchCV(Lasso(), parameters, cv=3, return_train_score=True)
#GS builds different models one for each of parameter. 7 models here.
# uses 3 fold cross validation to find best model.

grid_search.fit(x_train, y_train)

grid_search.best_params_

{'alpha': 0.2}

In [152]:
for i in range(len(parameters['alpha'])):
    print(f'parameters : {grid_search.cv_results_["params"][i]}')
    print(f'Mean Test Score: {grid_search.cv_results_["mean_test_score"][i]}')
    print(f'Rank : {grid_search.cv_results_["rank_test_score"][i]} ')

parameters : {'alpha': 0.2}
Mean Test Score: 0.7938187518110844
Rank : 1 
parameters : {'alpha': 0.4}
Mean Test Score: 0.783449057659834
Rank : 2 
parameters : {'alpha': 0.6}
Mean Test Score: 0.7736823392816229
Rank : 3 
parameters : {'alpha': 0.7}
Mean Test Score: 0.7669447110491836
Rank : 4 
parameters : {'alpha': 0.8}
Mean Test Score: 0.7586298953434908
Rank : 5 
parameters : {'alpha': 0.9}
Mean Test Score: 0.7487920632453484
Rank : 6 
parameters : {'alpha': 1.0}
Mean Test Score: 0.7374279189726695
Rank : 7 


In [153]:
lasso_reg = Lasso(alpha=grid_search.best_params_['alpha']).fit(x_train,y_train)

y_pred = lasso_reg.predict(x_test)

print('Training score: ', lasso_reg.score(x_train, y_train))
print('Test score :', r2_score(y_test, y_pred))

Training score:  0.8544942345785331
Test score : 0.8407826915062744


In [154]:
parameters = {'max_depth': [1, 2, 3, 4, 5, 7, 8]}

grid_search = GridSearchCV(DecisionTreeRegressor(), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)
grid_search.best_params_

{'max_depth': 5}

In [155]:
dtr = DecisionTreeRegressor(max_depth=grid_search.best_params_['max_depth']).fit(x_train, y_train)

In [156]:
y_pred = dtr.predict(x_test)

In [157]:
print('Training score: ', dtr.score(x_train, y_train))
print('Test score :', r2_score(y_test, y_pred))

Training score:  0.9712270756535439
Test score : 0.9081676578720991


In [158]:
parameters = {
    'epsilon': [0.5, 0.1, 0.2, 0.3],
    'C': [0.2, 0.3]
}
# total 2 * 4 = 8 models

grid_search = GridSearchCV(SVR(kernel='linear'), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)


grid_search.best_params_

{'C': 0.2, 'epsilon': 0.5}

In [159]:
grid_search.cv_results_

{'mean_fit_time': array([0.00435408, 0.00323494, 0.00325942, 0.00294344, 0.00270971,
        0.00293533, 0.0029374 , 0.00285109]),
 'std_fit_time': array([9.67181424e-04, 1.76887984e-04, 3.67167140e-04, 2.72529598e-04,
        1.16411491e-04, 1.46874273e-04, 6.87324922e-05, 1.93552654e-04]),
 'mean_score_time': array([0.00244554, 0.00205207, 0.00252787, 0.00193191, 0.00177836,
        0.00225329, 0.00184727, 0.00188049]),
 'std_score_time': array([6.01126648e-04, 7.49400834e-05, 3.99006882e-04, 1.48644886e-04,
        6.92976906e-05, 4.07039521e-04, 5.38217291e-05, 1.53214234e-04]),
 'param_C': masked_array(data=[0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.3],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_epsilon': masked_array(data=[0.5, 0.1, 0.2, 0.3, 0.5, 0.1, 0.2, 0.3],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 '

In [160]:
for epsilon in parameters['epsilon']:
    for C in parameters['C']:
        idx = grid_search.cv_results_['params'].index({'C': C, 'epsilon': epsilon})
        mean_test_score = grid_search.cv_results_['mean_test_score'][idx]
        rank_test_score = grid_search.cv_results_['rank_test_score'][idx]

        print(f'Parameters: epsilon={epsilon}, C={C}')
        print(f'Mean Test Score: {mean_test_score}')
        print(f'Rank: {rank_test_score}')

Parameters: epsilon=0.5, C=0.2
Mean Test Score: 0.7717486477585161
Rank: 1
Parameters: epsilon=0.5, C=0.3
Mean Test Score: 0.7681412149970713
Rank: 2
Parameters: epsilon=0.1, C=0.2
Mean Test Score: 0.7557336399070657
Rank: 7
Parameters: epsilon=0.1, C=0.3
Mean Test Score: 0.7527054459598763
Rank: 8
Parameters: epsilon=0.2, C=0.2
Mean Test Score: 0.7590744724075238
Rank: 5
Parameters: epsilon=0.2, C=0.3
Mean Test Score: 0.7584744074675789
Rank: 6
Parameters: epsilon=0.3, C=0.2
Mean Test Score: 0.7624915253847583
Rank: 4
Parameters: epsilon=0.3, C=0.3
Mean Test Score: 0.7660146266291706
Rank: 3


In [161]:
grid_search.best_params_['C']

0.2

In [162]:
svr = SVR(kernel='linear', epsilon=grid_search.best_params_['epsilon'], C=grid_search.best_params_['C']).fit(x_train, y_train)

In [163]:
y_pred = svr.predict(x_test)

In [164]:
print('Training score : ', svr.score(x_train, y_train))
print('Test score : ', r2_score(y_test, y_pred))

Training score :  0.8330533332696215
Test score :  0.8441533964682544
