In [70]:
# Sumit Poojary

In [71]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, mean_squared_error

from dmba import regressionSummary, adjusted_r2_score

In [72]:
vehicle_df = pd.read_csv('ToyotaCorolla.csv')

## Data Analysis

In [73]:
print(vehicle_df.dtypes)

Id                 float64
Price              float64
Age_08_22          float64
KM                 float64
Fuel_Type           object
HP                 float64
Color               object
Automatic          float64
CC                 float64
Doors              float64
Cylinders          float64
Gears              float64
Mfr_Guarantee      float64
ABS                float64
Airbag_1           float64
Airbag_2           float64
Airco              float64
CD_Player          float64
Powered_Windows    float64
Power_Steering     float64
Radio              float64
Mistlamps          float64
Sport_Model        float64
Metallic_Rim       float64
dtype: object


In [74]:
print(vehicle_df.shape)
print(len(vehicle_df))
print(len(vehicle_df.columns))
vehicle_df.head()

(2872, 24)
2872
24


Unnamed: 0,Id,Price,Age_08_22,KM,Fuel_Type,HP,Color,Automatic,CC,Doors,...,Airbag_1,Airbag_2,Airco,CD_Player,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Metallic_Rim
0,,,,,,,,,,,...,,,,,,,,,,
1,1.0,13500.0,23.0,46986.0,Diesel,90.0,Blue,0.0,2000.0,3.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,,,,,,,,,,,...,,,,,,,,,,
3,2.0,13750.0,23.0,72937.0,Diesel,90.0,Silver,0.0,2000.0,3.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,,,,,,,,,,,...,,,,,,,,,,


In [75]:
vehicle_df.describe()

Unnamed: 0,Id,Price,Age_08_22,KM,HP,Automatic,CC,Doors,Cylinders,Gears,...,Airbag_1,Airbag_2,Airco,CD_Player,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Metallic_Rim
count,1436.0,1435.0,1436.0,1436.0,1436.0,1436.0,1431.0,1436.0,1436.0,1436.0,...,1436.0,1436.0,1435.0,1436.0,1436.0,1436.0,1436.0,401.0,1436.0,1436.0
mean,721.555014,10726.978397,55.947075,68533.259749,101.502089,0.05571,1576.355695,4.033426,4.0,5.026462,...,0.970752,0.722841,0.508014,0.218663,0.561978,0.977716,0.14624,0.296758,0.300139,0.204735
std,416.47689,3625.298424,18.599988,37506.448872,14.98108,0.229441,424.806511,0.952677,0.0,0.18851,...,0.168559,0.447751,0.50011,0.413483,0.496317,0.147657,0.353469,0.4574,0.458478,0.403649
min,1.0,4350.0,1.0,1.0,69.0,0.0,1300.0,2.0,4.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,361.75,8450.0,44.0,43000.0,90.0,0.0,1400.0,3.0,4.0,5.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,721.5,9900.0,61.0,63389.5,110.0,0.0,1600.0,4.0,4.0,5.0,...,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
75%,1081.25,11950.0,70.0,87020.75,110.0,0.0,1600.0,5.0,4.0,5.0,...,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
max,1442.0,32500.0,80.0,243000.0,192.0,1.0,16000.0,5.0,4.0,6.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Dealing with NA values (Pre-Processing)

In [76]:
vehicle_df.isnull().sum()

Id                 1436
Price              1437
Age_08_22          1436
KM                 1436
Fuel_Type          1436
HP                 1436
Color              1445
Automatic          1436
CC                 1441
Doors              1436
Cylinders          1436
Gears              1436
Mfr_Guarantee      1437
ABS                1436
Airbag_1           1436
Airbag_2           1436
Airco              1437
CD_Player          1436
Powered_Windows    1436
Power_Steering     1436
Radio              1436
Mistlamps          2471
Sport_Model        1436
Metallic_Rim       1436
dtype: int64

In [77]:
vehicle_df = vehicle_df.dropna(axis = 0,how = 'all', subset = 'Id')
print(vehicle_df.shape)
vehicle_df.isnull().sum()

(1436, 24)


Id                    0
Price                 1
Age_08_22             0
KM                    0
Fuel_Type             0
HP                    0
Color                 9
Automatic             0
CC                    5
Doors                 0
Cylinders             0
Gears                 0
Mfr_Guarantee         1
ABS                   0
Airbag_1              0
Airbag_2              0
Airco                 1
CD_Player             0
Powered_Windows       0
Power_Steering        0
Radio                 0
Mistlamps          1035
Sport_Model           0
Metallic_Rim          0
dtype: int64

In [78]:
print((vehicle_df['Color'].value_counts()),(vehicle_df['CC'].value_counts()))

Grey      297
Blue      281
Red       278
Green     218
Black     190
Silver    122
White      31
Violet      4
Yellow      3
Beige       3
Name: Color, dtype: int64 1600.0     843
1300.0     248
1400.0     163
2000.0     117
1900.0      30
1800.0      14
1598.0       4
1587.0       4
1995.0       2
1398.0       2
1332.0       2
16000.0      1
1975.0       1
Name: CC, dtype: int64


In [79]:
vehicle_df1 = vehicle_df.drop(['Mistlamps'], axis = 1)
vehicle_df1 = vehicle_df1.fillna(value = {"Price": vehicle_df1['Price'].mean()})
vehicle_df1=vehicle_df1.dropna(axis=0, how='any').copy()
vehicle_df1.isnull().sum()

Id                 0
Price              0
Age_08_22          0
KM                 0
Fuel_Type          0
HP                 0
Color              0
Automatic          0
CC                 0
Doors              0
Cylinders          0
Gears              0
Mfr_Guarantee      0
ABS                0
Airbag_1           0
Airbag_2           0
Airco              0
CD_Player          0
Powered_Windows    0
Power_Steering     0
Radio              0
Sport_Model        0
Metallic_Rim       0
dtype: int64

In [80]:
predictors_df = vehicle_df1.loc[:,'Age_08_22':]
response_df = vehicle_df1['Price']
predictors_df = pd.get_dummies(predictors_df)
predictors_df.corr()


Unnamed: 0,Age_08_22,KM,HP,Automatic,CC,Doors,Cylinders,Gears,Mfr_Guarantee,ABS,...,Color_Beige,Color_Black,Color_Blue,Color_Green,Color_Grey,Color_Red,Color_Silver,Color_Violet,Color_White,Color_Yellow
Age_08_22,1.0,0.503409,-0.155399,0.043938,-0.097031,-0.134296,,-0.008352,-0.169863,-0.411503,...,0.022601,-0.025161,-0.03455,0.103698,-0.132176,0.096634,-0.024513,0.017174,0.048865,-0.042608
KM,0.503409,1.0,-0.337213,-0.080379,0.103932,-0.031547,,0.014077,-0.212777,-0.174699,...,-0.007075,0.037741,-0.005769,-0.018651,-0.106211,0.049862,0.00597,0.018506,0.129874,-0.038475
HP,-0.155399,-0.337213,1.0,0.009298,0.035207,0.089355,,0.213181,0.146042,0.056625,...,0.026545,-0.001527,-0.027814,0.010505,0.02657,0.014446,0.016478,-0.012305,-0.094651,-0.00032
Automatic,0.043938,-0.080379,0.009298,1.0,0.067871,-0.035417,,-0.099463,0.021756,-0.01961,...,-0.011017,-0.039303,-0.017063,0.053301,0.007692,-0.02331,0.050231,-0.012726,-0.035771,-0.011017
CC,-0.097031,0.103932,0.035207,0.067871,1.0,0.078571,,0.014832,-0.05721,0.037295,...,0.002558,-0.007612,0.030797,-0.011232,-0.008253,-0.010153,-0.003629,-0.015759,0.027538,-0.011843
Doors,-0.134296,-0.031547,0.089355,-0.035417,0.078571,1.0,,-0.159696,0.040705,0.059387,...,-0.033376,-0.098964,-0.045926,0.061104,0.058227,0.000732,0.02693,0.012619,-0.008819,0.014952
Cylinders,,,,,,,,,,,...,,,,,,,,,,
Gears,-0.008352,0.014077,0.213181,-0.099463,0.014832,-0.159696,,1.0,0.010771,0.087115,...,0.074446,0.130081,-0.023287,-0.049836,-0.026515,0.033944,-0.069633,-0.007506,-0.021099,-0.006498
Mfr_Guarantee,-0.169863,-0.212777,0.146042,0.021756,-0.05721,0.040705,,0.010771,1.0,0.120183,...,0.024026,0.008945,-0.02793,0.034362,0.010912,0.00318,-0.006091,0.036759,-0.095104,0.055212
ABS,-0.411503,-0.174699,0.056625,-0.01961,0.037295,0.059387,,0.087115,0.120183,1.0,...,-0.017004,-0.03775,0.003823,-0.024245,0.060669,0.014054,-0.02169,-0.008322,-0.02647,0.022193


In [81]:
predictors_df = predictors_df.drop(["Fuel_Type_Diesel"], axis = 1)
predictors_df.corr()

Unnamed: 0,Age_08_22,KM,HP,Automatic,CC,Doors,Cylinders,Gears,Mfr_Guarantee,ABS,...,Color_Beige,Color_Black,Color_Blue,Color_Green,Color_Grey,Color_Red,Color_Silver,Color_Violet,Color_White,Color_Yellow
Age_08_22,1.0,0.503409,-0.155399,0.043938,-0.097031,-0.134296,,-0.008352,-0.169863,-0.411503,...,0.022601,-0.025161,-0.03455,0.103698,-0.132176,0.096634,-0.024513,0.017174,0.048865,-0.042608
KM,0.503409,1.0,-0.337213,-0.080379,0.103932,-0.031547,,0.014077,-0.212777,-0.174699,...,-0.007075,0.037741,-0.005769,-0.018651,-0.106211,0.049862,0.00597,0.018506,0.129874,-0.038475
HP,-0.155399,-0.337213,1.0,0.009298,0.035207,0.089355,,0.213181,0.146042,0.056625,...,0.026545,-0.001527,-0.027814,0.010505,0.02657,0.014446,0.016478,-0.012305,-0.094651,-0.00032
Automatic,0.043938,-0.080379,0.009298,1.0,0.067871,-0.035417,,-0.099463,0.021756,-0.01961,...,-0.011017,-0.039303,-0.017063,0.053301,0.007692,-0.02331,0.050231,-0.012726,-0.035771,-0.011017
CC,-0.097031,0.103932,0.035207,0.067871,1.0,0.078571,,0.014832,-0.05721,0.037295,...,0.002558,-0.007612,0.030797,-0.011232,-0.008253,-0.010153,-0.003629,-0.015759,0.027538,-0.011843
Doors,-0.134296,-0.031547,0.089355,-0.035417,0.078571,1.0,,-0.159696,0.040705,0.059387,...,-0.033376,-0.098964,-0.045926,0.061104,0.058227,0.000732,0.02693,0.012619,-0.008819,0.014952
Cylinders,,,,,,,,,,,...,,,,,,,,,,
Gears,-0.008352,0.014077,0.213181,-0.099463,0.014832,-0.159696,,1.0,0.010771,0.087115,...,0.074446,0.130081,-0.023287,-0.049836,-0.026515,0.033944,-0.069633,-0.007506,-0.021099,-0.006498
Mfr_Guarantee,-0.169863,-0.212777,0.146042,0.021756,-0.05721,0.040705,,0.010771,1.0,0.120183,...,0.024026,0.008945,-0.02793,0.034362,0.010912,0.00318,-0.006091,0.036759,-0.095104,0.055212
ABS,-0.411503,-0.174699,0.056625,-0.01961,0.037295,0.059387,,0.087115,0.120183,1.0,...,-0.017004,-0.03775,0.003823,-0.024245,0.060669,0.014054,-0.02169,-0.008322,-0.02647,0.022193


## Scaling of predictors

In [82]:
print(predictors_df.shape)
z_score_norm = preprocessing.StandardScaler()
predictor_df_normalized = z_score_norm.fit_transform(predictors_df)
predictor_df_normalized = pd.DataFrame(predictor_df_normalized, columns = predictors_df.columns)
predictor_df_normalized.head(10)

(1420, 31)


Unnamed: 0,Age_08_22,KM,HP,Automatic,CC,Doors,Cylinders,Gears,Mfr_Guarantee,ABS,...,Color_Beige,Color_Black,Color_Blue,Color_Green,Color_Grey,Color_Red,Color_Silver,Color_Violet,Color_White,Color_Yellow
0,-1.81631,-0.580895,-0.770488,-0.239446,0.994558,-1.075473,0.0,-0.14123,-0.833373,0.482327,...,-0.046012,-0.393029,2.017778,-0.425869,-0.512076,-0.491181,-0.303822,-0.053149,-0.149393,-0.046012
1,-1.81631,0.110129,-0.770488,-0.239446,0.994558,-1.075473,0.0,-0.14123,-0.833373,0.482327,...,-0.046012,-0.393029,-0.495595,-0.425869,-0.512076,-0.491181,3.291403,-0.053149,-0.149393,-0.046012
2,-1.761802,-0.721358,-0.770488,-0.239446,0.994558,-1.075473,0.0,-0.14123,1.199943,0.482327,...,-0.046012,-0.393029,2.017778,-0.425869,-0.512076,-0.491181,-0.303822,-0.053149,-0.149393,-0.046012
3,-1.652786,-0.553894,-0.770488,-0.239446,0.994558,-1.075473,0.0,-0.14123,1.199943,0.482327,...,-0.046012,2.544344,-0.495595,-0.425869,-0.512076,-0.491181,-0.303822,-0.053149,-0.149393,-0.046012
4,-1.434753,-0.806861,-0.770488,-0.239446,0.994558,-1.075473,0.0,-0.14123,1.199943,0.482327,...,-0.046012,2.544344,-0.495595,-0.425869,-0.512076,-0.491181,-0.303822,-0.053149,-0.149393,-0.046012
5,-1.325737,-0.20773,-0.770488,-0.239446,0.994558,-1.075473,0.0,-0.14123,-0.833373,0.482327,...,-0.046012,-0.393029,-0.495595,-0.425869,-0.512076,-0.491181,-0.303822,-0.053149,6.693762,-0.046012
6,-1.598278,0.687292,-0.770488,-0.239446,0.994558,-1.075473,0.0,-0.14123,-0.833373,0.482327,...,-0.046012,-0.393029,-0.495595,-0.425869,1.952834,-0.491181,-0.303822,-0.053149,-0.149393,-0.046012
7,-1.434753,0.188735,-0.770488,-0.239446,0.994558,-1.075473,0.0,-0.14123,1.199943,0.482327,...,-0.046012,-0.393029,-0.495595,-0.425869,1.952834,-0.491181,-0.303822,-0.053149,-0.149393,-0.046012
8,-1.598278,-1.307468,6.101279,-0.239446,0.52508,-1.075473,0.0,-0.14123,-0.833373,0.482327,...,-0.046012,-0.393029,-0.495595,-0.425869,-0.512076,2.03591,-0.303822,-0.053149,-0.149393,-0.046012
9,-1.81631,0.062225,-2.185264,-0.239446,0.759819,-1.075473,0.0,-0.14123,-0.833373,0.482327,...,-0.046012,-0.393029,2.017778,-0.425869,-0.512076,-0.491181,-0.303822,-0.053149,-0.149393,-0.046012


In [83]:
X = predictor_df_normalized
y = response_df
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=80)

## Linear Regression

In [84]:
linear_model = LinearRegression()
linear_model = linear_model.fit(train_X, train_y)
print('intercept ', linear_model.intercept_)
print(pd.DataFrame({'Predictor': X.columns, 'coefficient': linear_model.coef_}))

intercept  10644.93237408189
           Predictor   coefficient
0          Age_08_22 -2.369248e+03
1                 KM -7.667532e+02
2                 HP  7.435012e+02
3          Automatic  1.603914e+02
4                 CC  1.281452e+01
5              Doors  2.351295e+02
6          Cylinders -2.218901e+14
7              Gears  8.060273e+01
8      Mfr_Guarantee  1.465843e+02
9                ABS -2.175677e+02
10          Airbag_1  5.135793e+01
11          Airbag_2 -1.273545e+02
12             Airco  1.474752e+02
13         CD_Player  1.282325e+02
14   Powered_Windows  1.635893e+02
15    Power_Steering -3.194742e+01
16             Radio -6.547587e+01
17       Sport_Model  3.328699e+02
18      Metallic_Rim  7.105915e+01
19     Fuel_Type_CNG -2.751942e+02
20  Fuel_Type_Petrol -7.405925e+02
21       Color_Beige  2.485025e+14
22       Color_Black  1.842530e+15
23        Color_Blue  2.153360e+15
24       Color_Green  1.951038e+15
25        Color_Grey  2.195697e+15
26         Color_Red  2.14

In [85]:
predicted_y_training = linear_model.predict(train_X)
regressionSummary(train_y, predicted_y_training)



Regression statistics

                      Mean Error (ME) : -0.1264
       Root Mean Squared Error (RMSE) : 1360.6345
            Mean Absolute Error (MAE) : 999.1738
          Mean Percentage Error (MPE) : -1.1311
Mean Absolute Percentage Error (MAPE) : 9.9923


In [86]:
print(adjusted_r2_score(train_y, predicted_y_training, linear_model))

0.8523683510087645


In [87]:
predicted_y_test = linear_model.predict(test_X)
regressionSummary(test_y, predicted_y_test)
print(adjusted_r2_score(test_y, predicted_y_test, linear_model))


Regression statistics

                      Mean Error (ME) : -2.0895
       Root Mean Squared Error (RMSE) : 1334.8030
            Mean Absolute Error (MAE) : 1027.5944
          Mean Percentage Error (MPE) : -0.8271
Mean Absolute Percentage Error (MAPE) : 10.1820
0.8303392882616415


## KNN

In [88]:
knn_p = KNeighborsRegressor(n_neighbors=4).fit(train_X, train_y)
predicted_y_training1 = knn_p.predict(train_X)
print(adjusted_r2_score(train_y, predicted_y_training1,linear_model))

regressionSummary(train_y, predicted_y_training1)

0.8580035786101238

Regression statistics

                      Mean Error (ME) : 16.0941
       Root Mean Squared Error (RMSE) : 1334.4135
            Mean Absolute Error (MAE) : 991.5176
          Mean Percentage Error (MPE) : -1.6306
Mean Absolute Percentage Error (MAPE) : 9.6866


In [89]:
predicted_y_test1 = knn_p.predict(test_X)
regressionSummary(test_y, predicted_y_test1)


Regression statistics

                      Mean Error (ME) : 5.6021
       Root Mean Squared Error (RMSE) : 1741.5521
            Mean Absolute Error (MAE) : 1297.9416
          Mean Percentage Error (MPE) : -2.6639
Mean Absolute Percentage Error (MAPE) : 12.7532


In [90]:
results = []
for k in range(1, 20):
    knn_p2 = KNeighborsRegressor(n_neighbors=k).fit(train_X, train_y)
    results.append({
        'k': k,
        'RMSE': round(mean_squared_error(test_y, knn_p2.predict(test_X)) ** 0.5, 4)
    })

# Convert results to a pandas data frame
results = pd.DataFrame(results)
print(results)

     k       RMSE
0    1  2232.7257
1    2  1878.7516
2    3  1752.2874
3    4  1741.5521
4    5  1744.1680
5    6  1715.8608
6    7  1743.3521
7    8  1753.8618
8    9  1775.2487
9   10  1773.6956
10  11  1787.9010
11  12  1796.5473
12  13  1791.1307
13  14  1788.4815
14  15  1787.6594
15  16  1798.1879
16  17  1798.6808
17  18  1816.2045
18  19  1818.3701


## Random Forest

In [93]:
DT = DecisionTreeRegressor(max_depth= 5, random_state=600).fit(train_X, train_y)

predicted_y_train2 = DT.predict(train_X)
regressionSummary(train_y, predicted_y_train2)
DT.get_params()
#print(adjusted_r2_score(train_y, predicted_y_training, DT))


Regression statistics

                      Mean Error (ME) : 0.0000
       Root Mean Squared Error (RMSE) : 1057.2079
            Mean Absolute Error (MAE) : 804.5752
          Mean Percentage Error (MPE) : -1.1286
Mean Absolute Percentage Error (MAPE) : 8.1678


{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': 5,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 600,
 'splitter': 'best'}

In [28]:
predicted_y_test2 =  DT.predict(test_X)
regressionSummary(test_y, predicted_y_test2)


Regression statistics

                      Mean Error (ME) : -27.8537
       Root Mean Squared Error (RMSE) : 1235.1141
            Mean Absolute Error (MAE) : 921.4796
          Mean Percentage Error (MPE) : -1.5146
Mean Absolute Percentage Error (MAPE) : 9.0789


In [26]:
results1 = []
for k in range(1, 20):
    DT_p2 = DecisionTreeRegressor(max_depth= k, random_state=600).fit(train_X, train_y)
    results1.append({
        'k': k,
        'RMSE': round(mean_squared_error(test_y, DT_p2.predict(test_X)) ** 0.5, 4)
    })

# Convert results to a pandas data frame
results1 = pd.DataFrame(results1)
print(results1)

     k       RMSE
0    1  2017.9202
1    2  1602.0656
2    3  1365.5409
3    4  1244.8260
4    5  1235.1141
5    6  1249.9049
6    7  1271.7131
7    8  1309.0680
8    9  1351.2990
9   10  1545.3154
10  11  1512.3580
11  12  1604.7472
12  13  1562.5670
13  14  1501.0869
14  15  1585.4744
15  16  1508.4799
16  17  1542.9553
17  18  1635.3137
18  19  1562.0804
