In [1]:
# import the relevant libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import ExtraTreesRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('cleaned_car_data.csv')
data.head()

Unnamed: 0,name,manufacturer,year,age,mileage,engine,transmission,price
0,Mazda MX5,Mazda,2007,14,63131,Petrol,Manual,7499
1,Jaguar XF,Jaguar,2010,11,61890,Petrol,Automatic,7775
2,Audi A6,Audi,2012,9,129170,Diesel,Automatic,6950
3,Nissan Qashqai,Nissan,2013,8,44900,Petrol,Automatic,7790
4,MINI Mini,Mini,2017,4,32012,Petrol,Manual,15999


In [3]:
#drop the name and year columns because it is irrelevant in our model building
data = data.drop(['name', 'year'], axis=1)

In [4]:
data.head()

Unnamed: 0,manufacturer,age,mileage,engine,transmission,price
0,Mazda,14,63131,Petrol,Manual,7499
1,Jaguar,11,61890,Petrol,Automatic,7775
2,Audi,9,129170,Diesel,Automatic,6950
3,Nissan,8,44900,Petrol,Automatic,7790
4,Mini,4,32012,Petrol,Manual,15999


In [5]:
# get dummny data
data = pd.get_dummies(data)
data

Unnamed: 0,age,mileage,price,manufacturer_Abarth,manufacturer_Alfa-Romero,manufacturer_Audi,manufacturer_BMW,manufacturer_Bentley,manufacturer_Chevrolet,manufacturer_Chrysler,...,manufacturer_Volkswagen,manufacturer_Volvo,engine_Diesel,engine_Electric,engine_Hybrid,engine_Petrol,engine_Plug_in_hybrid,transmission_Automatic,transmission_Manual,transmission_Semiautomatic
0,14,63131,7499,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,11,61890,7775,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2,9,129170,6950,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,8,44900,7790,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
4,4,32012,15999,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2981,1,10290,22000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2982,1,16193,27000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2983,4,59926,16000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2984,1,12355,30000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [6]:
# creating X and y variables
X = data.drop('price', axis=1)

# log transform the price column
y = np.log(data['price'])

In [7]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2090, 50), (896, 50), (2090,), (896,))

In [9]:
# feature scale the X_train and X_test values

norm = MinMaxScaler().fit(X_train)

# transform training data
X_train = norm.transform(X_train)

# transform testing data
X_test = norm.transform(X_test)

print(X_train)
print('\n')
print(X_test)

[[0.16666667 0.13713793 0.         ... 1.         0.         0.        ]
 [0.04166667 0.0308516  0.         ... 1.         0.         0.        ]
 [0.         0.03725164 0.         ... 0.         0.         1.        ]
 ...
 [0.41666667 0.28336733 0.         ... 1.         0.         0.        ]
 [0.20833333 0.22733844 0.         ... 0.         1.         0.        ]
 [0.375      0.16930954 0.         ... 1.         0.         0.        ]]


[[0.04166667 0.03048589 0.         ... 0.         1.         0.        ]
 [0.125      0.07322899 0.         ... 0.         0.         1.        ]
 [0.04166667 0.02873159 0.         ... 0.         0.         1.        ]
 ...
 [0.58333333 0.7828559  0.         ... 1.         0.         0.        ]
 [0.33333333 0.29713884 0.         ... 1.         0.         0.        ]
 [0.04166667 0.03975451 0.         ... 0.         1.         0.        ]]


In [10]:
# Fitting extra trees regressor

etr = ExtraTreesRegressor(random_state = 123 , max_depth = 45  , n_estimators = 400)
etr.fit(X_train,y_train)

ExtraTreesRegressor(max_depth=45, n_estimators=400, random_state=123)

In [11]:
#Fitting light gbm model on the train data

lgbm = LGBMRegressor(random_state = 123 ,  num_leaves = 750 , learning_rate = 0.01, max_bin = 1200 , n_estimators = 1000)
lgbm.fit(X_train,y_train)

LGBMRegressor(learning_rate=0.01, max_bin=1200, n_estimators=1000,
              num_leaves=750, random_state=123)

In [12]:
#Fitting xgboost regressor model on the train data

xgb = XGBRegressor(random_state = 123 , max_depth = 7 , learning_rate = 0.2 , n_estimators = 1500)
xgb.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.2, max_delta_step=0, max_depth=7,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1500, n_jobs=4, num_parallel_tree=1, random_state=123,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [13]:
#Fitting random forest regressor model on the train data

rf = RandomForestRegressor(random_state = 123 , max_depth = 45 , n_estimators = 600)
rf.fit(X_train,y_train)

RandomForestRegressor(max_depth=45, n_estimators=600, random_state=123)

In [14]:
#Fitting catboost regressor model on the train data

cat = CatBoostRegressor(random_state = 123  , max_depth = 14 )
cat.fit(X_train,y_train)

Learning rate set to 0.044503
0:	learn: 0.5362543	total: 634ms	remaining: 10m 33s
1:	learn: 0.5218486	total: 681ms	remaining: 5m 39s
2:	learn: 0.5085464	total: 699ms	remaining: 3m 52s
3:	learn: 0.4967243	total: 703ms	remaining: 2m 55s
4:	learn: 0.4843031	total: 707ms	remaining: 2m 20s
5:	learn: 0.4735010	total: 715ms	remaining: 1m 58s
6:	learn: 0.4623843	total: 739ms	remaining: 1m 44s
7:	learn: 0.4521795	total: 752ms	remaining: 1m 33s
8:	learn: 0.4415589	total: 794ms	remaining: 1m 27s
9:	learn: 0.4316441	total: 1.15s	remaining: 1m 53s
10:	learn: 0.4223400	total: 1.16s	remaining: 1m 44s
11:	learn: 0.4136046	total: 1.21s	remaining: 1m 39s
12:	learn: 0.4057098	total: 1.23s	remaining: 1m 33s
13:	learn: 0.3983877	total: 1.23s	remaining: 1m 26s
14:	learn: 0.3915261	total: 1.27s	remaining: 1m 23s
15:	learn: 0.3849760	total: 1.32s	remaining: 1m 21s
16:	learn: 0.3784134	total: 1.37s	remaining: 1m 19s
17:	learn: 0.3718104	total: 1.38s	remaining: 1m 15s
18:	learn: 0.3657413	total: 1.39s	remaining

157:	learn: 0.2167275	total: 32s	remaining: 2m 50s
158:	learn: 0.2165414	total: 32.1s	remaining: 2m 50s
159:	learn: 0.2164897	total: 32.2s	remaining: 2m 48s
160:	learn: 0.2160584	total: 32.5s	remaining: 2m 49s
161:	learn: 0.2160104	total: 32.5s	remaining: 2m 47s
162:	learn: 0.2155848	total: 32.8s	remaining: 2m 48s
163:	learn: 0.2153579	total: 33.2s	remaining: 2m 49s
164:	learn: 0.2153074	total: 33.2s	remaining: 2m 47s
165:	learn: 0.2150913	total: 33.5s	remaining: 2m 48s
166:	learn: 0.2147864	total: 33.9s	remaining: 2m 48s
167:	learn: 0.2145242	total: 34.2s	remaining: 2m 49s
168:	learn: 0.2143237	total: 34.5s	remaining: 2m 49s
169:	learn: 0.2139232	total: 34.8s	remaining: 2m 50s
170:	learn: 0.2136982	total: 35.1s	remaining: 2m 50s
171:	learn: 0.2134277	total: 35.5s	remaining: 2m 50s
172:	learn: 0.2132303	total: 35.8s	remaining: 2m 51s
173:	learn: 0.2129735	total: 36.1s	remaining: 2m 51s
174:	learn: 0.2124216	total: 36.4s	remaining: 2m 51s
175:	learn: 0.2120993	total: 36.8s	remaining: 2m

314:	learn: 0.1836330	total: 1m 22s	remaining: 2m 58s
315:	learn: 0.1833181	total: 1m 22s	remaining: 2m 58s
316:	learn: 0.1830231	total: 1m 22s	remaining: 2m 58s
317:	learn: 0.1828574	total: 1m 23s	remaining: 2m 58s
318:	learn: 0.1825036	total: 1m 23s	remaining: 2m 57s
319:	learn: 0.1822065	total: 1m 23s	remaining: 2m 57s
320:	learn: 0.1820986	total: 1m 23s	remaining: 2m 57s
321:	learn: 0.1818566	total: 1m 24s	remaining: 2m 57s
322:	learn: 0.1816158	total: 1m 24s	remaining: 2m 57s
323:	learn: 0.1815424	total: 1m 25s	remaining: 2m 57s
324:	learn: 0.1812999	total: 1m 25s	remaining: 2m 57s
325:	learn: 0.1809163	total: 1m 25s	remaining: 2m 57s
326:	learn: 0.1807381	total: 1m 26s	remaining: 2m 57s
327:	learn: 0.1806497	total: 1m 26s	remaining: 2m 56s
328:	learn: 0.1806311	total: 1m 26s	remaining: 2m 56s
329:	learn: 0.1803801	total: 1m 26s	remaining: 2m 56s
330:	learn: 0.1803507	total: 1m 27s	remaining: 2m 56s
331:	learn: 0.1801826	total: 1m 27s	remaining: 2m 56s
332:	learn: 0.1800087	total:

467:	learn: 0.1565085	total: 2m 12s	remaining: 2m 30s
468:	learn: 0.1564814	total: 2m 12s	remaining: 2m 29s
469:	learn: 0.1563458	total: 2m 12s	remaining: 2m 29s
470:	learn: 0.1561943	total: 2m 12s	remaining: 2m 29s
471:	learn: 0.1561737	total: 2m 13s	remaining: 2m 29s
472:	learn: 0.1560083	total: 2m 13s	remaining: 2m 28s
473:	learn: 0.1559905	total: 2m 13s	remaining: 2m 28s
474:	learn: 0.1558001	total: 2m 14s	remaining: 2m 28s
475:	learn: 0.1557665	total: 2m 14s	remaining: 2m 28s
476:	learn: 0.1556362	total: 2m 14s	remaining: 2m 27s
477:	learn: 0.1556112	total: 2m 15s	remaining: 2m 27s
478:	learn: 0.1555819	total: 2m 15s	remaining: 2m 27s
479:	learn: 0.1553737	total: 2m 15s	remaining: 2m 27s
480:	learn: 0.1551750	total: 2m 16s	remaining: 2m 26s
481:	learn: 0.1549776	total: 2m 16s	remaining: 2m 26s
482:	learn: 0.1549461	total: 2m 16s	remaining: 2m 26s
483:	learn: 0.1548025	total: 2m 16s	remaining: 2m 26s
484:	learn: 0.1545458	total: 2m 17s	remaining: 2m 25s
485:	learn: 0.1545286	total:

620:	learn: 0.1385336	total: 3m 1s	remaining: 1m 50s
621:	learn: 0.1383993	total: 3m 1s	remaining: 1m 50s
622:	learn: 0.1382451	total: 3m 1s	remaining: 1m 50s
623:	learn: 0.1381901	total: 3m 2s	remaining: 1m 49s
624:	learn: 0.1381399	total: 3m 2s	remaining: 1m 49s
625:	learn: 0.1380384	total: 3m 2s	remaining: 1m 49s
626:	learn: 0.1378701	total: 3m 3s	remaining: 1m 49s
627:	learn: 0.1377234	total: 3m 3s	remaining: 1m 48s
628:	learn: 0.1376316	total: 3m 3s	remaining: 1m 48s
629:	learn: 0.1374969	total: 3m 4s	remaining: 1m 48s
630:	learn: 0.1373939	total: 3m 4s	remaining: 1m 47s
631:	learn: 0.1373785	total: 3m 4s	remaining: 1m 47s
632:	learn: 0.1372451	total: 3m 5s	remaining: 1m 47s
633:	learn: 0.1371429	total: 3m 5s	remaining: 1m 47s
634:	learn: 0.1370822	total: 3m 5s	remaining: 1m 46s
635:	learn: 0.1368855	total: 3m 6s	remaining: 1m 46s
636:	learn: 0.1367032	total: 3m 6s	remaining: 1m 46s
637:	learn: 0.1365788	total: 3m 6s	remaining: 1m 45s
638:	learn: 0.1365200	total: 3m 6s	remaining: 

773:	learn: 0.1244786	total: 3m 51s	remaining: 1m 7s
774:	learn: 0.1244719	total: 3m 51s	remaining: 1m 7s
775:	learn: 0.1243520	total: 3m 51s	remaining: 1m 6s
776:	learn: 0.1242881	total: 3m 52s	remaining: 1m 6s
777:	learn: 0.1242830	total: 3m 52s	remaining: 1m 6s
778:	learn: 0.1241698	total: 3m 52s	remaining: 1m 6s
779:	learn: 0.1240419	total: 3m 52s	remaining: 1m 5s
780:	learn: 0.1239301	total: 3m 53s	remaining: 1m 5s
781:	learn: 0.1238259	total: 3m 53s	remaining: 1m 5s
782:	learn: 0.1237016	total: 3m 53s	remaining: 1m 4s
783:	learn: 0.1235666	total: 3m 54s	remaining: 1m 4s
784:	learn: 0.1234547	total: 3m 54s	remaining: 1m 4s
785:	learn: 0.1233759	total: 3m 54s	remaining: 1m 3s
786:	learn: 0.1232980	total: 3m 55s	remaining: 1m 3s
787:	learn: 0.1231961	total: 3m 55s	remaining: 1m 3s
788:	learn: 0.1231159	total: 3m 55s	remaining: 1m 3s
789:	learn: 0.1230489	total: 3m 56s	remaining: 1m 2s
790:	learn: 0.1229074	total: 3m 56s	remaining: 1m 2s
791:	learn: 0.1228203	total: 3m 56s	remaining:

930:	learn: 0.1129411	total: 4m 40s	remaining: 20.8s
931:	learn: 0.1128410	total: 4m 41s	remaining: 20.5s
932:	learn: 0.1127322	total: 4m 41s	remaining: 20.2s
933:	learn: 0.1126475	total: 4m 41s	remaining: 19.9s
934:	learn: 0.1126119	total: 4m 41s	remaining: 19.6s
935:	learn: 0.1125309	total: 4m 42s	remaining: 19.3s
936:	learn: 0.1124564	total: 4m 42s	remaining: 19s
937:	learn: 0.1123714	total: 4m 42s	remaining: 18.7s
938:	learn: 0.1123065	total: 4m 43s	remaining: 18.4s
939:	learn: 0.1122511	total: 4m 43s	remaining: 18.1s
940:	learn: 0.1121629	total: 4m 43s	remaining: 17.8s
941:	learn: 0.1120731	total: 4m 44s	remaining: 17.5s
942:	learn: 0.1119775	total: 4m 44s	remaining: 17.2s
943:	learn: 0.1119492	total: 4m 44s	remaining: 16.9s
944:	learn: 0.1118261	total: 4m 45s	remaining: 16.6s
945:	learn: 0.1118059	total: 4m 45s	remaining: 16.3s
946:	learn: 0.1117259	total: 4m 45s	remaining: 16s
947:	learn: 0.1116841	total: 4m 46s	remaining: 15.7s
948:	learn: 0.1116141	total: 4m 46s	remaining: 15.

<catboost.core.CatBoostRegressor at 0x1612a939eb0>

In [15]:
train_score = pd.DataFrame({
    'model': ['ExtraTreesRegressor', 'LGBMRegressor', 'XGB', 'RandomForestRegressor', 'CatBoostRegressor'],
    
    'score': [etr.score(X_train, y_train), lgbm.score(X_train, y_train),
                xgb.score(X_train, y_train) , rf.score(X_train, y_train),
                cat.score(X_train, y_train)]
    })
train_score            

Unnamed: 0,model,score
0,ExtraTreesRegressor,0.999986
1,LGBMRegressor,0.907804
2,XGB,0.999882
3,RandomForestRegressor,0.965881
4,CatBoostRegressor,0.961405


In [16]:
etr_pred = etr.predict(X_test)
lgbm_pred = lgbm.predict(X_test)
xgb_pred = xgb.predict(X_test)
rf_pred = rf.predict(X_test)
cat_pred = cat.predict(X_test)

In [17]:
# Generalisation
best_model = pd.DataFrame({
    'model': ['ExtraTreesRegressor', 'LGBMRegressor', 'XGBRegressor', 'RandomForestRegressor', 'CatBoostRegressor'],
    
    'mae': [mean_absolute_error(y_test, etr_pred), mean_absolute_error(y_test, lgbm_pred),
           mean_absolute_error(y_test, xgb_pred), mean_absolute_error(y_test, rf_pred),
           mean_absolute_error(y_test, cat_pred)],
    
    'mse': [mean_squared_error(y_test, etr_pred), mean_squared_error(y_test, lgbm_pred),
            mean_squared_error(y_test, xgb_pred), mean_squared_error(y_test, rf_pred),
            mean_squared_error(y_test, cat_pred)],
    
    'rmse': [(np.sqrt(metrics.mean_squared_error(y_test, etr_pred))),(np.sqrt(metrics.mean_squared_error(y_test, lgbm_pred))),
             (np.sqrt(metrics.mean_squared_error(y_test, xgb_pred))), (np.sqrt(metrics.mean_squared_error(y_test, rf_pred))),
             (np.sqrt(metrics.mean_squared_error(y_test, cat_pred)))]
})
best_model

Unnamed: 0,model,mae,mse,rmse
0,ExtraTreesRegressor,0.218828,0.082976,0.288055
1,LGBMRegressor,0.205127,0.072246,0.268786
2,XGBRegressor,0.216739,0.083082,0.288239
3,RandomForestRegressor,0.211458,0.077205,0.277858
4,CatBoostRegressor,0.197216,0.069104,0.262876
