In [19]:
# import the relevant libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import ExtraTreesRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [20]:
data = pd.read_csv('cleaned_car_data.csv')
data.head()

Unnamed: 0,name,manufacturer,year,age,mileage,engine,transmission,price
0,Mazda MX5,Mazda,2007,14,63131,Petrol,Manual,7499
1,Jaguar XF,Jaguar,2010,11,61890,Petrol,Automatic,7775
2,Audi A6,Audi,2012,9,129170,Diesel,Automatic,6950
3,Nissan Qashqai,Nissan,2013,8,44900,Petrol,Automatic,7790
4,MINI Mini,Mini,2017,4,32012,Petrol,Manual,15999


In [21]:
#drop the name and year columns because it is irrelevant in our model building
data = data.drop(['name', 'year'], axis=1)

In [22]:
data.head()

Unnamed: 0,manufacturer,age,mileage,engine,transmission,price
0,Mazda,14,63131,Petrol,Manual,7499
1,Jaguar,11,61890,Petrol,Automatic,7775
2,Audi,9,129170,Diesel,Automatic,6950
3,Nissan,8,44900,Petrol,Automatic,7790
4,Mini,4,32012,Petrol,Manual,15999


In [23]:
# get dummny data
data = pd.get_dummies(data)
data

Unnamed: 0,age,mileage,price,manufacturer_Abarth,manufacturer_Alfa-Romero,manufacturer_Audi,manufacturer_BMW,manufacturer_Bentley,manufacturer_Chevrolet,manufacturer_Chrysler,...,manufacturer_Volkswagen,manufacturer_Volvo,engine_Diesel,engine_Electric,engine_Hybrid,engine_Petrol,engine_Plug_in_hybrid,transmission_Automatic,transmission_Manual,transmission_Semiautomatic
0,14,63131,7499,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,11,61890,7775,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2,9,129170,6950,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,8,44900,7790,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
4,4,32012,15999,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2981,1,10290,22000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2982,1,16193,27000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2983,4,59926,16000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2984,1,12355,30000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [24]:
# creating X and y variables
X = data.drop('price', axis=1)

# log transform the price column
y = np.log(data['price'])

In [25]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [26]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2090, 50), (896, 50), (2090,), (896,))

In [27]:
# feature scale the X_train and X_test values

norm = MinMaxScaler().fit(X_train)

# transform training data
X_train = norm.transform(X_train)

# transform testing data
X_test = norm.transform(X_test)

print(X_train)
print('\n')
print(X_test)

[[0.16666667 0.13713793 0.         ... 1.         0.         0.        ]
 [0.04166667 0.0308516  0.         ... 1.         0.         0.        ]
 [0.         0.03725164 0.         ... 0.         0.         1.        ]
 ...
 [0.41666667 0.28336733 0.         ... 1.         0.         0.        ]
 [0.20833333 0.22733844 0.         ... 0.         1.         0.        ]
 [0.375      0.16930954 0.         ... 1.         0.         0.        ]]


[[0.04166667 0.03048589 0.         ... 0.         1.         0.        ]
 [0.125      0.07322899 0.         ... 0.         0.         1.        ]
 [0.04166667 0.02873159 0.         ... 0.         0.         1.        ]
 ...
 [0.58333333 0.7828559  0.         ... 1.         0.         0.        ]
 [0.33333333 0.29713884 0.         ... 1.         0.         0.        ]
 [0.04166667 0.03975451 0.         ... 0.         1.         0.        ]]


In [28]:
# Fitting extra trees regressor

etr = ExtraTreesRegressor(random_state = 123 , max_depth = 45  , n_estimators = 400)
etr.fit(X_train,y_train)

ExtraTreesRegressor(max_depth=45, n_estimators=400, random_state=123)

In [29]:
#Fitting light gbm model on the train data

lgbm = LGBMRegressor(random_state = 123 ,  num_leaves = 750 , learning_rate = 0.01, max_bin = 1200 , n_estimators = 1000)
lgbm.fit(X_train,y_train)

LGBMRegressor(learning_rate=0.01, max_bin=1200, n_estimators=1000,
              num_leaves=750, random_state=123)

In [30]:
#Fitting xgboost regressor model on the train data

xgb = XGBRegressor(random_state = 123 , max_depth = 7 , learning_rate = 0.2 , n_estimators = 1500)
xgb.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.2, max_delta_step=0, max_depth=7,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1500, n_jobs=4, num_parallel_tree=1, random_state=123,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [31]:
#Fitting random forest regressor model on the train data

rf = RandomForestRegressor(random_state = 123 , max_depth = 45 , n_estimators = 600)
rf.fit(X_train,y_train)

RandomForestRegressor(max_depth=45, n_estimators=600, random_state=123)

In [33]:
#Fitting catboost regressor model on the train data

cat = CatBoostRegressor(random_state = 123  , max_depth = 14 )
cat.fit(X_train,y_train, verbose = False)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

<catboost.core.CatBoostRegressor at 0x1612a9703d0>

In [34]:
train_score = pd.DataFrame({
    'model': ['ExtraTreesRegressor', 'LGBMRegressor', 'XGBRegressor', 'RandomForestRegressor', 'CatBoostRegressor'],
    
    'score': [etr.score(X_train, y_train), lgbm.score(X_train, y_train),
                xgb.score(X_train, y_train) , rf.score(X_train, y_train),
                cat.score(X_train, y_train)]
    })
train_score            

Unnamed: 0,model,score
0,ExtraTreesRegressor,0.999986
1,LGBMRegressor,0.907804
2,XGBRegressor,0.999882
3,RandomForestRegressor,0.965881
4,CatBoostRegressor,0.961405


In [35]:
etr_pred = etr.predict(X_test)
lgbm_pred = lgbm.predict(X_test)
xgb_pred = xgb.predict(X_test)
rf_pred = rf.predict(X_test)
cat_pred = cat.predict(X_test)

In [36]:
# Generalisation
best_model = pd.DataFrame({
    'model': ['ExtraTreesRegressor', 'LGBMRegressor', 'XGBRegressor', 'RandomForestRegressor', 'CatBoostRegressor'],
    
    'mae': [mean_absolute_error(y_test, etr_pred), mean_absolute_error(y_test, lgbm_pred),
           mean_absolute_error(y_test, xgb_pred), mean_absolute_error(y_test, rf_pred),
           mean_absolute_error(y_test, cat_pred)],
    
    'mse': [mean_squared_error(y_test, etr_pred), mean_squared_error(y_test, lgbm_pred),
            mean_squared_error(y_test, xgb_pred), mean_squared_error(y_test, rf_pred),
            mean_squared_error(y_test, cat_pred)],
    
    'rmse': [(np.sqrt(metrics.mean_squared_error(y_test, etr_pred))),(np.sqrt(metrics.mean_squared_error(y_test, lgbm_pred))),
             (np.sqrt(metrics.mean_squared_error(y_test, xgb_pred))), (np.sqrt(metrics.mean_squared_error(y_test, rf_pred))),
             (np.sqrt(metrics.mean_squared_error(y_test, cat_pred)))]
})
best_model

Unnamed: 0,model,mae,mse,rmse
0,ExtraTreesRegressor,0.218828,0.082976,0.288055
1,LGBMRegressor,0.205127,0.072246,0.268786
2,XGBRegressor,0.216739,0.083082,0.288239
3,RandomForestRegressor,0.211458,0.077205,0.277858
4,CatBoostRegressor,0.197216,0.069104,0.262876
