In [43]:
# import the relevant libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import ExtraTreesRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [100]:
data = pd.read_csv('cleaned_car_data.csv')
data.head()

Unnamed: 0,name,manufacturer,year,age,mileage,engine,transmission,price
0,Mazda MX5,Mazda,2007,14,63131,Petrol,Manual,7499
1,Jaguar XF,Jaguar,2010,11,61890,Petrol,Automatic,7775
2,Audi A6,Audi,2012,9,129170,Diesel,Automatic,6950
3,Nissan Qashqai,Nissan,2013,8,44900,Petrol,Automatic,7790
4,MINI Mini,Mini,2017,4,32012,Petrol,Manual,15999


In [101]:
#drop the name and year columns because it is irrelevant in our model building
data = data.drop(['name', 'year'], axis=1)

In [102]:
data.head()

Unnamed: 0,manufacturer,age,mileage,engine,transmission,price
0,Mazda,14,63131,Petrol,Manual,7499
1,Jaguar,11,61890,Petrol,Automatic,7775
2,Audi,9,129170,Diesel,Automatic,6950
3,Nissan,8,44900,Petrol,Automatic,7790
4,Mini,4,32012,Petrol,Manual,15999


In [106]:
data['manufacturer'].unique()

array(['Mazda', 'Jaguar', 'Audi', 'Nissan', 'Mini', 'Mercedes-Benz',
       'Volkswagen', 'Vauxhall', 'Toyota', 'Skoda', 'Ford', 'Chevrolet',
       'BMW', 'Suzuki', 'Renault', 'Peugeot', 'Citroen', 'Volvo', 'Fiat',
       'DS', 'Dacia', 'Abarth', 'Smart', 'Seat', 'Mitsubishi', 'MG',
       'Lexus', 'Land-Rover', 'Kia', 'Jeep', 'Hyundai', 'Honda',
       'Maserati', 'Subaru', 'Porsche', 'Infiniti', 'Bentley',
       'Alfa-Romero', 'Chrysler', 'Isuzu'], dtype=object)

In [107]:
# label encode the categorical values
le_manufacturer = LabelEncoder()
le_engine = LabelEncoder()
le_transmission = LabelEncoder()

In [108]:
data['manufacturer'] = le_manufacturer.fit_transform(data['manufacturer'])
data['engine'] = le_engine.fit_transform(data['engine'])
data['transmission'] = le_transmission.fit_transform(data['transmission'])

In [109]:
data.head(10)

Unnamed: 0,manufacturer,age,mileage,engine,transmission,price
0,23,14,63131,3,1,7499
1,16,11,61890,3,0,7775
2,2,9,129170,0,0,6950
3,27,8,44900,3,0,7790
4,25,4,32012,3,1,15999
5,24,6,33050,0,0,10995
6,38,10,62000,0,1,7250
7,37,7,44000,3,1,5990
8,36,8,172000,2,0,6290
9,32,9,74000,0,1,4450


In [110]:
# creating X and y variables
X = data.drop('price', axis=1)

# log transform the price column
y = np.log(data['price'])

In [111]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [112]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2090, 5), (896, 5), (2090,), (896,))

In [113]:
# feature scale the X_train and X_test values

norm = MinMaxScaler().fit(X_train)

# transform training data
X_train = norm.transform(X_train)

# transform testing data
X_test = norm.transform(X_test)

print(X_train)
print('\n')
print(X_test)


[[0.05128205 0.16666667 0.13713793 0.         0.        ]
 [0.46153846 0.04166667 0.0308516  0.5        0.        ]
 [0.82051282 0.         0.03725164 0.         1.        ]
 ...
 [0.07692308 0.41666667 0.28336733 0.         0.        ]
 [0.69230769 0.20833333 0.22733844 0.75       0.5       ]
 [0.92307692 0.375      0.16930954 0.75       0.        ]]


[[0.97435897 0.04166667 0.03048589 0.75       0.5       ]
 [0.33333333 0.125      0.07322899 0.75       1.        ]
 [0.76923077 0.04166667 0.02873159 0.75       1.        ]
 ...
 [0.05128205 0.58333333 0.7828559  0.         0.        ]
 [0.79487179 0.33333333 0.29713884 0.         0.        ]
 [0.94871795 0.04166667 0.03975451 0.75       0.5       ]]


In [81]:
# Fitting extra trees regressor

etr = ExtraTreesRegressor(random_state = 123 , max_depth = 45  , n_estimators = 400)
etr.fit(X_train,y_train)

ExtraTreesRegressor(max_depth=45, n_estimators=400, random_state=123)

In [82]:
#Fitting light gbm model on the train data

lgbm = LGBMRegressor(random_state = 123 ,  num_leaves = 750 , learning_rate = 0.01, max_bin = 1200 , n_estimators = 1000)
lgbm.fit(X_train,y_train)

LGBMRegressor(learning_rate=0.01, max_bin=1200, n_estimators=1000,
              num_leaves=750, random_state=123)

In [83]:
#Fitting xgboost regressor model on the train data

xgb = XGBRegressor(random_state = 123 , max_depth = 7 , learning_rate = 0.2 , n_estimators = 1500)
xgb.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.2, max_delta_step=0, max_depth=7,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1500, n_jobs=4, num_parallel_tree=1, random_state=123,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [84]:
#Fitting random forest regressor model on the train data

rf = RandomForestRegressor(random_state = 123 , max_depth = 45 , n_estimators = 600)
rf.fit(X_train,y_train)

RandomForestRegressor(max_depth=45, n_estimators=600, random_state=123)

In [85]:
#Fitting catboost regressor model on the train data

cat = CatBoostRegressor(random_state = 123  , max_depth = 14 )
cat.fit(X_train,y_train, verbose = False)

<catboost.core.CatBoostRegressor at 0x2123a8d0e80>

In [86]:
train_score = pd.DataFrame({
    'model': ['ExtraTreesRegressor', 'LGBMRegressor', 'XGBRegressor', 'RandomForestRegressor', 'CatBoostRegressor'],
    
    'score': [etr.score(X_train, y_train), lgbm.score(X_train, y_train),
                xgb.score(X_train, y_train) , rf.score(X_train, y_train),
                cat.score(X_train, y_train)]
    })
train_score            

Unnamed: 0,model,score
0,ExtraTreesRegressor,0.999986
1,LGBMRegressor,0.910343
2,XGBRegressor,0.999859
3,RandomForestRegressor,0.962746
4,CatBoostRegressor,0.968628


In [87]:
etr_pred = etr.predict(X_test)
lgbm_pred = lgbm.predict(X_test)
xgb_pred = xgb.predict(X_test)
rf_pred = rf.predict(X_test)
cat_pred = cat.predict(X_test)

In [88]:
# Generalisation
best_model = pd.DataFrame({
    'model': ['ExtraTreesRegressor', 'LGBMRegressor', 'XGBRegressor', 'RandomForestRegressor', 'CatBoostRegressor'],
    
    'mae': [mean_absolute_error(y_test, etr_pred), mean_absolute_error(y_test, lgbm_pred),
           mean_absolute_error(y_test, xgb_pred), mean_absolute_error(y_test, rf_pred),
           mean_absolute_error(y_test, cat_pred)],
    
    'mse': [mean_squared_error(y_test, etr_pred), mean_squared_error(y_test, lgbm_pred),
            mean_squared_error(y_test, xgb_pred), mean_squared_error(y_test, rf_pred),
            mean_squared_error(y_test, cat_pred)],
    
    'rmse': [(np.sqrt(metrics.mean_squared_error(y_test, etr_pred))),(np.sqrt(metrics.mean_squared_error(y_test, lgbm_pred))),
             (np.sqrt(metrics.mean_squared_error(y_test, xgb_pred))), (np.sqrt(metrics.mean_squared_error(y_test, rf_pred))),
             (np.sqrt(metrics.mean_squared_error(y_test, cat_pred)))]
})
best_model

Unnamed: 0,model,mae,mse,rmse
0,ExtraTreesRegressor,0.228266,0.090456,0.300759
1,LGBMRegressor,0.204273,0.07251,0.269277
2,XGBRegressor,0.221372,0.082251,0.286794
3,RandomForestRegressor,0.213997,0.0782,0.279643
4,CatBoostRegressor,0.198805,0.070714,0.265921


The CatBoost Regressor has the lowest MAE and RMSE, so it is the chosen model

In [89]:
#create the grid
grid = {'max_depth': [3,4,5],'n_estimators':[100, 200, 300]}

#Instantiate GridSearchCV
model = GridSearchCV (estimator = cat, param_grid = grid, scoring ='neg_root_mean_squared_error', cv = 5)

In [90]:
model.fit(X_train,y_train, verbose = False)

GridSearchCV(cv=5,
             estimator=<catboost.core.CatBoostRegressor object at 0x000002123A8D0E80>,
             param_grid={'max_depth': [3, 4, 5],
                         'n_estimators': [100, 200, 300]},
             scoring='neg_root_mean_squared_error')

In [91]:
y_pred = model.predict(X_test)

In [92]:
grid_model = pd.DataFrame({
    'model': ['CatBoost Regressor'],
    'mae': [mean_absolute_error(y_test, y_pred)],
    'mse': [mean_squared_error(y_test, y_pred)],
    'rmse': [np.sqrt(metrics.mean_squared_error(y_test, y_pred))]
    })
grid_model


Unnamed: 0,model,mae,mse,rmse
0,CatBoost Regressor,0.194577,0.065322,0.255582


#### Make Predictions on new data.

In [97]:
# manufacturer, age, mileage, engine, transmission
new_data = [['Mercedes-Benz', 2, 12340, 'Petrol', 'Automatic']]
# convert to array
new_data = np.array(new_data)
new_data

array([['Mercedes-Benz', '2', '12340', 'Petrol', 'Automatic']],
      dtype='<U13')

In [114]:
new_data[:,0] = le_manufacturer.transform(new_data[:,0])
new_data[:,3] = le_engine.transform(new_data[:,3])
new_data[:,4] = le_transmission.transform(new_data[:,4])
new_data

array([['24', '2', '12340', '3', '0']], dtype='<U13')

In [115]:
new_data = norm.transform(new_data)
new_data

array([[0.61538462, 0.08333333, 0.07050897, 0.75      , 0.        ]])

In [116]:
# make predictions
price = model.predict(new_data)
# convert the price from log_price to actual price
act_price = np.exp(price) + 1
act_price = round(act_price[0])  # round to the neaarest pounds

print(f"The estimated cost of the car is {act_price:,} pounds")

The estimated cost of the car is 24,187 pounds
