# MODEL BUILDING

In [49]:
# import the relevant libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import ExtraTreesRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')
import pickle

In [50]:
data = pd.read_csv('../input/cleaned_car_data.csv')
data.head()

Unnamed: 0,name,manufacturer,year,age,mileage,engine,transmission,price
0,Mazda MX5,Mazda,2007,14,63131,Petrol,Manual,7499
1,Jaguar XF,Jaguar,2010,11,61890,Petrol,Automatic,7775
2,Audi A6,Audi,2012,9,129170,Diesel,Automatic,6950
3,Nissan Qashqai,Nissan,2013,8,44900,Petrol,Automatic,7790
4,MINI Mini,Mini,2017,4,32012,Petrol,Manual,15999


In [51]:
#drop the name and year columns because it is irrelevant in our model building
data = data.drop(['name', 'year'], axis=1)

In [52]:
data.head()

Unnamed: 0,manufacturer,age,mileage,engine,transmission,price
0,Mazda,14,63131,Petrol,Manual,7499
1,Jaguar,11,61890,Petrol,Automatic,7775
2,Audi,9,129170,Diesel,Automatic,6950
3,Nissan,8,44900,Petrol,Automatic,7790
4,Mini,4,32012,Petrol,Manual,15999


In [53]:
data['manufacturer'].unique()

array(['Mazda', 'Jaguar', 'Audi', 'Nissan', 'Mini', 'Mercedes-Benz',
       'Volkswagen', 'Vauxhall', 'Toyota', 'Skoda', 'Ford', 'Chevrolet',
       'BMW', 'Suzuki', 'Renault', 'Peugeot', 'Citroen', 'Volvo', 'Fiat',
       'DS', 'Dacia', 'Abarth', 'Smart', 'Seat', 'Mitsubishi', 'MG',
       'Lexus', 'Land-Rover', 'Kia', 'Jeep', 'Hyundai', 'Honda',
       'Maserati', 'Subaru', 'Porsche', 'Infiniti', 'Bentley',
       'Alfa-Romero', 'Chrysler', 'Isuzu'], dtype=object)

In [54]:
# label encode the categorical values
le_manufacturer = LabelEncoder()
le_engine = LabelEncoder()
le_transmission = LabelEncoder()

In [55]:
data['manufacturer'] = le_manufacturer.fit_transform(data['manufacturer'])
data['engine'] = le_engine.fit_transform(data['engine'])
data['transmission'] = le_transmission.fit_transform(data['transmission'])

In [56]:
data.head(10)

Unnamed: 0,manufacturer,age,mileage,engine,transmission,price
0,23,14,63131,3,1,7499
1,16,11,61890,3,0,7775
2,2,9,129170,0,0,6950
3,27,8,44900,3,0,7790
4,25,4,32012,3,1,15999
5,24,6,33050,0,0,10995
6,38,10,62000,0,1,7250
7,37,7,44000,3,1,5990
8,36,8,172000,2,0,6290
9,32,9,74000,0,1,4450


In [57]:
# creating X and y variables
X = data.drop('price', axis=1)

# log transform the price column
y = np.log(data['price'])

In [58]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [59]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2090, 5), (896, 5), (2090,), (896,))

In [60]:
# feature scale the X_train and X_test values

norm = MinMaxScaler().fit(X_train)

# transform training data
X_train = norm.transform(X_train)

# transform testing data
X_test = norm.transform(X_test)

print(X_train)
print('\n')
print(X_test)


[[0.05128205 0.16666667 0.13713793 0.         0.        ]
 [0.46153846 0.04166667 0.0308516  0.5        0.        ]
 [0.82051282 0.         0.03725164 0.         1.        ]
 ...
 [0.07692308 0.41666667 0.28336733 0.         0.        ]
 [0.69230769 0.20833333 0.22733844 0.75       0.5       ]
 [0.92307692 0.375      0.16930954 0.75       0.        ]]


[[0.97435897 0.04166667 0.03048589 0.75       0.5       ]
 [0.33333333 0.125      0.07322899 0.75       1.        ]
 [0.76923077 0.04166667 0.02873159 0.75       1.        ]
 ...
 [0.05128205 0.58333333 0.7828559  0.         0.        ]
 [0.79487179 0.33333333 0.29713884 0.         0.        ]
 [0.94871795 0.04166667 0.03975451 0.75       0.5       ]]


In [61]:
models = {
    'linear_model': LinearRegression(),
    'ridge_model': Ridge(random_state=123),
    'extree_model':ExtraTreesRegressor(random_state = 123),
    'lgbm_model':LGBMRegressor(random_state = 123),
    'xgboost_model':XGBRegressor(random_state = 123),
    'rf_model' : RandomForestRegressor(random_state = 123)
 }

In [62]:
def train_model(models: dict) -> pd.DataFrame:
    """
    It takes in a dictionary containing a key-pair of model name and estimators.
    It returns a data frame containing the metrics of the trained model.
    """
    my_dict = {}
    name_list, train_score_list, r_sqd_list, mae_list, rmse_list = [], [], [], [], []
    for name, estimator in models.items():
        # fit
        estimator.fit(X_train, y_train)

        # make predictions
        y_pred = estimator.predict(X_test)

        # metrics
        train_score = estimator.score(X_train, y_train)
        r_sqd = metrics.r2_score(y_test, y_pred)
        mae = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)

        # add the metrics to the empty list
        name_list.append(name)
        train_score_list.append(train_score)
        r_sqd_list.append(r_sqd)
        mae_list.append(mae)
        rmse_list.append(rmse)

    my_dict["Name"] = name_list
    my_dict["Train_Score"] = train_score_list
    my_dict["R_squared"] = r_sqd_list
    my_dict["Mean_absolute_error"] = mae_list
    my_dict["Root_mean_sqd_error"] = rmse_list

    my_dataframe = pd.DataFrame(my_dict)
    my_dataframe = my_dataframe.sort_values("Root_mean_sqd_error")
    return my_dataframe

In [63]:
train_model(models)

Unnamed: 0,Name,Train_Score,R_squared,Mean_absolute_error,Root_mean_sqd_error
3,lgbm_model,0.870977,0.781549,0.197763,0.261602
4,xgboost_model,0.951829,0.775469,0.202,0.265217
5,rf_model,0.961506,0.745092,0.216515,0.28259
2,extree_model,0.999986,0.710235,0.228424,0.301291
1,ridge_model,0.587302,0.506338,0.285117,0.393259
0,linear_model,0.587771,0.4982,0.284037,0.396487


In [64]:
#create the grid
grid = {'max_depth': [3,4,5],'n_estimators':[100, 200, 300]}

#Instantiate GridSearchCV
model = GridSearchCV (estimator = LGBMRegressor(random_state = 123), param_grid = grid, scoring ='neg_root_mean_squared_error', cv = 5)

In [65]:
model.fit(X_train,y_train, verbose = False)

GridSearchCV(cv=5, estimator=LGBMRegressor(random_state=123),
             param_grid={'max_depth': [3, 4, 5],
                         'n_estimators': [100, 200, 300]},
             scoring='neg_root_mean_squared_error')

In [66]:
y_pred = model.predict(X_test)

In [67]:
grid_model = pd.DataFrame({
    'model': ['LGBM'],
    'r_squared': [metrics.r2_score(y_test, y_pred)],
    'mae': [mean_absolute_error(y_test, y_pred)],
    'rmse': [np.sqrt(metrics.mean_squared_error(y_test, y_pred))]
    })
grid_model


Unnamed: 0,model,r_squared,mae,rmse
0,LGBM,0.790897,0.198254,0.255944


#### Make Predictions on new data.

In [68]:
# manufacturer, age, mileage, engine, transmission
new_data = [['Mazda', 14, 63131, 'Petrol', 'Manual']]
# convert to array
new_data = np.array(new_data)
new_data

array([['Mazda', '14', '63131', 'Petrol', 'Manual']], dtype='<U11')

In [69]:
new_data[:,0] = le_manufacturer.transform(new_data[:,0])
new_data[:,3] = le_engine.transform(new_data[:,3])
new_data[:,4] = le_transmission.transform(new_data[:,4])
new_data

array([['23', '14', '63131', '3', '1']], dtype='<U11')

In [70]:
new_data = norm.transform(new_data)
new_data

array([[0.58974359, 0.58333333, 0.36074492, 0.75      , 0.5       ]])

In [71]:
# make predictions
price = model.predict(new_data)
# convert the price from log_price to actual price
act_price = np.exp(price) + 1
act_price = round(act_price[0])  # round to the neaarest pounds

print(f"The estimated cost of the car is {act_price:,} pounds")

The estimated cost of the car is 5,254 pounds


In [72]:
data = {"model": model, "normalization": norm}
with open('../models/regressor.pkl', 'wb') as file:
    pickle.dump(data, file)