# Setup and Imports

In [1]:
import sys
if '../Toolkit' not in sys.path: sys.path.append('../Toolkit')

%load_ext autoreload
%autoreload 1

%aimport tools
%aimport models

import tools as t
import models as m

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from IPython.display import clear_output

import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 999
pd.options.display.max_rows = 113
import matplotlib.pyplot as plt
import seaborn as sns

RS = 35577 # global random state seed
raw_data_path = 'data-raw'
processed_data_path = 'data-processed'

In [2]:
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
import time

In [19]:
df_data = t.from_pkl(f'{processed_data_path}/df_clean.pkl')

df_data.shape
df_data.head(3)

(97323, 8)

Unnamed: 0,make,model,year,price,transmission,mileage,fuelType,engineSize
0,Audi,A1,2017,12500,Manual,15735,Petrol,1.4
1,Audi,A6,2016,16500,Automatic,36203,Diesel,2.0
2,Audi,A1,2016,11000,Manual,29946,Petrol,1.4


In [22]:
df_data.groupby(['make'])['model'].nunique()

make
Audi        21
Bmw         22
Ford        19
Hyundi      11
Merc        20
Skoda       12
Toyota      15
Vauxhall    17
Vw          22
Name: model, dtype: int64

In [26]:
df_data.loc[df_data['make'] == 'Hyundi']['model'].value_counts()

Tucson      1279
I10         1061
I30          535
I20          487
Kona         322
Ioniq        273
Santa Fe     244
IX20         202
I40          120
IX35         118
I800         117
Name: model, dtype: int64

In [29]:
df_hyundi = df_data.loc[df_data['make'] == 'Hyundi'].drop(columns=['make']).copy()
df_hyundi

Unnamed: 0,model,year,price,transmission,mileage,fuelType,engineSize
39414,I20,2017,7999,Manual,17307,Petrol,1.2
39415,Tucson,2016,14499,Automatic,25233,Diesel,2.0
39416,Tucson,2016,11399,Manual,37877,Diesel,1.7
39417,I10,2016,6499,Manual,23789,Petrol,1.0
39418,IX35,2015,10199,Manual,33177,Diesel,2.0
...,...,...,...,...,...,...,...
44269,I30,2016,8680,Manual,25906,Diesel,1.6
44270,I40,2015,7830,Manual,59508,Diesel,1.7
44271,I10,2017,6830,Manual,13810,Petrol,1.0
44272,Tucson,2018,13994,Manual,23313,Petrol,1.6


In [34]:
df_to_group = df_hyundi

In [41]:
mileage_5k = ((df_to_group['mileage'] / 5000).round(0) * 5000).astype(int)
df_grp = df_to_group.drop(columns='mileage').groupby(['model', 'year', 'transmission', mileage_5k, 'fuelType', 'engineSize']).agg([len, np.mean, np.std, m.mad]).copy()
df_grp.columns = [c[1] for c in df_grp.columns]

'mean group size: ', df_grp['len'].mean()
'price mad in 5k groups: ', df_grp['mad'].mean()
'price std in 5k groups: ', df_grp['std'].mean()

('mean group size: ', 3.7702060221870046)

('price mad in 5k groups: ', 315.97138099375206)

('price std in 5k groups: ', 849.4404889166979)

In [43]:
mileage_10k = ((df_to_group['mileage'] / 10000).round(0) * 10000).astype(int)
df_grp = df_to_group.drop(columns='mileage').groupby(['model', 'year', 'transmission', mileage_10k, 'fuelType', 'engineSize']).agg([len, np.mean, np.std, m.mad]).copy()
df_grp.columns = [c[1] for c in df_grp.columns]

'mean group size: ', df_grp['len'].mean()
'price mad in 10k groups: ', df_grp['mad'].mean()
'price std in 10k groups: ', df_grp['std'].mean()

('mean group size: ', 5.228571428571429)

('price mad in 10k groups: ', 378.451221207289)

('price std in 10k groups: ', 894.6484135765044)

# Encode

In [48]:
df_hyundi = df_hyundi.drop(columns='model')
df_hyundi

Unnamed: 0,year,price,transmission,mileage,fuelType,engineSize
39414,2017,7999,1,17307,2,1.2
39415,2016,14499,0,25233,0,2.0
39416,2016,11399,1,37877,0,1.7
39417,2016,6499,1,23789,2,1.0
39418,2015,10199,1,33177,0,2.0
...,...,...,...,...,...,...
44269,2016,8680,1,25906,0,1.6
44270,2015,7830,1,59508,0,1.7
44271,2017,6830,1,13810,2,1.0
44272,2018,13994,1,23313,2,1.6


In [49]:
m.ordinalEncode(df_hyundi, ['transmission', 'fuelType'])
df_hyundi

Unnamed: 0,year,price,transmission,mileage,fuelType,engineSize
39414,2017,7999,1,17307,2,1.2
39415,2016,14499,0,25233,0,2.0
39416,2016,11399,1,37877,0,1.7
39417,2016,6499,1,23789,2,1.0
39418,2015,10199,1,33177,0,2.0
...,...,...,...,...,...,...
44269,2016,8680,1,25906,0,1.6
44270,2015,7830,1,59508,0,1.7
44271,2017,6830,1,13810,2,1.0
44272,2018,13994,1,23313,2,1.6


# X y

In [52]:
df_train = df_hyundi
df_train.head(3)

Unnamed: 0,year,price,transmission,mileage,fuelType,engineSize
39414,2017,7999,1,17307,2,1.2
39415,2016,14499,0,25233,0,2.0
39416,2016,11399,1,37877,0,1.7


In [53]:
X, y  = df_train.drop(columns='price'), df_train['price']

y.shape
X.shape
X.columns
X.head(2)


(4758,)

(4758, 5)

Index(['year', 'transmission', 'mileage', 'fuelType', 'engineSize'], dtype='object')

Unnamed: 0,year,transmission,mileage,fuelType,engineSize
39414,2017,1,17307,2,1.2
39415,2016,0,25233,0,2.0


# Train model

In [54]:
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import KFold, RepeatedKFold, StratifiedKFold, RepeatedStratifiedKFold

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import minmax_scale, normalize, scale, PolynomialFeatures

In [None]:
def booger_aids(X, y):
    models_and_params = [
        (LinearRegression, {}),
        # (Lasso, {'random_state': RS}),
        # (Ridge, {'random_state': RS}),
    ]
    cv = KFold(5)
    metrics = [ mean_absolute_error, m.root_mean_squared_error, r2_score]

    r = t.grid_exec(
        lambda model: m.cv_regression(model, cv, X, y, metrics), 
        models_and_params,
    )
    return r

### Raw

In [None]:
X, y  = df_train.drop(columns='price'), df_train['price']
r = booger_aids(X, y)
m.display_stats(r, False)

### Standard Scaled

In [None]:
X, y  = df_train.drop(columns='price'), df_train['price']
X[X.columns] = scale(X)
r = booger_aids(X, y)
m.display_stats(r, False)

### Normalized

In [None]:
X, y  = df_train.drop(columns='price'), df_train['price']
X[X.columns] = normalize(X)
r = booger_aids(X, y)
m.display_stats(r, False)

### Min Max

In [None]:
X, y  = df_train.drop(columns='price'), df_train['price']
X[X.columns] = minmax_scale(X)
r = booger_aids(X, y)
m.display_stats(r, False)

### Poly

In [None]:
all_r = []
for i in range(7):
    X, y  = df_train.drop(columns='price').copy(), df_train['price'].copy()
    X_poly = m.polynomialFeatures(X, i)
    r = booger_aids(X_poly, y)
    all_r.extend(r)

m.display_stats(all_r, False)

In [None]:
all_r = []
for i in range(7):
    X, y  = df_train.drop(columns='price').copy(), df_train['price'].copy()
    X[X.columns] = scale(X)
    X_poly = m.polynomialFeatures(X, i)
    r = booger_aids(X_poly, y)
    all_r.extend(r)

m.display_stats(all_r, False)

In [None]:
all_r = []
for i in range(7):
    X, y  = df_train.drop(columns='price').copy(), df_train['price'].copy()
    X[X.columns] = normalize(X)
    X_poly = m.polynomialFeatures(X, i)
    r = booger_aids(X_poly, y)
    all_r.extend(r)

m.display_stats(all_r, False)

In [None]:
all_r = []
for i in range(1, 20):
    X, y  = df_train.drop(columns='price').copy(), df_train['price'].copy()
    X[X.columns] = minmax_scale(X)
    X_poly = m.polynomialFeatures(X, i)
    r = booger_aids(X_poly, y)
    all_r.extend(r)

m.display_stats(all_r, False)

## Trees

In [56]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [57]:
n_est_list = [100, 200, 400]
models_and_params = [
    (DecisionTreeRegressor, {}),
    *[ (RandomForestRegressor,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (AdaBoostRegressor,             {'random_state': RS, 'n_estimators': n_e}) for n_e in  n_est_list],
    *[ (LGBMRegressor,                 {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (XGBRegressor,                  {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostRegressor,             {'random_state': RS, 'n_estimators': n_e, 'silent': True, 'allow_writing_files': False}) for n_e in n_est_list ],
]
cv = KFold(5)
metrics = [ mean_absolute_error, m.root_mean_squared_error, r2_score]

_ = t.grid_exec(
    lambda model: m.cv_regression(model, cv, X, y, metrics), 
    models_and_params, 
    on1Completed = lambda results_1_run: m.display_stats(results_1_run, reverse_rank_idx=[2])
)

Unnamed: 0,model,params,n_folds,MAE_mean,MAE_std,#1,RMSE_mean,RMSE_std,#2,R2_mean,R2_std,#3,time
0,DecisionTreeRegressor,,5,1787,129,13,3056,421,13,0.693,0.171,13,0.1
1,RandomForestRegressor,'n_estimators': 100,5,1524,140,11,2512,469,10,0.813,0.053,10,2.5
2,RandomForestRegressor,'n_estimators': 200,5,1522,142,10,2520,470,11,0.812,0.053,11,4.9
3,RandomForestRegressor,'n_estimators': 400,5,1519,141,9,2505,469,9,0.813,0.053,9,10.0
4,AdaBoostRegressor,'n_estimators': 100,5,2510,316,15,3366,660,15,0.665,0.091,15,1.0
5,AdaBoostRegressor,'n_estimators': 200,5,2510,316,15,3366,660,15,0.665,0.091,15,1.0
6,AdaBoostRegressor,'n_estimators': 400,5,2510,316,15,3366,660,15,0.665,0.091,15,1.0
7,LGBMRegressor,'n_estimators': 100,5,1394,120,4,2217,546,1,0.853,0.06,1,0.4
8,LGBMRegressor,'n_estimators': 200,5,1405,118,5,2242,535,2,0.85,0.059,2,0.7
9,LGBMRegressor,'n_estimators': 400,5,1426,113,6,2281,520,4,0.844,0.058,4,1.4


In [58]:
n_est_list = [100, 200, 400]
models_and_params = [
    (DecisionTreeRegressor, {}),
    *[ (RandomForestRegressor,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (AdaBoostRegressor,             {'random_state': RS, 'n_estimators': n_e}) for n_e in  n_est_list],
    *[ (LGBMRegressor,                 {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (XGBRegressor,                  {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostRegressor,             {'random_state': RS, 'n_estimators': n_e, 'silent': True, 'allow_writing_files': False}) for n_e in n_est_list ],
]
cv = KFold(8)
metrics = [ mean_absolute_error, m.root_mean_squared_error, r2_score]

_ = t.grid_exec(
    lambda model: m.cv_regression(model, cv, X, y, metrics), 
    models_and_params, 
    on1Completed = lambda results_1_run: m.display_stats(results_1_run, reverse_rank_idx=[2])
)

Unnamed: 0,model,params,n_folds,MAE_mean,MAE_std,#1,RMSE_mean,RMSE_std,#2,R2_mean,R2_std,#3,time
0,DecisionTreeRegressor,,8,1779,231,13,2960,873,13,0.732,0.111,13,0.1
1,RandomForestRegressor,'n_estimators': 100,8,1507,179,11,2426,662,10,0.822,0.057,10,4.2
2,RandomForestRegressor,'n_estimators': 200,8,1506,177,10,2429,662,11,0.822,0.057,11,8.4
3,RandomForestRegressor,'n_estimators': 400,8,1506,179,9,2431,670,12,0.821,0.058,12,16.6
4,AdaBoostRegressor,'n_estimators': 100,8,2523,293,14,3495,633,14,0.621,0.106,14,1.9
5,AdaBoostRegressor,'n_estimators': 200,8,2532,308,15,3500,620,15,0.619,0.109,15,2.0
6,AdaBoostRegressor,'n_estimators': 400,8,2532,308,15,3500,620,15,0.619,0.109,15,2.0
7,LGBMRegressor,'n_estimators': 100,8,1387,148,4,2175,675,2,0.856,0.06,3,0.9
8,LGBMRegressor,'n_estimators': 200,8,1397,147,5,2200,670,4,0.853,0.059,4,1.4
9,LGBMRegressor,'n_estimators': 400,8,1427,146,7,2246,662,5,0.847,0.058,5,2.7


In [None]:
('price mad in 5k groups: ', 323.66491789552254)
('price std in 5k groups: ', 701.2696209252479)

('price mad in 10k groups: ', 343.64691608968064)
('price std in 10k groups: ', 729.4423273421423)

In [59]:
n_est_list = [800, 1600]

models_and_params = [
    *[ (RandomForestRegressor,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (AdaBoostRegressor,             {'random_state': RS, 'n_estimators': n_e}) for n_e in  n_est_list],
    *[ (LGBMRegressor,                 {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (XGBRegressor,                  {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostRegressor,             {'random_state': RS, 'n_estimators': n_e, 'silent': True, 'allow_writing_files': False}) for n_e in n_est_list ],
]
cv = KFold(5)
metrics = [ mean_absolute_error, m.root_mean_squared_error, r2_score]

_ = t.grid_exec(
    lambda model: m.cv_regression(model, cv, X, y, metrics), 
    models_and_params, 
    on1Completed = lambda results_1_run: m.display_stats(results_1_run, reverse_rank_idx=[2])
)


Unnamed: 0,model,params,n_folds,MAE_mean,MAE_std,#1,RMSE_mean,RMSE_std,#2,R2_mean,R2_std,#3,time
0,RandomForestRegressor,'n_estimators': 800,5,1517,139,5,2504,467,5,0.814,0.053,5,20.1
1,RandomForestRegressor,'n_estimators': 1600,5,1518,139,6,2506,467,6,0.813,0.053,6,40.6
2,AdaBoostRegressor,'n_estimators': 800,5,2510,316,9,3366,660,9,0.665,0.091,9,1.0
3,AdaBoostRegressor,'n_estimators': 1600,5,2510,316,9,3366,660,9,0.665,0.091,9,1.0
4,LGBMRegressor,'n_estimators': 800,5,1461,108,3,2339,494,3,0.835,0.058,3,2.9
5,LGBMRegressor,'n_estimators': 1600,5,1502,106,4,2401,473,4,0.826,0.058,4,6.2
6,XGBRegressor,'n_estimators': 800,5,1634,147,7,2628,394,7,0.793,0.053,7,16.0
7,XGBRegressor,'n_estimators': 1600,5,1672,147,8,2670,387,8,0.786,0.055,8,30.3
8,CatBoostRegressor,'n_estimators': 800,5,1362,119,1,2251,506,2,0.848,0.056,2,4.5
9,CatBoostRegressor,'n_estimators': 1600,5,1363,123,2,2242,520,1,0.85,0.056,1,9.4


In [60]:
n_est_list = [800, 1600]

models_and_params = [
    *[ (RandomForestRegressor,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (AdaBoostRegressor,             {'random_state': RS, 'n_estimators': n_e}) for n_e in  n_est_list],
    *[ (LGBMRegressor,                 {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (XGBRegressor,                  {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostRegressor,             {'random_state': RS, 'n_estimators': n_e, 'silent': True, 'allow_writing_files': False}) for n_e in n_est_list ],
]
cv = KFold(8)
metrics = [ mean_absolute_error, m.root_mean_squared_error, r2_score]

_ = t.grid_exec(
    lambda model: m.cv_regression(model, cv, X, y, metrics), 
    models_and_params, 
    on1Completed = lambda results_1_run: m.display_stats(results_1_run, reverse_rank_idx=[2])
)


Unnamed: 0,model,params,n_folds,MAE_mean,MAE_std,#1,RMSE_mean,RMSE_std,#2,R2_mean,R2_std,#3,time
0,RandomForestRegressor,'n_estimators': 800,8,1505,181,5,2429,672,5,0.822,0.058,5,33.7
1,RandomForestRegressor,'n_estimators': 1600,8,1505,181,4,2430,673,6,0.822,0.058,6,66.1
2,AdaBoostRegressor,'n_estimators': 800,8,2532,308,9,3500,620,9,0.619,0.109,9,2.0
3,AdaBoostRegressor,'n_estimators': 1600,8,2532,308,9,3500,620,9,0.619,0.109,9,1.9
4,LGBMRegressor,'n_estimators': 800,8,1467,137,3,2310,649,3,0.838,0.056,3,3.8
5,LGBMRegressor,'n_estimators': 1600,8,1518,138,6,2375,643,4,0.829,0.056,4,7.6
6,XGBRegressor,'n_estimators': 800,8,1593,189,7,2474,670,7,0.814,0.062,7,22.3
7,XGBRegressor,'n_estimators': 1600,8,1633,187,8,2519,660,8,0.807,0.061,8,46.7
8,CatBoostRegressor,'n_estimators': 800,8,1350,160,2,2193,655,2,0.853,0.06,2,7.0
9,CatBoostRegressor,'n_estimators': 1600,8,1347,161,1,2189,666,1,0.854,0.058,1,13.8
