# Setup and Imports

In [1]:
import sys
if '../Toolkit' not in sys.path: sys.path.append('../Toolkit')

%load_ext autoreload
%autoreload 1

%aimport tools
%aimport models

import tools as t
import models as m

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from IPython.display import clear_output

import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 999
pd.options.display.max_rows = 113
import matplotlib.pyplot as plt
import seaborn as sns

RS = 35577 # global random state seed
raw_data_path = 'data-raw'
processed_data_path = 'data-processed'

In [2]:
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
import time

In [3]:
df_data = t.from_pkl(f'{processed_data_path}/df_clean.pkl')

df_data.shape
df_data.head(3)

(97323, 8)

Unnamed: 0,make,model,year,price,transmission,mileage,fuelType,engineSize
0,Audi,A1,2017,12500,Manual,15735,Petrol,1.4
1,Audi,A6,2016,16500,Automatic,36203,Diesel,2.0
2,Audi,A1,2016,11000,Manual,29946,Petrol,1.4


In [4]:
df_data.insert(0, 'car', df_data['make'] + '__' + df_data['model'])
df_data = df_data.drop(columns=['make', 'model'])

In [5]:
df_data.groupby(['car']).size().sort_values(ascending=False).head(10)

car
Ford__Fiesta       6508
Vw__Golf           4769
Ford__Focus        4555
Merc__C Class      3689
Vauxhall__Corsa    3285
Vw__Polo           3230
Vauxhall__Astra    2655
Merc__A Class      2474
Bmw__3 Series      2413
Ford__Kuga         2208
dtype: int64

In [6]:
df_fiesta = df_data.loc[ (df_data['car'] == 'Ford__Fiesta') ].drop(columns=['car']).copy()
df_fiesta.head(3)

Unnamed: 0,year,price,transmission,mileage,fuelType,engineSize
21449,2017,12000,Automatic,15944,Petrol,1.0
21452,2019,17500,Manual,10460,Petrol,1.5
21453,2019,16500,Automatic,1482,Petrol,1.0


In [24]:
mileage_5k = ((df_fiesta['mileage'] / 5000).round(0) * 5000).astype(int)
df_fiesta_grp = df_fiesta.drop(columns='mileage').groupby(['year', 'transmission', mileage_5k, 'fuelType', 'engineSize']).agg([len, np.mean, np.std, m.mad])
df_fiesta_grp.columns = [c[1] for c in df_fiesta_grp.columns]

'mean group size: ', df_fiesta_grp['len'].mean()
'price mad in 5k groups: ', df_fiesta_grp['mad'].mean()
'price std in 5k groups: ', df_fiesta_grp['std'].mean()

('mean group size: ', 10.446227929373997)

('price mad in 5k groups: ', 323.66491789552254)

('price std in 5k groups: ', 701.2696209252479)

In [25]:
mileage_10k = ((df_fiesta['mileage'] / 10000).round(0) * 10000).astype(int)
df_fiesta_grp = df_fiesta.drop(columns='mileage').groupby(['year', 'transmission', mileage_10k, 'fuelType', 'engineSize']).agg([len, np.mean, np.std, m.mad])
df_fiesta_grp.columns = [c[1] for c in df_fiesta_grp.columns]

'mean group size: ', df_fiesta_grp['len'].mean()
'price mad in 10k groups: ', df_fiesta_grp['mad'].mean()
'price std in 10k groups: ', df_fiesta_grp['std'].mean()

('mean group size: ', 15.349056603773585)

('price mad in 10k groups: ', 343.64691608968064)

('price std in 10k groups: ', 729.4423273421423)

# Encode

In [9]:
m.ordinalEncode(df_fiesta, ['transmission', 'fuelType'])
df_fiesta

Unnamed: 0,year,price,transmission,mileage,fuelType,engineSize
21449,2017,12000,0,15944,1,1.0
21452,2019,17500,1,10460,1,1.5
21453,2019,16500,0,1482,1,1.0
21454,2015,10500,1,35432,1,1.6
21456,2017,9000,1,13054,1,1.2
...,...,...,...,...,...,...
39394,2019,15999,1,2813,1,1.0
39395,2015,8999,1,24546,1,1.0
39401,2017,9899,0,16303,1,1.0
39404,2018,12500,0,7047,1,1.0


# X y

In [10]:
df_train = df_fiesta
df_train.head(3)

Unnamed: 0,year,price,transmission,mileage,fuelType,engineSize
21449,2017,12000,0,15944,1,1.0
21452,2019,17500,1,10460,1,1.5
21453,2019,16500,0,1482,1,1.0


In [11]:
X, y  = df_train.drop(columns='price'), df_train['price']

y.shape
X.shape
X.columns
X.head(2)


(6508,)

(6508, 5)

Index(['year', 'transmission', 'mileage', 'fuelType', 'engineSize'], dtype='object')

Unnamed: 0,year,transmission,mileage,fuelType,engineSize
21449,2017,0,15944,1,1.0
21452,2019,1,10460,1,1.5


# Train model

In [12]:
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import KFold, RepeatedKFold, StratifiedKFold, RepeatedStratifiedKFold

## Linear Models

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import minmax_scale, normalize, scale, PolynomialFeatures

In [None]:
def booger_aids(X, y):
    models_and_params = [
        (LinearRegression, {}),
        # (Lasso, {'random_state': RS}),
        # (Ridge, {'random_state': RS}),
    ]
    cv = KFold(5)
    metrics = [ mean_absolute_error, m.root_mean_squared_error, r2_score]

    r = t.grid_exec(
        lambda model: m.cv_regression(model, cv, X, y, metrics), 
        models_and_params,
    )
    return r

### Raw

In [None]:
X, y  = df_train.drop(columns='price'), df_train['price']
r = booger_aids(X, y)
m.display_stats(r, False)

### Standard Scaled

In [None]:
X, y  = df_train.drop(columns='price'), df_train['price']
X[X.columns] = scale(X)
r = booger_aids(X, y)
m.display_stats(r, False)

### Normalized

In [None]:
X, y  = df_train.drop(columns='price'), df_train['price']
X[X.columns] = normalize(X)
r = booger_aids(X, y)
m.display_stats(r, False)

### Min Max

In [None]:
X, y  = df_train.drop(columns='price'), df_train['price']
X[X.columns] = minmax_scale(X)
r = booger_aids(X, y)
m.display_stats(r, False)

### Poly

In [None]:
all_r = []
for i in range(7):
    X, y  = df_train.drop(columns='price').copy(), df_train['price'].copy()
    X_poly = m.polynomialFeatures(X, i)
    r = booger_aids(X_poly, y)
    all_r.extend(r)

m.display_stats(all_r, False)

In [None]:
all_r = []
for i in range(7):
    X, y  = df_train.drop(columns='price').copy(), df_train['price'].copy()
    X[X.columns] = scale(X)
    X_poly = m.polynomialFeatures(X, i)
    r = booger_aids(X_poly, y)
    all_r.extend(r)

m.display_stats(all_r, False)

In [None]:
all_r = []
for i in range(7):
    X, y  = df_train.drop(columns='price').copy(), df_train['price'].copy()
    X[X.columns] = normalize(X)
    X_poly = m.polynomialFeatures(X, i)
    r = booger_aids(X_poly, y)
    all_r.extend(r)

m.display_stats(all_r, False)

In [None]:
all_r = []
for i in range(1, 20):
    X, y  = df_train.drop(columns='price').copy(), df_train['price'].copy()
    X[X.columns] = minmax_scale(X)
    X_poly = m.polynomialFeatures(X, i)
    r = booger_aids(X_poly, y)
    all_r.extend(r)

m.display_stats(all_r, False)

## Trees

In [15]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [175]:
n_est_list = [100, 200, 400]
models_and_params = [
    (DecisionTreeRegressor, {}),
    *[ (RandomForestRegressor,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (AdaBoostRegressor,             {'random_state': RS, 'n_estimators': n_e}) for n_e in  n_est_list],
    *[ (LGBMRegressor,                 {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (XGBRegressor,                  {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostRegressor,             {'random_state': RS, 'n_estimators': n_e, 'silent': True, 'allow_writing_files': False}) for n_e in n_est_list ],
]
cv = KFold(5)
metrics = [ mean_absolute_error, m.root_mean_squared_error, r2_score]

_ = t.grid_exec(
    lambda model: m.cv_regression(model, cv, X, y, metrics), 
    models_and_params, 
    on1Completed = lambda results_1_run: m.display_stats(results_1_run, reverse_rank_idx=[2])
)

Unnamed: 0,model,params,n_folds,MAE_mean,MAE_std,#1,RMSE_mean,RMSE_std,#2,R2_mean,R2_std,#3,time
0,DecisionTreeRegressor,,5,1016,75,13,1362,102,13,0.758,0.032,13,0.1
1,RandomForestRegressor,'n_estimators': 100,5,874,63,12,1168,86,12,0.822,0.018,12,3.1
2,RandomForestRegressor,'n_estimators': 200,5,873,63,11,1166,86,11,0.823,0.018,11,6.1
3,RandomForestRegressor,'n_estimators': 400,5,873,63,10,1166,86,10,0.823,0.018,10,12.6
4,AdaBoostRegressor,'n_estimators': 100,5,1175,36,15,1438,50,15,0.729,0.035,15,1.0
5,AdaBoostRegressor,'n_estimators': 200,5,1175,36,15,1438,50,15,0.729,0.035,15,1.0
6,AdaBoostRegressor,'n_estimators': 400,5,1175,36,15,1438,50,15,0.729,0.035,15,1.0
7,LGBMRegressor,'n_estimators': 100,5,758,51,4,1011,71,4,0.867,0.014,4,0.5
8,LGBMRegressor,'n_estimators': 200,5,764,53,5,1020,74,5,0.865,0.014,5,0.8
9,LGBMRegressor,'n_estimators': 400,5,773,56,6,1031,76,6,0.862,0.014,6,1.5


In [177]:
n_est_list = [100, 200, 400]
models_and_params = [
    (DecisionTreeRegressor, {}),
    *[ (RandomForestRegressor,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (AdaBoostRegressor,             {'random_state': RS, 'n_estimators': n_e}) for n_e in  n_est_list],
    *[ (LGBMRegressor,                 {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (XGBRegressor,                  {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostRegressor,             {'random_state': RS, 'n_estimators': n_e, 'silent': True, 'allow_writing_files': False}) for n_e in n_est_list ],
]
cv = KFold(8)
metrics = [ mean_absolute_error, m.root_mean_squared_error, r2_score]

_ = t.grid_exec(
    lambda model: m.cv_regression(model, cv, X, y, metrics), 
    models_and_params, 
    on1Completed = lambda results_1_run: m.display_stats(results_1_run, reverse_rank_idx=[2])
)

Unnamed: 0,model,params,n_folds,MAE_mean,MAE_std,#1,RMSE_mean,RMSE_std,#2,R2_mean,R2_std,#3,time
0,DecisionTreeRegressor,,8,1018,82,13,1370,97,13,0.749,0.042,13,0.1
1,RandomForestRegressor,'n_estimators': 100,8,877,70,12,1174,84,12,0.816,0.031,12,5.2
2,RandomForestRegressor,'n_estimators': 200,8,876,68,11,1173,82,11,0.816,0.03,11,10.4
3,RandomForestRegressor,'n_estimators': 400,8,874,67,10,1171,81,10,0.817,0.03,10,21.2
4,AdaBoostRegressor,'n_estimators': 100,8,1159,67,15,1414,65,15,0.734,0.032,15,1.7
5,AdaBoostRegressor,'n_estimators': 200,8,1159,67,15,1414,65,15,0.734,0.032,15,1.8
6,AdaBoostRegressor,'n_estimators': 400,8,1159,67,15,1414,65,15,0.734,0.032,15,1.8
7,LGBMRegressor,'n_estimators': 100,8,750,49,4,1004,65,4,0.865,0.019,4,0.8
8,LGBMRegressor,'n_estimators': 200,8,757,50,5,1013,65,5,0.863,0.02,5,1.3
9,LGBMRegressor,'n_estimators': 400,8,766,54,6,1025,68,6,0.86,0.021,6,2.3


In [None]:
('price mad in 5k groups: ', 323.66491789552254)
('price std in 5k groups: ', 701.2696209252479)

('price mad in 10k groups: ', 343.64691608968064)
('price std in 10k groups: ', 729.4423273421423)

In [176]:
n_est_list = [800, 1600]

models_and_params = [
    *[ (RandomForestRegressor,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (AdaBoostRegressor,             {'random_state': RS, 'n_estimators': n_e}) for n_e in  n_est_list],
    *[ (LGBMRegressor,                 {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (XGBRegressor,                  {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostRegressor,             {'random_state': RS, 'n_estimators': n_e, 'silent': True, 'allow_writing_files': False}) for n_e in n_est_list ],
]
cv = KFold(5)
metrics = [ mean_absolute_error, m.root_mean_squared_error, r2_score]

_ = t.grid_exec(
    lambda model: m.cv_regression(model, cv, X, y, metrics), 
    models_and_params, 
    on1Completed = lambda results_1_run: m.display_stats(results_1_run, reverse_rank_idx=[2])
)


Unnamed: 0,model,params,n_folds,MAE_mean,MAE_std,#1,RMSE_mean,RMSE_std,#2,R2_mean,R2_std,#3,time
0,DecisionTreeRegressor,,5,1016,75,9,1362,102,9,0.758,0.032,9,0.1
1,RandomForestRegressor,'n_estimators': 800,5,873,63,6,1167,87,6,0.823,0.019,6,23.9
2,RandomForestRegressor,'n_estimators': 1600,5,873,64,5,1167,87,5,0.823,0.019,5,48.4
3,AdaBoostRegressor,'n_estimators': 800,5,1175,36,10,1438,50,10,0.729,0.035,10,1.0
4,AdaBoostRegressor,'n_estimators': 1600,5,1175,36,10,1438,50,10,0.729,0.035,10,1.0
5,LGBMRegressor,'n_estimators': 800,5,787,58,3,1048,79,3,0.857,0.014,3,3.2
6,LGBMRegressor,'n_estimators': 1600,5,804,59,4,1072,81,4,0.85,0.015,4,5.5
7,XGBRegressor,'n_estimators': 800,5,884,55,7,1183,76,7,0.817,0.021,7,15.6
8,XGBRegressor,'n_estimators': 1600,5,925,56,8,1233,75,8,0.801,0.024,8,32.9
9,CatBoostRegressor,'n_estimators': 800,5,742,54,1,994,76,1,0.871,0.015,1,5.2


In [178]:
n_est_list = [800, 1600]

models_and_params = [
    *[ (RandomForestRegressor,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (AdaBoostRegressor,             {'random_state': RS, 'n_estimators': n_e}) for n_e in  n_est_list],
    *[ (LGBMRegressor,                 {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (XGBRegressor,                  {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostRegressor,             {'random_state': RS, 'n_estimators': n_e, 'silent': True, 'allow_writing_files': False}) for n_e in n_est_list ],
]
cv = KFold(8)
metrics = [ mean_absolute_error, m.root_mean_squared_error, r2_score]

_ = t.grid_exec(
    lambda model: m.cv_regression(model, cv, X, y, metrics), 
    models_and_params, 
    on1Completed = lambda results_1_run: m.display_stats(results_1_run, reverse_rank_idx=[2])
)


Unnamed: 0,model,params,n_folds,MAE_mean,MAE_std,#1,RMSE_mean,RMSE_std,#2,R2_mean,R2_std,#3,time
0,RandomForestRegressor,'n_estimators': 800,8,874,67,6,1170,81,6,0.817,0.03,6,42.2
1,RandomForestRegressor,'n_estimators': 1600,8,873,66,5,1169,81,5,0.817,0.03,5,82.7
2,AdaBoostRegressor,'n_estimators': 800,8,1159,67,9,1414,65,9,0.734,0.032,9,1.8
3,AdaBoostRegressor,'n_estimators': 1600,8,1159,67,9,1414,65,9,0.734,0.032,9,1.7
4,LGBMRegressor,'n_estimators': 800,8,779,56,3,1041,70,3,0.855,0.021,3,4.2
5,LGBMRegressor,'n_estimators': 1600,8,798,59,4,1064,71,4,0.849,0.022,4,9.0
6,XGBRegressor,'n_estimators': 800,8,881,59,7,1177,81,7,0.814,0.032,7,30.8
7,XGBRegressor,'n_estimators': 1600,8,923,60,8,1230,79,8,0.797,0.035,8,56.6
8,CatBoostRegressor,'n_estimators': 800,8,742,49,1,996,65,1,0.867,0.021,1,8.6
9,CatBoostRegressor,'n_estimators': 1600,8,743,50,2,997,66,2,0.867,0.021,2,17.0


In [1]:
700/1000

0.7

In [2]:
1000/700

1.4285714285714286