# Setup and Imports

In [1]:
import sys
if '../Toolkit' not in sys.path: sys.path.append('../Toolkit')

%load_ext autoreload
%autoreload 1

%aimport tools
%aimport models

import tools as t
import models as m

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from IPython.display import clear_output

import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 999
pd.options.display.max_rows = 113
import matplotlib.pyplot as plt
import seaborn as sns

RS = 35577 # global random state seed
raw_data_path = 'data-raw'
processed_data_path = 'data-processed'

In [2]:
import time

In [3]:
df_data = t.from_pkl(f'{processed_data_path}/df_clean.pkl')

df_data.shape
df_data.head(3)

(97053, 8)

Unnamed: 0,make,model,year,price,transmission,mileage,fuelType,engineSize
0,Audi,A1,2017,12500,Manual,15735,Petrol,1.4
1,Audi,A6,2016,16500,Automatic,36203,Diesel,2.0
2,Audi,A1,2016,11000,Manual,29946,Petrol,1.4


In [4]:
df_data.insert(0, 'car', df_data['make'] + '__' + df_data['model'])
df_data = df_data.drop(columns=['make', 'model'])
df_data.groupby(['car']).size().sort_values(ascending=False).head(10)

In [14]:
df_1_car = df_data.loc[ (df_data['car'] == 'Ford__Fiesta') ].drop(columns=['car']).copy()
df_1_car.head(3)

mileage_5k = ((df_1_car['mileage'] / 5000).round(0) * 5000).astype(int)
df_grp = df_1_car.drop(columns='mileage').groupby(['year', 'transmission', mileage_5k, 'fuelType', 'engineSize'])['price'].agg([len, np.mean, np.std, m.mad])

'mean group size: ', df_grp['len'].mean()
'price mad in 5k groups: ', df_grp['mad'].mean()
'price std in 5k groups: ', df_grp['std'].mean()

Unnamed: 0,year,price,transmission,mileage,fuelType,engineSize
21449,2017,12000,Automatic,15944,Petrol,1.0
21452,2019,17500,Manual,10460,Petrol,1.5
21453,2019,16500,Automatic,1482,Petrol,1.0


('mean group size: ', 15.711864406779661)

('price mad in 5k groups: ', 344.2007726118433)

('price std in 5k groups: ', 722.0580288841817)

# Encode

In [15]:
m.ordinalEncode(df_1_car, ['transmission', 'fuelType'])
df_1_car.head(3)

Unnamed: 0,year,price,transmission,mileage,fuelType,engineSize
21449,2017,12000,0,15944,1,1.0
21452,2019,17500,1,10460,1,1.5
21453,2019,16500,0,1482,1,1.0


## X, y

In [16]:
df_train = df_1_car
X, y  = df_train.drop(columns='price').copy(), df_train['price'].copy()

y.shape
X.shape
X.columns
X.head(3)
y.head(3)

(6489,)

(6489, 5)

Index(['year', 'transmission', 'mileage', 'fuelType', 'engineSize'], dtype='object')

Unnamed: 0,year,transmission,mileage,fuelType,engineSize
21449,2017,0,15944,1,1.0
21452,2019,1,10460,1,1.5
21453,2019,0,1482,1,1.0


21449    12000
21452    17500
21453    16500
Name: price, dtype: int64

# Train model

In [17]:
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import KFold, RepeatedKFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.preprocessing import minmax_scale, normalize, scale, PolynomialFeatures

In [18]:
from sklearn.tree import DecisionTreeRegressor

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

## First Run

In [19]:
n_est_list = [100, 400]
models_and_params = [
    (DecisionTreeRegressor, {}),
    
    *[ (RandomForestRegressor, {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (AdaBoostRegressor,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    
    *[ (LGBMRegressor,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (XGBRegressor,          {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostRegressor,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedKFold(n_splits=6, n_repeats=1)
metrics = [ mean_absolute_error, m.root_mean_squared_error, r2_score ]

ret = t.grid_exec(
    lambda model: m.cv_regression(model, cv, X, y, metrics), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(pd.DataFrame(r).sort_values(by=['R2_mean'], ascending=False))
)

Unnamed: 0,model,params,n_folds,MAE_mean,MAE_std,#1,RMSE_mean,RMSE_std,#2,R2_mean,R2_std,#3,time
10,CatBoostRegressor,'n_estimators': 400,6,730.929,13.654,1,979.683,21.595,1,0.885,0.001,11,9.5
9,CatBoostRegressor,'n_estimators': 100,6,732.701,12.989,2,980.766,23.179,2,0.885,0.002,10,2.5
5,LGBMRegressor,'n_estimators': 100,6,742.663,14.525,3,996.388,20.285,3,0.881,0.001,9,0.6
6,LGBMRegressor,'n_estimators': 400,6,755.231,14.95,4,1013.778,19.802,4,0.877,0.002,8,2.5
7,XGBRegressor,'n_estimators': 100,6,763.523,22.424,5,1022.803,31.85,5,0.874,0.005,7,2.9
8,XGBRegressor,'n_estimators': 400,6,822.647,20.641,6,1100.935,27.578,6,0.854,0.007,6,11.0
2,RandomForestRegressor,'n_estimators': 400,6,855.729,22.191,7,1149.836,26.405,7,0.841,0.008,5,14.7
1,RandomForestRegressor,'n_estimators': 100,6,857.223,22.269,8,1150.637,25.689,8,0.841,0.008,4,3.7
0,DecisionTreeRegressor,,6,1001.308,23.47,9,1352.618,30.422,9,0.78,0.012,3,0.1
3,AdaBoostRegressor,'n_estimators': 100,6,1146.534,24.721,10,1402.989,21.895,10,0.764,0.008,2,1.1


## Best 3 models

In [22]:
trained_models = []
n_est_list = [400, 800, 1200]
models_and_params = [
    *[ (LGBMRegressor,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (XGBRegressor,          {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostRegressor,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedKFold(n_splits=6, n_repeats=5)
metrics = [ mean_absolute_error, m.root_mean_squared_error, r2_score ]

ret = t.grid_exec(
    lambda model: m.cv_regression(model, cv, X, y, metrics, trained_models), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(pd.DataFrame(r).sort_values(by=['R2_mean'], ascending=False))
)

Unnamed: 0,model,params,n_folds,MAE_mean,MAE_std,#1,RMSE_mean,RMSE_std,#2,R2_mean,R2_std,#3,time
6,CatBoostRegressor,'n_estimators': 400,30,728.937,16.576,1,978.806,24.285,1,0.885,0.003,9,55.7
7,CatBoostRegressor,'n_estimators': 800,30,729.828,17.167,2,980.374,24.745,2,0.885,0.003,8,121.9
8,CatBoostRegressor,'n_estimators': 1200,30,730.126,16.743,3,981.005,24.697,3,0.884,0.003,7,178.5
0,LGBMRegressor,'n_estimators': 400,30,754.559,17.424,4,1012.153,24.582,4,0.877,0.003,6,9.9
1,LGBMRegressor,'n_estimators': 800,30,767.444,17.712,5,1027.41,23.863,5,0.873,0.003,5,22.1
2,LGBMRegressor,'n_estimators': 1200,30,776.708,17.907,6,1039.049,23.788,6,0.87,0.004,4,35.6
3,XGBRegressor,'n_estimators': 400,30,824.027,22.765,7,1108.142,31.996,7,0.853,0.007,3,61.5
4,XGBRegressor,'n_estimators': 800,30,867.729,23.781,8,1164.065,33.651,8,0.837,0.009,2,129.7
5,XGBRegressor,'n_estimators': 1200,30,892.704,24.756,9,1195.529,34.879,9,0.828,0.009,1,181.8


In [25]:
m.get_fe_df(trained_models)

Unnamed: 0,LGBMR_0_rank,LGBMR_1_rank,LGBMR_2_rank,XGBR_3_rank,XGBR_4_rank,XGBR_5_rank,CBR_6_rank,CBR_7_rank,CBR_8_rank,sum_rank
year,4,4,4,5,5,5,5,5,5,42
engineSize,3,3,3,4,4,4,4,4,4,33
mileage,5,5,5,1,1,1,3,3,3,27
fuelType,1,1,1,3,3,3,2,2,2,18
transmission,2,2,2,2,2,2,1,1,1,15
