# Setup and Imports

In [1]:
import sys
if '../Toolkit' not in sys.path: sys.path.append('../Toolkit')

%load_ext autoreload
%autoreload 1

%aimport tools
%aimport models

import tools as t
import models as m

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from IPython.display import clear_output

import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 999
pd.options.display.max_rows = 113
import matplotlib.pyplot as plt
import seaborn as sns

RS = 35577 # global random state seed
raw_data_path = 'data-raw'
processed_data_path = 'data-processed'

In [2]:
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
import time

In [30]:
df_data = t.from_pkl(f'{processed_data_path}/df_clean.pkl')

df_data.shape
df_data.head(3)

(97323, 8)

Unnamed: 0,make,model,year,price,transmission,mileage,fuelType,engineSize
0,Audi,A1,2017,12500,Manual,15735,Petrol,1.4
1,Audi,A6,2016,16500,Automatic,36203,Diesel,2.0
2,Audi,A1,2016,11000,Manual,29946,Petrol,1.4


In [31]:
df_data.insert(0, 'car', df_data['make'] + '__' + df_data['model'])
df_data = df_data.drop(columns=['make', 'model'])

In [32]:
df_data.groupby(['car']).size().sort_values(ascending=False).head(10)

car
Ford__Fiesta       6508
Vw__Golf           4769
Ford__Focus        4555
Merc__C Class      3689
Vauxhall__Corsa    3285
Vw__Polo           3230
Vauxhall__Astra    2655
Merc__A Class      2474
Bmw__3 Series      2413
Ford__Kuga         2208
dtype: int64

In [37]:
df_fiesta = df_data.loc[ (df_data['car'] == 'Ford__Fiesta') ].drop(columns=['car']).copy()
df_fiesta.head(3)

Unnamed: 0,year,price,transmission,mileage,fuelType,engineSize
21449,2017,12000,Automatic,15944,Petrol,1.0
21452,2019,17500,Manual,10460,Petrol,1.5
21453,2019,16500,Automatic,1482,Petrol,1.0


In [52]:
mileage_5k = ((df_fiesta['mileage'] / 5000).round(0) * 5000).astype(int)
r = df_fiesta.drop(columns='mileage').groupby(['year', 'transmission', mileage_5k, 'fuelType', 'engineSize']).agg([len, np.mean, np.std, m.mad])
r.columns = [c[1] for c in r.columns]

'mean group size: ', r['len'].mean()
'price std in group: ', r['std'].mean()

('mean group size: ', 10.446227929373997)

('price std in group: ', 701.2696209252479)

In [79]:
mileage_10k = ((df_fiesta['mileage'] / 10000).round(0) * 10000).astype(int)
r = df_fiesta.drop(columns='mileage').groupby(['year', 'transmission', mileage_10k, 'fuelType', 'engineSize']).agg([len, np.mean, np.std, m.mad])
r.columns = [c[1] for c in r.columns]

'mean group size: ', r['len'].mean()
'price std in group: ', r['mad'].mean()
'price std in group: ', r['std'].mean()

('mean group size: ', 15.349056603773585)

('price std in group: ', 343.64691608968064)

('price std in group: ', 729.4423273421423)

# Encode

In [67]:
m.ordinalEncode(df_fiesta, ['transmission', 'fuelType'])
df_fiesta

Unnamed: 0,year,price,transmission,mileage,fuelType,engineSize
21449,2017,12000,0,15944,1,1.0
21452,2019,17500,1,10460,1,1.5
21453,2019,16500,0,1482,1,1.0
21454,2015,10500,1,35432,1,1.6
21456,2017,9000,1,13054,1,1.2
...,...,...,...,...,...,...
39394,2019,15999,1,2813,1,1.0
39395,2015,8999,1,24546,1,1.0
39401,2017,9899,0,16303,1,1.0
39404,2018,12500,0,7047,1,1.0


# X y

In [68]:
df_train = df_fiesta
df_train.head(3)

Unnamed: 0,year,price,transmission,mileage,fuelType,engineSize
21449,2017,12000,0,15944,1,1.0
21452,2019,17500,1,10460,1,1.5
21453,2019,16500,0,1482,1,1.0


In [69]:
X, y  = df_train.drop(columns='price'), df_train['price']

y.shape
X.shape
X.columns
X.head(2)


(6508,)

(6508, 5)

Index(['year', 'transmission', 'mileage', 'fuelType', 'engineSize'], dtype='object')

Unnamed: 0,year,transmission,mileage,fuelType,engineSize
21449,2017,0,15944,1,1.0
21452,2019,1,10460,1,1.5


# Train model

In [99]:
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

# from sklearn.svm import SVC

# from lightgbm import LGBMRegressor
# from xgboost import XGBRegressor
# from catboost import CatBoostRegressor

from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import KFold, RepeatedKFold, StratifiedKFold, RepeatedStratifiedKFold

## Linear Models

In [148]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import minmax_scale, normalize, scale, PolynomialFeatures

In [185]:
def booger_aids(X, y):
    models_and_params = [
        (LinearRegression, {}),
        # (Lasso, {'random_state': RS}),
        # (Ridge, {'random_state': RS}),
    ]
    cv = KFold(5)
    metrics = [ mean_absolute_error, m.root_mean_squared_error, r2_score]

    r = t.grid_exec(
        lambda model: m.cv_regression(model, cv, X, y, metrics), 
        models_and_params,
    )
    return r

## Raw

In [186]:
X, y  = df_train.drop(columns='price'), df_train['price']
r = booger_aids(X, y)
m.display_stats(r, False)

Unnamed: 0,model,params,n_folds,MAE_mean,MAE_std,RMSE_mean,RMSE_std,R2_mean,R2_std,time
0,LinearRegression,,5,1269.885,120.543,1698.917,203.566,0.627,0.037,0.0


## Standard Scaled

In [187]:
X, y  = df_train.drop(columns='price'), df_train['price']
X[X.columns] = scale(X)
r = booger_aids(X, y)
m.display_stats(r, False)

Unnamed: 0,model,params,n_folds,MAE_mean,MAE_std,RMSE_mean,RMSE_std,R2_mean,R2_std,time
0,LinearRegression,,5,1269.885,120.543,1698.917,203.566,0.627,0.037,0.0


## Normalized

In [188]:
X, y  = df_train.drop(columns='price'), df_train['price']
X[X.columns] = normalize(X)
r = booger_aids(X, y)
m.display_stats(r, False)

Unnamed: 0,model,params,n_folds,MAE_mean,MAE_std,RMSE_mean,RMSE_std,R2_mean,R2_std,time
0,LinearRegression,,5,1686.642,134.771,2169.482,204.145,0.392,0.021,0.0


## Min Max

In [189]:
X, y  = df_train.drop(columns='price'), df_train['price']
X[X.columns] = minmax_scale(X)
r = booger_aids(X, y)
m.display_stats(r, False)

Unnamed: 0,model,params,n_folds,MAE_mean,MAE_std,RMSE_mean,RMSE_std,R2_mean,R2_std,time
0,LinearRegression,,5,1269.885,120.543,1698.917,203.566,0.627,0.037,0.0


## Poly

In [207]:
all_r = []
for i in range(7):
    X, y  = df_train.drop(columns='price').copy(), df_train['price'].copy()
    X_poly = m.polynomialFeatures(X, i)
    r = booger_aids(X_poly, y)
    all_r.extend(r)

m.display_stats(all_r, False)

Unnamed: 0,model,params,n_folds,MAE_mean,MAE_std,RMSE_mean,RMSE_std,R2_mean,R2_std,time
0,LinearRegression,,5,2239,297,2919,375,0,0,0.0
1,LinearRegression,,5,1270,121,1699,204,1,0,0.0
2,LinearRegression,,5,1181,240,2146,1522,0,1,0.0
3,LinearRegression,,5,1644,960,3426,3578,-2,5,0.1
4,LinearRegression,,5,1141,208,2194,1709,0,1,0.2
5,LinearRegression,,5,1263,344,2214,1586,0,1,0.3
6,LinearRegression,,5,1824,1140,5713,8047,-10,22,0.9


In [205]:
all_r = []
for i in range(7):
    X, y  = df_train.drop(columns='price').copy(), df_train['price'].copy()
    X[X.columns] = scale(X)
    X_poly = m.polynomialFeatures(X, i)
    r = booger_aids(X_poly, y)
    all_r.extend(r)

m.display_stats(all_r, False)

Unnamed: 0,model,params,n_folds,MAE_mean,MAE_std,RMSE_mean,RMSE_std,R2_mean,R2_std,time
0,LinearRegression,,5,2239,297,2919,375,-0,0,0.0
1,LinearRegression,,5,1270,121,1699,204,1,0,0.0
2,LinearRegression,,5,1181,240,2146,1522,0,1,0.0
3,LinearRegression,,5,1919,2146,9718,17214,-45,91,0.1
4,LinearRegression,,5,6301644411,12603287292,127195250790,254390499551,-9466413776816030,18932827553632060,0.2
5,LinearRegression,,5,75362562040,150725122555,1437048288940,2874096575803,-1208332399438686464,2416664798877372928,0.3
6,LinearRegression,,5,316918391633,633836772810,6087968834675,12175937350872,-21686452738463879168,43372905476927741952,0.9


In [208]:
all_r = []
for i in range(7):
    X, y  = df_train.drop(columns='price').copy(), df_train['price'].copy()
    X[X.columns] = normalize(X)
    X_poly = m.polynomialFeatures(X, i)
    r = booger_aids(X_poly, y)
    all_r.extend(r)

m.display_stats(all_r, False)

Unnamed: 0,model,params,n_folds,MAE_mean,MAE_std,RMSE_mean,RMSE_std,R2_mean,R2_std,time
0,LinearRegression,,5,2239,297,2919,375,0,0,0.0
1,LinearRegression,,5,1687,135,2169,204,0,0,0.0
2,LinearRegression,,5,1499,118,1957,221,1,0,0.0
3,LinearRegression,,5,1523,606,5993,7436,-10,20,0.1
4,LinearRegression,,5,1692,1116,12999,16488,-52,94,0.2
5,LinearRegression,,5,2028,1254,18709,15837,-79,90,0.4
6,LinearRegression,,5,3833,2812,59776,48761,-800,766,0.9


In [None]:
all_r = []
for i in range(1, 20):
    X, y  = df_train.drop(columns='price').copy(), df_train['price'].copy()
    X[X.columns] = minmax_scale(X)
    X_poly = m.polynomialFeatures(X, i)
    r = booger_aids(X_poly, y)
    all_r.extend(r)

m.display_stats(all_r, False)

## Trees

In [None]:
n_est_list = [100]
models_and_params = [
    # (LinearRegression, {}),
    # (Lasso, {'random_state': RS}),
    # (Ridge, {'random_state': RS}),
    (DecisionTreeRegressor, {}),
    *[ (RandomForestRegressor,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (AdaBoostRegressor,             {'random_state': RS, 'n_estimators': n_e}) for n_e in  n_est_list],
    *[ (LGBMRegressor,                 {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (XGBRegressor,                  {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostRegressor,             {'random_state': RS, 'n_estimators': n_e, 'silent': True, 'allow_writing_files': False}) for n_e in n_est_list ],
]
cv = KFold(5)
metrics = [ mean_absolute_error, m.root_mean_squared_error, r2_score]

r = t.grid_exec(
    lambda model: m.cv_regression(model, cv, X, y, metrics), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(r)
)
