# Setup and Imports

In [1]:
import sys
if '../Toolkit' not in sys.path: sys.path.append('../Toolkit')

%load_ext autoreload
%autoreload 1

%aimport tools
%aimport models

import tools as t
import models as m

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from IPython.display import clear_output

import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 999
pd.options.display.max_rows = 113
import matplotlib.pyplot as plt
import seaborn as sns

RS = 35577 # global random state seed
raw_data_path = 'data-raw'
processed_data_path = 'data-processed'

In [2]:
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
import time

In [3]:
df_data = t.from_pkl(f'{processed_data_path}/df_clean.pkl')

df_data.shape
df_data.head(3)

(97449, 9)

Unnamed: 0,make,model,year,price,transmission,mileage,fuelType,tax,engineSize
0,Audi,A1,2017,12500,Manual,15735,Petrol,150,1.4
1,Audi,A6,2016,16500,Automatic,36203,Diesel,20,2.0
2,Audi,A1,2016,11000,Manual,29946,Petrol,30,1.4


# Encode

In [4]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

## transmission & fuelType

In [5]:
df_enc1 = df_data.copy()

In [6]:
df_enc1['transmission'].value_counts()
df_enc1['fuelType'].value_counts()

m.ordinalEncode(df_enc1, ['transmission', 'fuelType'])

df_enc1['transmission'].value_counts()
df_enc1['fuelType'].value_counts()

Manual       55469
Semi-Auto    22295
Automatic    19685
Name: transmission, dtype: int64

Petrol    53976
Diesel    40416
Hybrid     3057
Name: fuelType, dtype: int64

1    55469
2    22295
0    19685
Name: transmission, dtype: int64

2    53976
0    40416
1     3057
Name: fuelType, dtype: int64

## make & model

In [7]:
df_audi = df_enc1.loc[df_enc1['make'] == 'Audi'].drop(columns='make').copy()

model_vc = df_audi['model'].value_counts()
model_vc

models_to_keep = model_vc[model_vc>10].index
df_audi = df_audi.loc[ df_audi['model'].isin(models_to_keep) ]
df_audi['model'].value_counts()

A3     1917
Q3     1379
A4     1375
A1     1337
A5      874
Q5      864
Q2      810
A6      748
Q7      394
TT      336
A7      122
A8      118
Q8       69
RS6      39
RS3      33
RS4      31
RS5      28
R8       28
S3       18
SQ5      16
S4       12
SQ7       8
S8        4
S5        3
A2        1
RS7       1
Name: model, dtype: int64

A3     1917
Q3     1379
A4     1375
A1     1337
A5      874
Q5      864
Q2      810
A6      748
Q7      394
TT      336
A7      122
A8      118
Q8       69
RS6      39
RS3      33
RS4      31
RS5      28
R8       28
S3       18
SQ5      16
S4       12
Name: model, dtype: int64

In [8]:
m.oh_encode(df_audi, ['model'])

In [9]:
df_train = df_audi

In [10]:
df_train.head(3)
df_train.dtypes

Unnamed: 0,year,price,transmission,mileage,fuelType,tax,engineSize,model__A1,model__A6,model__A4,model__A3,model__Q3,model__Q5,model__A5,model__S4,model__Q2,model__A7,model__TT,model__Q7,model__RS6,model__RS3,model__A8,model__Q8,model__RS4,model__RS5,model__R8,model__SQ5,model__S3
0,2017,12500,1,15735,2,150,1.4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2016,16500,0,36203,0,20,2.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2016,11000,1,29946,2,30,1.4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


year              int64
price             int64
transmission      int64
mileage           int64
fuelType          int64
tax               int64
engineSize      float64
model__A1         int64
model__A6         int64
model__A4         int64
model__A3         int64
model__Q3         int64
model__Q5         int64
model__A5         int64
model__S4         int64
model__Q2         int64
model__A7         int64
model__TT         int64
model__Q7         int64
model__RS6        int64
model__RS3        int64
model__A8         int64
model__Q8         int64
model__RS4        int64
model__RS5        int64
model__R8         int64
model__SQ5        int64
model__S3         int64
dtype: object

In [11]:
X = df_train.drop('price', axis=1)
y = df_train['price']

In [12]:
X.shape
y.shape

(10548, 27)

(10548,)

# Train model

In [13]:
from lightgbm import LGBMRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVC

from sklearn.model_selection import KFold, RepeatedKFold, StratifiedKFold, RepeatedStratifiedKFold

In [14]:
from sklearn.metrics import mean_squared_error as mse, mean_absolute_error as mae, r2_score as r2

def RMSE(y_true, y_pred):
    return mse(y_true, y_pred, squared=False)

In [15]:
all_results = []

In [19]:
n_est_list = [200]
models_and_params = [
    (LinearRegression, {}),
    (Lasso, {'random_state': RS}),
    (Ridge, {'random_state': RS}),
    (DecisionTreeRegressor, {}),
    *[ (RandomForestRegressor,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (AdaBoostRegressor,         {'random_state': RS, 'n_estimators': n_e}) for n_e in  n_est_list],
    *[ (LGBMRegressor,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
]
cv = KFold(5)
metrics = [ mae, RMSE, r2]

r = t.grid_exec(
    lambda model: m.cv_regression(model, cv, X, y, metrics), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(r)
)


Unnamed: 0,model,params,n_folds,MAE_mean,MAE_std,RMSE_mean,RMSE_std,R2_mean,R2_std,time
0,LinearRegression,,5,2938,241,4534,553,0.837,0.036,0.1
1,Lasso,,5,2935,239,4527,547,0.838,0.036,1.1
2,Ridge,,5,2936,239,4530,542,0.838,0.035,0.1
3,DecisionTreeRegressor,,5,2381,235,3581,419,0.899,0.02,0.2
4,RandomForestRegressor,'n_estimators': 200,5,1966,207,3001,367,0.929,0.015,16.1
5,AdaBoostRegressor,'n_estimators': 200,5,5794,543,6745,511,0.636,0.078,6.7
6,LGBMRegressor,'n_estimators': 200,5,1834,191,2802,330,0.938,0.013,0.9
