# Setup and Imports

In [1]:
import sys
if '../Toolkit' not in sys.path: sys.path.append('../Toolkit')

%load_ext autoreload
%autoreload 1

%aimport tools
%aimport models

import tools as t
import models as m

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from IPython.display import clear_output

import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 999
pd.options.display.max_rows = 113
import matplotlib.pyplot as plt
import seaborn as sns

RS = 35577 # global random state seed
raw_data_path = 'data-raw'
processed_data_path = 'data-processed'

In [2]:
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
import time

In [3]:
df_data = t.from_pkl(f'{processed_data_path}/df_clean.pkl')

df_data.shape
df_data.head(3)

(97323, 8)

Unnamed: 0,make,model,year,price,transmission,mileage,fuelType,engineSize
0,Audi,A1,2017,12500,Manual,15735,Petrol,1.4
1,Audi,A6,2016,16500,Automatic,36203,Diesel,2.0
2,Audi,A1,2016,11000,Manual,29946,Petrol,1.4


In [4]:
df_data

Unnamed: 0,make,model,year,price,transmission,mileage,fuelType,engineSize
0,Audi,A1,2017,12500,Manual,15735,Petrol,1.4
1,Audi,A6,2016,16500,Automatic,36203,Diesel,2.0
2,Audi,A1,2016,11000,Manual,29946,Petrol,1.4
3,Audi,A4,2017,16800,Automatic,25952,Diesel,2.0
4,Audi,A3,2019,17300,Manual,1998,Petrol,1.0
...,...,...,...,...,...,...,...,...
99153,Vw,California,2019,57991,Automatic,10288,Diesel,2.0
99154,Vw,California,2019,57991,Automatic,6739,Diesel,2.0
99155,Vw,California,2019,57991,Automatic,7259,Diesel,2.0
99156,Vw,California,2019,57991,Automatic,7486,Diesel,2.0


In [None]:
df_fiesta = df_data.loc[ 
    (df_data['make'] == 'Ford') & \
    (df_data['model'] == 'Fiesta') & \
    (df_data['transmission'] == 'Manual') & \
    (df_data['fuelType'] == 'Petrol')
].drop(columns=['make', 'model', 'transmission', 'fuelType']).copy()

df_fiesta['mileage_5k'] = ((df_fiesta['mileage'] / 5000).round(0)*5000 ).astype(int)
df_fiesta.drop(columns='mileage', inplace=True)
df_fiesta = df_fiesta.loc[df_fiesta['mileage_5k'] <= 80000]

df_fiesta.head(3)
df_fiesta.shape

In [None]:
def mad(s):
    return (s - s.mean() ).abs().mean()

In [None]:
d = df_fiesta.groupby(['year', 'engineSize', 'mileage_5k'])['price'].agg([len, np.mean, np.std, mad])

'std', d['std'].mean()
'mad', d['mad'].mean()

d.tail(20).style.format('{:.0f}') 

# Encode

In [None]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [None]:
## transmission & fuelType

# df_enc1 = df_fiesta_mp.copy()

# df_enc1['transmission'].value_counts()
# df_enc1['fuelType'].value_counts()

# t.ordinalEncode(df_enc1, ['transmission', 'fuelType'])

# df_enc1['transmission'].value_counts()
# df_enc1['fuelType'].value_counts()

## Prep X y

In [None]:
df_train = df_fiesta

In [None]:
_ = sns.pairplot(data= df_train).fig.set_size_inches(15,15)

In [None]:
df_train.head(3)
df_train.dtypes

In [None]:
X = df_train.drop('price', axis=1)
y = df_train['price']

X.shape
y.shape

# Train model

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVC

from lightgbm import LGBMRegressor

from sklearn.model_selection import KFold, RepeatedKFold, StratifiedKFold, RepeatedStratifiedKFold

In [None]:
from sklearn.metrics import mean_squared_error as mse, mean_absolute_error as mae, r2_score as r2

def rmse(y_true, y_pred):
    return mse(y_true, y_pred, squared=False)

In [None]:
all_results = []

In [None]:
n_est_list = [200]
models_and_params = [
    (LinearRegression, {}),
    (Lasso, {'random_state': RS}),
    # (Ridge, {'random_state': RS}),
    (DecisionTreeRegressor, {}),
    # *[ (RandomForestRegressor,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    # *[ (AdaBoostRegressor,         {'random_state': RS, 'n_estimators': n_e}) for n_e in  n_est_list],
    *[ (LGBMRegressor,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
]
cv = KFold(5)
metrics = [ mae, rmse, r2]

r = t.grid_exec(
    lambda model: m.cv_regression(model, cv, X, y, metrics), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(r)
)


In [None]:
from sklearn.preprocessing import PolynomialFeatures
n_est_list = [300]
models_and_params = [
    (LinearRegression, {}),
    # (Lasso, {'random_state': RS}),
    # (Ridge, {'random_state': RS}),
]
cv = RepeatedKFold(n_splits=5, n_repeats=2)
metrics = [ r2]

res = []
for i in range(15):
    X_p = pd.DataFrame(data=PolynomialFeatures(degree=i).fit_transform(X))
    r = t.grid_exec(
        lambda model: m.cv_regression(model, cv, X_p, y, metrics), 
        models_and_params, 
        on1Completed = lambda r: m.display_stats(r)
    )
    res.extend(r)
    m.display_stats(res)