# Setup and Imports

In [1]:
import sys
if '../Toolkit' not in sys.path: sys.path.append('../Toolkit')

%load_ext autoreload
%autoreload 1

%aimport tools
%aimport models

import tools as t
import models as m

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from IPython.display import clear_output

import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 999
pd.options.display.max_rows = 113
import matplotlib.pyplot as plt
import seaborn as sns

RS = 35577 # global random state seed
raw_data_path = 'data-raw'
processed_data_path = 'data-processed'

In [2]:
import time

In [3]:
df_data = t.from_pkl(f'{processed_data_path}/df_clean.pkl')

df_data.shape
df_data.head(3)

(97323, 8)

Unnamed: 0,make,model,year,price,transmission,mileage,fuelType,engineSize
0,Audi,A1,2017,12500,Manual,15735,Petrol,1.4
1,Audi,A6,2016,16500,Automatic,36203,Diesel,2.0
2,Audi,A1,2016,11000,Manual,29946,Petrol,1.4


In [4]:
df_data.insert(0, 'car', df_data['make'] + '__' + df_data['model'])
df_data = df_data.drop(columns=['make', 'model'])

In [5]:
df_data.groupby(['car']).size().sort_values(ascending=False).head(10)

car
Ford__Fiesta       6508
Vw__Golf           4769
Ford__Focus        4555
Merc__C Class      3689
Vauxhall__Corsa    3285
Vw__Polo           3230
Vauxhall__Astra    2655
Merc__A Class      2474
Bmw__3 Series      2413
Ford__Kuga         2208
dtype: int64

In [6]:
df_fiesta = df_data.loc[ (df_data['car'] == 'Ford__Fiesta') ].drop(columns=['car']).copy()
df_fiesta.head(3)

Unnamed: 0,year,price,transmission,mileage,fuelType,engineSize
21449,2017,12000,Automatic,15944,Petrol,1.0
21452,2019,17500,Manual,10460,Petrol,1.5
21453,2019,16500,Automatic,1482,Petrol,1.0


In [7]:
mileage_5k = ((df_fiesta['mileage'] / 5000).round(0) * 5000).astype(int)
df_fiesta_grp = df_fiesta.drop(columns='mileage').groupby(['year', 'transmission', mileage_5k, 'fuelType', 'engineSize']).agg([len, np.mean, np.std, m.mad])
df_fiesta_grp.columns = [c[1] for c in df_fiesta_grp.columns]

'mean group size: ', df_fiesta_grp['len'].mean()
'price mad in 5k groups: ', df_fiesta_grp['mad'].mean()
'price std in 5k groups: ', df_fiesta_grp['std'].mean()

('mean group size: ', 10.446227929373997)

('price mad in 5k groups: ', 323.66491789552254)

('price std in 5k groups: ', 701.2696209252479)

In [8]:
mileage_10k = ((df_fiesta['mileage'] / 10000).round(0) * 10000).astype(int)
df_fiesta_grp = df_fiesta.drop(columns='mileage').groupby(['year', 'transmission', mileage_10k, 'fuelType', 'engineSize']).agg([len, np.mean, np.std, m.mad])
df_fiesta_grp.columns = [c[1] for c in df_fiesta_grp.columns]

'mean group size: ', df_fiesta_grp['len'].mean()
'price mad in 10k groups: ', df_fiesta_grp['mad'].mean()
'price std in 10k groups: ', df_fiesta_grp['std'].mean()

('mean group size: ', 15.349056603773585)

('price mad in 10k groups: ', 343.64691608968064)

('price std in 10k groups: ', 729.4423273421423)

# Encode

In [9]:
m.ordinalEncode(df_fiesta, ['transmission', 'fuelType'])
df_fiesta

Unnamed: 0,year,price,transmission,mileage,fuelType,engineSize
21449,2017,12000,0,15944,1,1.0
21452,2019,17500,1,10460,1,1.5
21453,2019,16500,0,1482,1,1.0
21454,2015,10500,1,35432,1,1.6
21456,2017,9000,1,13054,1,1.2
...,...,...,...,...,...,...
39394,2019,15999,1,2813,1,1.0
39395,2015,8999,1,24546,1,1.0
39401,2017,9899,0,16303,1,1.0
39404,2018,12500,0,7047,1,1.0


## minmax

# X y

In [17]:
df_train = df_fiesta
df_train.head(3)

from sklearn.preprocessing import minmax_scale, scale
df_train = pd.DataFrame(data = scale(minmax_scale(df_fiesta)), columns = df_fiesta.columns)

df_train.head(3)

Unnamed: 0,year,price,transmission,mileage,fuelType,engineSize
21449,2017,12000,0,15944,1,1.0
21452,2019,17500,1,10460,1,1.5
21453,2019,16500,0,1482,1,1.0


Unnamed: 0,year,price,transmission,mileage,fuelType,engineSize
0,0.102,0.625971,-3.791209,-0.354737,0.252824,-0.527712
1,1.110838,2.529589,-0.056243,-0.692438,0.252824,2.165051
2,1.110838,2.183476,-3.791209,-1.245298,0.252824,-0.527712


In [18]:
X, y  = df_train.drop(columns='price'), df_train['price']

y.shape
X.shape
X.columns
X.head(2)


(6508,)

(6508, 5)

Index(['year', 'transmission', 'mileage', 'fuelType', 'engineSize'], dtype='object')

Unnamed: 0,year,transmission,mileage,fuelType,engineSize
0,0.102,-3.791209,-0.354737,0.252824,-0.527712
1,1.110838,-0.056243,-0.692438,0.252824,2.165051


# Train model

In [19]:
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import KFold, RepeatedKFold, StratifiedKFold, RepeatedStratifiedKFold

## Linear Models

In [20]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import minmax_scale, normalize, scale, PolynomialFeatures

In [21]:
models_and_params = [
        (LinearRegression, {}),
        (Lasso, {'random_state': RS}),
        (Ridge, {'random_state': RS}),
]
cv = KFold(5)
metrics = [ mean_absolute_error, m.root_mean_squared_error, r2_score]

_ = t.grid_exec(
    lambda model: m.cv_regression(model, cv, X, y, metrics), 
    models_and_params, 
    on1Completed = lambda results_1_run: m.display_stats(results_1_run, reverse_rank_idx=[2])
)

Unnamed: 0,model,params,n_folds,MAE_mean,MAE_std,#1,RMSE_mean,RMSE_std,#2,R2_mean,R2_std,#3,time
0,LinearRegression,,5,0.44,0.042,2,0.588,0.07,2,0.627,0.037,2,0.0
1,Lasso,,5,0.775,0.103,3,1.01,0.13,3,-0.096,0.097,3,0.0
2,Ridge,,5,0.44,0.042,1,0.588,0.07,1,0.628,0.037,1,0.1


In [23]:
def booger_aids(X, y):
    models_and_params = [
        (LinearRegression, {}),
        # (Lasso, {'random_state': RS}),
        # (Ridge, {'random_state': RS}),
    ]
    cv = KFold(5)
    metrics = [ mean_absolute_error, m.root_mean_squared_error, r2_score]

    r = t.grid_exec(
        lambda model: m.cv_regression(model, cv, X, y, metrics), 
        models_and_params,
    )
    return r

### Poly

In [24]:
all_r = []
for i in range(7):
    X_poly = m.polynomialFeatures(X, i)
    r = booger_aids(X_poly, y)
    all_r.extend(r)

m.display_stats(all_r, False)

Unnamed: 0,model,params,n_folds,MAE_mean,MAE_std,#1,RMSE_mean,RMSE_std,#2,R2_mean,R2_std,#3,time
0,LinearRegression,,5,1,0,4,1,0,3,0,0,5,0.0
1,LinearRegression,,5,0,0,2,1,0,1,1,0,7,0.0
2,LinearRegression,,5,0,0,1,1,1,2,0,1,6,0.1
3,LinearRegression,,5,1,1,3,3,6,4,-45,91,4,0.1
4,LinearRegression,,5,3816959,7633918,6,77072728,154145456,6,-29014220334073184,58028440668146368,2,0.2
5,LinearRegression,,5,1814411,3628821,5,34650464,69300927,5,-5864453341730582,11728906683461164,3,0.4
6,LinearRegression,,5,12732875,25465745,7,234194769,468389378,7,-267894369909188800,535788739818336512,1,1.0


In [None]:
('price mad in 5k groups: ', 323.66491789552254)
('price std in 5k groups: ', 701.2696209252479)

('price mad in 10k groups: ', 343.64691608968064)
('price std in 10k groups: ', 729.4423273421423)