In [0]:
# !pip install --upgrade tables
# !pip install eli5
# !pip install xgboost

In [0]:
import pandas as pd
import numpy as np
import eli5
import xgboost as xgb

from eli5.sklearn import PermutationImportance
from sklearn.metrics import mean_absolute_error as mean_absolute_error
from sklearn.model_selection import cross_val_score, KFold

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [4]:
cd "/content/drive/My Drive/Colab Notebooks/dw_matrix/matrix_two/dw_matrix_car"

/content/drive/My Drive/Colab Notebooks/dw_matrix/matrix_two/dw_matrix_car


In [12]:
df = pd.read_hdf('data/car.h5')
df.shape

(106494, 155)

## Feature Engineering

In [0]:
SUFFIX_CAT = '__cat'
for feat in df.columns:
  if isinstance( df[feat][0], list): continue
  
  factorized_value = df[feat].factorize()[0] 
  if SUFFIX_CAT in feat:
    df[feat] = factorized_value
  else:
    df[feat + SUFFIX_CAT] = factorized_value

In [15]:
cat_feats = [x for x in df.columns if SUFFIX_CAT in x]
cat_feats = [x for x in cat_feats if 'price' not in x ]
len(cat_feats)

151

## Function  

In [0]:
def run_model(model,feats):

  X = df[feats].values
  y = df['price_value'].values

  scores = cross_val_score(model,X,y, cv=3, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

### DecisionTree

In [21]:
run_model(DecisionTreeRegressor(max_depth=5), cat_feats)

(-19695.13091100928, 148.72570644015792)

### Random Forest

In [22]:
model = RandomForestRegressor(max_depth=5, n_estimators=50, random_state=0)
run_model(model, cat_feats)

(-18718.657185256638, 64.5424578125788)

### SQBoost

In [53]:
xgb_params = {
    'max_depth':5,
    'n_estimators':50,
    'learning_rate':0.1,
    'seed':0
}

run_model(xgb.XGBRegressor(**xgb_params), cat_feats)



(-13108.379065811214, 74.32158265003798)

In [26]:
m = xgb.XGBRegressor(**xgb_params)
m.fit(  X, y )

imp =  PermutationImportance(m, random_state=0, ).fit(X, y)
eli5.show_weights(imp, feature_names= cat_feats)



Weight,Feature
0.1295  ± 0.0023,param_napęd__cat
0.0920  ± 0.0009,param_stan__cat
0.0682  ± 0.0007,param_faktura-vat__cat
0.0570  ± 0.0016,param_rok-produkcji__cat
0.0562  ± 0.0018,param_skrzynia-biegów__cat
0.0389  ± 0.0006,param_moc__cat
0.0214  ± 0.0011,feature_kamera-cofania__cat
0.0207  ± 0.0009,param_typ__cat
0.0179  ± 0.0009,seller_name__cat
0.0132  ± 0.0005,feature_światła-led__cat


In [54]:
feats =  ['param_napęd__cat','param_stan__cat','param_faktura-vat__cat','param_rok-produkcji__cat','param_skrzynia-biegów__cat','param_moc__cat','feature_kamera-cofania__cat','param_typ__cat','seller_name__cat','feature_światła-led__cat','feature_wspomaganie-kierownicy__cat','feature_system-start-stop__cat','param_pojemność-skokowa__cat','feature_regulowane-zawieszenie__cat','feature_łopatki-zmiany-biegów__cat','feature_asystent-pasa-ruchu__cat','feature_hud-(wyświetlacz-przezierny)__cat','feature_czujniki-parkowania-przednie__cat','param_marka-pojazdu__cat','param_kod-silnika__cat' ] 
#len(feats)

run_model(xgb.XGBRegressor(**xgb_params), feats)



(-13393.50596342179, 181.97459015213212)

In [55]:
df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: -1 if str(x) =='None' else int(x)   )

feats =  ['param_napęd__cat','param_stan__cat','param_faktura-vat__cat','param_rok-produkcji','param_skrzynia-biegów__cat','param_moc__cat','feature_kamera-cofania__cat','param_typ__cat','seller_name__cat','feature_światła-led__cat','feature_wspomaganie-kierownicy__cat','feature_system-start-stop__cat','param_pojemność-skokowa__cat','feature_regulowane-zawieszenie__cat','feature_łopatki-zmiany-biegów__cat','feature_asystent-pasa-ruchu__cat','feature_hud-(wyświetlacz-przezierny)__cat','feature_czujniki-parkowania-przednie__cat','param_marka-pojazdu__cat','param_kod-silnika__cat' ] 
run_model(xgb.XGBRegressor(**xgb_params), feats)



(-11472.349388129545, 115.17704045600611)

In [56]:
#df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: -1 if str(x) =='None' else int(x)   )
df['param_moc'] = df['param_moc'].map(lambda x: -1 if str(x) == 'None' else int(str(x).split(' ')[0]) )


feats =  ['param_napęd__cat','param_stan__cat','param_faktura-vat__cat','param_rok-produkcji','param_skrzynia-biegów__cat','param_moc','feature_kamera-cofania__cat','param_typ__cat','seller_name__cat','feature_światła-led__cat','feature_wspomaganie-kierownicy__cat','feature_system-start-stop__cat','param_pojemność-skokowa__cat','feature_regulowane-zawieszenie__cat','feature_łopatki-zmiany-biegów__cat','feature_asystent-pasa-ruchu__cat','feature_hud-(wyświetlacz-przezierny)__cat','feature_czujniki-parkowania-przednie__cat','param_marka-pojazdu__cat','param_kod-silnika__cat' ] 
run_model(xgb.XGBRegressor(**xgb_params), feats)



(-9790.801239302049, 67.27163103418717)

In [49]:
  df['param_pojemność-skokowa'].unique()

array(['898 cm3', '1 560 cm3', '3 000 cm3', ..., '5 992 cm3', '1 966 cm3',
       '142 280 cm3'], dtype=object)

In [67]:
df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: -1 if str(x) == 'None' else int(str(x).split('cm')[0].replace(' ','')) )

feats =  ['param_napęd__cat','param_stan__cat','param_faktura-vat__cat','param_rok-produkcji','param_skrzynia-biegów__cat','param_moc','feature_kamera-cofania__cat','param_typ__cat','seller_name__cat','feature_światła-led__cat','feature_wspomaganie-kierownicy__cat','feature_system-start-stop__cat','param_pojemność-skokowa','feature_regulowane-zawieszenie__cat','feature_łopatki-zmiany-biegów__cat','feature_asystent-pasa-ruchu__cat','feature_hud-(wyświetlacz-przezierny)__cat','feature_czujniki-parkowania-przednie__cat','param_marka-pojazdu__cat','param_kod-silnika__cat' ] 
run_model(xgb.XGBRegressor(**xgb_params), feats)



(-9661.940862244881, 73.39680454747987)

In [69]:
  df['param_pojemność-skokowa'].unique()

array([   898,   1560,   3000, ...,   5992,   1966, 142280])