In [1]:
!pip install --upgrade tables
!pip install eli5
!pip install xgboost
!pip install hyperopt

Collecting tables
[?25l  Downloading https://files.pythonhosted.org/packages/ed/c3/8fd9e3bb21872f9d69eb93b3014c86479864cca94e625fd03713ccacec80/tables-3.6.1-cp36-cp36m-manylinux1_x86_64.whl (4.3MB)
[K     |████████████████████████████████| 4.3MB 4.9MB/s 
Installing collected packages: tables
  Found existing installation: tables 3.4.4
    Uninstalling tables-3.4.4:
      Successfully uninstalled tables-3.4.4
Successfully installed tables-3.6.1
Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/97/2f/c85c7d8f8548e460829971785347e14e45fa5c6617da374711dec8cb38cc/eli5-0.10.1-py2.py3-none-any.whl (105kB)
[K     |████████████████████████████████| 112kB 4.9MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.10.1


In [2]:
import pandas as pd
import numpy as np

import xgboost as xgb

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score, KFold

from hyperopt import hp,fmin,tpe,STATUS_OK

import eli5
from eli5.sklearn import PermutationImportance

Using TensorFlow backend.


In [3]:
cd "/content/drive/My Drive/Colab Notebooks/matrix/matrix_two/dw_matrix_car"

/content/drive/My Drive/Colab Notebooks/matrix/matrix_two/dw_matrix_car


In [0]:
df=pd.read_hdf('data/car.h5')

In [5]:
df.shape

(106494, 155)

##Feature Engineering

In [0]:
SUFFIX_CAT='_cat'

for feat in df.columns:
  if isinstance(df[feat][0],list): continue

  factorized_values=df[feat].factorize()[0]
  if SUFFIX_CAT in feat:
    df[feat]=factorized_values
  else:
    df[feat + SUFFIX_CAT]=factorized_values  

In [0]:
df['param_rok-produkcji']=df['param_rok-produkcji'].map(lambda x:-1 if str(x)=='None' else int(x))
df['param_moc']=df['param_moc'].map(lambda x: -1 if str(x)=='None' else x.split(' ')[0])
df['param_pojemność-skokowa']=df['param_pojemność-skokowa'].map(lambda x: -1 if str(x)=='None' else x.split('cm3')[0].replace(' ',''))

In [0]:
def run_model(model,feats):
  x=df[feats].values
  y=df['price_value'].values
  scores = cross_val_score(model,x,y,cv=3,scoring='neg_mean_absolute_error')
  return np.mean(scores),np.std(scores)

In [0]:
feats = ['param_napęd_cat',
'param_rok-produkcji', #zmiana z cat na num
'param_stan_cat',
'param_skrzynia-biegów_cat',
'param_faktura-vat_cat',
'param_moc', #zmiana z cat na num
'param_marka-pojazdu_cat',
'feature_kamera-cofania_cat',
'param_typ_cat',
'param_pojemność-skokowa', #zmiana z cat na num
'seller_name_cat',
'feature_wspomaganie-kierownicy_cat',
'param_model-pojazdu_cat',
'param_wersja_cat',
'param_kod-silnika_cat',
'feature_system-start-stop_cat',
'feature_asystent-pasa-ruchu_cat',
'feature_czujniki-parkowania-przednie_cat',
'feature_łopatki-zmiany-biegów_cat',
'feature_regulowane-zawieszenie_cat']

In [10]:
xgb_params={
    'max_depth':10,
    'n_estimators':50,
    'learning_rate':0.1,
    'seed':0
}
run_model(xgb.XGBRegressor(**xgb_params),feats)



(-7980.848703913878, 71.02226534839897)

In [18]:
def obj_func(params):
  print('Training with params:')
  print(params)

  mean_mae,score_std = run_model(xgb.XGBRegressor(**params),feats)

  return {'loss':np.abs(mean_mae),'status':STATUS_OK}


#space
xgb_reg_params={
    'learning_rate': hp.choice('learning_rate', np.arange(0.05,0.31,0.05)),
    'max_depth': hp.choice('max_depth', np.arange(5,16,1, dtype=int)),
    'subsample': hp.quniform('subsample' , 0.5,1,0.05),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.5,1,0.05),
    'objective': 'reg:squarederror',
    'n_estimators': 100,
    'seed': 0,

}

best= fmin(obj_func,xgb_reg_params,algo=tpe.suggest,max_evals=5)





Training with params:
{'colsample_bytree': 0.9, 'learning_rate': 0.25, 'max_depth': 8, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.7000000000000001}
Training with params:
{'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 9, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.8500000000000001}
Training with params:
{'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 1.0}
Training with params:
{'colsample_bytree': 0.7000000000000001, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.6000000000000001}
Training with params:
{'colsample_bytree': 0.9, 'learning_rate': 0.25, 'max_depth': 7, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.7000000000000001}
100%|██████████| 5/5 [03:24<00:00, 40.55s/it, best loss: 7663.7435695520835]


In [19]:
best

{'colsample_bytree': 0.8,
 'learning_rate': 3,
 'max_depth': 4,
 'subsample': 0.8500000000000001}

In [0]:
!git config --global user.email "jedrzejczak.wa@gmail.com"
!git config --global user.name "Wieslaw"

In [0]:
!git add day5.ipynb
!git commit -m "DecTree,RandomF,XGBoost"

In [0]:
!git push -u origin master