In [0]:
!pip3 install --upgrade tables
!pip3 install xgboost
!pip3 install hyperopt

In [0]:
import pandas as pd
import numpy as np

import xgboost as xgb

from sklearn.model_selection import cross_val_score

from hyperopt import STATUS_OK, hp, fmin, tpe,STATUS_FAIL

# Loading and categorizing data

In [3]:
df = pd.read_hdf('data/car.h5')
df.shape

(106494, 155)

In [4]:
df = df[ df['price_currency'] != 'EUR']
df.shape
df['price_currency'].value_counts()

PLN    106290
Name: price_currency, dtype: int64

In [0]:
suffix = '_cat'
for column in df.columns:
  if isinstance(df[column][0],list):continue

  factorized = df[column].factorize()[0]
  if suffix in column:
    df[column] = factorized
  else:
    df[column+suffix] = factorized

In [0]:
def run_model(model,data):
  x = df[data].values
  y = df['price_value'].values

  scores = cross_val_score(model,x,y,cv=3,scoring='neg_mean_absolute_error')
  return (np.mean(scores),np.std(scores))

In [7]:
data = ['param_napęd_cat','param_faktura-vat_cat','param_stan_cat','param_rok-produkcji_cat','param_moc_cat','param_skrzynia-biegów_cat','feature_kamera-cofania_cat','param_marka-pojazdu_cat','param_pojemność-skokowa_cat','feature_bluetooth_cat','feature_łopatki-zmiany-biegów_cat','feature_światła-led_cat','feature_klimatyzacja-manualna_cat','param_kod-silnika_cat']

xgbconf = {
    'max_depth':5,
    'n_estiminators':50,
    'learning_rate':0.1,
    'seed':0,
    'objective':'reg:squarederror'
}

run_model(xgb.XGBRegressor(**xgbconf),data)

(-12651.943794108565, 113.08085999961047)

In [0]:
df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: -1 if str(x)=='None' else int(x))
df['param_moc'] = df['param_moc'].map(lambda x: -1 if str(x)=='None' else int(x.replace(' ','').split('KM')[0]))
df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: -1 if str(x)=='None' else int(x.replace(' ','').split('cm3')[0]))

# Data prediction without optimizer

In [9]:
data = ['param_napęd_cat','param_faktura-vat_cat','param_stan_cat','param_rok-produkcji','param_moc','param_skrzynia-biegów_cat','feature_kamera-cofania_cat','param_marka-pojazdu_cat','param_pojemność-skokowa','feature_bluetooth_cat','feature_łopatki-zmiany-biegów_cat','feature_światła-led_cat','feature_klimatyzacja-manualna_cat','param_kod-silnika_cat']
run_model(xgb.XGBRegressor(**xgbconf),data)

(-9376.088262775234, 57.17602884271151)

# Setting up optimizer

In [0]:
def optimize_model(params):
  try:
    mean,score = run_model(xgb.XGBRegressor(**params),data)
    print('Running on: ',params)
    return {'loss': np.abs(mean),'status':STATUS_OK}
  except:
    return {'status':STATUS_FAIL}

In [0]:
new_xgbconf = {
    'max_depth': hp.choice('max_depth',np.arange(5,16,1,dtype=int)),
    'n_estiminators': 100,
    'learning_rate': hp.quniform('learning_rate',0.05,0.31,0.05),
    'subsample': hp.quniform('subsample',0.5,1,0.05),
    'colsample_bytree': hp.quniform('colsample_bytree',0.5,1,0.05),
    'seed':0,
    'objective':'reg:squarederror'
}

# Data prediction with optimizer

In [12]:
best = fmin(optimize_model,new_xgbconf,algo=tpe.suggest,max_evals=25)
best

Running on: 
{'colsample_bytree': 0.7000000000000001, 'learning_rate': 0.2, 'max_depth': 10, 'n_estiminators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9}
Running on: 
{'colsample_bytree': 0.5, 'learning_rate': 0.25, 'max_depth': 9, 'n_estiminators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9}
Running on: 
{'colsample_bytree': 0.6000000000000001, 'learning_rate': 0.25, 'max_depth': 10, 'n_estiminators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9}
Running on: 
{'colsample_bytree': 0.75, 'learning_rate': 0.15000000000000002, 'max_depth': 14, 'n_estiminators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.65}
Running on: 
{'colsample_bytree': 0.55, 'learning_rate': 0.2, 'max_depth': 10, 'n_estiminators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 1.0}
Running on: 
{'colsample_bytree': 0.65, 'learning_rate': 0.15000000000000002, 'max_depth': 7, 'n_estiminators': 100, 'objective': 'reg:

{'colsample_bytree': 0.8500000000000001,
 'learning_rate': 0.05,
 'max_depth': 10,
 'subsample': 0.8500000000000001}

Best loss: 8210.47