In [0]:
!pip install eli5
!pip install --upgrade tables

In [0]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

import xgboost as xgb

import eli5
from eli5.sklearn import PermutationImportance

# Loading and categorizing data

In [3]:
df = pd.read_hdf('data/car.h5')
df.shape

(106494, 155)

In [4]:
[x for x in df.columns if 'price' in x]

['price_currency', 'price_details', 'price_value']

In [5]:
df['price_currency'].value_counts()

PLN    106290
EUR       204
Name: price_currency, dtype: int64

In [6]:
df = df[ df['price_currency'] != 'EUR']
df.shape
df['price_currency'].value_counts()

PLN    106290
Name: price_currency, dtype: int64

In [0]:
suffix = '_cat'
for column in df.columns:
  if isinstance(df[column][0],list):continue

  factorized = df[column].factorize()[0]
  if suffix in column:
    df[column] = factorized
  else:
    df[column+suffix] = factorized

In [0]:
cat_data = [x for x in df.columns if suffix in x]

In [0]:
cat_data = [x for x in cat_data if 'price' not in x]

# Data prediction

In [0]:
def run_model(model,data):
  x = df[data].values
  y = df['price_value'].values

  scores = cross_val_score(model,x,y,cv=3,scoring='neg_mean_absolute_error')
  return (np.mean(scores),np.std(scores))

In [22]:
data = ['param_napęd_cat','param_faktura-vat_cat','param_stan_cat','param_rok-produkcji_cat','param_moc_cat','param_skrzynia-biegów_cat','feature_kamera-cofania_cat','param_marka-pojazdu_cat','param_pojemność-skokowa_cat','feature_bluetooth_cat','feature_łopatki-zmiany-biegów_cat','feature_światła-led_cat','feature_klimatyzacja-manualna_cat','param_kod-silnika_cat']

xgbconf = {
    'max_depth':5,
    'n_estiminators':50,
    'learning_rate':0.1,
    'seed':0,
    'objective':'reg:squarederror'
}

print(run_model(xgb.XGBRegressor(**xgbconf),cat_data))
print(run_model(xgb.XGBRegressor(**xgbconf),data))

(-11724.014890347571, 93.13570351767962)
(-12651.943794108565, 113.08085999961047)


In [0]:
df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: -1 if str(x)=='None' else int(x))

In [23]:
data = ['param_napęd_cat','param_faktura-vat_cat','param_stan_cat','param_rok-produkcji','param_moc_cat','param_skrzynia-biegów_cat','feature_kamera-cofania_cat','param_marka-pojazdu_cat','param_pojemność-skokowa_cat','feature_bluetooth_cat','feature_łopatki-zmiany-biegów_cat','feature_światła-led_cat','feature_klimatyzacja-manualna_cat','param_kod-silnika_cat']
print(run_model(xgb.XGBRegressor(**xgbconf),data))

(-11216.071485276958, 89.05607408215677)


In [0]:
df['param_moc'] = df['param_moc'].map(lambda x: -1 if str(x)=='None' else int(x.replace(' ','').split('KM')[0]))

In [24]:
data = ['param_napęd_cat','param_faktura-vat_cat','param_stan_cat','param_rok-produkcji','param_moc','param_skrzynia-biegów_cat','feature_kamera-cofania_cat','param_marka-pojazdu_cat','param_pojemność-skokowa_cat','feature_bluetooth_cat','feature_łopatki-zmiany-biegów_cat','feature_światła-led_cat','feature_klimatyzacja-manualna_cat','param_kod-silnika_cat']
print(run_model(xgb.XGBRegressor(**xgbconf),data))

(-9543.999134278894, 48.74135835043408)


In [0]:
df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: -1 if str(x)=='None' else int(x.replace(' ','').split('cm3')[0]))

In [25]:
data = ['param_napęd_cat','param_faktura-vat_cat','param_stan_cat','param_rok-produkcji','param_moc','param_skrzynia-biegów_cat','feature_kamera-cofania_cat','param_marka-pojazdu_cat','param_pojemność-skokowa','feature_bluetooth_cat','feature_łopatki-zmiany-biegów_cat','feature_światła-led_cat','feature_klimatyzacja-manualna_cat','param_kod-silnika_cat']
print(run_model(xgb.XGBRegressor(**xgbconf),data))

(-9376.088262775234, 57.17602884271151)


Best loss: 9376.08