In [0]:
!pip install --upgrade tables
!pip install eli5

In [0]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

import eli5
from eli5.sklearn import PermutationImportance

# Loading and categorizing data




In [0]:
DATA_PATH = 'data/car.h5'

In [4]:
df = pd.read_hdf(DATA_PATH)
df.shape

(106494, 155)

In [5]:
[x for x in df.columns if 'price' in x]

['price_currency', 'price_details', 'price_value']

In [6]:
df['price_currency'].value_counts()

PLN    106290
EUR       204
Name: price_currency, dtype: int64

In [7]:
df = df[ df['price_currency'] != 'EUR']
df.shape
df['price_currency'].value_counts()

PLN    106290
Name: price_currency, dtype: int64

In [0]:
suffix = '_cat'
for column in df.columns:
  if isinstance(df[column][0],list):continue

  factorized = df[column].factorize()[0]
  if suffix in column:
    df[column] = factorized
  else:
    df[column+suffix] = factorized

In [0]:
cat_data = [x for x in df.columns if suffix in x]

In [0]:
cat_data = [x for x in cat_data if 'price' not in x]

In [11]:
x = df[cat_data].values
y = df['price_value'].values

model = DecisionTreeRegressor(max_depth=5)
scores = cross_val_score(model,x,y,cv=3,scoring='neg_mean_absolute_error')
np.mean(scores)

-19566.58893736832

# Display best features

In [12]:
model.fit(x,y)

imp = PermutationImportance(model).fit(x,y)
eli5.show_weights(imp,feature_names=cat_data)

Weight,Feature
0.2565  ± 0.0022,param_napęd_cat
0.2024  ± 0.0062,param_faktura-vat_cat
0.1946  ± 0.0026,param_stan_cat
0.1453  ± 0.0057,param_rok-produkcji_cat
0.0632  ± 0.0038,param_moc_cat
0.0419  ± 0.0009,feature_kamera-cofania_cat
0.0413  ± 0.0019,param_skrzynia-biegów_cat
0.0267  ± 0.0038,param_marka-pojazdu_cat
0.0203  ± 0.0018,param_pojemność-skokowa_cat
0.0167  ± 0.0004,feature_bluetooth_cat
