In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score

import eli5
from eli5.sklearn import PermutationImportance

##Load Data

In [25]:
df = pd.read_hdf('data/car.h5')
df_copy = df.copy()
df.shape

(106494, 155)

In [20]:
df.columns

Index(['breadcrumb', 'created_at', 'price_currency', 'price_details',
       'price_value', 'seller_address', 'seller_name', 'seller_type',
       'feature_czujniki-parkowania-przednie',
       'feature_poduszka-powietrzna-chroniąca-kolana',
       ...
       'param_pearl', 'param_stan', 'param_wersja', 'param_emisja-co2',
       'param_body-type', 'param_matowy', 'param_bezwypadkowy',
       'param_akryl-(niemetalizowany)', 'param_monthly-payment-value',
       'car_id'],
      dtype='object', length=155)

##Data transformations

In [21]:
df['price_currency'].unique()

array(['PLN', 'EUR'], dtype=object)

In [22]:
df['price_currency'].value_counts()

PLN    106290
EUR       204
Name: price_currency, dtype: int64

In [23]:
euro_rate = 4.58
for value in df['price_currency']:
  if value == 'EUR':
    df['price_value'] * 4.58
    df['price_currency'] = 'PLN'

In [24]:
df['price_currency'].value_counts()

PLN    106494
Name: price_currency, dtype: int64

In [33]:
suffix_cat = '__cat'
for col in df.columns:
  if isinstance(df[col][0], list): continue

  factor_value = df[col].factorize()[0]
  if suffix_cat in col:
    df[col] = factor_value
  else:
    df[col + suffix_cat] = factor_value

In [41]:
cat_features = [suf for suf in df.columns if (suffix_cat in suf) and ('price' not in suf)]
cat_features

['car_id__cat',
 'car_id__cat__cat',
 'created_at__cat',
 'seller_address__cat',
 'seller_name__cat',
 'seller_type__cat',
 'feature_czujniki-parkowania-przednie__cat',
 'feature_poduszka-powietrzna-chroniąca-kolana__cat',
 'feature_kurtyny-powietrzne__cat',
 'feature_klimatyzacja-dwustrefowa__cat',
 'feature_światła-led__cat',
 'feature_czujnik-zmierzchu__cat',
 'feature_elektrycznie-ustawiane-lusterka__cat',
 'feature_asr-(kontrola-trakcji)__cat',
 'feature_poduszka-powietrzna-kierowcy__cat',
 'feature_cd__cat',
 'feature_elektryczne-szyby-przednie__cat',
 'feature_poduszka-powietrzna-pasażera__cat',
 'feature_system-start-stop__cat',
 'feature_światła-do-jazdy-dziennej__cat',
 'feature_komputer-pokładowy__cat',
 'feature_elektryczne-szyby-tylne__cat',
 'feature_klimatyzacja-manualna__cat',
 'feature_tapicerka-welurowa__cat',
 'feature_czujnik-deszczu__cat',
 'feature_światła-przeciwmgielne__cat',
 'feature_ogrzewanie-postojowe__cat',
 'feature_radio-niefabryczne__cat',
 'feature_reg

In [42]:
len(cat_features)

152

##Features

In [57]:
model = DecisionTreeRegressor(max_depth=5)
model.fit(X, y)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=5,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [None]:
imp = PermutationImportance(model, random_state=5).fit(X,y)
eli5.show_weights(imp, feature_names=cat_features)

##DecisionTreeRegressor

In [56]:
X = df[cat_features]
y = df['price_value']

In [None]:
tree = DecisionTreeRegressor(max_depth=5)