In [1]:
!pip install eli5
!pip install --upgrade tables
!pip install xgboost

Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/97/2f/c85c7d8f8548e460829971785347e14e45fa5c6617da374711dec8cb38cc/eli5-0.10.1-py2.py3-none-any.whl (105kB)
[K     |███                             | 10kB 15.2MB/s eta 0:00:01[K     |██████▏                         | 20kB 1.8MB/s eta 0:00:01[K     |█████████▎                      | 30kB 2.6MB/s eta 0:00:01[K     |████████████▍                   | 40kB 1.7MB/s eta 0:00:01[K     |███████████████▌                | 51kB 2.1MB/s eta 0:00:01[K     |██████████████████▋             | 61kB 2.5MB/s eta 0:00:01[K     |█████████████████████▊          | 71kB 2.8MB/s eta 0:00:01[K     |████████████████████████▊       | 81kB 2.2MB/s eta 0:00:01[K     |███████████████████████████▉    | 92kB 2.5MB/s eta 0:00:01[K     |███████████████████████████████ | 102kB 2.8MB/s eta 0:00:01[K     |████████████████████████████████| 112kB 2.8MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.10.1
Coll

In [2]:
import pandas as pd
import numpy as np

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score,KFold

import xgboost as xgb

import eli5
from eli5.sklearn import PermutationImportance

Using TensorFlow backend.


## Wczytywanie danych

In [3]:
cd "/content/drive/My Drive/Colab Notebooks/matrix_two/dw_matrix_car"

/content/drive/My Drive/Colab Notebooks/matrix_two/dw_matrix_car


In [50]:
df = pd.read_hdf('data/car.h5')
df.shape

(106494, 155)

In [5]:
df.columns

Index(['breadcrumb', 'created_at', 'price_currency', 'price_details',
       'price_value', 'seller_address', 'seller_name', 'seller_type',
       'feature_czujniki-parkowania-przednie',
       'feature_poduszka-powietrzna-chroniąca-kolana',
       ...
       'param_pearl', 'param_stan', 'param_wersja', 'param_emisja-co2',
       'param_body-type', 'param_matowy', 'param_bezwypadkowy',
       'param_akryl-(niemetalizowany)', 'param_monthly-payment-value',
       'car_id'],
      dtype='object', length=155)

## Dummy Model

In [6]:
df.select_dtypes(np.number).columns

Index(['price_value', 'car_id'], dtype='object')

In [9]:
feats = ['car_id']
X = df[feats].values
y = df['price_value'].values

model = DummyRegressor()
model.fit(X,y)
y_pred = model.predict(X)

mae(y,y_pred)

39465.934630440985

In [10]:
[x for x in df.columns if 'price' in x]

['price_currency', 'price_details', 'price_value']

In [11]:
df['price_currency'].value_counts()

PLN    106290
EUR       204
Name: price_currency, dtype: int64

In [51]:
df = df[df['price_currency'] != 'EUR']
df.shape

(106290, 155)

## Features

In [0]:
SUFFIX_CAT='__cat'
for feat in df.columns:
  if isinstance(df[feat][0],list):continue

  factorized_values = df[feat].factorize()[0]
  if SUFFIX_CAT in feat:
    df[feat] = factorized_values
  else:
    df[feat + SUFFIX_CAT] = factorized_values

In [53]:
cat_feats = [x for x in df.columns if SUFFIX_CAT in x]
cat_feats = [x for x in cat_feats if 'price' not in x]
len(cat_feats)

151

In [15]:
cat_feats

['created_at__cat',
 'seller_address__cat',
 'seller_name__cat',
 'seller_type__cat',
 'feature_czujniki-parkowania-przednie__cat',
 'feature_poduszka-powietrzna-chroniąca-kolana__cat',
 'feature_kurtyny-powietrzne__cat',
 'feature_klimatyzacja-dwustrefowa__cat',
 'feature_światła-led__cat',
 'feature_czujnik-zmierzchu__cat',
 'feature_elektrycznie-ustawiane-lusterka__cat',
 'feature_asr-(kontrola-trakcji)__cat',
 'feature_poduszka-powietrzna-kierowcy__cat',
 'feature_cd__cat',
 'feature_elektryczne-szyby-przednie__cat',
 'feature_poduszka-powietrzna-pasażera__cat',
 'feature_system-start-stop__cat',
 'feature_światła-do-jazdy-dziennej__cat',
 'feature_komputer-pokładowy__cat',
 'feature_elektryczne-szyby-tylne__cat',
 'feature_klimatyzacja-manualna__cat',
 'feature_tapicerka-welurowa__cat',
 'feature_czujnik-deszczu__cat',
 'feature_światła-przeciwmgielne__cat',
 'feature_ogrzewanie-postojowe__cat',
 'feature_radio-niefabryczne__cat',
 'feature_regulowane-zawieszenie__cat',
 'feature_

In [0]:
def run_model(model,feats):
  X = df[feats].values
  y = df['price_value'].values


  scores = cross_val_score(model, X,y,cv=3,scoring='neg_mean_absolute_error')
  return (np.mean(scores),np.std(scores))

In [17]:
m = DecisionTreeRegressor(max_depth=5)
m.fit(X,y)

imp = PermutationImportance(m,random_state=325).fit(X,y)
eli5.show_weights(imp,feature_names=cat_feats)

Weight,Feature
0.2532  ± 0.0057,param_napęd__cat
0.1984  ± 0.0050,param_faktura-vat__cat
0.1957  ± 0.0054,param_stan__cat
0.1398  ± 0.0026,param_rok-produkcji__cat
0.0645  ± 0.0039,param_moc__cat
0.0415  ± 0.0013,feature_kamera-cofania__cat
0.0411  ± 0.0028,param_skrzynia-biegów__cat
0.0283  ± 0.0030,param_marka-pojazdu__cat
0.0199  ± 0.0015,param_pojemność-skokowa__cat
0.0162  ± 0.0005,feature_bluetooth__cat


## DecisionTree


In [0]:
run_model(DecisionTreeRegressor(max_depth=5),cat_feats)

## RandomForest

In [21]:
run_model(RandomForestRegressor(n_estimators=50,random_state=0),cat_feats)

(-11218.24568717095, 347.0322002294832)

## XGBoost

In [22]:
xgb_params ={
    'max_depth':5,
    'n_estimators':50,
    'learning_rate': 0.1,
    'random_state':0
}
model = xgb.XGBRegressor(**xgb_params)
run_model(model,cat_feats)



(-13039.290196724838, 109.36715375706265)

In [24]:
m = xgb.XGBRegressor(**xgb_params)
m.fit(X,y)
imp = PermutationImportance(m,random_state=0).fit(X,y)
eli5.show_weights(imp,feature_names=cat_feats)



Weight,Feature
0.1209  ± 0.0019,param_napęd__cat
0.1175  ± 0.0030,param_rok-produkcji__cat
0.1113  ± 0.0013,param_stan__cat
0.0625  ± 0.0019,param_skrzynia-biegów__cat
0.0527  ± 0.0016,param_faktura-vat__cat
0.0461  ± 0.0015,param_moc__cat
0.0275  ± 0.0008,param_marka-pojazdu__cat
0.0230  ± 0.0004,param_typ__cat
0.0227  ± 0.0007,feature_kamera-cofania__cat
0.0191  ± 0.0007,param_pojemność-skokowa__cat


In [0]:
feats = ['param_napęd__cat',
'param_rok-produkcji',
'param_stan__cat',
'param_skrzynia-biegów__cat',
'param_faktura-vat__cat',
'param_moc',
'param_marka-pojazdu__cat',
'param_typ__cat',
'feature_kamera-cofania__cat',
'param_pojemność-skokowa',
'seller_name__cat',
'param_kod-silnika__cat',
'param_model-pojazdu__cat',
'feature_wspomaganie-kierownicy__cat',
'param_wersja__cat',
'feature_czujniki-parkowania-przednie__cat',
'feature_asystent-pasa-ruchu__cat',
'feature_regulowane-zawieszenie__cat',
'feature_system-start-stop__cat',
'feature_światła-led__cat']

In [0]:
df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: -1 if str(x)=='None' else x)

In [0]:
df['param_moc'] = df['param_moc'].map(lambda x: -1 if str(x)=='None' else int(x.split(' ')[0]))

In [0]:
df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: -1 if str(x)=='None' else int(x.split('cm')[0].replace(' ','')))

In [64]:
xgb_params ={
    'max_depth':5,
    'n_estimators':50,
    'learning_rate': 0.1,
    'random_state':0
}
model = xgb.XGBRegressor(**xgb_params)
run_model(model,feats)



(-9449.513980284812, 81.47168211987172)

In [0]:
  !git config --global user.email "nickypolit2@gmail.com"
  !git config --global user.name "Nikita"

In [0]:
!git add day3_simple_model.ipynb
!git commit -m "add simle model"

In [0]:
!git push -u origin master

Counting objects: 1   Counting objects: 3, done.
Delta compression using up to 2 threads.
Compressing objects:  33% (1/3)   Compressing objects:  66% (2/3)   Compressing objects: 100% (3/3)   Compressing objects: 100% (3/3), done.
Writing objects:  33% (1/3)   Writing objects:  66% (2/3)   Writing objects: 100% (3/3)   Writing objects: 100% (3/3), 5.68 KiB | 1.89 MiB/s, done.
Total 3 (delta 1), reused 0 (delta 0)
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/NykPol/dw_matrix_car.git
   c3dab5f..15b0af0  master -> master
Branch 'master' set up to track remote branch 'master' from 'origin'.
