In [39]:
ls

[0m[01;34mData[0m/  LICENSE  [01;34mMatrix-I[0m/  README.md


In [40]:
cd "drive/My Drive/Colab Notebooks/DataWorkshop-Matrix/"

[Errno 2] No such file or directory: 'drive/My Drive/Colab Notebooks/DataWorkshop-Matrix/'
/content/drive/My Drive/Colab Notebooks/DataWorkshop-Matrix


In [41]:
!pip install eli5



In [0]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
import eli5
from eli5.sklearn import PermutationImportance
from ast import literal_eval
from tqdm import tqdm_notebook

In [0]:
df_orig = pd.read_csv('Data/MenShoePrices.csv', low_memory=False)

In [0]:
def run_model(feature, model=DecisionTreeRegressor(max_depth=5)):
  X = df_orig[feature].values
  y = df_orig['prices_amountmin'].values
  scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

In [0]:
df_orig['brand_id'] = df_orig['brand'].factorize()[0]

In [46]:
my_model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(['brand_id'], my_model)

(-57.47223572384038, 4.328288468270897)

In [0]:
df_orig['brand_id'] = df_orig['brand'].map(lambda x: str(x).lower()).factorize()[0]

In [48]:
run_model(['brand_id'], my_model)

(-57.31783843165656, 4.181246596160967)

In [0]:
def parse_features(x):
  out_dict = {}
  if str(x) == 'nan': return out_dict
  x = literal_eval(x.replace('\\"', '"'))
  for item in x:
    key = item['key'].lower().strip()
    val = item['value'][0].lower().strip()
    out_dict[key] = val
  return out_dict
df_orig['features_parsed'] = df_orig['features'].map(parse_features)

In [50]:
df_orig['features_parsed'].head().values

array([{'gender': 'men', 'shoe size': 'm', 'shoe category': "men's shoes", 'color': 'multicolor', 'manufacturer part number': '8190-w-navy-7.5', 'brand': 'josmo'},
       {'gender': 'men', 'shoe size': 'm', 'shoe category': "men's shoes", 'color': 'multicolor', 'manufacturer part number': '8190-w-navy-7.5', 'brand': 'josmo'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'}],
      dtype=object)

In [51]:
keys_set = set()
df_orig['features_parsed'].map(lambda x: keys_set.update(x.keys()))
len(keys_set)

476

In [52]:
for key in tqdm_notebook(keys_set):
  df_orig['feats_' + key] = df_orig['features_parsed'].map(lambda x: x[key] if key in x else np.nan)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(IntProgress(value=0, max=476), HTML(value='')))




In [53]:
df_orig['feats_gender'].head()

0    men
1    men
2    men
3    men
4    men
Name: feats_gender, dtype: object

In [54]:
keys_stats = {}
for key in tqdm_notebook(keys_set):
  keys_stats[key] = df_orig[False == df_orig['feats_' + key].isnull()].shape[0] / df_orig.shape[0] * 100

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(IntProgress(value=0, max=476), HTML(value='')))




In [55]:
{k:v for k,v in keys_stats.items() if v > 40}

{'brand': 48.62691466083151,
 'color': 47.784463894967175,
 'gender': 50.17505470459519}

In [0]:
feats_ids = []
for k,v in keys_stats.items():
  if v > 30:
    df_orig['feats_' + k + '_id'] = df_orig['feats_' + k].factorize()[0]
    feats_ids.append('feats_' + k + '_id')


In [57]:
df_orig['feats_gender_id'].head()

0    0
1    0
2    0
3    0
4    0
Name: feats_gender_id, dtype: int64

In [58]:
feats_ids

['feats_gender_id',
 'feats_brand_id',
 'feats_manufacturer part number_id',
 'feats_material_id',
 'feats_color_id']

In [59]:
df_orig[df_orig['brand'] != df_orig['feats_brand']][['brand', 'feats_brand']].head()

Unnamed: 0,brand,feats_brand
0,Josmo,josmo
1,Josmo,josmo
2,SERVUS BY HONEYWELL,servus by honeywell
3,SERVUS BY HONEYWELL,servus by honeywell
4,SERVUS BY HONEYWELL,servus by honeywell


In [0]:
df_orig['brand'] = df_orig['brand'].map(lambda x: str(x).lower())

In [61]:
df_orig[df_orig['brand'] == df_orig['feats_brand']].shape

(8846, 531)

In [70]:
my_model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(feats_ids, my_model)

(-60.523207611509655, 4.1212852278312955)

In [63]:
feats_ids

['feats_gender_id',
 'feats_brand_id',
 'feats_manufacturer part number_id',
 'feats_material_id',
 'feats_color_id']

In [69]:
X = df_orig[feats_ids].values
y = df_orig['prices_amountmin'].values
my_model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
my_model.fit(X, y)
perm = PermutationImportance(my_model, random_state=1).fit(X, y)
eli5.show_weights(perm, feature_names=feats_ids)

Weight,Feature
0.1182  ± 0.0043,feats_material_id
0.1010  ± 0.0096,feats_gender_id
0.0583  ± 0.0032,feats_brand_id


In [30]:
ls

[0m[01;34mData[0m/  LICENSE  [01;34mMatrix-I[0m/  README.md


In [0]:
feats_ids = list(filter(lambda x: x not in ['feats_color_id', 'feats_manufacturer part number_id'], feats_ids))

In [68]:
feats_ids

['feats_gender_id', 'feats_brand_id', 'feats_material_id']