In [0]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
import eli5
from eli5.sklearn import PermutationImportance
from ast import literal_eval
from tqdm import tqdm_notebook

In [12]:
cd "dw_matix/"

/content/drive/My Drive/Colab Notebooks/First_task/dw_matix


In [63]:
df = pd.read_csv('data/women_shoes.csv', low_memory=False)
df.shape

(18001, 47)

In [0]:
def run_model(feats, model = DecisionTreeRegressor(max_depth=5)):

  x = df[feats].values
  y = df['prices_amountmin'].values

  scores = cross_val_score(model, x, y, scoring='neg_mean_absolute_error')

  return np.mean(scores), np.std(scores)


In [65]:
df['brand_cat'] = df['brand'].map(lambda x: str(x).lower() ).factorize()[0]
run_model(['brand_cat'])


(-51.57118869808353, 0.6390743725376244)

In [66]:
 run_model(['brand_cat'], RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0))

(-51.356432361171095, 0.7207640114801125)

In [0]:
df.features

In [0]:
literal_eval(df.features[0])

In [0]:
def parse_features(x):
  output_dict = {}
  if str(x) == 'nan': 
    return output_dict
  features = literal_eval(x.replace('\\"', '"'))
  for item in features:
    key = item['key'].lower().strip()
    value = item['value'][0].lower().strip()
    output_dict[key] = value
  return output_dict
df['features_parsed'] = df['features'].map(parse_features)

In [76]:
df['features_parsed'].head().values

array([{'season': 'all-season', 'material': 'synthetic', 'gender': 'women', 'shoe size': '9', 'size': '9', 'color': 'blue', 'model': 'z130103201090', 'manufacturer part number': 'z130103201090', 'brand': 'zoot', 'age group': 'adult'},
       {'season': 'all-season', 'material': 'synthetic', 'gender': 'women', 'shoe size': '9', 'size': '9', 'color': 'blue', 'model': 'z130103201090', 'manufacturer part number': 'z130103201090', 'brand': 'zoot', 'age group': 'adult'},
       {'season': 'all-season', 'material': 'synthetic', 'gender': 'women', 'shoe size': '9', 'size': '9', 'color': 'blue', 'model': 'z130103201090', 'manufacturer part number': 'z130103201090', 'brand': 'zoot', 'age group': 'adult'},
       {'season': 'all-season', 'material': 'synthetic', 'gender': 'women', 'shoe size': '9', 'size': '9', 'color': 'blue', 'model': 'z130103201090', 'manufacturer part number': 'z130103201090', 'brand': 'zoot', 'age group': 'adult'},
       {'heel height': 'high (3 in. and up)', 'material': 's

In [77]:
keys = set()
df['features_parsed'].map(lambda x: keys.update(x.keys()))
len(keys)

342

In [80]:
def get_name_feat(key):
  return 'feat_'+key

for key in tqdm_notebook(keys):
  df[get_name_feat(key)] = df.features_parsed.map(lambda feats: feats[key] if key in feats else np.nan)

HBox(children=(IntProgress(value=0, max=342), HTML(value='')))




In [81]:
df.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension',
       ...
       'feat_texture', 'feat_legging length', 'feat_rx-able',
       'feat_fabric care instructions', 'feat_shoe size', 'feat_gender',
       'feat_bridge/temple size:', 'feat_unit type',
       'feat_assembled product weight', 'feat_pronation'],
      dtype='object', length=391)

In [82]:
df[ df["feat_gender"].isnull()].shape

(7410, 391)

In [84]:
df.shape[0]

18001

In [0]:
keys_stat = {}
for key in keys:
  keys_stat[key] = df[ False == df[get_name_feat(key)].isnull()].shape[0] /df.shape[0] * 100
keys_stat

In [96]:
{k:v for k, v in keys_stat.items() if v > 30}

{'age group': 42.54208099550025,
 'brand': 57.22459863340925,
 'color': 53.1414921393256,
 'gender': 58.835620243319816,
 'manufacturer part number': 44.30309427254041,
 'material': 48.458418976723515,
 'model': 35.91467140714405,
 'shoe size': 32.32042664296428,
 'size': 30.4927504027554}

In [0]:
df['brand'] = df['brand'].map(lambda x: str(x).lower())

In [0]:
df['feat_brand_cat'] = df['feat_brand'].factorize()[0]
df['feat_color_cat'] = df['feat_color'].factorize()[0]
df['feat_gender_cat'] = df['feat_gender'].factorize()[0]
df['feat_material_cat'] = df['feat_material'].factorize()[0]
df['feat_shoe size_cat'] = df['feat_shoe size'].factorize()[0]


In [0]:
for key in keys:
  df[get_name_feat(key) +"_cat" ] = df[get_name_feat(key)].factorize()[0]

feats_cat = [x for x in df.columns if 'cat' in x]
feats_cat


In [0]:
feats = ['brand_cat', 'feat_brand_cat', 'feat_color_cat', 'feat_gender_cat', 'feat_material_cat', 'feat_shoe size_cat', 
 'feat_origin of components_cat',
 'feat_size/dimensions_cat',
 'feat_width x height_cat',]

# feats += feats_cat
# feats = list(set(feats))

In [102]:
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(['brand_cat'], model)

(-51.356432361171095, 0.7207640114801125)

In [136]:
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(feats, model)

(-50.09273957831771, 1.2635121048869071)

In [135]:
X = df[ feats].values
Y = df['prices_amountmin'].values

model.fit(X, Y)
perm = PermutationImportance(model, random_state=1).fit(X,Y)
eli5.show_weights(perm, feature_names=feats)

Weight,Feature
0.6248  ± 0.0219,brand_cat
0.3707  ± 0.0178,feat_brand_cat
0.0736  ± 0.0091,feat_material_cat
0.0387  ± 0.0042,feat_gender_cat
0.0385  ± 0.0033,feat_shoe size_cat
0.0083  ± 0.0028,feat_color_cat
0.0008  ± 0.0001,feat_origin of components_cat
0.0001  ± 0.0000,feat_width x height_cat
0.0000  ± 0.0000,feat_size/dimensions_cat


In [0]:
df['brand'].value_counts(normalize=True)

In [141]:
!git push

Everything up-to-date


In [143]:
!git status

On branch master
Your branch is up to date with 'origin/master'.

Changes to be committed:
  (use "git reset HEAD <file>..." to unstage)

	[32mnew file:   Day5.ipynb[m

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git checkout -- <file>..." to discard changes in working directory)

	[31mmodified:   Day4.ipynb[m
	[31mmodified:   Day5.ipynb[m

