In [1]:
!pip install eli5



In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
import eli5
from eli5.sklearn import PermutationImportance
from ast import literal_eval
from tqdm import tqdm_notebook

Using TensorFlow backend.


In [3]:
cd "/content/drive/My Drive/Colab Notebooks/dw_matrix"

/content/drive/My Drive/Colab Notebooks/dw_matrix


In [0]:
df = pd.read_csv("data/men_shoes.csv", low_memory=False)

In [5]:
df.columns  

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension', 'ean', 'features',
       'flavors', 'imageurls', 'isbn', 'keys', 'manufacturer',
       'manufacturernumber', 'merchants', 'name', 'prices_amountmin',
       'prices_amountmax', 'prices_availability', 'prices_color',
       'prices_condition', 'prices_count', 'prices_currency',
       'prices_dateadded', 'prices_dateseen', 'prices_flavor', 'prices_issale',
       'prices_merchant', 'prices_offer', 'prices_returnpolicy',
       'prices_shipping', 'prices_size', 'prices_source', 'prices_sourceurls',
       'prices_warranty', 'quantities', 'reviews', 'sizes', 'skus',
       'sourceurls', 'upc', 'vin', 'websiteids', 'weight'],
      dtype='object')

In [0]:
def run_model(feats,model=DecisionTreeRegressor(max_depth=5)):
  X = df[feats].values
  y = df['prices_amountmin'].values

  scores = cross_val_score(model,X,y,scoring= 'neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

In [7]:
df['brand_cat'] = df['brand'].factorize()[0]
print("DecisionTreeRegressor model outcome: ", run_model(['brand_cat']))
model = RandomForestRegressor(max_depth=5,
                              n_estimators=100,
                              random_state=0)
print("RandomForestRegressor model outcome: ", run_model(['brand_cat'],model))

DecisionTreeRegressor model outcome:  (-58.38655694633361, 4.223555478221712)
RandomForestRegressor model outcome:  (-57.47223572384038, 4.328288468270897)


In [8]:
df['brand_cat2'] = df['brand'].map(lambda x:str(x).lower()).factorize()[0]
print("DecisionTreeRegressor model outcome: ", run_model(['brand_cat2']))
print("RandomForestRegressor model outcome: ", run_model(['brand_cat2'],model))

DecisionTreeRegressor model outcome:  (-58.133398968282776, 4.206122611474276)
RandomForestRegressor model outcome:  (-57.31783843165656, 4.181246596160967)


In [9]:
df.features.head().values

array(['[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SERVUS BY HONEYWELL"]},{"key":"manufacturer_part_number","value":["ZSR101BLMLG"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SER

In [10]:
# widzimy, ze features to slownik zapisany jako str, musimy wrocic do formy slownika aby uzywac tej kolumny
# ponizej funkcja literal_eval ktora nam ten process ulatwi

str_dict = '[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]'
literal_eval(str_dict)

[{'key': 'Gender', 'value': ['Men']},
 {'key': 'Shoe Size', 'value': ['M']},
 {'key': 'Shoe Category', 'value': ["Men's Shoes"]},
 {'key': 'Color', 'value': ['Multicolor']},
 {'key': 'Manufacturer Part Number', 'value': ['8190-W-NAVY-7.5']},
 {'key': 'Brand', 'value': ['Josmo']}]

In [11]:
# chcemy je miec w takiej formie jak ponizej
{ 
    'Gender': 'Men',
    'Shoe Size': 'M',
    'Shoe Category': "Men's shoes",
    'Color': 'Multicolor',
    'Manufacturer Part Number': '8190-W-NAVY-7.5',
    'Brand': 'Josmo'
 }

{'Brand': 'Josmo',
 'Color': 'Multicolor',
 'Gender': 'Men',
 'Manufacturer Part Number': '8190-W-NAVY-7.5',
 'Shoe Category': "Men's shoes",
 'Shoe Size': 'M'}

In [12]:
def parse_features(x):
  output_dict = {}
  if str(x) == 'nan': return output_dict
  features = literal_eval(x.replace('\\"','"'))
  for item in features:
    # theat's how item look right now
    # {'key': 'Gender', 'value': ['Men']}
    key = item['key'].lower().strip()
    value = item['value'][0].lower().strip()
    output_dict[key] = value
  return output_dict


df['features_parsed'] = df['features'].map(parse_features)
df['features_parsed'].head().values

array([{'gender': 'men', 'shoe size': 'm', 'shoe category': "men's shoes", 'color': 'multicolor', 'manufacturer part number': '8190-w-navy-7.5', 'brand': 'josmo'},
       {'gender': 'men', 'shoe size': 'm', 'shoe category': "men's shoes", 'color': 'multicolor', 'manufacturer part number': '8190-w-navy-7.5', 'brand': 'josmo'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'}],
      dtype=object)

In [13]:
keys = set()
df['features_parsed'].map(lambda x: keys.update(x.keys()))
len(keys)

476

In [14]:
def get_name_feat(key):
  return "feat_" + key

for key in tqdm_notebook(keys):
  df[get_name_feat(key)] = df.features_parsed.map(lambda feats: feats[key] if key in feats else np.nan)

HBox(children=(IntProgress(value=0, max=476), HTML(value='')))




In [15]:
df.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension',
       ...
       'feat_chain/necklace length (in.)', 'feat_impact resistant',
       'feat_bag size range', 'feat_flame resistant', 'feat_manufacturer',
       'feat_part number', 'feat_sole', 'feat_mechanic', 'feat_expandable',
       'feat_brand'],
      dtype='object', length=527)

In [0]:
keys_stats = {}
for key in keys:
  keys_stats[key] = df [ False == df[get_name_feat(key)].isnull()].shape[0] / df.shape[0] * 100

In [17]:
{k:v for k,v in keys_stats.items() if v > 30}

{'brand': 48.62691466083151,
 'color': 47.784463894967175,
 'gender': 50.17505470459519,
 'manufacturer part number': 36.252735229759296,
 'material': 34.9070021881838}

In [0]:
df['feat_brand_cat'] = df['feat_brand'].factorize()[0]
df['feat_color_cat'] = df['feat_color'].factorize()[0]
df['feat_gender_cat'] = df['feat_brand'].factorize()[0]
df['feat_manufacturer part number_cat'] = df['feat_manufacturer part number'].factorize()[0]
df['feat_material_cat'] = df['feat_material'].factorize()[0]
df['feat_sport_cat'] = df['feat_sport'].factorize()[0]
df['feat_style_cat'] = df['feat_style'].factorize()[0]

for key in keys:
  df[get_name_feat(key)+'_cat'] = df[get_name_feat(key)].factorize()[0]

In [19]:
df['brand'] = df['brand'].map(lambda x: str(x).lower())
df [ df.brand == df.feat_brand].shape

(8846, 1003)

In [20]:
model = RandomForestRegressor(max_depth=5, n_estimators=100)
run_model(['brand_cat2'],model)

(-57.29584149309792, 4.244010718855507)

In [0]:
feats_cat = [x for x in df.columns if 'cat' in x]
#feats_cat

In [0]:
feats = [
      'brand_cat2',
      'feat_brand_cat',
      'feat_gender_cat',
      'feat_material_cat',
      'feat_movement_cat',
      'feat_adjustable_cat',
      'feat_resizable_cat',
      'feat_fabric content_cat',
      'feat_case thickness_cat',
      'weight_converted_cat']

#feats += feats_cat
#feats = list(set(feats))

model = RandomForestRegressor(max_depth=5, n_estimators=100)
result = run_model(feats,model)

In [50]:
X = df[feats].values
y = df['prices_amountmin'].values
m = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
m.fit(X,y)

print(result)
perm = PermutationImportance(m, random_state=1).fit(X,y);
eli5.show_weights(perm, feature_names=feats)

(-57.243850579763276, 4.241041760411566)


Weight,Feature
0.2577  ± 0.0069,brand_cat2
0.1146  ± 0.0077,feat_material_cat
0.0238  ± 0.0021,feat_brand_cat
0.0181  ± 0.0006,feat_fabric content_cat
0.0087  ± 0.0010,feat_resizable_cat
0.0084  ± 0.0005,feat_case thickness_cat
0.0079  ± 0.0031,feat_adjustable_cat
0.0054  ± 0.0009,feat_gender_cat
0.0050  ± 0.0010,feat_movement_cat
0.0009  ± 0.0002,weight_converted_cat


In [24]:
df['brand'].value_counts(normalize=True)

nike              0.097210
puma              0.033315
ralph lauren      0.028775
vans              0.021116
new balance       0.020295
                    ...   
calibrate         0.000055
merino            0.000055
louis m gerson    0.000055
owl               0.000055
xtreme couture    0.000055
Name: brand, Length: 1732, dtype: float64

In [25]:
df[df['brand'] == 'nike'].features_parsed.sample(5).values

array([{'material': 'canvas', 'gender': 'men', 'color': 'blue', 'model': '642290 412', 'manufacturer part number': '642290 412', 'brand': 'nike', 'age group': 'adult'},
       {'sport': 'any sport', 'style': 'pants', 'condition': 'new with tags'},
       {'sport': 'football', 'main color': 'purple & green', 'type': 'cleats'},
       {'condition': 'new with box', 'type': 'cleats'},
       {'style': 'basketball shoes', 'country/region of manufacture': 'china', 'condition': 'new with box'}],
      dtype=object)

In [0]:
#df['weight'].unique()

In [48]:
df['weight_string'] = df['weight'].astype(str)
def convert_to_grams(weight):
  if 'nan' in weight:
    return '0'
  elif 'g' in weight:
    return weight[0:-2]
  elif 'lbs' in weight:
    x = float(weight[0:-4]) * 453.592
    return str(x)
  elif 'pounds' in weight:
    x = float(weight[0:-7]) * 453.592
    return str(x)
  elif 'ounces' in weight:
    x = float(weight[0:-7])* 28.35
    return str(x)
  elif 'Kg' in weight:
    x - float(weight[0:-3]) * 1000
    return str(x)

df['weight_converted'] = df['weight_string'].map(convert_to_grams)
df['weight_converted_cat'] = df['weight_converted'].factorize()[0]
df['weight_converted_cat']

array([0, 0, 0, ..., 0, 0, 0])

In [0]:
def addGitcommit(filepath, message):
  !git add /filepath/
  !git config --global user.email 'janeta.mateusz@gmail.com'
  !git config --global user.name 'SirMatix'
  !git commit -m message
  !git push origin master

In [53]:
ls

[0m[01;34mdata[0m/  HelloGithub.ipynb  LICENSE  [01;34mmatrix_one[0m/  README.md


In [0]:
addGitcommit('day5.ipynb',"Trying out commit function")

fatal: pathspec 'filepath' did not match any files
