# Vestiaire Collective - Data preprocessing 

In [3]:
import pandas as pd
import numpy as np

In [2]:
import re

In [3]:
import plotly.figure_factory as ff

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
from functions import (
    get_quantiles_by_group, 
    feature_prop_table, 
    target_boxplot
)

## Load the dataset 

In [2]:
backup_path = "C:/Users/pemma/OneDrive - Université de Tours/Mécen/M2/S1/02 - Machine Learning/05 - Projet/ML_Vestiaire_Collective/backup/"

In [7]:
data  = pd.read_pickle(backup_path + "vc_data.pkl")

In [8]:
data.head()

Unnamed: 0,id,url,num_likes,price,we_love_tag,online_date,gender,category,sub_category,designer,condition,material,color,size,location
0,19126896,https://fr.vestiairecollective.com//women-shoe...,7,180.0,True,09/11/2021,women,shoes,ankle boots,acne studios,very good condition,leather,black,"[39, eu]",italy
1,19181389,https://fr.vestiairecollective.com//women-clot...,1,40.55,True,12/11/2021,women,clothing,jeans,acne studios,good condition,denim - jeans,navy,"[30, us]",poland
2,19182029,https://fr.vestiairecollective.com//men-clothi...,6,332.5,True,12/11/2021,men,clothing,coats,acne studios,good condition,wool,black,"[l, international]",denmark
3,19132670,https://fr.vestiairecollective.com//men-clothi...,3,45.0,False,09/11/2021,men,clothing,jeans,acne studios,"never worn, with tag",cotton,grey,"[28, us]",germany
4,19118182,https://fr.vestiairecollective.com//women-clot...,9,105.0,False,09/11/2021,women,clothing,dresses,acne studios,very good condition,linen,black,"[s, international]",germany


In [9]:
data.shape

(10409, 15)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10409 entries, 0 to 43
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            10409 non-null  int64  
 1   url           10409 non-null  object 
 2   num_likes     10409 non-null  int64  
 3   price         10409 non-null  float64
 4   we_love_tag   10409 non-null  bool   
 5   online_date   10409 non-null  object 
 6   gender        10409 non-null  object 
 7   category      10409 non-null  object 
 8   sub_category  10409 non-null  object 
 9   designer      10409 non-null  object 
 10  condition     10409 non-null  object 
 11  material      10409 non-null  object 
 12  color         10409 non-null  object 
 13  size          10409 non-null  object 
 14  location      10409 non-null  object 
dtypes: bool(1), float64(1), int64(2), object(11)
memory usage: 1.2+ MB


In [11]:
# NAs
nan_ix = {
    col: data.loc[pd.isna(data[col]), :].index.values.tolist()
    for col in data.columns
}
nan_ix

{'id': [],
 'url': [],
 'num_likes': [],
 'price': [],
 'we_love_tag': [],
 'online_date': [],
 'gender': [],
 'category': [],
 'sub_category': [],
 'designer': [],
 'condition': [],
 'material': [],
 'color': [],
 'size': [],
 'location': []}

In [12]:
data_cleaned = data.copy()

In [13]:
# add log price column to the data
data_cleaned["lprice"] = np.log(data.price+1)

## `we_love_tag`

In [14]:
# convert "we_love_tag" to binary (0, 1)
data_cleaned["we_love_tag"] = data_cleaned["we_love_tag"].astype(int)

## `gender`

In [15]:
# convert "gender" to cat variable
data_cleaned["gender"] = pd.Categorical(data_cleaned.gender)

In [16]:
feature_prop_table(feature="gender", data=data_cleaned)

Unnamed: 0,gender,count,freq,cumul_freq
0,women,8266,0.79412,0.79412
1,men,2001,0.192237,0.986358
2,kids,133,0.012777,0.999135
3,life & living,9,0.000865,1.0


In [17]:
# remove "life & living"
data_cleaned = data_cleaned[data_cleaned.gender != "life & living"]
data_cleaned.gender = data_cleaned.gender.cat.remove_unused_categories()

In [18]:
feature_prop_table(feature="gender", data=data_cleaned)

Unnamed: 0,gender,count,freq,cumul_freq
0,women,8266,0.794808,0.794808
1,men,2001,0.192404,0.987212
2,kids,133,0.012788,1.0


## `condition`

In [19]:
# convert "condition" to cat variable
data_cleaned["condition"] = pd.Categorical(data_cleaned.condition)

In [20]:
# value counts "condition"
feature_prop_table(feature="condition", data=data_cleaned)

Unnamed: 0,condition,count,freq,cumul_freq
0,very good condition,4803,0.461827,0.461827
1,never worn,3395,0.326442,0.788269
2,good condition,1196,0.115,0.903269
3,"never worn, with tag",821,0.078942,0.982212
4,fair condition,185,0.017788,1.0


In [21]:
# rename levels
data_cleaned["condition"] = data_cleaned["condition"].replace(to_replace="never worn, with tag", value="never worn")
data_cleaned["condition"] = data_cleaned["condition"].cat.rename_categories(["fair_condition", "good_condition", "never_worn", "very_good_condition"])

In [22]:
feature_prop_table(feature="condition", data=data_cleaned)

Unnamed: 0,condition,count,freq,cumul_freq
0,very_good_condition,4803,0.461827,0.461827
1,never_worn,4216,0.405385,0.867212
2,good_condition,1196,0.115,0.982212
3,fair_condition,185,0.017788,1.0


## `category`

In [23]:
# convert "category" to cat variable
data_cleaned["category"] = pd.Categorical(data_cleaned.category)

In [24]:
# change "boys" and "girls" to "clothing" in "category"
data_cleaned = data_cleaned.replace("boys", "clothing").replace("girls", "clothing")

In [25]:
data_cleaned.category = data_cleaned.category.cat.remove_unused_categories()

In [26]:
feature_prop_table(feature="category", data=data_cleaned)

Unnamed: 0,category,count,freq,cumul_freq
0,shoes,3351,0.322212,0.322212
1,clothing,3316,0.318846,0.641058
2,bags,2265,0.217788,0.858846
3,accessories,1116,0.107308,0.966154
4,jewellery,352,0.033846,1.0


In [27]:
# remove "jewellery" from "category"
data_cleaned = data_cleaned[data_cleaned.category != "jewellery"]

In [28]:
data_cleaned.category = data_cleaned.category.cat.remove_unused_categories()

In [29]:
feature_prop_table(feature="category", data=data_cleaned)

Unnamed: 0,category,count,freq,cumul_freq
0,shoes,3351,0.333499,0.333499
1,clothing,3316,0.330016,0.663515
2,bags,2265,0.225418,0.888933
3,accessories,1116,0.111067,1.0


## `sub-category`

### shoes

In [30]:
feature_prop_table(
    feature="sub_category", 
    data=data_cleaned.loc[data_cleaned.category == "shoes"]
)

Unnamed: 0,sub_category,count,freq,cumul_freq
0,trainers,1037,0.30946,0.30946
1,ankle boots,594,0.177261,0.48672
2,sandals,493,0.14712,0.633841
3,heels,487,0.14533,0.77917
4,boots,368,0.109818,0.888988
5,flats,143,0.042674,0.931662
6,ballet flats,140,0.041779,0.973441
7,lace ups,46,0.013727,0.987168
8,espadrilles,31,0.009251,0.996419
9,mules & clogs,11,0.003283,0.999702


In [31]:
# lins some shoes sub categories
shoes_cat_to_link = {
    "ankle boots": "boots", 
    "ballet flats": "flats", 
    "espadrilles": "mules & clogs"
}
data_cleaned = data_cleaned.replace({"sub_category": shoes_cat_to_link})

In [32]:
# rename "mules & clogs" as comfy shoes
data_cleaned = data_cleaned.replace("mules & clogs", "comfy_shoes")

In [33]:
# rename "lace ups" as "lace_ups"
data_cleaned = data_cleaned.replace("lace ups", "lace_ups")

In [34]:
# remove "first shoes" from type of shoes
data_cleaned = data_cleaned.loc[data_cleaned.sub_category != "first shoes"]

In [35]:
feature_prop_table(
    feature="sub_category", 
    data=data_cleaned.loc[data_cleaned.category == "shoes"]
)

Unnamed: 0,sub_category,count,freq,cumul_freq
0,trainers,1037,0.309552,0.309552
1,boots,962,0.287164,0.596716
2,sandals,493,0.147164,0.743881
3,heels,487,0.145373,0.889254
4,flats,283,0.084478,0.973731
5,lace_ups,46,0.013731,0.987463
6,comfy_shoes,42,0.012537,1.0


### clothing

In [36]:
feature_prop_table(
    feature="sub_category", 
    data=data_cleaned.loc[data_cleaned.category == "clothing"]
)

Unnamed: 0,sub_category,count,freq,cumul_freq
0,jackets,555,0.16737,0.16737
1,coats,525,0.158323,0.325694
2,dresses,334,0.100724,0.426417
3,tops,326,0.098311,0.524729
4,knitwear,301,0.090772,0.615501
5,knitwear & sweatshirts,213,0.064234,0.679735
6,trousers,197,0.059409,0.739144
7,t-shirts,149,0.044934,0.784077
8,jeans,121,0.03649,0.820567
9,skirts,117,0.035283,0.85585


In [37]:
# link some clothing sub categories
clothing_cat_to_link = {
    "jackets & coats": "coats", 
    "trench coats": "coats", 
    "leather jackets": "jackets", 
    "jumpsuits": "suits", 
    "outfits": "suits", 
    "shirts": "polo shirts", 
    "t-shirts": "tops", 
    "knitwear": "knitwear & sweatshirts", 
}
data_cleaned = data_cleaned.replace({"sub_category": clothing_cat_to_link})

In [38]:
# rename "knitwear & sweatshirts" as "knitwear_sweatshirts"
data_cleaned = data_cleaned.replace("knitwear & sweatshirts", "knitwear_sweatshirts")

In [39]:
# rename "polo shirts" as "polo_shirts"
data_cleaned = data_cleaned.replace("polo shirts", "polo_shirts")

In [40]:
# remove clothing sub categories with proportion < 1%
data_cleaned = data_cleaned.loc[-data_cleaned.sub_category.isin(["lingerie", "shorts"])]

In [41]:
feature_prop_table(
    feature="sub_category", 
    data=data_cleaned.loc[data_cleaned.category == "clothing"]
)

Unnamed: 0,sub_category,count,freq,cumul_freq
0,coats,681,0.207812,0.207812
1,jackets,602,0.183705,0.391517
2,knitwear_sweatshirts,514,0.156851,0.548367
3,tops,475,0.14495,0.693317
4,dresses,334,0.101922,0.79524
5,trousers,197,0.060116,0.855356
6,jeans,121,0.036924,0.89228
7,skirts,117,0.035703,0.927983
8,swimwear,93,0.02838,0.956363
9,polo_shirts,92,0.028074,0.984437


### bags

In [42]:
feature_prop_table(
    feature="sub_category", 
    data=data_cleaned.loc[data_cleaned.category == "bags"]
)

Unnamed: 0,sub_category,count,freq,cumul_freq
0,handbags,1858,0.820309,0.820309
1,clutch bags,121,0.053422,0.873731
2,bags,121,0.053422,0.927152
3,backpacks,58,0.025607,0.952759
4,travel bags,51,0.022517,0.975276
5,"small bags, wallets & cases",49,0.021634,0.996909
6,belt bags,7,0.003091,1.0


In [43]:
bags_cat_to_link = {
    "clutch bags": "small bags, wallets & cases", 
    "belt bags": "small bags, wallets & cases", 
    "backpacks": "travel bags", 
}

In [44]:
data_cleaned = data_cleaned.replace({"sub_category": bags_cat_to_link})

In [45]:
# rename "small bags, wallets & cases" as "small_bags"
data_cleaned.loc[
    data_cleaned.sub_category == "small bags, wallets & cases", 
    "sub_category"
] = "small_bags"

In [46]:
# rename "travel bags" as "travel_bags"
data_cleaned.loc[
    data_cleaned.sub_category == "travel bags", 
    "sub_category"
] = "travel_bags"

In [47]:
feature_prop_table(
    feature="sub_category", 
    data=data_cleaned.loc[data_cleaned.category=="bags"]
)

Unnamed: 0,sub_category,count,freq,cumul_freq
0,handbags,1858,0.820309,0.820309
1,small_bags,177,0.078146,0.898455
2,bags,121,0.053422,0.951876
3,travel_bags,109,0.048124,1.0


### accessories

In [48]:
feature_prop_table(
    feature="sub_category", 
    data=data_cleaned.loc[data_cleaned.category == "accessories"]
)

Unnamed: 0,sub_category,count,freq,cumul_freq
0,belts,202,0.181004,0.181004
1,scarves,189,0.169355,0.350358
2,sunglasses,168,0.150538,0.500896
3,wallets,151,0.135305,0.636201
4,"purses, wallets & cases",92,0.082437,0.718638
5,hats,82,0.073477,0.792115
6,watches,44,0.039427,0.831541
7,silk handkerchief,41,0.036738,0.86828
8,ties,39,0.034946,0.903226
9,hats & pull on hats,33,0.02957,0.932796


In [49]:
accessories_to_link = {
    "silk handkerchief": "scarves", 
    "hats & pull on hats": "hats", 
    "scarves & pocket squares": "scarves", 
    "purses, wallets & cases": "small_bags", 
    "wallets": "small_bags"
}

In [50]:
data_cleaned = data_cleaned.replace({"sub_category": accessories_to_link})

In [51]:
accessories_prop_table = feature_prop_table(
    feature="sub_category", 
    data=data_cleaned.loc[data_cleaned.category == "accessories"]
)
accessories_prop_table

Unnamed: 0,sub_category,count,freq,cumul_freq
0,scarves,259,0.232079,0.232079
1,small_bags,243,0.217742,0.449821
2,belts,202,0.181004,0.630824
3,sunglasses,168,0.150538,0.781362
4,hats,115,0.103047,0.884409
5,watches,44,0.039427,0.923835
6,ties,39,0.034946,0.958781
7,jewellery,25,0.022401,0.981183
8,gloves,15,0.013441,0.994624
9,cufflinks,5,0.00448,0.999104


In [52]:
# remove types of accessories with count < 100
accessories_to_remove = accessories_prop_table.loc[
    accessories_prop_table["count"] < 100, 
    "sub_category"
].tolist()
data_cleaned = data_cleaned.loc[
    -data_cleaned.sub_category.isin(accessories_to_remove)
]

In [53]:
accessories_prop_table = feature_prop_table(
    feature="sub_category", 
    data=data_cleaned.loc[data_cleaned.category=="accessories"]
)
accessories_prop_table

Unnamed: 0,sub_category,count,freq,cumul_freq
0,scarves,259,0.262411,0.262411
1,small_bags,243,0.246201,0.508612
2,belts,202,0.204661,0.713273
3,sunglasses,168,0.170213,0.883485
4,hats,115,0.116515,1.0


In [54]:
sub_cat_prop_table = feature_prop_table(
    feature="sub_category", 
    data=data_cleaned
)   
sub_cat_prop_table

Unnamed: 0,sub_category,count,freq,cumul_freq
0,handbags,1858,0.188076,0.188076
1,trainers,1037,0.10497,0.293046
2,boots,962,0.097378,0.390424
3,coats,681,0.068934,0.459358
4,jackets,602,0.060937,0.520296
5,knitwear_sweatshirts,514,0.05203,0.572325
6,sandals,493,0.049904,0.622229
7,heels,487,0.049296,0.671525
8,tops,475,0.048082,0.719607
9,small_bags,420,0.042514,0.762122


In [55]:
# remove "comfy_shoes", "suits" and "lace_ups"
sub_cat_to_remove = sub_cat_prop_table.loc[
    sub_cat_prop_table["count"] < 90, 
    "sub_category"
].values.tolist()
data_cleaned = data_cleaned.loc[-data_cleaned.sub_category.isin(sub_cat_to_remove)]

In [56]:
# convert "sub_category" to cat variable
data_cleaned["sub_category"] = pd.Categorical(data_cleaned.sub_category)

In [57]:
feature_prop_table(
    feature="sub_category", 
    data=data_cleaned
)

Unnamed: 0,sub_category,count,freq,cumul_freq
0,handbags,1858,0.19076,0.19076
1,trainers,1037,0.106468,0.297228
2,boots,962,0.098768,0.395996
3,coats,681,0.069918,0.465914
4,jackets,602,0.061807,0.527721
5,knitwear_sweatshirts,514,0.052772,0.580493
6,sandals,493,0.050616,0.631109
7,heels,487,0.05,0.681109
8,tops,475,0.048768,0.729877
9,small_bags,420,0.043121,0.772998


## `designer`

In [58]:
designer_prop_table = feature_prop_table(
    feature="designer", 
    data=data_cleaned
)
designer_prop_table

Unnamed: 0,designer,count,freq,cumul_freq
0,jean paul gaultier,317,0.032546,0.032546
1,coach,309,0.031725,0.064271
2,chloé,308,0.031622,0.095893
3,christian louboutin,306,0.031417,0.12731
4,jimmy choo,306,0.031417,0.158727
5,acne studios,305,0.031314,0.190041
6,burberry,304,0.031211,0.221253
7,balmain,303,0.031109,0.252361
8,moncler,303,0.031109,0.28347
9,off-white,300,0.030801,0.314271


In [59]:
# remove brands with less than 1% items
designer_to_remove = designer_prop_table.loc[
    designer_prop_table.freq < .01, 
    "designer"
].values.tolist()
data_cleaned = data_cleaned.loc[
    -data_cleaned.designer.isin(designer_to_remove)
]

In [60]:
lprice_by_designer = get_quantiles_by_group(
    feature="designer", 
    data=data_cleaned
)
lprice_by_designer

Unnamed: 0_level_0,lprice,lprice,lprice
Unnamed: 0_level_1,q0.25,median,q0.75
designer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
chanel,6.136006,6.862744,7.898595
louis vuitton,5.860786,6.358678,6.891108
hermès,5.673323,6.335437,7.09091
bottega veneta,5.525453,6.11845,6.673298
balenciaga,5.602119,6.111467,6.39693
dior,5.192957,6.089691,6.979994
saint laurent,5.515437,6.036659,6.802395
christian louboutin,5.595028,5.918005,6.175867
balmain,5.198497,5.83773,6.448888
fendi,5.180134,5.755742,6.286825


In [61]:
lprice_by_designer_q1 = lprice_by_designer[("lprice", "q0.25")].values
lprice_by_designer_med = lprice_by_designer[("lprice", "median")].values
lprice_by_designer_q3 = lprice_by_designer[("lprice", "q0.75")].values

In [62]:
fig = ff.create_distplot(
    hist_data=[
        lprice_by_designer_q1,
        lprice_by_designer_med, 
        lprice_by_designer_q3
    ], 
    group_labels=["25%", "Median", "75%"], 
    show_hist=False, 
    show_rug=False
)
fig.update_layout(title_text="Aggregate log price by designer")
fig.show()

In [63]:
lprice_by_designer_quantiles = np.quantile(
    a=lprice_by_designer_med, 
    q=np.linspace(0, 1, 5)
)
lprice_by_designer_quantiles                # log price quantiles
# np.exp(lprice_by_designer_quantiles)-1      # price quantiles

array([4.20599091, 5.31544321, 5.53739703, 5.81723339, 6.86274424])

In [64]:
def classify_designer_by_price(designer: str): 
    lprice_med = lprice_by_designer.loc[
        lprice_by_designer.index == designer, 
        ("lprice", "median")
    ].values[0]
    if lprice_med < lprice_by_designer_quantiles[1]:
        return "cheap_brand"
    else:
        if lprice_med < lprice_by_designer_quantiles[2]: 
            return "average_priced_brand"
        else: 
            if lprice_med < lprice_by_designer_quantiles[3]:
                return "expensive_brand"
            else: 
                return "very_expensive_brand"

In [65]:
data_cleaned.designer = data_cleaned.designer.apply(classify_designer_by_price)

In [66]:
# convert "designer" to cat 
data_cleaned["designer"] = pd.Categorical(data_cleaned["designer"])

## `material`

In [67]:
material_prop_table = feature_prop_table(
    feature="material",
    data=data_cleaned
)
material_prop_table

Unnamed: 0,material,count,freq,cumul_freq
0,leather,3982,0.41077,0.41077
1,cotton,1098,0.113266,0.524035
2,wool,732,0.075511,0.599546
3,cloth,628,0.064782,0.664328
4,polyester,469,0.04838,0.712709
5,synthetic,391,0.040334,0.753043
6,suede,376,0.038787,0.79183
7,silk,328,0.033835,0.825665
8,patent leather,306,0.031566,0.857231
9,viscose,156,0.016092,0.873324


In [68]:
# create "other_material" group for material such that freq < 1%
other_material = material_prop_table.loc[ 
    material_prop_table["freq"] < .01, 
    "material"
].values.tolist()
data_cleaned.loc[ 
    data_cleaned.material.isin(other_material), 
    "material"
] = "other_material"

In [69]:
# rename some categories in "material"
data_cleaned.material = data_cleaned.material.replace("patent leather", "patent_leather").replace("denim - jeans", "denim_jeans")

In [70]:
material_prop_table = feature_prop_table(
    feature="material",
    data=data_cleaned
)
material_prop_table

Unnamed: 0,material,count,freq,cumul_freq
0,leather,3982,0.41077,0.41077
1,cotton,1098,0.113266,0.524035
2,other_material,733,0.075614,0.599649
3,wool,732,0.075511,0.67516
4,cloth,628,0.064782,0.739942
5,polyester,469,0.04838,0.788323
6,synthetic,391,0.040334,0.828657
7,suede,376,0.038787,0.867444
8,silk,328,0.033835,0.901279
9,patent_leather,306,0.031566,0.932845


In [71]:
# convert "material" to cat
categories = [
    "other_material",
    "leather",
    "cotton",
    "wool",
    "cloth",
    "polyester",
    "synthetic",
    "suede",
    "silk",
    "patent_leather",
    "other",
    "viscose",
    "plastic",
    "cashmere",
    "denim_jeans"
]
data_cleaned["material"] = pd.Categorical(
    values=data_cleaned["material"], 
    categories=categories
)

## `color`

In [72]:
color_prop_table = feature_prop_table(
    feature="color", 
    data=data_cleaned
)
color_prop_table

Unnamed: 0,color,count,freq,cumul_freq
0,black,3308,0.341242,0.341242
1,white,878,0.090571,0.431813
2,multicolour,760,0.078399,0.510213
3,brown,695,0.071694,0.581906
4,beige,675,0.069631,0.651537
5,blue,637,0.065711,0.717248
6,pink,393,0.040541,0.757788
7,grey,356,0.036724,0.794512
8,red,296,0.030534,0.825046
9,green,288,0.029709,0.854756


In [73]:
# create "other_color" group for color such that freq < .9%
other_color = color_prop_table.loc[ 
    color_prop_table["freq"] < .009, 
    "color"
].values.tolist()
data_cleaned.loc[ 
    data_cleaned.color.isin(other_color), 
    "color"
] = "other_color"

In [74]:
color_prop_table = feature_prop_table(
    feature="color", 
    data=data_cleaned
)
color_prop_table.color.values.tolist()

['black',
 'white',
 'multicolour',
 'brown',
 'beige',
 'blue',
 'pink',
 'grey',
 'other_color',
 'red',
 'green',
 'camel',
 'navy',
 'burgundy',
 'gold',
 'silver',
 'orange',
 'yellow',
 'purple']

In [75]:
# convert "color" to cat
categories = [
    "other_color",
    "black",
    "white",
    "multicolour",
    "brown",
    "beige",
    "blue",
    "pink",
    "grey",
    "red",
    "green",
    "camel",
    "navy",
    "burgundy",
    "gold",
    "silver",
    "orange",
    "yellow",
    "purple"]
data_cleaned["color"] = pd.Categorical(
    values=data_cleaned["color"], 
    categories=categories
)

## `location`

In [76]:
feature_prop_table(feature="location", data=data_cleaned)

Unnamed: 0,location,count,freq,cumul_freq
0,italy,2801,0.288942,0.288942
1,france,1858,0.191665,0.480607
2,united kingdom,1317,0.135857,0.616464
3,germany,982,0.1013,0.717764
4,spain,483,0.049825,0.767588
5,romania,292,0.030122,0.79771
6,poland,203,0.020941,0.818651
7,greece,188,0.019393,0.838044
8,belgium,173,0.017846,0.85589
9,netherlands,171,0.01764,0.87353


In [77]:
# reduce "location" variable
def map_geo_area(country):
    """Map a country to the corresponding geographical area."""
    if country not in (
        "canada",
        "israel",
        "singapore", 
        "switzerland", 
        "united kingdom", 
        "united states"
    ): 
        return "eu"
    else: 
        if country != "united kingdom":
            return "other_country"
        else: 
            return "us"
data_cleaned.location = data_cleaned.location.replace("vestiaire collective france", "france")
data_cleaned.location = data_cleaned.location.apply(map_geo_area)
data_cleaned.location = data_cleaned.location.replace(" ", "_") 

In [78]:
feature_prop_table(feature="location", data=data_cleaned)

Unnamed: 0,location,count,freq,cumul_freq
0,eu,8361,0.862492,0.862492
1,us,1317,0.135857,0.998349
2,other_country,16,0.001651,1.0


In [79]:
data_cleaned.location = pd.Categorical(
    values=data_cleaned.location, 
    categories=["other_country", "eu", "uk"]
)

## `size`

In [80]:
data_cleaned["size"] = data_cleaned["size"].apply(lambda size: "".join(size))

In [81]:
data_cleaned.loc[
    data_cleaned["size"] == "size", 
    "size"
] = "no_size"

In [82]:
data_cleaned["size"] = data_cleaned["size"].apply( lambda size: size.replace("international", "") )

### bags

In [83]:
data_cleaned.loc[
    data_cleaned.category.isin(["bags"]), 
    "size"
] = "no_size"

### accessories

In [84]:
accessories = list(set(
    data_cleaned.loc[
        data_cleaned.category == "accessories",
        "sub_category"
    ].values
))
accessories

['hats', 'sunglasses', 'belts', 'small_bags', 'scarves']

#### belts

In [85]:
belt_sizes = {
    "70cm": "xxs",
    "75cm": "xs", 
    "80cm": "xs", 
    "85cm": "s", 
    "90cm": "m", 
    "95cm": "l", 
    "100cm": "xl"
}

In [86]:
data_cleaned[
    data_cleaned.sub_category == "belts"
] = data_cleaned[
    data_cleaned.sub_category == "belts"
].replace({"size": belt_sizes})

#### hats

In [87]:
hat_sizes = {
    "21.2inches": "s", 
    "24inches": "xxl", 
    "54cm": "s", 
    "55cm": "s", 
    "56cm": "m",
    "57cm": "m", 
    "58cm": "l", 
    "59cm": "l", 
    "60cm": "xl", 
    "61cm": "xl", 
}

In [88]:
data_cleaned[
    data_cleaned.sub_category == "hats"
] = data_cleaned[
    data_cleaned.sub_category == "hats"
].replace({"size": hat_sizes})

### shoes

In [89]:
uk_shoe_size = data_cleaned[data_cleaned.category == "shoes"]["size"].str.contains("uk").values

In [90]:
# convert UK shoe size to EU shoe size
uk_size_to_eu = {
    "2uk": "35", 
    "3uk": "36", 
    "3.5uk": "36",
    "4uk": "37", 
    "4.5uk": "37", 
    "5uk": "38", 
    "5.5uk": "39", 
    "6uk": "39", 
    "6.5uk": "40", 
    "7uk": "41", 
    "7.5uk": "41", 
    "8uk": "42", 
    "8.5uk": "42", 
    "9uk": "43",
    "9.5uk": "44",  
    "10uk": "45", 
    "10.5uk": "45", 
    "11uk": "46", 
    "12uk": "47", 
}

In [91]:
data_cleaned[data_cleaned.category == "shoes"] = data_cleaned[data_cleaned.category == "shoes"].replace({"size": uk_size_to_eu})

In [92]:
us_shoe_size = data_cleaned[data_cleaned.category == "shoes"]["size"].str.contains("us").values

In [93]:
# convert US shoe size to EU shoe size
us_size_to_eu = {
    "3us": "34", 
    "3.5us": "34.5",
    "4us": "35", 
    "4.5us": "35.5", 
    "5us": "36", 
    "5.5us": "36.5", 
    "6us": "37", 
    "6.5us": "37.5", 
    "7us": "38", 
    "7.5us": "38.5", 
    "8us": "39", 
    "8.5us": "39.5", 
    "9us": "40",
    "9.5us": "40.5",  
    "10us": "41", 
    "10.5us": "41.5", 
    "11us": "42", 
}

In [94]:
data_cleaned[data_cleaned.category == "shoes"] = data_cleaned[data_cleaned.category == "shoes"].replace({"size": us_size_to_eu})

In [95]:
# remove "fr" and "it" from shoe size
data_cleaned.loc[
    data_cleaned.category == "shoes", 
    "size"
] = data_cleaned[
    data_cleaned.category == "shoes"
]["size"].apply( lambda old_size: re.sub("[^\d\.]", "", old_size) )

In [96]:
def classify_shoe_size(shoe_size: str): 
    shoe_size = float(shoe_size)
    if shoe_size <= 35: 
        return "<=35"
    else: 
        if shoe_size >= 45: 
            return ">=45"
        else:
            return str( int( round(shoe_size, 0) ) )

In [97]:
data_cleaned.loc[
    data_cleaned.category == "shoes", 
    "size"
] = data_cleaned[ data_cleaned.category == "shoes" ]["size"].apply(classify_shoe_size).values

In [98]:
feature_prop_table(
    feature="size", 
    data=data_cleaned[data_cleaned.category == "shoes"]
)

Unnamed: 0,size,count,freq,cumul_freq
0,38,748,0.230012,0.230012
1,40,508,0.156212,0.386224
2,39,424,0.130381,0.516605
3,37,411,0.126384,0.642989
4,36,358,0.110086,0.753075
5,41,223,0.068573,0.821648
6,42,211,0.064883,0.886531
7,<=35,116,0.03567,0.922202
8,43,98,0.030135,0.952337
9,44,96,0.02952,0.981857


### clothing

In [99]:
# convert uk size to eu size
clothing_uk_to_eu = {
    "3uk": "3xs", 
    "4uk": "xxs", 
    "6uk": "xs", 
    "8uk": "s", 
    "10uk": "m", 
    "12uk": "l", 
    "14uk": "xl", 
    "15.5uk": "m", 
    "16uk": "xxl", 
    "18-20uk": "3xl", 
    "28uk": "xs", 
    "30uk": "s",
    "32uk": "m", 
    "34uk": "xs", 
    "36uk": "s", 
    "40uk": "l", 
    "42uk": "xl"
}

In [100]:
data_cleaned.loc[
    data_cleaned.category == "clothing", 
    :
]= data_cleaned[
    data_cleaned.category == "clothing"
].replace({"size": clothing_uk_to_eu})

In [101]:
# convert us size to eu size
clothing_us_to_eu = {
    "0us": "xxs", 
    "1us": "xxs", 
    "2us": "xs",
    "4us": "s", 
    "6us": "m", 
    "8us": "l", 
    "10us": "xl", 
    "12us": "xxl", 
    "14-16us": "3xl",  
    "18us": "4xl", 
    "24us": "xs", 
    "25us": "xs", 
    "26us": "s",
    "27us": "s",
    "28us": "m", 
    "29us": "m", 
    "30us": "m",
    "31us": "m",
    "32us": "m", 
    "33us": "l", 
    "34us": "l", 
    "36us": "xl", 
}

In [102]:
data_cleaned = data_cleaned.replace({"size": clothing_us_to_eu})

In [103]:
# convert international numeric size to s, m, l size
it_clothing_sizes = {
    "34it": "3xs", 
    "36it": "xxs", 
    "38it": "xs", 
    "40it": "s", 
    "42it": "m", 
    "44it": "l", 
    "46it": "xl", 
    "48it": "xxl", 
    "50it": "l", 
    "50-52it": "3xl", 
    "52it": "xl", 
    "54it": "xxl", 
    "56it": "3xl", 
}

In [104]:
data_cleaned = data_cleaned.replace({"size": it_clothing_sizes})

In [105]:
# convert fr numeric size to s, m, l size
fr_clothing_sizes = {
    "30fr": "3xs", 
    "32fr": "xxs", 
    "34fr": "xs", 
    "36fr": "s", 
    "38fr": "m", 
    "40fr": "l", 
    "40-": "l", 
    "42fr": "xl", 
    "44fr": "xxl", 
    "46fr": "3xl", 
    "48fr": "m",
    "50fr": "l", 
    "52fr": "xl", 
    "54fr": "xxl", 
    "56fr": "3xl" 
}

In [106]:
data_cleaned = data_cleaned.replace({"size": fr_clothing_sizes})

In [107]:
# convert eu size to s, m, l size
eu_clothing_sizes = {
    "37eu": "xs", 
    "38eu": "s", 
    "39eu": "m", 
    "40eu": "m",
    "41eu": "l",
    "42eu": "xl", 
    "43eu": "xxl", 
}

In [108]:
data_cleaned = data_cleaned.replace({"size": eu_clothing_sizes})

In [109]:
# convert size from "0-5" scale to international size
clothing_size_conversion = {
    "0000\t": "xxs", 
    "000-5": "xxs", 
    "00-5": "xs", 
    "0": "s", 
    "10": "6",
    "10-5": "m", 
    "20": "m", 
    "2": "m", 
    "20-5": "m", 
    "30-5": "l", 
    "30": "l", 
    "40-5": "xl", 
    "40": "xl", 
    "50": "xxl", 
    "60": "3xl", 
    "6": "3xl"
}

In [110]:
data_cleaned.loc[
    data_cleaned.category == "clothing"
] = data_cleaned.loc[
    data_cleaned.category == "clothing"
].replace({ "size": clothing_size_conversion })

In [111]:
data_cleaned.loc[ 
    data_cleaned["size"] == "tailleunique", 
    "size"
] = "no_size"

In [112]:
kid_sizes = list(
    set(
        data_cleaned.loc[
            data_cleaned["size"].str.contains(
                "|".join(["ans", "years", "mois"])
            ), 
            "size"
        ]
    )
)

In [113]:
def rescale_kid_size(kid_size): 
    size = re.sub("[^\d\.]", "", kid_size)
    unit = kid_size.replace(size, "")
    if unit in ["mois", "months"]: 
        return "3xs"
    else: 
        if int(size) <= 2: 
            return "xxs"
        if int(size) <= 4: 
            return "xs"
        if int(size) <= 6: 
            return "s"
        if int(size) <= 10: 
            return "m"
        if int(size) <= 14: 
            return "l"
        if int(size) <= 18: 
            return "xl" 
        else: 
            return "xxl"

In [114]:
data_cleaned.loc[ 
    data_cleaned["size"].isin(kid_sizes), 
    "size"
] = data_cleaned.loc[ 
    data_cleaned["size"].isin(kid_sizes), 
    "size"
].apply(rescale_kid_size)

In [115]:
data_cleaned.loc[ 
    data_cleaned["size"] == "18months", 
    "size"
] = "3xs"

In [116]:
data_cleaned.loc[ 
    data_cleaned["size"] == "6", 
    "size"
] = "3xl"

In [117]:
data_cleaned.loc[ 
    data_cleaned["size"] == "xxxl", 
    "size"
] = "3xl"

In [118]:
data_cleaned.loc[ 
    data_cleaned["size"] == "xxxl",  
    "size"
] = "3xl"

### Convert size to `pd.Categorical`

In [119]:
feature_prop_table(
    feature="size",
    data=data_cleaned
)

Unnamed: 0,size,count,freq,cumul_freq
0,no_size,2259,0.233031,0.233031
1,m,1107,0.114194,0.347225
2,s,787,0.081184,0.428409
3,38,748,0.077161,0.50557
4,no size,658,0.067877,0.573447
5,l,612,0.063132,0.636579
6,40,508,0.052404,0.688983
7,39,424,0.043738,0.732721
8,37,411,0.042397,0.775119
9,xs,379,0.039096,0.814215


In [120]:
# add "size_" before each size
data_cleaned["size"] = "size_" + data_cleaned["size"].astype(str)
data_cleaned.loc[
    data_cleaned["size"] == "size_no_size", 
    "size"
] = "no_size"

In [121]:
data_cleaned["size"] = pd.Categorical(
    values=data_cleaned["size"], 
    categories=[
        "no_size",
        "size_3xs",
        "size_xxs",
        "size_xs",
        "size_s",
        "size_m",
        "size_l",
        "size_xl",
        "size_xxl",
        "size_3xl",
        "size_<=35",
        "size_36",
        "size_37",
        "size_38",
        "size_39", 
        "size_40", 
        "size_41",
        "size_42",
        "size_43",
        "size_44",
        "size_>=45"
    ]
)

## Feature selection

In [122]:
data_cleaned.index = data_cleaned.id

In [123]:
data_cleaned = data_cleaned.drop(["id", "url", "online_date"], axis=1)

In [124]:
data_cleaned.head()

Unnamed: 0_level_0,num_likes,price,we_love_tag,gender,category,sub_category,designer,condition,material,color,size,location,lprice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
19126896,7,180.0,1,women,shoes,boots,cheap_brand,very_good_condition,leather,black,size_39,eu,5.198497
19181389,1,40.55,1,women,clothing,jeans,cheap_brand,good_condition,denim_jeans,navy,size_m,eu,3.726898
19182029,6,332.5,1,men,clothing,coats,cheap_brand,good_condition,wool,black,size_l,eu,5.809643
19132670,3,45.0,0,men,clothing,jeans,cheap_brand,never_worn,cotton,grey,size_m,eu,3.828641
19118182,9,105.0,0,women,clothing,dresses,cheap_brand,very_good_condition,other_material,black,size_s,eu,4.663439


## Transform categories to dummies 

In [125]:
# convert cat variables to dummies
data_dummies = pd.get_dummies(
    data_cleaned, 
    columns=[
        "gender", 
        "category", 
        "sub_category", 
        "designer", 
        "condition",
        "material", 
        "color", 
        "size", 
        "location"
    ], 
    prefix="", 
    prefix_sep="", 
    drop_first=True
)

In [126]:
data_dummies.head()

Unnamed: 0_level_0,num_likes,price,we_love_tag,lprice,men,women,bags,clothing,shoes,belts,...,size_38,size_39,size_40,size_41,size_42,size_43,size_44,size_>=45,eu,uk
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19126896,7,180.0,1,5.198497,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
19181389,1,40.55,1,3.726898,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
19182029,6,332.5,1,5.809643,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
19132670,3,45.0,0,3.828641,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
19118182,9,105.0,0,4.663439,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [127]:
data_dummies.columns

Index(['num_likes', 'price', 'we_love_tag', 'lprice', 'men', 'women', 'bags',
       'clothing', 'shoes', 'belts', 'boots', 'coats', 'dresses', 'flats',
       'handbags', 'hats', 'heels', 'jackets', 'jeans', 'knitwear_sweatshirts',
       'polo_shirts', 'sandals', 'scarves', 'skirts', 'small_bags',
       'sunglasses', 'swimwear', 'tops', 'trainers', 'travel_bags', 'trousers',
       'cheap_brand', 'expensive_brand', 'very_expensive_brand',
       'good_condition', 'never_worn', 'very_good_condition', 'leather',
       'cotton', 'wool', 'cloth', 'polyester', 'synthetic', 'suede', 'silk',
       'patent_leather', 'other', 'viscose', 'plastic', 'cashmere',
       'denim_jeans', 'black', 'white', 'multicolour', 'brown', 'beige',
       'blue', 'pink', 'grey', 'red', 'green', 'camel', 'navy', 'burgundy',
       'gold', 'silver', 'orange', 'yellow', 'purple', 'size_3xs', 'size_xxs',
       'size_xs', 'size_s', 'size_m', 'size_l', 'size_xl', 'size_xxl',
       'size_3xl', 'size_<=35', 'size

In [128]:
data_dummies.shape

(9694, 91)

## Save cleaned data

In [129]:
data_cleaned.to_pickle(path=backup_path + "vc_data_cleaned.pkl")
data_dummies.to_pickle(path=backup_path + "vc_data_dummies.pkl")

In [5]:
pd.read_pickle(backup_path + "vc_data_cleaned.pkl")

Unnamed: 0_level_0,num_likes,price,we_love_tag,gender,category,sub_category,designer,condition,material,color,size,location,lprice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
19126896,7,180.00,1,women,shoes,boots,cheap_brand,very_good_condition,leather,black,size_39,eu,5.198497
19181389,1,40.55,1,women,clothing,jeans,cheap_brand,good_condition,denim_jeans,navy,size_m,eu,3.726898
19182029,6,332.50,1,men,clothing,coats,cheap_brand,good_condition,wool,black,size_l,eu,5.809643
19132670,3,45.00,0,men,clothing,jeans,cheap_brand,never_worn,cotton,grey,size_m,eu,3.828641
19118182,9,105.00,0,women,clothing,dresses,cheap_brand,very_good_condition,other_material,black,size_s,eu,4.663439
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19201767,1,95.00,0,women,bags,small_bags,cheap_brand,never_worn,cloth,black,no_size,eu,4.564348
19062770,4,44.00,1,women,accessories,scarves,cheap_brand,very_good_condition,polyester,navy,,eu,3.806662
19210693,15,80.00,0,women,accessories,belts,cheap_brand,very_good_condition,leather,blue,size_xs,eu,4.394449
18970201,46,162.00,1,women,bags,handbags,cheap_brand,very_good_condition,synthetic,pink,no_size,eu,5.093750


In [7]:
pd.read_pickle(backup_path + "vc_data_dummies.pkl")

Unnamed: 0_level_0,num_likes,price,we_love_tag,lprice,men,women,bags,clothing,shoes,belts,...,size_38,size_39,size_40,size_41,size_42,size_43,size_44,size_>=45,eu,uk
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19126896,7,180.00,1,5.198497,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
19181389,1,40.55,1,3.726898,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
19182029,6,332.50,1,5.809643,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
19132670,3,45.00,0,3.828641,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
19118182,9,105.00,0,4.663439,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19201767,1,95.00,0,4.564348,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
19062770,4,44.00,1,3.806662,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
19210693,15,80.00,0,4.394449,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
18970201,46,162.00,1,5.093750,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
