# Vestiaire Collective - Data preprocessing 

In [1]:
import pandas as pd
import numpy as np

In [2]:
import re

In [3]:
import plotly.figure_factory as ff

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
from functions import (
    get_quantiles_by_group, 
    feature_prop_table, 
    target_boxplot
)

## Load the dataset 

In [6]:
backup_path = "C:/Users/pemma/OneDrive - Université de Tours/Mécen/M2/S1/02 - Machine Learning/05 - Projet/price_prediction_vestiaire_collective/backup/"

In [7]:
data  = pd.read_pickle(backup_path + "data/vc_data.pkl")

In [8]:
data.head()

Unnamed: 0,id,url,num_likes,price,we_love_tag,online_date,gender,category,sub_category,designer,condition,material,color,size,location
0,19126896,https://fr.vestiairecollective.com//women-shoe...,7,180.0,True,09/11/2021,women,shoes,ankle boots,acne studios,very good condition,leather,black,"[39, eu]",italy
1,19181389,https://fr.vestiairecollective.com//women-clot...,1,40.55,True,12/11/2021,women,clothing,jeans,acne studios,good condition,denim - jeans,navy,"[30, us]",poland
2,19182029,https://fr.vestiairecollective.com//men-clothi...,6,332.5,True,12/11/2021,men,clothing,coats,acne studios,good condition,wool,black,"[l, international]",denmark
3,19132670,https://fr.vestiairecollective.com//men-clothi...,3,45.0,False,09/11/2021,men,clothing,jeans,acne studios,"never worn, with tag",cotton,grey,"[28, us]",germany
4,19118182,https://fr.vestiairecollective.com//women-clot...,9,105.0,False,09/11/2021,women,clothing,dresses,acne studios,very good condition,linen,black,"[s, international]",germany


In [9]:
data.shape

(10409, 15)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10409 entries, 0 to 43
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            10409 non-null  int64  
 1   url           10409 non-null  object 
 2   num_likes     10409 non-null  int64  
 3   price         10409 non-null  float64
 4   we_love_tag   10409 non-null  bool   
 5   online_date   10409 non-null  object 
 6   gender        10409 non-null  object 
 7   category      10409 non-null  object 
 8   sub_category  10409 non-null  object 
 9   designer      10409 non-null  object 
 10  condition     10409 non-null  object 
 11  material      10409 non-null  object 
 12  color         10409 non-null  object 
 13  size          10409 non-null  object 
 14  location      10409 non-null  object 
dtypes: bool(1), float64(1), int64(2), object(11)
memory usage: 1.2+ MB


In [11]:
# NaNs
def detect_nan(df: pd.DataFrame):
    return {
        col: df.loc[pd.isna(df[col]), :].index.values.tolist()
        for col in df.columns
    }

In [12]:
detect_nan(df = data)

{'id': [],
 'url': [],
 'num_likes': [],
 'price': [],
 'we_love_tag': [],
 'online_date': [],
 'gender': [],
 'category': [],
 'sub_category': [],
 'designer': [],
 'condition': [],
 'material': [],
 'color': [],
 'size': [],
 'location': []}

In [13]:
data_cleaned = data.copy()

In [14]:
# add log price column to the data
data_cleaned["lprice"] = np.log(data.price+1)

## `we_love_tag`

In [15]:
data_cleaned["we_love_tag"] = data_cleaned["we_love_tag"].astype("int64")

## `gender`

In [16]:
# convert "gender" to cat variable
data_cleaned["gender"] = pd.Categorical(data_cleaned.gender)

In [17]:
feature_prop_table(feature="gender", data=data_cleaned)

Unnamed: 0,gender,count,freq,cumul_freq
0,women,8266,0.79412,0.79412
1,men,2001,0.192237,0.986358
2,kids,133,0.012777,0.999135
3,life & living,9,0.000865,1.0


In [18]:
# remove "life & living" and "kids"
data_cleaned = data_cleaned[data_cleaned.gender.isin(["men", "women"])]
data_cleaned.gender = data_cleaned.gender.cat.remove_unused_categories()

In [19]:
feature_prop_table(feature="gender", data=data_cleaned)

Unnamed: 0,gender,count,freq,cumul_freq
0,women,8266,0.805104,0.805104
1,men,2001,0.194896,1.0


## `condition`

In [20]:
# convert "condition" to cat variable
data_cleaned["condition"] = pd.Categorical(data_cleaned.condition)

In [21]:
# value counts "condition"
feature_prop_table(feature="condition", data=data_cleaned)

Unnamed: 0,condition,count,freq,cumul_freq
0,very good condition,4722,0.45992,0.45992
1,never worn,3379,0.329113,0.789033
2,good condition,1173,0.11425,0.903282
3,"never worn, with tag",810,0.078894,0.982176
4,fair condition,183,0.017824,1.0


In [22]:
# rename levels
data_cleaned["condition"] = data_cleaned["condition"].replace(to_replace="never worn, with tag", value="never worn")
data_cleaned["condition"] = data_cleaned["condition"].cat.rename_categories(["fair_condition", "good_condition", "never_worn", "very_good_condition"])

In [23]:
feature_prop_table(feature="condition", data=data_cleaned)

Unnamed: 0,condition,count,freq,cumul_freq
0,very_good_condition,4722,0.45992,0.45992
1,never_worn,4189,0.408006,0.867926
2,good_condition,1173,0.11425,0.982176
3,fair_condition,183,0.017824,1.0


## `category`

In [24]:
# convert "category" to cat variable
data_cleaned["category"] = pd.Categorical(data_cleaned.category)

In [25]:
# change "boys" and "girls" to "clothing" in "category"
data_cleaned = data_cleaned.replace("boys", "clothing").replace("girls", "clothing")

In [26]:
data_cleaned.category = data_cleaned.category.cat.remove_unused_categories()

In [27]:
feature_prop_table(feature="category", data=data_cleaned)

Unnamed: 0,category,count,freq,cumul_freq
0,shoes,3325,0.323853,0.323853
1,clothing,3214,0.313042,0.636895
2,bags,2265,0.22061,0.857505
3,accessories,1111,0.108211,0.965715
4,jewellery,352,0.034285,1.0


In [28]:
# remove "jewellery" from "category"
data_cleaned = data_cleaned[data_cleaned.category != "jewellery"]

In [29]:
data_cleaned.category = data_cleaned.category.cat.remove_unused_categories()

In [30]:
feature_prop_table(feature="category", data=data_cleaned)

Unnamed: 0,category,count,freq,cumul_freq
0,shoes,3325,0.33535,0.33535
1,clothing,3214,0.324155,0.659506
2,bags,2265,0.228442,0.887948
3,accessories,1111,0.112052,1.0


## `sub-category`

### shoes

In [31]:
feature_prop_table(
    feature="sub_category", 
    data=data_cleaned.loc[data_cleaned.category == "shoes"]
)

Unnamed: 0,sub_category,count,freq,cumul_freq
0,trainers,1016,0.305564,0.305564
1,ankle boots,594,0.178647,0.484211
2,sandals,493,0.148271,0.632481
3,heels,487,0.146466,0.778947
4,boots,366,0.110075,0.889023
5,flats,142,0.042707,0.931729
6,ballet flats,139,0.041805,0.973534
7,lace ups,46,0.013835,0.987368
8,espadrilles,31,0.009323,0.996692
9,mules & clogs,11,0.003308,1.0


In [32]:
# lins some shoes sub categories
shoes_cat_to_link = {
    "ankle boots": "boots", 
    "ballet flats": "flats", 
    "espadrilles": "mules & clogs"
}
data_cleaned = data_cleaned.replace({"sub_category": shoes_cat_to_link})

In [33]:
# rename "mules & clogs" as comfy shoes
data_cleaned = data_cleaned.replace("mules & clogs", "comfy_shoes")

In [34]:
# rename "lace ups" as "lace_ups"
data_cleaned = data_cleaned.replace("lace ups", "lace_ups")

In [35]:
# remove "first shoes" from type of shoes
data_cleaned = data_cleaned.loc[data_cleaned.sub_category != "first shoes"]

In [36]:
feature_prop_table(
    feature="sub_category", 
    data=data_cleaned.loc[data_cleaned.category == "shoes"]
)

Unnamed: 0,sub_category,count,freq,cumul_freq
0,trainers,1016,0.305564,0.305564
1,boots,960,0.288722,0.594286
2,sandals,493,0.148271,0.742556
3,heels,487,0.146466,0.889023
4,flats,281,0.084511,0.973534
5,lace_ups,46,0.013835,0.987368
6,comfy_shoes,42,0.012632,1.0


### clothing

In [37]:
feature_prop_table(
    feature="sub_category", 
    data=data_cleaned.loc[data_cleaned.category == "clothing"]
)

Unnamed: 0,sub_category,count,freq,cumul_freq
0,jackets,555,0.172682,0.172682
1,coats,525,0.163348,0.33603
2,dresses,329,0.102365,0.438395
3,tops,318,0.098942,0.537337
4,knitwear,287,0.089297,0.626633
5,knitwear & sweatshirts,213,0.066273,0.692906
6,trousers,192,0.059739,0.752645
7,t-shirts,149,0.04636,0.799004
8,jeans,121,0.037648,0.836652
9,skirts,116,0.036092,0.872744


In [38]:
# link some clothing sub categories
clothing_cat_to_link = {
    "jackets & coats": "coats", 
    "trench coats": "coats", 
    "leather jackets": "jackets", 
    "jumpsuits": "suits", 
    "outfits": "suits", 
    "shirts": "polo shirts", 
    "t-shirts": "tops", 
    "knitwear": "knitwear & sweatshirts", 
}
data_cleaned = data_cleaned.replace({"sub_category": clothing_cat_to_link})

In [39]:
# rename "knitwear & sweatshirts" as "knitwear_sweatshirts"
data_cleaned = data_cleaned.replace("knitwear & sweatshirts", "knitwear_sweatshirts")

In [40]:
# rename "polo shirts" as "polo_shirts"
data_cleaned = data_cleaned.replace("polo shirts", "polo_shirts")

In [41]:
# remove clothing sub categories with proportion < 1%
data_cleaned = data_cleaned.loc[-data_cleaned.sub_category.isin(["lingerie", "shorts"])]

In [42]:
feature_prop_table(
    feature="sub_category", 
    data=data_cleaned.loc[data_cleaned.category == "clothing"]
)

Unnamed: 0,sub_category,count,freq,cumul_freq
0,coats,621,0.195591,0.195591
1,jackets,602,0.189606,0.385197
2,knitwear_sweatshirts,500,0.15748,0.542677
3,tops,467,0.147087,0.689764
4,dresses,329,0.103622,0.793386
5,trousers,192,0.060472,0.853858
6,jeans,121,0.03811,0.891969
7,skirts,116,0.036535,0.928504
8,swimwear,93,0.029291,0.957795
9,polo_shirts,92,0.028976,0.986772


### bags

In [43]:
feature_prop_table(
    feature="sub_category", 
    data=data_cleaned.loc[data_cleaned.category == "bags"]
)

Unnamed: 0,sub_category,count,freq,cumul_freq
0,handbags,1858,0.820309,0.820309
1,clutch bags,121,0.053422,0.873731
2,bags,121,0.053422,0.927152
3,backpacks,58,0.025607,0.952759
4,travel bags,51,0.022517,0.975276
5,"small bags, wallets & cases",49,0.021634,0.996909
6,belt bags,7,0.003091,1.0


In [44]:
bags_cat_to_link = {
    "clutch bags": "small bags, wallets & cases", 
    "belt bags": "small bags, wallets & cases", 
    "backpacks": "travel bags", 
}

In [45]:
data_cleaned = data_cleaned.replace({"sub_category": bags_cat_to_link})

In [46]:
# rename "small bags, wallets & cases" as "small_bags"
data_cleaned.loc[
    data_cleaned.sub_category == "small bags, wallets & cases", 
    "sub_category"
] = "small_bags"

In [47]:
# rename "travel bags" as "travel_bags"
data_cleaned.loc[
    data_cleaned.sub_category == "travel bags", 
    "sub_category"
] = "travel_bags"

In [48]:
feature_prop_table(
    feature="sub_category", 
    data=data_cleaned.loc[data_cleaned.category=="bags"]
)

Unnamed: 0,sub_category,count,freq,cumul_freq
0,handbags,1858,0.820309,0.820309
1,small_bags,177,0.078146,0.898455
2,bags,121,0.053422,0.951876
3,travel_bags,109,0.048124,1.0


### accessories

In [49]:
feature_prop_table(
    feature="sub_category", 
    data=data_cleaned.loc[data_cleaned.category == "accessories"]
)

Unnamed: 0,sub_category,count,freq,cumul_freq
0,belts,202,0.181818,0.181818
1,scarves,186,0.167417,0.349235
2,sunglasses,167,0.150315,0.49955
3,wallets,151,0.135914,0.635464
4,"purses, wallets & cases",92,0.082808,0.718272
5,hats,82,0.073807,0.792079
6,watches,44,0.039604,0.831683
7,silk handkerchief,41,0.036904,0.868587
8,ties,39,0.035104,0.90369
9,hats & pull on hats,33,0.029703,0.933393


In [50]:
accessories_to_link = {
    "silk handkerchief": "scarves", 
    "hats & pull on hats": "hats", 
    "scarves & pocket squares": "scarves", 
    "purses, wallets & cases": "small_bags", 
    "wallets": "small_bags"
}

In [51]:
data_cleaned = data_cleaned.replace({"sub_category": accessories_to_link})

In [52]:
accessories_prop_table = feature_prop_table(
    feature="sub_category", 
    data=data_cleaned.loc[data_cleaned.category == "accessories"]
)
accessories_prop_table

Unnamed: 0,sub_category,count,freq,cumul_freq
0,scarves,256,0.230423,0.230423
1,small_bags,243,0.218722,0.449145
2,belts,202,0.181818,0.630963
3,sunglasses,167,0.150315,0.781278
4,hats,115,0.10351,0.884788
5,watches,44,0.039604,0.924392
6,ties,39,0.035104,0.959496
7,jewellery,25,0.022502,0.981998
8,gloves,15,0.013501,0.9955
9,cufflinks,5,0.0045,1.0


In [53]:
# remove types of accessories with count < 100
accessories_to_remove = accessories_prop_table.loc[
    accessories_prop_table["count"] < 100, 
    "sub_category"
].tolist()
data_cleaned = data_cleaned.loc[
    -data_cleaned.sub_category.isin(accessories_to_remove)
]

In [54]:
accessories_prop_table = feature_prop_table(
    feature="sub_category", 
    data=data_cleaned.loc[data_cleaned.category=="accessories"]
)
accessories_prop_table

Unnamed: 0,sub_category,count,freq,cumul_freq
0,scarves,256,0.260427,0.260427
1,small_bags,243,0.247202,0.50763
2,belts,202,0.205493,0.713123
3,sunglasses,167,0.169888,0.883011
4,hats,115,0.116989,1.0


In [55]:
sub_cat_prop_table = feature_prop_table(
    feature="sub_category", 
    data=data_cleaned
)   
sub_cat_prop_table

Unnamed: 0,sub_category,count,freq,cumul_freq
0,handbags,1858,0.190603,0.190603
1,trainers,1016,0.104227,0.29483
2,boots,960,0.098482,0.393311
3,coats,621,0.063705,0.457017
4,jackets,602,0.061756,0.518773
5,knitwear_sweatshirts,500,0.051293,0.570066
6,sandals,493,0.050574,0.62064
7,heels,487,0.049959,0.670599
8,tops,467,0.047907,0.718506
9,small_bags,420,0.043086,0.761592


In [56]:
# remove "comfy_shoes", "suits" and "lace_ups"
sub_cat_to_remove = sub_cat_prop_table.loc[
    sub_cat_prop_table["count"] < 90, 
    "sub_category"
].values.tolist()
data_cleaned = data_cleaned.loc[-data_cleaned.sub_category.isin(sub_cat_to_remove)]

In [57]:
# convert "sub_category" to cat variable
data_cleaned["sub_category"] = pd.Categorical(data_cleaned.sub_category)

In [58]:
feature_prop_table(
    feature="sub_category", 
    data=data_cleaned
)

Unnamed: 0,sub_category,count,freq,cumul_freq
0,handbags,1858,0.193179,0.193179
1,trainers,1016,0.105635,0.298815
2,boots,960,0.099813,0.398628
3,coats,621,0.064566,0.463194
4,jackets,602,0.062591,0.525785
5,knitwear_sweatshirts,500,0.051986,0.577771
6,sandals,493,0.051258,0.629029
7,heels,487,0.050634,0.679663
8,tops,467,0.048555,0.728218
9,small_bags,420,0.043668,0.771886


## `designer`

In [59]:
designer_prop_table = feature_prop_table(
    feature="designer", 
    data=data_cleaned
)
designer_prop_table

Unnamed: 0,designer,count,freq,cumul_freq
0,jean paul gaultier,316,0.032855,0.032855
1,coach,309,0.032127,0.064982
2,christian louboutin,306,0.031815,0.096798
3,jimmy choo,306,0.031815,0.128613
4,acne studios,305,0.031711,0.160324
5,chloé,305,0.031711,0.192036
6,off-white,300,0.031192,0.223227
7,isabel marant,299,0.031088,0.254315
8,salvatore ferragamo,296,0.030776,0.28509
9,alexander mcqueen,295,0.030672,0.315762


In [60]:
# remove brands with less than 1% items
designer_to_remove = designer_prop_table.loc[
    designer_prop_table.freq < .01, 
    "designer"
].values.tolist()
data_cleaned = data_cleaned.loc[
    -data_cleaned.designer.isin(designer_to_remove)
]

In [61]:
lprice_by_designer = get_quantiles_by_group(
    feature="designer", 
    data=data_cleaned
)
lprice_by_designer

Unnamed: 0_level_0,lprice,lprice,lprice
Unnamed: 0_level_1,q0.25,median,q0.75
designer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
chanel,6.136006,6.862744,7.898595
louis vuitton,5.860786,6.358678,6.891108
hermès,5.673323,6.335437,7.09091
bottega veneta,5.525453,6.11845,6.673298
balenciaga,5.638355,6.111467,6.398179
dior,5.192957,6.090337,6.981877
saint laurent,5.515437,6.036659,6.802395
christian louboutin,5.595028,5.918005,6.175867
balmain,5.249175,5.860786,6.478149
moncler,5.382682,5.844908,6.395679


In [62]:
# convert "designer" to cat 
data_cleaned["designer"] = pd.Categorical(data_cleaned["designer"])

## `material`

In [63]:
material_prop_table = feature_prop_table(
    feature="material",
    data=data_cleaned
)
material_prop_table

Unnamed: 0,material,count,freq,cumul_freq
0,leather,3968,0.414542,0.414542
1,cotton,1069,0.11168,0.526222
2,wool,724,0.075637,0.60186
3,cloth,623,0.065086,0.666945
4,polyester,440,0.045967,0.712913
5,synthetic,373,0.038968,0.75188
6,suede,372,0.038863,0.790744
7,silk,328,0.034267,0.82501
8,patent leather,305,0.031864,0.856874
9,other,156,0.016298,0.873172


In [64]:
# create "other_material" group for material such that freq < 1%
other_material = material_prop_table.loc[ 
    material_prop_table["freq"] < .01, 
    "material"
].values.tolist()
data_cleaned.loc[ 
    data_cleaned.material.isin(other_material), 
    "material"
] = "other_material"

In [65]:
# rename some categories in "material"
data_cleaned.material = data_cleaned.material.replace("patent leather", "patent_leather").replace("denim - jeans", "denim_jeans")

In [66]:
material_prop_table = feature_prop_table(
    feature="material",
    data=data_cleaned
)
material_prop_table

Unnamed: 0,material,count,freq,cumul_freq
0,leather,3968,0.414542,0.414542
1,cotton,1069,0.11168,0.526222
2,other_material,726,0.075846,0.602069
3,wool,724,0.075637,0.677706
4,cloth,623,0.065086,0.742791
5,polyester,440,0.045967,0.788759
6,synthetic,373,0.038968,0.827727
7,suede,372,0.038863,0.86659
8,silk,328,0.034267,0.900857
9,patent_leather,305,0.031864,0.93272


In [67]:
# convert "material" to cat
categories = [
    "other_material",
    "leather",
    "cotton",
    "wool",
    "cloth",
    "polyester",
    "synthetic",
    "suede",
    "silk",
    "patent_leather",
    "other",
    "viscose",
    "plastic",
    "cashmere",
    "denim_jeans"
]
data_cleaned["material"] = pd.Categorical(
    values=data_cleaned["material"], 
    categories=categories
)

## `color`

In [68]:
color_prop_table = feature_prop_table(
    feature="color", 
    data=data_cleaned
)
color_prop_table

Unnamed: 0,color,count,freq,cumul_freq
0,black,3281,0.342771,0.342771
1,white,861,0.08995,0.43272
2,multicolour,755,0.078876,0.511596
3,brown,693,0.072399,0.583995
4,beige,671,0.0701,0.654095
5,blue,614,0.064145,0.718241
6,pink,380,0.039699,0.75794
7,grey,349,0.036461,0.7944
8,red,292,0.030506,0.824906
9,green,283,0.029565,0.854471


In [69]:
# create "other_color" group for color such that freq < .9%
other_color = color_prop_table.loc[ 
    color_prop_table["freq"] < .009, 
    "color"
].values.tolist()
data_cleaned.loc[ 
    data_cleaned.color.isin(other_color), 
    "color"
] = "other_color"

In [70]:
color_prop_table = feature_prop_table(
    feature="color", 
    data=data_cleaned
)

In [71]:
# convert "color" to cat
categories = [
    "other_color",
    "black",
    "white",
    "multicolour",
    "brown",
    "beige",
    "blue",
    "pink",
    "grey",
    "red",
    "green",
    "camel",
    "navy",
    "burgundy",
    "gold",
    "silver",
    "orange",
    "yellow",
    "purple"]
data_cleaned["color"] = pd.Categorical(
    values=data_cleaned["color"], 
    categories=categories
)

## `location`

In [72]:
feature_prop_table(feature="location", data=data_cleaned)

Unnamed: 0,location,count,freq,cumul_freq
0,italy,2741,0.286356,0.286356
1,france,1837,0.191914,0.47827
2,united kingdom,1304,0.136231,0.614501
3,germany,978,0.102173,0.716674
4,spain,479,0.050042,0.766715
5,romania,289,0.030192,0.796908
6,poland,202,0.021103,0.818011
7,greece,188,0.019641,0.837651
8,belgium,173,0.018074,0.855725
9,netherlands,171,0.017865,0.87359


In [73]:
# reduce "location" variable
def map_geo_area(country):
    """Map a country to the corresponding geographical area."""
    if country not in (
        "canada",
        "israel",
        "singapore", 
        "switzerland", 
        "united kingdom", 
        "united states"
    ): 
        return "eu"
    else: 
        if country != "united kingdom":
            return "other_country"
        else: 
            return "uk"
data_cleaned.location = data_cleaned.location.replace("vestiaire collective france", "france")
data_cleaned.location = data_cleaned.location.apply(map_geo_area)
data_cleaned.location = data_cleaned.location.replace(" ", "_") 

In [74]:
feature_prop_table(feature="location", data=data_cleaned)

Unnamed: 0,location,count,freq,cumul_freq
0,eu,8252,0.862098,0.862098
1,uk,1304,0.136231,0.998328
2,other_country,16,0.001672,1.0


In [75]:
data_cleaned.location = pd.Categorical(
    values=data_cleaned.location, 
    categories=["other_country", "eu", "uk"]
)

In [76]:
detect_nan(df=data_cleaned)

{'id': [],
 'url': [],
 'num_likes': [],
 'price': [],
 'we_love_tag': [],
 'online_date': [],
 'gender': [],
 'category': [],
 'sub_category': [],
 'designer': [],
 'condition': [],
 'material': [],
 'color': [],
 'size': [],
 'location': [],
 'lprice': []}

## `size`

In [77]:
data_cleaned["size"] = data_cleaned["size"].apply(lambda size: "".join(size))

In [78]:
data_cleaned.loc[
    data_cleaned["size"] == "no size", 
    "size"
] = "no_size"

In [79]:
data_cleaned["size"] = data_cleaned["size"].apply( lambda size: size.replace("international", "") )

### bags

In [80]:
data_cleaned.loc[
    data_cleaned.category.isin(["bags"]), 
    "size"
] = "no_size"

### accessories

In [81]:
accessories = list(set(
    data_cleaned.loc[
        data_cleaned.category == "accessories",
        "sub_category"
    ].values
))
accessories

['scarves', 'sunglasses', 'small_bags', 'hats', 'belts']

#### belts

In [82]:
belt_sizes = {
    "70cm": "xxs",
    "75cm": "xs", 
    "80cm": "xs", 
    "85cm": "s", 
    "90cm": "m", 
    "95cm": "l", 
    "100cm": "xl"
}

In [83]:
data_cleaned[
    data_cleaned.sub_category == "belts"
] = data_cleaned[
    data_cleaned.sub_category == "belts"
].replace({"size": belt_sizes})

#### hats

In [84]:
hat_sizes = {
    "21.2inches": "s", 
    "24inches": "xxl", 
    "54cm": "s", 
    "55cm": "s", 
    "56cm": "m",
    "57cm": "m", 
    "58cm": "l", 
    "59cm": "l", 
    "60cm": "xl", 
    "61cm": "xl", 
}

In [85]:
data_cleaned[
    data_cleaned.sub_category == "hats"
] = data_cleaned[
    data_cleaned.sub_category == "hats"
].replace({"size": hat_sizes})

In [86]:
detect_nan(data_cleaned)

{'id': [],
 'url': [],
 'num_likes': [],
 'price': [],
 'we_love_tag': [],
 'online_date': [],
 'gender': [],
 'category': [],
 'sub_category': [],
 'designer': [],
 'condition': [],
 'material': [],
 'color': [],
 'size': [],
 'location': [],
 'lprice': []}

### shoes

In [87]:
uk_shoe_size = data_cleaned[data_cleaned.category == "shoes"]["size"].str.contains("uk").values

In [88]:
# convert UK shoe size to EU shoe size
uk_size_to_eu = {
    "2uk": "35", 
    "3uk": "36", 
    "3.5uk": "36",
    "4uk": "37", 
    "4.5uk": "37", 
    "5uk": "38", 
    "5.5uk": "39", 
    "6uk": "39", 
    "6.5uk": "40", 
    "7uk": "41", 
    "7.5uk": "41", 
    "8uk": "42", 
    "8.5uk": "42", 
    "9uk": "43",
    "9.5uk": "44",  
    "10uk": "45", 
    "10.5uk": "45", 
    "11uk": "46", 
    "12uk": "47", 
}

In [89]:
data_cleaned[data_cleaned.category == "shoes"] = data_cleaned[data_cleaned.category == "shoes"].replace({"size": uk_size_to_eu})

In [90]:
us_shoe_size = data_cleaned[data_cleaned.category == "shoes"]["size"].str.contains("us").values

In [91]:
# convert US shoe size to EU shoe size
us_size_to_eu = {
    "3us": "34", 
    "3.5us": "34.5",
    "4us": "35", 
    "4.5us": "35.5", 
    "5us": "36", 
    "5.5us": "36.5", 
    "6us": "37", 
    "6.5us": "37.5", 
    "7us": "38", 
    "7.5us": "38.5", 
    "8us": "39", 
    "8.5us": "39.5", 
    "9us": "40",
    "9.5us": "40.5",  
    "10us": "41", 
    "10.5us": "41.5", 
    "11us": "42", 
}

In [92]:
data_cleaned[data_cleaned.category == "shoes"] = data_cleaned[data_cleaned.category == "shoes"].replace({"size": us_size_to_eu})

In [93]:
# remove "fr" and "it" from shoe size
data_cleaned.loc[
    data_cleaned.category == "shoes", 
    "size"
] = data_cleaned[
    data_cleaned.category == "shoes"
]["size"].apply( lambda old_size: re.sub("[^\d\.]", "", old_size) )

In [94]:
def classify_shoe_size(shoe_size: str): 
    shoe_size = float(shoe_size)
    if shoe_size <= 35: 
        return "<=35"
    else: 
        if shoe_size >= 45: 
            return ">=45"
        else:
            return str( int( round(shoe_size, 0) ) )

In [95]:
data_cleaned.loc[
    data_cleaned.category == "shoes", 
    "size"
] = data_cleaned[ data_cleaned.category == "shoes" ]["size"].apply(classify_shoe_size).values

In [96]:
feature_prop_table(
    feature="size", 
    data=data_cleaned[data_cleaned.category == "shoes"]
)

Unnamed: 0,size,count,freq,cumul_freq
0,38,748,0.231794,0.231794
1,40,508,0.157422,0.389216
2,39,424,0.131391,0.520607
3,37,411,0.127363,0.64797
4,36,358,0.110939,0.758909
5,41,223,0.069104,0.828014
6,42,211,0.065386,0.893399
7,43,98,0.030369,0.923768
8,44,96,0.029749,0.953517
9,<=35,91,0.0282,0.981717


In [97]:
detect_nan(data_cleaned)

{'id': [],
 'url': [],
 'num_likes': [],
 'price': [],
 'we_love_tag': [],
 'online_date': [],
 'gender': [],
 'category': [],
 'sub_category': [],
 'designer': [],
 'condition': [],
 'material': [],
 'color': [],
 'size': [],
 'location': [],
 'lprice': []}

### clothing

In [98]:
# convert uk size to eu size
clothing_uk_to_eu = {
    "3uk": "3xs", 
    "4uk": "xxs", 
    "6uk": "xs", 
    "8uk": "s", 
    "10uk": "m", 
    "12uk": "l", 
    "14uk": "xl", 
    "15.5uk": "m", 
    "16uk": "xxl", 
    "18-20uk": "3xl", 
    "28uk": "xs", 
    "30uk": "s",
    "32uk": "m", 
    "34uk": "xs", 
    "36uk": "s", 
    "40uk": "l", 
    "42uk": "xl"
}

In [99]:
data_cleaned.loc[
    data_cleaned.category == "clothing", 
    :
]= data_cleaned[
    data_cleaned.category == "clothing"
].replace({"size": clothing_uk_to_eu})

In [100]:
detect_nan(data_cleaned)

{'id': [],
 'url': [],
 'num_likes': [],
 'price': [],
 'we_love_tag': [],
 'online_date': [],
 'gender': [],
 'category': [],
 'sub_category': [],
 'designer': [],
 'condition': [],
 'material': [],
 'color': [],
 'size': [],
 'location': [],
 'lprice': []}

In [101]:
# convert us size to eu size
clothing_us_to_eu = {
    "0us": "xxs", 
    "1us": "xxs", 
    "2us": "xs",
    "4us": "s", 
    "6us": "m", 
    "8us": "l", 
    "10us": "xl", 
    "12us": "xxl", 
    "14-16us": "3xl",  
    "18us": "4xl", 
    "24us": "xs", 
    "25us": "xs", 
    "26us": "s",
    "27us": "s",
    "28us": "m", 
    "29us": "m", 
    "30us": "m",
    "31us": "m",
    "32us": "m", 
    "33us": "l", 
    "34us": "l", 
    "36us": "xl", 
}

In [102]:
data_cleaned = data_cleaned.replace({"size": clothing_us_to_eu})

In [103]:
detect_nan(data_cleaned)

{'id': [],
 'url': [],
 'num_likes': [],
 'price': [],
 'we_love_tag': [],
 'online_date': [],
 'gender': [],
 'category': [],
 'sub_category': [],
 'designer': [],
 'condition': [],
 'material': [],
 'color': [],
 'size': [],
 'location': [],
 'lprice': []}

In [104]:
# convert international numeric size to s, m, l size
it_clothing_sizes = {
    "34it": "3xs", 
    "36it": "xxs", 
    "38it": "xs", 
    "40it": "s", 
    "42it": "m", 
    "44it": "l", 
    "46it": "xl", 
    "48it": "xxl", 
    "50it": "l", 
    "50-52it": "3xl", 
    "52it": "xl", 
    "54it": "xxl", 
    "56it": "3xl", 
}

In [105]:
data_cleaned = data_cleaned.replace({"size": it_clothing_sizes})

In [106]:
# convert fr numeric size to s, m, l size
fr_clothing_sizes = {
    "30fr": "3xs", 
    "32fr": "xxs", 
    "34fr": "xs", 
    "36fr": "s", 
    "38fr": "m", 
    "40fr": "l", 
    "40-": "l", 
    "42fr": "xl", 
    "44fr": "xxl", 
    "46fr": "3xl", 
    "48fr": "m",
    "50fr": "l", 
    "52fr": "xl", 
    "54fr": "xxl", 
    "56fr": "3xl" 
}

In [107]:
data_cleaned = data_cleaned.replace({"size": fr_clothing_sizes})

In [108]:
# convert eu size to s, m, l size
eu_clothing_sizes = {
    "37eu": "xs", 
    "38eu": "s", 
    "39eu": "m", 
    "40eu": "m",
    "41eu": "l",
    "42eu": "xl", 
    "43eu": "xxl", 
}

In [109]:
data_cleaned = data_cleaned.replace({"size": eu_clothing_sizes})

In [110]:
detect_nan(data_cleaned)

{'id': [],
 'url': [],
 'num_likes': [],
 'price': [],
 'we_love_tag': [],
 'online_date': [],
 'gender': [],
 'category': [],
 'sub_category': [],
 'designer': [],
 'condition': [],
 'material': [],
 'color': [],
 'size': [],
 'location': [],
 'lprice': []}

In [111]:
# convert size from "0-5" scale to international size
clothing_size_conversion = {
    "0000\t": "xxs", 
    "000-5": "xxs", 
    "00-5": "xs", 
    "0": "s", 
    "10": "6",
    "10-5": "m", 
    "20": "m", 
    "2": "m", 
    "20-5": "m", 
    "30-5": "l", 
    "30": "l", 
    "40-5": "xl", 
    "40": "xl", 
    "50": "xxl", 
    "60": "3xl", 
    "6": "3xl"
}

In [112]:
data_cleaned.loc[
    data_cleaned.category == "clothing"
] = data_cleaned.loc[
    data_cleaned.category == "clothing"
].replace({ "size": clothing_size_conversion })

In [113]:
data_cleaned.loc[ 
    data_cleaned["size"] == "tailleunique", 
    "size"
] = "no_size"

In [114]:
data_cleaned.loc[ 
    data_cleaned["size"] == "6", 
    "size"
] = "3xl"

In [115]:
data_cleaned.loc[ 
    data_cleaned["size"] == "xxxl", 
    "size"
] = "3xl"

In [116]:
data_cleaned.loc[ 
    data_cleaned["size"] == "xxxl",  
    "size"
] = "3xl"

In [117]:
feature_prop_table(
    feature="size", 
    data=data_cleaned[data_cleaned.category == "clothing"]
)

Unnamed: 0,size,count,freq,cumul_freq
0,m,952,0.304445,0.304445
1,s,708,0.226415,0.53086
2,l,534,0.170771,0.701631
3,xs,342,0.10937,0.811001
4,xl,301,0.096258,0.907259
5,xxl,148,0.04733,0.954589
6,xxs,66,0.021106,0.975696
7,3xl,42,0.013431,0.989127
8,3xs,23,0.007355,0.996482
9,no_size,11,0.003518,1.0


In [118]:
detect_nan(data_cleaned)

{'id': [],
 'url': [],
 'num_likes': [],
 'price': [],
 'we_love_tag': [],
 'online_date': [],
 'gender': [],
 'category': [],
 'sub_category': [],
 'designer': [],
 'condition': [],
 'material': [],
 'color': [],
 'size': [],
 'location': [],
 'lprice': []}

### Convert size to `pd.Categorical`

In [119]:
# add "size_" before each size
data_cleaned["size"] = "size_" + data_cleaned["size"].astype(str)
data_cleaned.loc[
    data_cleaned["size"] == "size_no_size", 
    "size"
] = "no_size"

In [120]:
data_cleaned["size"] = pd.Categorical(data_cleaned["size"]) 

In [121]:
feature_prop_table(
    feature="size", 
    data=data_cleaned
)

Unnamed: 0,size,count,freq,cumul_freq
0,no_size,2913,0.304325,0.304325
1,size_m,1080,0.112829,0.417154
2,size_s,776,0.08107,0.498224
3,size_38,748,0.078145,0.576369
4,size_l,593,0.061952,0.63832
5,size_40,508,0.053071,0.691392
6,size_39,424,0.044296,0.735687
7,size_37,411,0.042938,0.778625
8,size_xs,370,0.038654,0.81728
9,size_36,358,0.037401,0.85468


In [122]:
detect_nan(df=data_cleaned)

{'id': [],
 'url': [],
 'num_likes': [],
 'price': [],
 'we_love_tag': [],
 'online_date': [],
 'gender': [],
 'category': [],
 'sub_category': [],
 'designer': [],
 'condition': [],
 'material': [],
 'color': [],
 'size': [],
 'location': [],
 'lprice': []}

## Feature selection

In [123]:
data_cleaned.index = data_cleaned.id

In [124]:
data_cleaned = data_cleaned.drop(["id", "url", "online_date"], axis=1)

In [125]:
data_cleaned.head()

Unnamed: 0_level_0,num_likes,price,we_love_tag,gender,category,sub_category,designer,condition,material,color,size,location,lprice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
19126896,7,180.0,1,women,shoes,boots,acne studios,very_good_condition,leather,black,size_39,eu,5.198497
19181389,1,40.55,1,women,clothing,jeans,acne studios,good_condition,denim_jeans,navy,size_m,eu,3.726898
19182029,6,332.5,1,men,clothing,coats,acne studios,good_condition,wool,black,size_l,eu,5.809643
19132670,3,45.0,0,men,clothing,jeans,acne studios,never_worn,cotton,grey,size_m,eu,3.828641
19118182,9,105.0,0,women,clothing,dresses,acne studios,very_good_condition,other_material,black,size_s,eu,4.663439


In [126]:
detect_nan(df=data_cleaned)

{'num_likes': [],
 'price': [],
 'we_love_tag': [],
 'gender': [],
 'category': [],
 'sub_category': [],
 'designer': [],
 'condition': [],
 'material': [],
 'color': [],
 'size': [],
 'location': [],
 'lprice': []}

In [127]:
data_cleaned.to_pickle(path=backup_path + "data/vc_data_cleaned.pkl")