# Vestiaire Collective - Data preprocessing 

In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from functions import (
    get_quantiles_by_group, 
    feature_prop_table, 
    target_boxplot
)

## Load the dataset 

In [4]:
file_path = "C:/Users/pemma/OneDrive - Université de Tours/Mécen/M2/S1/02 - Machine Learning/05 - Projet/ML_Vestiaire_Collective/backup/vc_data.pkl"

In [5]:
data  = pd.read_pickle(file_path)

In [6]:
data.head()

Unnamed: 0,id,url,num_likes,price,we_love_tag,online_date,gender,category,sub_category,designer,condition,material,color,size,location
0,19126896,https://fr.vestiairecollective.com//women-shoe...,7,180.0,True,09/11/2021,women,shoes,ankle boots,acne studios,very good condition,leather,black,39,italy
1,19181389,https://fr.vestiairecollective.com//women-clot...,1,40.55,True,12/11/2021,women,clothing,jeans,acne studios,good condition,denim - jeans,navy,30,poland
2,19182029,https://fr.vestiairecollective.com//men-clothi...,6,332.5,True,12/11/2021,men,clothing,coats,acne studios,good condition,wool,black,l,denmark
3,19132670,https://fr.vestiairecollective.com//men-clothi...,3,45.0,False,09/11/2021,men,clothing,jeans,acne studios,"never worn, with tag",cotton,grey,28,germany
4,19118182,https://fr.vestiairecollective.com//women-clot...,9,105.0,False,09/11/2021,women,clothing,dresses,acne studios,very good condition,linen,black,s,germany


In [7]:
data.shape

(10409, 15)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10409 entries, 0 to 43
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            10409 non-null  int64  
 1   url           10409 non-null  object 
 2   num_likes     10409 non-null  int64  
 3   price         10409 non-null  float64
 4   we_love_tag   10409 non-null  bool   
 5   online_date   10409 non-null  object 
 6   gender        10409 non-null  object 
 7   category      10409 non-null  object 
 8   sub_category  10409 non-null  object 
 9   designer      10409 non-null  object 
 10  condition     10409 non-null  object 
 11  material      10409 non-null  object 
 12  color         10409 non-null  object 
 13  size          10409 non-null  object 
 14  location      10409 non-null  object 
dtypes: bool(1), float64(1), int64(2), object(11)
memory usage: 1.2+ MB


In [9]:
data_cleaned = data.copy()

In [10]:
# add log price column to the data
data_cleaned["lprice"] = np.log(data.price+1)

## `we_love_tag`

In [11]:
# convert "we_love_tag" to binary (0, 1)
data_cleaned["we_love_tag"] = data_cleaned["we_love_tag"].astype(int)

In [12]:
data_cleaned.we_love_tag

0     1
1     1
2     1
3     0
4     0
     ..
39    1
40    0
41    1
42    1
43    0
Name: we_love_tag, Length: 10409, dtype: int32

## `gender`

In [13]:
# convert "gender" to cat variable
data_cleaned["gender"] = pd.Categorical(data_cleaned.gender)

In [14]:
data_cleaned.gender.value_counts()

women            8266
men              2001
kids              133
life & living       9
Name: gender, dtype: int64

In [15]:
# remove "life & living"
data_cleaned = data_cleaned[data_cleaned.gender != "life & living"]
data_cleaned.gender = data_cleaned.gender.cat.remove_unused_categories()

In [16]:
data_cleaned.gender.value_counts()

women    8266
men      2001
kids      133
Name: gender, dtype: int64

## `condition`

In [17]:
# convert "condition" to cat variable
data_cleaned["condition"] = pd.Categorical(data_cleaned.condition)

In [18]:
# value counts "condition"
data_cleaned.condition.value_counts()

very good condition     4803
never worn              3395
good condition          1196
never worn, with tag     821
fair condition           185
Name: condition, dtype: int64

In [19]:
# rename levels
data_cleaned["condition"] = data_cleaned["condition"].replace(to_replace="never worn, with tag", value="never worn")
data_cleaned["condition"] = data_cleaned["condition"].cat.rename_categories(["fair_condition", "good_condition", "never_worn", "very_good_condition"])

In [20]:
data_cleaned.condition.value_counts(normalize=True)

very_good_condition    0.461827
never_worn             0.405385
good_condition         0.115000
fair_condition         0.017788
Name: condition, dtype: float64

## `category`

In [21]:
# convert "category" to cat variable
data_cleaned["category"] = pd.Categorical(data_cleaned.category)

In [22]:
# change "boys" and "girls" to "clothing" in "category"
data_cleaned = data_cleaned.replace("boys", "clothing").replace("girls", "clothing")

In [23]:
data_cleaned.category = data_cleaned.category.cat.remove_unused_categories()

In [24]:
data_cleaned.category.value_counts()

shoes          3351
clothing       3316
bags           2265
accessories    1116
jewellery       352
Name: category, dtype: int64

In [25]:
# remove "jewellery" from "category"
data_cleaned = data_cleaned[data_cleaned.category != "jewellery"]

In [26]:
data_cleaned.category = data_cleaned.category.cat.remove_unused_categories()

In [27]:
data_cleaned.category.value_counts(normalize=True)

shoes          0.333499
clothing       0.330016
bags           0.225418
accessories    0.111067
Name: category, dtype: float64

## `sub-category`

### shoes

In [29]:
# link some shoes sub categories
shoes_cat_to_link = {
    "ankle boots": "boots", 
    "ballet flats": "flats", 
    "espadrilles": "mules & clogs"
}
data_cleaned = data_cleaned.replace({"sub_category": shoes_cat_to_link})

In [30]:
# rename "mules & clogs" as comfy shoes
data_cleaned = data_cleaned.replace("mules & clogs", "comfy_shoes")

In [31]:
# rename "lace ups" as "lace_ups"
data_cleaned = data_cleaned.replace("lace ups", "lace_ups")

In [32]:
# remove "first shoes" from type of shoes
data_cleaned = data_cleaned.loc[data_cleaned.sub_category != "first shoes"]

In [33]:
category = "shoes"
prop_table = feature_prop_table(
    feature="sub_category", 
    data=data_cleaned.loc[data_cleaned.category==category]
)
prop_table

Unnamed: 0,sub_category,count,freq,cumul_freq
0,trainers,1037,0.309552,0.309552
1,boots,962,0.287164,0.596716
2,sandals,493,0.147164,0.743881
3,heels,487,0.145373,0.889254
4,flats,283,0.084478,0.973731
5,lace_ups,46,0.013731,0.987463
6,comfy_shoes,42,0.012537,1.0


### clothing

In [34]:
# link some clothing sub categories
clothing_cat_to_link = {
    "jackets & coats": "coats", 
    "trench coats": "coats", 
    "leather jackets": "jackets", 
    "jumpsuits": "suits", 
    "outfits": "suits", 
    "shirts": "polo shirts", 
    "t-shirts": "tops", 
    "knitwear": "knitwear & sweatshirts", 
}
data_cleaned = data_cleaned.replace({"sub_category": clothing_cat_to_link})

In [35]:
# rename "knitwear & sweatshirts" as "knitwear_sweatshirts"
data_cleaned = data_cleaned.replace("knitwear & sweatshirts", "knitwear_sweatshirts")

In [36]:
# rename "polo shirts" as "polo_shirts"
data_cleaned = data_cleaned.replace("polo shirts", "polo_shirts")

In [38]:
# remove clothing sub categories with proportion < 1%
data_cleaned = data_cleaned.loc[-data_cleaned.sub_category.isin(["lingerie", "shorts"])]

In [37]:
category = "clothing"
prop_table = feature_prop_table(
    feature="sub_category", 
    data=data_cleaned.loc[data_cleaned.category==category]
)
prop_table

Unnamed: 0,sub_category,count,freq,cumul_freq
0,coats,681,0.205368,0.205368
1,jackets,602,0.181544,0.386912
2,knitwear_sweatshirts,514,0.155006,0.541918
3,tops,475,0.143245,0.685163
4,dresses,334,0.100724,0.785887
5,trousers,197,0.059409,0.845296
6,jeans,121,0.03649,0.881785
7,skirts,117,0.035283,0.917069
8,swimwear,93,0.028046,0.945115
9,polo_shirts,92,0.027744,0.972859


### bags

In [40]:
bags_cat_to_link = {
    "clutch bags": "small bags, wallets & cases", 
    "belt bags": "small bags, wallets & cases", 
    "backpacks": "travel bags", 
}

In [42]:
data_cleaned = data_cleaned.replace({"sub_category": bags_cat_to_link})

In [46]:
# rename "small bags, wallets & cases" as "small_bags"
data_cleaned.loc[
    data_cleaned.sub_category == "small bags, wallets & cases", 
    "sub_category"
] = "small_bags"

In [48]:
# rename "travel bags" as "travel_bags"
data_cleaned.loc[
    data_cleaned.sub_category == "travel bags", 
    "sub_category"
] = "travel_bags"

In [49]:
feature_prop_table(
    feature="sub_category", 
    data=data_cleaned.loc[data_cleaned.category=="bags"]
)

Unnamed: 0,sub_category,count,freq,cumul_freq
0,handbags,1858,0.820309,0.820309
1,small_bags,177,0.078146,0.898455
2,bags,121,0.053422,0.951876
3,travel_bags,109,0.048124,1.0


### accessories

In [50]:
accessories_to_link = {
    "silk handkerchief": "scarves", 
    "hats & pull on hats": "hats", 
    "scarves & pocket squares": "scarves", 
    "purses, wallets & cases": "small_bags", 
    "wallets": "small_bags"
}

In [51]:
data_cleaned = data_cleaned.replace({"sub_category": accessories_to_link})

In [58]:
# remove types of accessories with count < 100
accessories_to_remove = accessories_prop_table.loc[
    accessories_prop_table["count"] < 100, 
    "sub_category"
].tolist()
data_cleaned = data_cleaned.loc[
    -data_cleaned.sub_category.isin(accessories_to_remove)
]

In [59]:
accessories_prop_table = feature_prop_table(
    feature="sub_category", 
    data=data_cleaned.loc[data_cleaned.category=="accessories"]
)
accessories_prop_table

Unnamed: 0,sub_category,count,freq,cumul_freq
0,scarves,259,0.262411,0.262411
1,small_bags,243,0.246201,0.508612
2,belts,202,0.204661,0.713273
3,sunglasses,168,0.170213,0.883485
4,hats,115,0.116515,1.0


In [60]:
# convert "sub_category" to cat variable
data_cleaned["sub_category"] = pd.Categorical(data_cleaned.sub_category)

In [61]:
feature_prop_table(
    feature="sub_category", 
    data=data_cleaned
)   

Unnamed: 0,sub_category,count,freq,cumul_freq
0,handbags,1858,0.188076,0.188076
1,trainers,1037,0.10497,0.293046
2,boots,962,0.097378,0.390424
3,coats,681,0.068934,0.459358
4,jackets,602,0.060937,0.520296
5,knitwear_sweatshirts,514,0.05203,0.572325
6,sandals,493,0.049904,0.622229
7,heels,487,0.049296,0.671525
8,tops,475,0.048082,0.719607
9,small_bags,420,0.042514,0.762122


## `designer`

In [260]:
feature_prop_table(feature="designer", data=data_cleaned)

Unnamed: 0,designer,count,freq,cumul_freq
0,moncler,292,0.044062,0.044062
1,balmain,292,0.044062,0.088124
2,jean paul gaultier,291,0.043911,0.132036
3,christian louboutin,290,0.04376,0.175796
4,acne studios,279,0.0421,0.217896
5,maison martin margiela,269,0.040592,0.258488
6,golden goose,268,0.040441,0.298929
7,jimmy choo,263,0.039686,0.338615
8,isabel marant,261,0.039384,0.377999
9,alexander mcqueen,244,0.036819,0.414818


In [261]:
lprice_by_designer = get_quantiles_by_group(
    feature="designer", 
    data=data_cleaned
)
lprice_by_designer

Unnamed: 0_level_0,lprice,lprice,lprice
Unnamed: 0_level_1,q0.25,median,q0.75
designer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
hermès,6.006353,6.552508,7.09091
chanel,6.084468,6.545324,7.044649
balenciaga,5.56452,6.090823,6.341153
bottega veneta,5.488711,5.942799,6.397728
christian louboutin,5.56452,5.902702,6.175867
dior,5.303305,5.874832,6.628841
saint laurent,5.419426,5.860786,6.397346
balmain,5.214891,5.857933,6.460846
moncler,5.134245,5.70711,6.296533
valentino garavani,5.334793,5.70711,6.013273


In [34]:
# convert "designer" to cat & rename levels
data_cleaned["designer"] = pd.Categorical(data_cleaned["designer"])
data_cleaned["designer"] = data_cleaned["designer"].cat.rename_categories(
    lambda x: x.replace(" ", "_").replace("-", "_")
)

In [35]:
data_cleaned = data_cleaned[data_cleaned["designer"] != "démoo"]

In [36]:
data_cleaned.designer = data_cleaned.designer.cat.remove_unused_categories()

## `material`

In [37]:
# convert "material" to cat & rename levels
data_cleaned["material"] = pd.Categorical(data_cleaned["material"])
data_cleaned["material"] = data_cleaned["material"].cat.rename_categories(
    lambda x: x.replace(" - ", "_").replace("-", "_").replace(" ", "_")
)

In [38]:
data_cleaned.material.value_counts()

leather                2179
cotton                 1026
wool                    646
polyester               426
suede                   339
synthetic               326
cloth                   304
patent_leather          250
silk                    197
viscose                 158
denim_jeans              90
other                    87
velvet                   81
glitter                  77
rubber                   71
cashmere                 68
fur                      63
cotton_elasthane         62
pony_style_calfskin      37
tweed                    37
exotic_leathers          37
linen                    30
plastic                  28
vegan_leather            16
faux_fur                 12
lycra                     7
lace                      5
spandex                   3
sponge                    3
polyamide                 2
Name: material, dtype: int64

In [39]:
# remove materials with counts < 10
to_keep = [
    key 
    for key, val in data_cleaned["material"].value_counts().items()
    if val >= 10
]
data_cleaned = data_cleaned[data_cleaned.material.isin(to_keep)]

In [40]:
data_cleaned.material = data_cleaned.material.cat.remove_unused_categories()

In [41]:
data_cleaned.material.value_counts()

leather                2179
cotton                 1026
wool                    646
polyester               426
suede                   339
synthetic               326
cloth                   304
patent_leather          250
silk                    197
viscose                 158
denim_jeans              90
other                    87
velvet                   81
glitter                  77
rubber                   71
cashmere                 68
fur                      63
cotton_elasthane         62
pony_style_calfskin      37
exotic_leathers          37
tweed                    37
linen                    30
plastic                  28
vegan_leather            16
faux_fur                 12
Name: material, dtype: int64

## `color`

In [42]:
# convert "color" to cat & rename levels
data_cleaned["color"] = pd.Categorical(data_cleaned["color"])
data_cleaned["color"] = data_cleaned["color"].cat.rename_categories(
    lambda x: x.replace(" / ", "_")
)

In [43]:
data_cleaned.color.value_counts()

black          2336
white           752
multicolour     519
blue            464
beige           437
brown           328
grey            254
pink            222
green           181
red             180
navy            149
camel           120
silver           84
burgundy         78
khaki            70
ecru             68
gold             67
purple           65
other            60
yellow           55
orange           52
anthracite       44
metallic         41
turquoise        20
white_black       1
Name: color, dtype: int64

In [44]:
# remove "white_black" category
data_cleaned = data_cleaned[data_cleaned.color != "white_black"]
data_cleaned.color = data_cleaned.color.cat.remove_unused_categories()

In [45]:
data_cleaned.color.value_counts()

black          2336
white           752
multicolour     519
blue            464
beige           437
brown           328
grey            254
pink            222
green           181
red             180
navy            149
camel           120
silver           84
burgundy         78
khaki            70
ecru             68
gold             67
purple           65
other            60
yellow           55
orange           52
anthracite       44
metallic         41
turquoise        20
Name: color, dtype: int64

## `location`

In [46]:
# reduce "location" variable
def map_geo_area(country):
    """Map a country to the corresponding geographical area."""
    if country not in (
        "canada",
        "israel",
        "singapore", 
        "switzerland", 
        "united kingdom", 
        "united states"
    ): 
        return "EU"
    else: 
        if country != "united kingdom":
            return "other_countries"
        else: 
            return country
data_cleaned.location = data_cleaned.location.replace("vestiaire collective france", "france")
data_cleaned.location = data_cleaned.location.apply(map_geo_area)
data_cleaned.location = data_cleaned.location.replace(" ", "_") 

In [47]:
data_cleaned.location.value_counts()

EU                 5666
united kingdom      969
other_countries      11
Name: location, dtype: int64

## `size`

In [48]:
data_cleaned

Unnamed: 0,id,url,num_likes,price,we_love_tag,online_date,gender,category,sub_category,designer,condition,material,color,size,location
0,19126896,https://fr.vestiairecollective.com//women-shoe...,7,180.00,1,09/11/2021,women,shoes,ankle boots,acne_studios,very_good_condition,leather,black,39,EU
1,19181389,https://fr.vestiairecollective.com//women-clot...,1,40.55,1,12/11/2021,women,clothing,jeans,acne_studios,good_condition,denim_jeans,navy,30,EU
2,19182029,https://fr.vestiairecollective.com//men-clothi...,6,332.50,1,12/11/2021,men,clothing,coats,acne_studios,good_condition,wool,black,l,EU
3,19132670,https://fr.vestiairecollective.com//men-clothi...,3,45.00,0,09/11/2021,men,clothing,jeans,acne_studios,never_worn,cotton,grey,28,EU
4,19118182,https://fr.vestiairecollective.com//women-clot...,9,105.00,0,09/11/2021,women,clothing,dresses,acne_studios,very_good_condition,linen,black,s,EU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34,19041594,https://fr.vestiairecollective.com//women-shoe...,2,335.00,0,05/11/2021,women,shoes,sandals,yves_saint_laurent,very_good_condition,leather,green,40,EU
35,18991208,https://fr.vestiairecollective.com//women-clot...,0,163.00,1,03/11/2021,women,clothing,jackets,yves_saint_laurent,very_good_condition,silk,brown,38,EU
36,19203307,https://fr.vestiairecollective.com//women-clot...,4,251.76,0,13/11/2021,women,clothing,jackets,yves_saint_laurent,never_worn,wool,black,34,united kingdom
38,18982027,https://fr.vestiairecollective.com//men-clothi...,1,135.00,1,02/11/2021,men,clothing,suits,yves_saint_laurent,good_condition,wool,grey,54,EU


In [49]:
# add no size for items of type "accessories", "bags" or "jewellery"
data_cleaned.loc[
    data_cleaned.category.isin(["accessories", "bags", "jewellery"]), 
    "size"
] = "no size"

### Shoes

In [50]:
sizes = [
    str( int(size) ) 
    if size.is_integer() 
    else str(size)
    for size in np.linspace(start=0, stop=20, num=41)
]

In [51]:
uk_shoe_sizes = np.array(list(set( 
    data_cleaned[
        ( data_cleaned["category"] == "shoes" ) &
        ( data_cleaned["size"].isin(sizes) )
    ]["size"]
))).astype(float)
np.sort(uk_shoe_sizes).tolist()

[2.0,
 3.0,
 3.5,
 4.0,
 4.5,
 5.0,
 5.5,
 6.0,
 6.5,
 7.0,
 7.5,
 8.0,
 8.5,
 9.0,
 9.5,
 10.0,
 10.5,
 11.0,
 12.0,
 17.0]

In [52]:
# convert UK shoe size to EU shoe size
conversion_shoe_size = {
    "2": "35", 
    "3": "36", 
    "3.5": "36",
    "4": "37", 
    "4.5": "37", 
    "5": "38", 
    "5.5": "39", 
    "6": "39", 
    "6.5": "40", 
    "7": "41", 
    "7.5": "41", 
    "8": "42", 
    "8.5": "42", 
    "9": "43",
    "9.5": "44",  
    "10": "45", 
    "10.5": "45", 
    "11": "46", 
    "12": "47", 
}

In [53]:
data_cleaned[data_cleaned.category == "shoes"] = data_cleaned[data_cleaned.category == "shoes"].replace({"size": conversion_shoe_size})

In [54]:
temp = data_cleaned[data_cleaned.category == "shoes"]["size"].value_counts()
temp

39      432
37      419
38      409
40      319
41      247
36      236
42      192
37.5    162
38.5    153
43      117
39.5    111
36.5     94
44       89
35       76
45       62
40.5     61
41.5     32
35.5     30
42.5     27
46       21
34       14
43.5     11
44.5      9
31        5
30        4
32        4
34.5      3
24        2
17        2
47        2
45.5      2
23        1
28        1
29        1
27        1
Name: size, dtype: int64

In [55]:
shoe_size_count = {
    "size": [], 
    "count": [],
    "freq": []
}
cumul_count = sum(temp.values)
for size, count in temp.items(): 
    shoe_size_count["size"].append(size)
    shoe_size_count["count"].append(count)
    shoe_size_count["freq"].append(count/cumul_count)

In [56]:
shoe_size_count = pd.DataFrame(shoe_size_count)
shoe_size_count

Unnamed: 0,size,count,freq
0,39.0,432,0.128917
1,37.0,419,0.125037
2,38.0,409,0.122053
3,40.0,319,0.095195
4,41.0,247,0.073709
5,36.0,236,0.070427
6,42.0,192,0.057296
7,37.5,162,0.048344
8,38.5,153,0.045658
9,43.0,117,0.034915


In [57]:
q1, q2, q3 = np.quantile(a=shoe_size_count["freq"], q=[.25, .5, .75])

In [58]:
def classify_shoe_size(size): 
    freq = shoe_size_count.loc[shoe_size_count["size"] == size, "freq"].values[0]
    if freq < q1: 
        return "rare_size"
    if freq >= q1 and freq < q2: 
        return "not_common_size"
    if freq >= q2 and freq < q3:
        return "common_size"
    else: 
        return "very_common_size" 

In [59]:
data_cleaned.loc[data_cleaned.category=="shoes", "size"] = data_cleaned.loc[data_cleaned.category=="shoes", "size"].apply(classify_shoe_size)

In [60]:
data_cleaned.loc[data_cleaned.category=="shoes", ]

Unnamed: 0,id,url,num_likes,price,we_love_tag,online_date,gender,category,sub_category,designer,condition,material,color,size,location
0,19126896,https://fr.vestiairecollective.com//women-shoe...,7,180.00,1,09/11/2021,women,shoes,ankle boots,acne_studios,very_good_condition,leather,black,very_common_size,EU
10,19068472,https://fr.vestiairecollective.com//women-shoe...,2,77.00,1,06/11/2021,women,shoes,ankle boots,acne_studios,good_condition,leather,black,very_common_size,EU
13,19077860,https://fr.vestiairecollective.com//women-shoe...,38,220.00,1,07/11/2021,women,shoes,trainers,acne_studios,very_good_condition,suede,black,very_common_size,EU
19,19203385,https://fr.vestiairecollective.com//women-shoe...,5,110.00,0,13/11/2021,women,shoes,ankle boots,acne_studios,very_good_condition,leather,black,very_common_size,EU
21,19214900,https://fr.vestiairecollective.com//women-shoe...,2,110.00,0,13/11/2021,women,shoes,ankle boots,acne_studios,very_good_condition,leather,green,very_common_size,EU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,18971724,https://fr.vestiairecollective.com//men-shoes/...,2,250.00,0,02/11/2021,men,shoes,boots,yves_saint_laurent,very_good_condition,leather,brown,common_size,EU
11,19206161,https://fr.vestiairecollective.com//women-shoe...,2,347.60,0,13/11/2021,women,shoes,boots,yves_saint_laurent,very_good_condition,leather,black,very_common_size,EU
25,19223255,https://fr.vestiairecollective.com//women-shoe...,0,150.00,0,14/11/2021,women,shoes,sandals,yves_saint_laurent,very_good_condition,suede,black,very_common_size,EU
33,19176212,https://fr.vestiairecollective.com//men-shoes/...,3,245.76,0,12/11/2021,men,shoes,trainers,yves_saint_laurent,never_worn,leather,black,very_common_size,EU


### Clothing

In [61]:
set( data_cleaned.loc[data_cleaned["category"] == "clothing", "size"] )

{'0',
 '00',
 '0000',
 '1',
 '10',
 '12',
 '14',
 '14-16',
 '15.5',
 '16',
 '18',
 '18-20',
 '2',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '3',
 '30',
 '31',
 '32',
 '33',
 '34',
 '36',
 '37',
 '38',
 '39',
 '3xs',
 '4',
 '40',
 '41',
 '42',
 '43',
 '44',
 '46',
 '48',
 '5',
 '50',
 '50-52',
 '52',
 '54',
 '56',
 '6',
 '8',
 '9',
 'international',
 'l',
 'm',
 'no size',
 's',
 'taille',
 'xl',
 'xs',
 'xxl',
 'xxs',
 'xxxl'}

In [62]:
data_cleaned.loc[
    (data_cleaned["size"] == "2") &
    (data_cleaned.location == "EU"), 
    "size"
] = "m"

In [63]:
data_cleaned.loc[
    (data_cleaned["size"] == "2") &
    (data_cleaned.location != "EU"), 
    "size"
] = "xs"

In [64]:
data_cleaned.loc[data_cleaned["size"] == "0000", "size"] = "xxs"
data_cleaned.loc[data_cleaned["size"] == "00", "size"] = "xs"
data_cleaned.loc[data_cleaned["size"] == "0", "size"] = "s"
data_cleaned.loc[data_cleaned["size"] == "1", "size"] = "m"

In [65]:
data_cleaned.loc[
    (data_cleaned["category"] == "clothing") &
    (data_cleaned["size"] == "14-16"), 
    "size"
] = "l"

In [66]:
data_cleaned.loc[
    (data_cleaned["category"] == "clothing") &
    (data_cleaned["size"] == "15.5"), 
    "size"
] = "m"

In [67]:
data_cleaned.loc[
    (data_cleaned["category"] == "clothing") &
    (data_cleaned["size"] == "18-20"), 
    "size"
] = "xl"

In [68]:
data_cleaned.loc[
    (data_cleaned["category"] == "clothing") &
    (data_cleaned["size"] == "50-52"), 
    "size"
] = "xxl"

In [69]:
data_cleaned.loc[
    ( data_cleaned["category"] == "clothing" ) &
    ( data_cleaned["size"].isin(["international", "taille"]) ), 
    "size"
] = "no size"

In [70]:
# jeans
def resize_jeans(old_size): 
    try:
        new_size = int(old_size)
        if new_size < 26: 
            return "xs"
        if new_size < 28: 
            return "s"
        if new_size < 32: 
            return "m"
        if new_size < 34: 
            return "l"
        if new_size < 36:
            return "xl"
        else: 
            return "xxl"
    except ValueError: 
        return old_size

In [71]:
data_cleaned.loc[
    data_cleaned.sub_category == "jeans", 
    "size"
] = data_cleaned.loc[
    data_cleaned.sub_category == "jeans", 
    "size"
].apply(resize_jeans) 

In [72]:
# trousers
def resize_trousers(old_size): 
    try:
        new_size = int(old_size)
        if new_size >= 26: 
            if new_size < 32: 
                return "xxs"
            if new_size < 34: 
                return "xs"
            if new_size < 38: 
                return "s"
            if new_size < 42: 
                return "m"
            if new_size < 44:
                return "l"
            if new_size < 46:
                return "xl"
            else: 
                return "xxl"
    except ValueError: 
        return old_size

In [73]:
data_cleaned.loc[
    data_cleaned.category == "clothing", 
    "size"
] = data_cleaned.loc[
    data_cleaned.category == "clothing",
    "size"
].apply(resize_trousers)

In [74]:
data_cleaned.loc[data_cleaned["size"] == "14", "size"] = "l"
data_cleaned.loc[data_cleaned["size"].isin(["14-16", "16"]), "size"] = "xl"
data_cleaned.loc[data_cleaned["size"] == "15.5", "size"] = "s" 

In [75]:
data_cleaned.loc[
    data_cleaned["size"].isin( ["18", "18-20"] ),
    "size" 
] = "xxl"

In [76]:
data_cleaned.loc[
    data_cleaned.category == "clothing", 
    "size"
].value_counts()

m          1132
s           694
l           415
xxl         351
xl          236
xs          172
no size      27
xxs          27
xxxl          2
3xs           1
Name: size, dtype: int64

In [77]:
data_cleaned.loc[
    (data_cleaned.category == "clothing") &
    data_cleaned["size"].isin(["s", "m", "l"]), 
    "size"
] = "very_common_size"

In [78]:
data_cleaned.loc[
    (data_cleaned.category == "clothing") &
    (data_cleaned["size"].isin(["xxl", "xl", "xs"])), 
    "size"
] = "common_size"

In [79]:
data_cleaned.loc[
    (data_cleaned.category == "clothing") &
    (data_cleaned["size"] == "xxs"), 
    "size"
] = "not_common_size"

In [80]:
data_cleaned.loc[
    (data_cleaned.category == "clothing") &
    (data_cleaned["size"].isin(["xxxl", "3xs"])), 
    "size"
] = "rare_size"

In [81]:
# convert size to cat variable 
data_cleaned["size"] = pd.Categorical(data_cleaned["size"])

In [82]:
data_cleaned["size"].value_counts()

very_common_size    4810
common_size         1431
not_common_size      122
no size               27
rare_size             18
Name: size, dtype: int64

## Feature selection

In [83]:
data_cleaned.index = data_cleaned.id

In [84]:
# feature selection
data_cleaned = data_cleaned.drop(["id", "url", "online_date"], axis=1)

In [85]:
data_cleaned.columns

Index(['num_likes', 'price', 'we_love_tag', 'gender', 'category',
       'sub_category', 'designer', 'condition', 'material', 'color', 'size',
       'location'],
      dtype='object')

In [86]:
data_cleaned.head()

Unnamed: 0_level_0,num_likes,price,we_love_tag,gender,category,sub_category,designer,condition,material,color,size,location
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
19126896,7,180.0,1,women,shoes,ankle boots,acne_studios,very_good_condition,leather,black,very_common_size,EU
19181389,1,40.55,1,women,clothing,jeans,acne_studios,good_condition,denim_jeans,navy,very_common_size,EU
19182029,6,332.5,1,men,clothing,coats,acne_studios,good_condition,wool,black,very_common_size,EU
19132670,3,45.0,0,men,clothing,jeans,acne_studios,never_worn,cotton,grey,very_common_size,EU
19118182,9,105.0,0,women,clothing,dresses,acne_studios,very_good_condition,linen,black,very_common_size,EU


## Transform categories to dummies 

In [87]:
# convert cat variables to dummies
data_cleaned = pd.get_dummies(
    data_cleaned, 
    columns=[
        "gender", 
        "category", 
        "sub_category", 
        "designer", 
        "condition",
        "material", 
        "color", 
        "size", 
        "location"
    ], 
    prefix="", 
    prefix_sep="", 
    drop_first=True
)

In [88]:
data_cleaned.head()

Unnamed: 0_level_0,num_likes,price,we_love_tag,men,women,shoes,ballet flats,boots,coats,dresses,...,silver,turquoise,white,yellow,no size,not_common_size,rare_size,very_common_size,other_countries,united kingdom
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19126896,7,180.0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
19181389,1,40.55,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
19182029,6,332.5,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
19132670,3,45.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
19118182,9,105.0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [89]:
for col in data_cleaned.columns: 
    print(col)

num_likes
price
we_love_tag
men
women
shoes
ballet flats
boots
coats
dresses
espadrilles
first shoes
flats
heels
jackets
jackets & coats
jeans
jumpsuits
knitwear
knitwear & sweatshirts
lace ups
leather jackets
lingerie
mules & clogs
outfits
polo shirts
sandals
shirts
shorts
skirts
suits
swimwear
t-shirts
tops
trainers
trench coats
trousers
alexander_mcqueen
alexander_wang
balenciaga
balmain
bottega_veneta
burberry
celine
chanel
chloé
christian_louboutin
coach
dior
dolce_&_gabbana
fendi
givenchy
golden_goose
gucci
hermès
isabel_marant
jean_paul_gaultier
jimmy_choo
kate_spade
louis_vuitton
maison_martin_margiela
michael_kors
moncler
off_white
prada
saint_laurent
salvatore_ferragamo
tory_burch
valentino_garavani
versace
yves_saint_laurent
good_condition
never_worn
very_good_condition
cloth
cotton
cotton_elasthane
denim_jeans
exotic_leathers
faux_fur
fur
glitter
leather
linen
other
patent_leather
plastic
polyester
pony_style_calfskin
rubber
silk
suede
synthetic
tweed
vegan_leather
velvet
v

## Save cleaned data

In [90]:
save_path = "./backup/vc_data_cleaned.pkl"
data_cleaned.to_pickle(path=save_path)

FileNotFoundError: [Errno 2] No such file or directory: './backup/vc_data_cleaned.pkl'