# Vestiaire Collective - Data preprocessing 

In [1]:
import pandas as pd
import numpy as np

## Load the dataset 

In [2]:
file_path = "C:/Users/pemma/OneDrive - Université de Tours/Mécen/M2/S1/02 - Machine Learning/05 - Projet/ML_Vestiaire_Collective/backup/vc_data.pkl"

In [3]:
data  = pd.read_pickle(file_path)

In [4]:
data.head()

Unnamed: 0,id,url,num_likes,price,we_love_tag,online_date,gender,category,sub_category,designer,condition,material,color,size,location
0,19126896,https://fr.vestiairecollective.com//women-shoe...,7,180.0,True,09/11/2021,women,shoes,ankle boots,acne studios,very good condition,leather,black,39,italy
1,19181389,https://fr.vestiairecollective.com//women-clot...,1,40.55,True,12/11/2021,women,clothing,jeans,acne studios,good condition,denim - jeans,navy,30,poland
2,19182029,https://fr.vestiairecollective.com//men-clothi...,6,332.5,True,12/11/2021,men,clothing,coats,acne studios,good condition,wool,black,l,denmark
3,19132670,https://fr.vestiairecollective.com//men-clothi...,3,45.0,False,09/11/2021,men,clothing,jeans,acne studios,"never worn, with tag",cotton,grey,28,germany
4,19118182,https://fr.vestiairecollective.com//women-clot...,9,105.0,False,09/11/2021,women,clothing,dresses,acne studios,very good condition,linen,black,s,germany


In [9]:
data.shape

(10409, 15)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10409 entries, 0 to 43
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            10409 non-null  int64  
 1   url           10409 non-null  object 
 2   num_likes     10409 non-null  int64  
 3   price         10409 non-null  float64
 4   we_love_tag   10409 non-null  bool   
 5   online_date   10409 non-null  object 
 6   gender        10409 non-null  object 
 7   category      10409 non-null  object 
 8   sub_category  10409 non-null  object 
 9   designer      10409 non-null  object 
 10  condition     10409 non-null  object 
 11  material      10409 non-null  object 
 12  color         10409 non-null  object 
 13  size          10409 non-null  object 
 14  location      10409 non-null  object 
dtypes: bool(1), float64(1), int64(2), object(11)
memory usage: 1.2+ MB


In [11]:
data_mod = data.copy()

## `we_love_tag`

In [12]:
# convert "we_love_tag" to binary (0, 1)
data_mod["we_love_tag"] = data_mod["we_love_tag"].astype(int)

## `gender`

In [13]:
# convert "gender" to cat variable
data_mod["gender"] = pd.Categorical(data_mod.gender)

In [14]:
data_mod.gender.value_counts()

women            8266
men              2001
kids              133
life & living       9
Name: gender, dtype: int64

In [15]:
# remove "life & living"
data_mod = data_mod[data_mod.gender != "life & living"]
data_mod.gender = data_mod.gender.cat.remove_unused_categories()

In [16]:
data_mod.gender.value_counts()

women    8266
men      2001
kids      133
Name: gender, dtype: int64

## `category`

In [17]:
# convert "category" to cat variable
data_mod["category"] = pd.Categorical(data_mod.category)

In [18]:
data_mod.category.value_counts()

shoes          3351
clothing       3214
bags           2265
accessories    1116
jewellery       352
girls            56
boys             46
Name: category, dtype: int64

In [19]:
# change "boys" and "girls" to "clothing" in "category"
data_mod = data_mod.replace("boys", "clothing").replace("girls", "clothing")

In [20]:
data_mod.category = data_mod.category.cat.remove_unused_categories()

In [21]:
data_mod.category.value_counts()

shoes          3351
clothing       3316
bags           2265
accessories    1116
jewellery       352
Name: category, dtype: int64

## `sub-category`

In [22]:
# convert "sub_category" to cat variable
data_mod["sub_category"] = pd.Categorical(data_mod.sub_category)

In [23]:
# remove sub categories with counts < 10
to_keep = [
    key 
    for key, val in data_mod["sub_category"].value_counts().items()
    if val >= 10
]
data_mod = data_mod[data_mod.sub_category.isin(to_keep)]

In [24]:
data_mod.sub_category = data_mod.sub_category.cat.remove_unused_categories()

In [25]:
data_mod.sub_category.value_counts()

handbags                       1858
trainers                       1037
ankle boots                     594
jackets                         555
coats                           525
sandals                         493
heels                           487
boots                           368
dresses                         334
tops                            326
knitwear                        301
knitwear & sweatshirts          213
belts                           202
trousers                        197
scarves                         189
sunglasses                      168
wallets                         151
t-shirts                        149
flats                           143
ballet flats                    140
clutch bags                     121
jeans                           121
bags                            121
skirts                          117
bracelets                       116
trench coats                     96
swimwear                         93
purses, wallets & cases     

## `designer`

In [26]:
# convert "designer" to cat & rename levels
data_mod["designer"] = pd.Categorical(data_mod["designer"])
data_mod["designer"] = data_mod["designer"].cat.rename_categories(
    lambda x: x.replace(" ", "_").replace("-", "_")
)

In [27]:
data_mod = data_mod[data_mod["designer"] != "démoo"]

In [28]:
data_mod.designer = data_mod.designer.cat.remove_unused_categories()

## `condition`

In [29]:
# convert "condition" to cat variable
data_mod["condition"] = pd.Categorical(data_mod.condition)

In [30]:
# value counts "condition"
data_mod.condition.value_counts()

very good condition     4780
never worn              3384
good condition          1193
never worn, with tag     817
fair condition           184
Name: condition, dtype: int64

In [31]:
# rename levels
data_mod["condition"] = data_mod["condition"].replace(to_replace="never worn, with tag", value="never worn")
data_mod["condition"] = data_mod["condition"].cat.rename_categories(["fair_condition", "good_condition", "never_worn", "very_good_condition"])

## `material`

In [32]:
# convert "material" to cat & rename levels
data_mod["material"] = pd.Categorical(data_mod["material"])
data_mod["material"] = data_mod["material"].cat.rename_categories(
    lambda x: x.replace(" - ", "_").replace("-", "_").replace(" ", "_")
)

In [33]:
data_mod.material.value_counts()

leather                4135
cotton                 1127
wool                    762
cloth                   653
polyester               471
synthetic               393
silk                    382
suede                   378
patent_leather          311
other                   199
metal                   195
viscose                 162
plastic                 149
denim_jeans             105
cashmere                103
velvet                   93
glitter                  91
fur                      82
rubber                   73
exotic_leathers          72
cotton_elasthane         62
pony_style_calfskin      52
tweed                    46
linen                    41
silver                   39
gold_plated              39
steel                    35
vegan_leather            31
faux_fur                 14
lycra                     7
gold_and_steel            6
pearls                    6
yellow_gold               5
wicker                    5
lace                      5
wood                

In [34]:
# remove materials with counts < 10
to_keep = [
    key 
    for key, val in data_mod["material"].value_counts().items()
    if val >= 10
]
data_mod = data_mod[data_mod.material.isin(to_keep)]

In [35]:
data_mod.material = data_mod.material.cat.remove_unused_categories()

In [36]:
data_mod.material.value_counts()

leather                4135
cotton                 1127
wool                    762
cloth                   653
polyester               471
synthetic               393
silk                    382
suede                   378
patent_leather          311
other                   199
metal                   195
viscose                 162
plastic                 149
denim_jeans             105
cashmere                103
velvet                   93
glitter                  91
fur                      82
rubber                   73
exotic_leathers          72
cotton_elasthane         62
pony_style_calfskin      52
tweed                    46
linen                    41
silver                   39
gold_plated              39
steel                    35
vegan_leather            31
faux_fur                 14
Name: material, dtype: int64

## `color`

In [37]:
# convert "color" to cat & rename levels
data_mod["color"] = pd.Categorical(data_mod["color"])
data_mod["color"] = data_mod["color"].cat.rename_categories(
    lambda x: x.replace(" / ", "_")
)

In [38]:
data_mod.color.value_counts()

black          3422
white           906
multicolour     790
brown           731
beige           699
blue            671
pink            411
grey            375
red             309
green           303
gold            282
navy            201
camel           196
silver          169
burgundy        158
orange          108
purple          100
yellow           98
ecru             82
khaki            79
other            68
anthracite       57
metallic         55
turquoise        24
white_black       1
Name: color, dtype: int64

In [39]:
# remove "white_black" category
data_mod = data_mod[data_mod.color != "white_black"]
data_mod.color = data_mod.color.cat.remove_unused_categories()

In [40]:
data_mod.color.value_counts()

black          3422
white           906
multicolour     790
brown           731
beige           699
blue            671
pink            411
grey            375
red             309
green           303
gold            282
navy            201
camel           196
silver          169
burgundy        158
orange          108
purple          100
yellow           98
ecru             82
khaki            79
other            68
anthracite       57
metallic         55
turquoise        24
Name: color, dtype: int64

## `location`

In [41]:
# reduce "location" variable
def map_geo_area(country):
    """Map a country to the corresponding geographical area."""
    if country not in (
        "canada",
        "israel",
        "singapore", 
        "switzerland", 
        "united kingdom", 
        "united states"
    ): 
        return "EU"
    else: 
        if country != "united kingdom":
            return "other_countries"
        else: 
            return country
data_mod.location = data_mod.location.replace("vestiaire collective france", "france")
data_mod.location = data_mod.location.apply(map_geo_area)
data_mod.location = data_mod.location.replace(" ", "_") 

In [42]:
data_mod.location.value_counts()

EU                 8884
united kingdom     1394
other_countries      16
Name: location, dtype: int64

## `size`

In [43]:
data_mod

Unnamed: 0,id,url,num_likes,price,we_love_tag,online_date,gender,category,sub_category,designer,condition,material,color,size,location
0,19126896,https://fr.vestiairecollective.com//women-shoe...,7,180.00,1,09/11/2021,women,shoes,ankle boots,acne_studios,very_good_condition,leather,black,39,EU
1,19181389,https://fr.vestiairecollective.com//women-clot...,1,40.55,1,12/11/2021,women,clothing,jeans,acne_studios,good_condition,denim_jeans,navy,30,EU
2,19182029,https://fr.vestiairecollective.com//men-clothi...,6,332.50,1,12/11/2021,men,clothing,coats,acne_studios,good_condition,wool,black,l,EU
3,19132670,https://fr.vestiairecollective.com//men-clothi...,3,45.00,0,09/11/2021,men,clothing,jeans,acne_studios,never_worn,cotton,grey,28,EU
4,19118182,https://fr.vestiairecollective.com//women-clot...,9,105.00,0,09/11/2021,women,clothing,dresses,acne_studios,very_good_condition,linen,black,s,EU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,18982027,https://fr.vestiairecollective.com//men-clothi...,1,135.00,1,02/11/2021,men,clothing,suits,yves_saint_laurent,good_condition,wool,grey,54,EU
39,19062770,https://fr.vestiairecollective.com//women-acce...,4,44.00,1,06/11/2021,women,accessories,scarves,yves_saint_laurent,very_good_condition,polyester,navy,no size,EU
40,19210693,https://fr.vestiairecollective.com//women-acce...,15,80.00,0,13/11/2021,women,accessories,belts,yves_saint_laurent,very_good_condition,leather,blue,75,EU
41,18970201,https://fr.vestiairecollective.com//women-bags...,46,162.00,1,02/11/2021,women,bags,handbags,yves_saint_laurent,very_good_condition,synthetic,pink,no size,EU


In [44]:
# add no size for items of type "accessories", "bags" or "jewellery"
data_mod.loc[
    data_mod.category.isin(["accessories", "bags", "jewellery"]), 
    "size"
] = "no size"

### Shoes

In [45]:
sizes = [
    str( int(size) ) 
    if size.is_integer() 
    else str(size)
    for size in np.linspace(start=0, stop=20, num=41)
]

In [46]:
uk_shoe_sizes = np.array(list(set( 
    data_mod[
        ( data_mod["category"] == "shoes" ) &
        ( data_mod["size"].isin(sizes) )
    ]["size"]
))).astype(float)
np.sort(uk_shoe_sizes).tolist()

[2.0,
 3.0,
 3.5,
 4.0,
 4.5,
 5.0,
 5.5,
 6.0,
 6.5,
 7.0,
 7.5,
 8.0,
 8.5,
 9.0,
 9.5,
 10.0,
 10.5,
 11.0,
 12.0,
 17.0]

In [47]:
# convert UK shoe size to EU shoe size
conversion_shoe_size = {
    "2": "35", 
    "3": "36", 
    "3.5": "36",
    "4": "37", 
    "4.5": "37", 
    "5": "38", 
    "5.5": "39", 
    "6": "39", 
    "6.5": "40", 
    "7": "41", 
    "7.5": "41", 
    "8": "42", 
    "8.5": "42", 
    "9": "43",
    "9.5": "44",  
    "10": "45", 
    "10.5": "45", 
    "11": "46", 
    "12": "47", 
}

In [48]:
data_mod[data_mod.category == "shoes"] = data_mod[data_mod.category == "shoes"].replace({"size": conversion_shoe_size})

In [49]:
temp = data_mod[data_mod.category == "shoes"]["size"].value_counts()
temp

39      432
37      419
38      409
40      319
41      247
36      236
42      192
37.5    162
38.5    153
43      117
39.5    111
36.5     94
44       89
35       76
45       62
40.5     61
41.5     32
35.5     30
42.5     27
46       21
34       14
43.5     11
44.5      9
31        5
30        4
32        4
34.5      3
24        2
47        2
45.5      2
17        1
23        1
28        1
29        1
27        1
Name: size, dtype: int64

In [50]:
shoe_size_count = {
    "size": [], 
    "count": [],
    "freq": []
}
cumul_count = sum(temp.values)
for size, count in temp.items(): 
    shoe_size_count["size"].append(size)
    shoe_size_count["count"].append(count)
    shoe_size_count["freq"].append(count/cumul_count)

In [51]:
shoe_size_count = pd.DataFrame(shoe_size_count)
shoe_size_count

Unnamed: 0,size,count,freq
0,39.0,432,0.128955
1,37.0,419,0.125075
2,38.0,409,0.12209
3,40.0,319,0.095224
4,41.0,247,0.073731
5,36.0,236,0.070448
6,42.0,192,0.057313
7,37.5,162,0.048358
8,38.5,153,0.045672
9,43.0,117,0.034925


In [52]:
q1, q2, q3 = np.quantile(a=shoe_size_count["freq"], q=[.25, .5, .75])

In [53]:
def classify_shoe_size(size): 
    freq = shoe_size_count.loc[shoe_size_count["size"] == size, "freq"].values[0]
    if freq < q1: 
        return "rare_size"
    if freq >= q1 and freq < q2: 
        return "not_common_size"
    if freq >= q2 and freq < q3:
        return "common_size"
    else: 
        return "very_common_size" 

In [54]:
data_mod.loc[data_mod.category=="shoes", "size"] = data_mod.loc[data_mod.category=="shoes", "size"].apply(classify_shoe_size)

In [55]:
data_mod.loc[data_mod.category=="shoes", ]

Unnamed: 0,id,url,num_likes,price,we_love_tag,online_date,gender,category,sub_category,designer,condition,material,color,size,location
0,19126896,https://fr.vestiairecollective.com//women-shoe...,7,180.00,1,09/11/2021,women,shoes,ankle boots,acne_studios,very_good_condition,leather,black,very_common_size,EU
10,19068472,https://fr.vestiairecollective.com//women-shoe...,2,77.00,1,06/11/2021,women,shoes,ankle boots,acne_studios,good_condition,leather,black,very_common_size,EU
13,19077860,https://fr.vestiairecollective.com//women-shoe...,38,220.00,1,07/11/2021,women,shoes,trainers,acne_studios,very_good_condition,suede,black,very_common_size,EU
19,19203385,https://fr.vestiairecollective.com//women-shoe...,5,110.00,0,13/11/2021,women,shoes,ankle boots,acne_studios,very_good_condition,leather,black,very_common_size,EU
21,19214900,https://fr.vestiairecollective.com//women-shoe...,2,110.00,0,13/11/2021,women,shoes,ankle boots,acne_studios,very_good_condition,leather,green,very_common_size,EU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,18971724,https://fr.vestiairecollective.com//men-shoes/...,2,250.00,0,02/11/2021,men,shoes,boots,yves_saint_laurent,very_good_condition,leather,brown,common_size,EU
11,19206161,https://fr.vestiairecollective.com//women-shoe...,2,347.60,0,13/11/2021,women,shoes,boots,yves_saint_laurent,very_good_condition,leather,black,very_common_size,EU
25,19223255,https://fr.vestiairecollective.com//women-shoe...,0,150.00,0,14/11/2021,women,shoes,sandals,yves_saint_laurent,very_good_condition,suede,black,very_common_size,EU
33,19176212,https://fr.vestiairecollective.com//men-shoes/...,3,245.76,0,12/11/2021,men,shoes,trainers,yves_saint_laurent,never_worn,leather,black,very_common_size,EU


### Clothing

In [56]:
set( data_mod.loc[data_mod["category"] == "clothing", "size"] )

{'0',
 '00',
 '0000',
 '1',
 '10',
 '12',
 '14',
 '14-16',
 '15.5',
 '16',
 '18',
 '18-20',
 '2',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '3',
 '30',
 '31',
 '32',
 '33',
 '34',
 '36',
 '37',
 '38',
 '39',
 '3xs',
 '4',
 '40',
 '41',
 '42',
 '43',
 '44',
 '46',
 '48',
 '5',
 '50',
 '50-52',
 '52',
 '54',
 '56',
 '6',
 '8',
 '9',
 'international',
 'l',
 'm',
 'no size',
 's',
 'taille',
 'xl',
 'xs',
 'xxl',
 'xxs',
 'xxxl'}

In [57]:
data_mod.loc[
    (data_mod["size"] == "2") &
    (data_mod.location == "EU"), 
    "size"
] = "m"

In [58]:
data_mod.loc[
    (data_mod["size"] == "2") &
    (data_mod.location != "EU"), 
    "size"
] = "xs"

In [59]:
data_mod.loc[data_mod["size"] == "0000", "size"] = "xxs"
data_mod.loc[data_mod["size"] == "00", "size"] = "xs"
data_mod.loc[data_mod["size"] == "0", "size"] = "s"
data_mod.loc[data_mod["size"] == "1", "size"] = "m"

In [60]:
data_mod.loc[
    (data_mod["category"] == "clothing") &
    (data_mod["size"] == "14-16"), 
    "size"
] = "l"

In [61]:
data_mod.loc[
    (data_mod["category"] == "clothing") &
    (data_mod["size"] == "15.5"), 
    "size"
] = "m"

In [62]:
data_mod.loc[
    (data_mod["category"] == "clothing") &
    (data_mod["size"] == "18-20"), 
    "size"
] = "xl"

In [63]:
data_mod.loc[
    (data_mod["category"] == "clothing") &
    (data_mod["size"] == "50-52"), 
    "size"
] = "xxl"

In [64]:
data_mod.loc[
    ( data_mod["category"] == "clothing" ) &
    ( data_mod["size"].isin(["international", "taille"]) ), 
    "size"
] = "no size"

In [65]:
# jeans
def resize_jeans(old_size): 
    try:
        new_size = int(old_size)
        if new_size < 26: 
            return "xs"
        if new_size < 28: 
            return "s"
        if new_size < 32: 
            return "m"
        if new_size < 34: 
            return "l"
        if new_size < 36:
            return "xl"
        else: 
            return "xxl"
    except ValueError: 
        return old_size

In [66]:
data_mod.loc[
    data_mod.sub_category == "jeans", 
    "size"
] = data_mod.loc[
    data_mod.sub_category == "jeans", 
    "size"
].apply(resize_jeans) 

In [67]:
# trousers
def resize_trousers(old_size): 
    try:
        new_size = int(old_size)
        if new_size >= 26: 
            if new_size < 32: 
                return "xxs"
            if new_size < 34: 
                return "xs"
            if new_size < 38: 
                return "s"
            if new_size < 42: 
                return "m"
            if new_size < 44:
                return "l"
            if new_size < 46:
                return "xl"
            else: 
                return "xxl"
    except ValueError: 
        return old_size

In [68]:
data_mod.loc[
    data_mod.category == "clothing", 
    "size"
] = data_mod.loc[
    data_mod.category == "clothing",
    "size"
].apply(resize_trousers)

In [69]:
data_mod.loc[data_mod["size"] == "14", "size"] = "l"
data_mod.loc[data_mod["size"].isin(["14-16", "16"]), "size"] = "xl"
data_mod.loc[data_mod["size"] == "15.5", "size"] = "s" 

In [70]:
data_mod.loc[
    data_mod["size"].isin( ["18", "18-20"] ),
    "size" 
] = "xxl"

In [71]:
data_mod.loc[
    data_mod.category == "clothing", 
    "size"
].value_counts()

m          1130
s           694
l           415
xxl         351
xl          236
xs          172
no size      27
xxs          27
xxxl          2
3xs           1
Name: size, dtype: int64

In [72]:
data_mod.loc[
    (data_mod.category == "clothing") &
    data_mod["size"].isin(["s", "m", "l"]), 
    "size"
] = "very_common_size"

In [73]:
data_mod.loc[
    (data_mod.category == "clothing") &
    (data_mod["size"].isin(["xxl", "xl", "xs"])), 
    "size"
] = "common_size"

In [74]:
data_mod.loc[
    (data_mod.category == "clothing") &
    (data_mod["size"] == "xxs"), 
    "size"
] = "not_common_size"

In [75]:
data_mod.loc[
    (data_mod.category == "clothing") &
    (data_mod["size"].isin(["xxxl", "3xs"])), 
    "size"
] = "rare_size"

In [76]:
# convert size to cat variable 
data_mod["size"] = pd.Categorical(data_mod["size"])

In [77]:
data_mod["size"].value_counts()

very_common_size    4808
no size             3685
common_size         1431
not_common_size      122
rare_size             17
Name: size, dtype: int64

## Feature selection

In [81]:
data_mod.index = data_mod.id

In [83]:
# feature selection
data_mod = data_mod.drop(["id", "url", "online_date"], axis=1)

In [84]:
data_mod.columns

Index(['num_likes', 'price', 'we_love_tag', 'gender', 'category',
       'sub_category', 'designer', 'condition', 'material', 'color', 'size',
       'location'],
      dtype='object')

In [85]:
data_mod.head()

Unnamed: 0_level_0,num_likes,price,we_love_tag,gender,category,sub_category,designer,condition,material,color,size,location
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
19126896,7,180.0,1,women,shoes,ankle boots,acne_studios,very_good_condition,leather,black,very_common_size,EU
19181389,1,40.55,1,women,clothing,jeans,acne_studios,good_condition,denim_jeans,navy,very_common_size,EU
19182029,6,332.5,1,men,clothing,coats,acne_studios,good_condition,wool,black,very_common_size,EU
19132670,3,45.0,0,men,clothing,jeans,acne_studios,never_worn,cotton,grey,very_common_size,EU
19118182,9,105.0,0,women,clothing,dresses,acne_studios,very_good_condition,linen,black,very_common_size,EU


## Transform categories to dummies 

In [86]:
# convert cat variables to dummies
data_mod = pd.get_dummies(
    data_mod, 
    columns=[
        "gender", 
        "category", 
        "sub_category", 
        "designer", 
        "condition",
        "material", 
        "color", 
        "size", 
        "location"
    ], 
    prefix="", 
    prefix_sep="", 
    drop_first=True
)

In [87]:
data_mod.head()

Unnamed: 0_level_0,num_likes,price,we_love_tag,men,women,bags,clothing,jewellery,shoes,backpacks,...,silver,turquoise,white,yellow,no size,not_common_size,rare_size,very_common_size,other_countries,united kingdom
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19126896,7,180.0,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
19181389,1,40.55,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
19182029,6,332.5,1,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
19132670,3,45.0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
19118182,9,105.0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [88]:
for col in data_mod.columns: 
    print(col)

num_likes
price
we_love_tag
men
women
bags
clothing
jewellery
shoes
backpacks
bag charms
bags
ballet flats
belts
boots
bracelets
clutch bags
coats
dresses
earrings
espadrilles
flats
gloves
hair accessories
handbags
hats
hats & pull on hats
heels
jackets
jackets & coats
jeans
jewellery
jumpsuits
knitwear
knitwear & sweatshirts
lace ups
leather jackets
lingerie
mules & clogs
necklaces
pins & brooches
polo shirts
purses, wallets & cases
rings
sandals
scarves
scarves & pocket squares
shirts
shorts
silk handkerchief
skirts
small bags, wallets & cases
suits
sunglasses
swimwear
t-shirts
ties
tops
trainers
travel bags
trench coats
trousers
wallets
watches
alexander_mcqueen
alexander_wang
balenciaga
balmain
bottega_veneta
burberry
celine
chanel
chloé
christian_louboutin
coach
dior
dolce_&_gabbana
fendi
givenchy
golden_goose
gucci
hermès
isabel_marant
jean_paul_gaultier
jimmy_choo
kate_spade
louis_vuitton
maison_martin_margiela
michael_kors
moncler
off_white
prada
saint_laurent
salvatore_ferraga

## Save cleaned data

In [89]:
save_path = "./backup/vc_data_mod.pkl"
data_mod.to_pickle(path=save_path)