# Machine Learning

Goal: predict the price of an item from [Vestiaire Collective](https://fr.vestiairecollective.com/)

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [61]:
from sklearn.model_selection import (
    train_test_split, 
    cross_val_score, 
    GridSearchCV
)

In [58]:
from sklearn.linear_model import (
    LinearRegression, 
    Ridge,
    Lasso,
    ElasticNet
)

In [73]:
from sklearn.ensemble import (
    GradientBoostingRegressor, 
    RandomForestRegressor, 
)

## Data

In [5]:
data_path = "C:/Users/pemma/OneDrive - Université de Tours/Mécen/M2/S1/02 - Machine Learning/05 - Projet/ML_Vestiaire_Collective/backup/vc_data_cleaned.pkl"

In [6]:
data = pd.read_pickle(data_path)

In [7]:
data.head()

Unnamed: 0_level_0,num_likes,price,we_love_tag,lprice,men,women,bags,clothing,shoes,belts,...,size_38,size_39,size_40,size_41,size_42,size_43,size_44,size_>=45,eu,uk
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19126896,7,180.0,1,5.198497,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
19181389,1,40.55,1,3.726898,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
19182029,6,332.5,1,5.809643,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
19132670,3,45.0,0,3.828641,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
19118182,9,105.0,0,4.663439,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [8]:
data.columns

Index(['num_likes', 'price', 'we_love_tag', 'lprice', 'men', 'women', 'bags',
       'clothing', 'shoes', 'belts', 'boots', 'coats', 'dresses', 'flats',
       'handbags', 'hats', 'heels', 'jackets', 'jeans', 'knitwear_sweatshirts',
       'polo_shirts', 'sandals', 'scarves', 'skirts', 'small_bags',
       'sunglasses', 'swimwear', 'tops', 'trainers', 'travel_bags', 'trousers',
       'cheap_brand', 'expensive_brand', 'very_expensive_brand',
       'good_condition', 'never_worn', 'very_good_condition', 'leather',
       'cotton', 'wool', 'cloth', 'polyester', 'synthetic', 'suede', 'silk',
       'patent_leather', 'other', 'viscose', 'plastic', 'cashmere',
       'denim_jeans', 'black', 'white', 'multicolour', 'brown', 'beige',
       'blue', 'pink', 'grey', 'red', 'green', 'camel', 'navy', 'burgundy',
       'gold', 'silver', 'orange', 'yellow', 'purple', 'size_3xs', 'size_xxs',
       'size_xs', 'size_s', 'size_m', 'size_l', 'size_xl', 'size_xxl',
       'size_3xl', 'size_<=35', 'size

In [9]:
data.shape

(9694, 91)

In [13]:
X = data.drop(
    labels=["price", "lprice"], 
    axis=1
).values
y = data["lprice"].values

In [16]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y)

## Learning phase

Create a dictionnary to store models' score where `results_full` refers to the feature vector with all features and `results_cons` to the contrained feature vector without `num_likes` and `we_love_tag`. 

In [51]:
results_full = dict()
results_cons = dict()

### Linear regression 

In [53]:
lr = LinearRegression()

In [54]:
score = cross_val_score(
    lr, 
    X_tr,
    y_tr
)
results_full[lr] = np.mean(score)

In [55]:
score = cross_val_score(
    lr, 
    X_tr[:, 2:],
    y_tr
)
results_cons[lr] = np.mean(score)

### Linear models with regularization

In [59]:
r = Ridge()

In [60]:
r.get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'normalize': 'deprecated',
 'positive': False,
 'random_state': None,
 'solver': 'auto',
 'tol': 0.001}

In [64]:
g = GridSearchCV(
    r,
    {
        "fit_intercept": [True, False],
        "alpha": [0.01, 0.1, 1., 10.],
        "max_iter": [10000],
    },
)

In [65]:
g.fit(X_tr, y_tr)

GridSearchCV(estimator=Ridge(),
             param_grid={'alpha': [0.01, 0.1, 1.0, 10.0],
                         'fit_intercept': [True, False], 'max_iter': [10000]})

In [69]:
results_full[g] = g.best_score_

In [70]:
g.fit(X_tr[:, 2:], y_tr)

GridSearchCV(estimator=Ridge(),
             param_grid={'alpha': [0.01, 0.1, 1.0, 10.0],
                         'fit_intercept': [True, False], 'max_iter': [10000]})

In [71]:
results_cons[g] = g.best_score_

### Random Forest Regressor

In [82]:
rf = RandomForestRegressor(oob_score=True)

In [83]:
rf.fit(X_tr, y_tr)

RandomForestRegressor(oob_score=True)

In [85]:
results_full[rf] = rf.oob_score_

In [87]:
rf.fit(X_tr[:, 2:], y_tr)

RandomForestRegressor(oob_score=True)

In [88]:
results_cons[rf] = rf.oob_score_

## Results 

In [97]:
score_full = np.mean( list(results_full.values()) )

In [98]:
score_cons = np.mean( list(results_cons.values()) )

In [100]:
score_full

0.48707850260240826

In [101]:
score_cons

0.4598349403715982