# Model selection 

## Setup

In [12]:
%cd "C:/Users/pemma/OneDrive - Université de Tours/Mécen/M2/S1/02 - Machine Learning/05 - Projet/price_prediction_vestiaire_collective/"

C:\Users\pemma\OneDrive - Université de Tours\Mécen\M2\S1\02 - Machine Learning\05 - Projet\price_prediction_vestiaire_collective


In [13]:
import pandas as pd
import numpy as np

In [14]:
from vc_ml import (
    read_data, 
    to_dummies, 
    load_feature_vector, 
    load_target, 
    load_best_estimator,
    get_files_paths, 
    get_cv_results,
    get_best_estimator,
    save_best_estimator,
)

In [15]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

## Best estimator testing

In [18]:
best_estimator = load_best_estimator()

In [22]:
X_tr, y_tr = load_feature_vector(file_name="train.pkl"), load_target(file_name="train.pkl")

In [5]:
X_te, y_te = load_feature_vector(file_name="test.pkl"), load_target(file_name="test.pkl")

In [24]:
best_estimator.fit(X_tr, y_tr)

Pipeline(steps=[('model',
                 GradientBoostingRegressor(criterion='squared_error',
                                           loss='huber', max_depth=10,
                                           min_samples_leaf=5,
                                           min_samples_split=20,
                                           n_estimators=250, tol=0.001))])

In [25]:
best_estimator.score(X_tr, y_tr)

0.4601943756449274

In [26]:
pred_tr, pred_te = best_estimator.predict(X_tr), best_estimator.predict(X_te)

In [27]:
best_estimator.score(X_te, y_te)

0.4269489150057041

In [28]:
fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Scatter(
        x=y_tr,
        y=pred_tr,
        mode="markers",
        name="Train (R² = 0.46)", 
        marker=dict(opacity=0.5)
    ), 
    row=1, col=1
)
fig.add_trace(
    go.Scatter(
        x=y_tr,
        y=y_tr,
        mode="lines",
        name="y=x"
    ), 
    row=1, col=1
)

fig.add_trace(
    go.Scatter(
        x=y_te,
        y=pred_te,
        mode="markers",
        name="Test (R² = 0.43)", 
        marker=dict(opacity=0.5)
    ), 
    row=1, col=2
)
fig.add_trace(
    go.Scatter(
        x=y_te,
        y=y_te,
        mode="lines",
        name="y=x", 
        showlegend=False, 
        line=dict(color="red")
    ), 
    row=1, col=2
)

fig.update_xaxes(
    title_text="Train prices", 
    range=[0,6000], 
    row=1, col=1
)
fig.update_xaxes(
    title_text="Test prices", 
    range=[0,6000], 
    row=1, col=2
)
fig.update_yaxes(
    title_text="Predicted prices", 
    range=[0,6000], 
    row=1, col=1
)
fig.update_yaxes(
    title_text=" ", 
    range=[0,6000], 
    row=1, col=2
)


fig.update_layout(
    title_text="Predicted prices VS Observed prices for training and testing sets",
)

fig.show()

In [29]:
img_path = "C:/Users/pemma/OneDrive - Université de Tours/Mécen/M2/S1/02 - Machine Learning/05 - Projet/price_prediction_vestiaire_collective/imgs/"
fig.write_image(
    img_path+"prediction_plot.png", 
    width=800, height=400
)

In [34]:
best_estimator.get_params() 

{'memory': None,
 'steps': [('model',
   GradientBoostingRegressor(criterion='squared_error', loss='huber', max_depth=10,
                             min_samples_leaf=5, min_samples_split=20,
                             n_estimators=250, tol=0.001))],
 'verbose': False,
 'model': GradientBoostingRegressor(criterion='squared_error', loss='huber', max_depth=10,
                           min_samples_leaf=5, min_samples_split=20,
                           n_estimators=250, tol=0.001),
 'model__alpha': 0.9,
 'model__ccp_alpha': 0.0,
 'model__criterion': 'squared_error',
 'model__init': None,
 'model__learning_rate': 0.1,
 'model__loss': 'huber',
 'model__max_depth': 10,
 'model__max_features': None,
 'model__max_leaf_nodes': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 5,
 'model__min_samples_split': 20,
 'model__min_weight_fraction_leaf': 0.0,
 'model__n_estimators': 250,
 'model__n_iter_no_change': None,
 'model__random_state': None,
 'model__subsample': 1.0

Test the model on a new example collected on [Vestiaire Collective](https://fr.vestiairecollective.com/chaussures-femme/baskets/balenciaga/baskets-balenciaga-track-en-toile-noir-20411089.shtml).

In [38]:
data = read_data(file_name="vc_data_cleaned.pkl")

In [39]:
new_item_id = 20411089
d = {
    "num_likes": 4, 
    "we_love_tag": 0, 
    "price": 560.,
    "gender": "women",
    "category": "shoes",
    "sub_category": "trainers",
    "designer": "balenciaga",
    "condition": "never_worn",
    "material": "other_material",
    "color": "black",
    "size": "size_38",
    "location": "uk", 
    "lprice": np.log(560.+1.)
}
new_data = pd.DataFrame(data=d, index=[new_item_id])

In [40]:
data = pd.concat(
    objs=[data, new_data], 
    axis=0
)

In [42]:
data_encoded = to_dummies(data)

In [53]:
X_new = data_encoded.tail(1).drop(
    labels=["num_likes", "price", "we_love_tag", "category", "lprice"], 
    axis=1
).values

In [51]:
y_new = data_encoded.tail(1)["price"].values

In [63]:
pred_new = best_estimator.predict(X_new)[0]

In [66]:
error = (pred_new - y_new[0]) / y_new[0]
print(f"Relative error = {round(100*error, 2)}%")

Relative error = 18.33%
