### ***1. Baseline RandomForest - Préparation des données (user, product)***

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score

# Charger les données brutes
df = pd.read_csv("data_clean.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,0,2019-10-01 06:03:28 UTC,cart,5844670,1487580007852147670,,bluesky,0.79,9794320,1be8fa80-8036-4d95-93da-494a08d82cb5
1,1,2019-10-01 06:03:41 UTC,cart,5824819,1487580007852147670,,domix,1.24,9794320,1be8fa80-8036-4d95-93da-494a08d82cb5
2,2,2019-10-01 06:10:59 UTC,remove_from_cart,5830883,1487580007852147670,,,9.37,9794320,1be8fa80-8036-4d95-93da-494a08d82cb5
3,3,2019-10-01 06:11:04 UTC,view,5844670,1487580007852147670,,bluesky,0.79,9794320,1be8fa80-8036-4d95-93da-494a08d82cb5
4,4,2019-10-01 06:12:01 UTC,cart,5844670,1487580007852147670,,bluesky,0.79,9794320,1be8fa80-8036-4d95-93da-494a08d82cb5


#### ***1.1 Construction de la cible et des features agrégées***

In [None]:
# Cible: au moins un purchase pour (user_id, product_id)
agg_target = (df.groupby(["user_id", "product_id"])["event_type"].apply(lambda x: int((x == "purchase").any()))).rename("label_purchase") # aggregation cible

# Features d'interaction simples
agg_inter = df.groupby(["user_id", "product_id"]).agg(
    nb_view = ("event_type", lambda x: (x == "view").sum()), # nombre de vues
    nb_cart = ("event_type", lambda x: (x == "cart").sum()), # nombre d'ajouts au panier
    nb_remove = ("event_type", lambda x: (x == "remove_from_cart").sum()), # nombre de retraits du panier
    nb_total_events = ("event_type", "count"),) # nombre total d'événements

agg_inter["has_cart"] = (agg_inter["nb_cart"] > 0).astype(int) # indicateur d'ajout au panier
agg_inter["has_remove"] = (agg_inter["nb_remove"] > 0).astype(int) # indicateur de retrait du panier

agg_user_prod = agg_inter.join(agg_target).reset_index() 
agg_user_prod.head()


Unnamed: 0,user_id,product_id,nb_view,nb_cart,nb_remove,nb_total_events,has_cart,has_remove,label_purchase
0,9794320,4905,1,0,0,1,0,0,0
1,9794320,5705033,2,1,1,4,1,1,0
2,9794320,5724282,1,1,1,3,1,1,0
3,9794320,5724305,3,1,2,6,1,1,0
4,9794320,5724608,3,1,1,5,1,1,0


#### ***1.2 Ajout de quelques features produit / utilisateur (simplifiées)***

In [None]:
# Stats produit
prod_stats = df.groupby("product_id").agg(
    prod_total_views=("event_type", lambda x: (x == "view").sum()),
    prod_total_purchases=("event_type", lambda x: (x == "purchase").sum()),)

# Taux de conversion produit
prod_stats["prod_conversion_rate"] = (
    prod_stats["prod_total_purchases"] / prod_stats["prod_total_views"].clip(lower=1))

# Stats utilisateur
user_stats = df.groupby("user_id").agg(
    user_total_views=("event_type", lambda x: (x == "view").sum()),
    user_total_purchases=("event_type", lambda x: (x == "purchase").sum()),
)

# Merge
agg_user_prod = agg_user_prod.merge(prod_stats, on="product_id", how="left")
agg_user_prod = agg_user_prod.merge(user_stats, on="user_id", how="left")
agg_user_prod.fillna(0, inplace=True)
agg_user_prod.head()


Unnamed: 0,user_id,product_id,nb_view,nb_cart,nb_remove,nb_total_events,has_cart,has_remove,label_purchase,prod_total_views,prod_total_purchases,prod_conversion_rate,user_total_views,user_total_purchases
0,9794320,4905,1,0,0,1,0,0,0,345,209,0.605797,90,4
1,9794320,5705033,2,1,1,4,1,1,0,127,23,0.181102,90,4
2,9794320,5724282,1,1,1,3,1,1,0,480,148,0.308333,90,4
3,9794320,5724305,3,1,2,6,1,1,0,615,175,0.284553,90,4
4,9794320,5724608,3,1,1,5,1,1,0,351,145,0.413105,90,4


#### ***1.3 Préparation de X, y et train_test_split***


In [None]:
feature_cols = [
    "nb_view", "nb_cart", "nb_remove", "nb_total_events", "has_cart", "has_remove", "prod_total_views",
    "prod_total_purchases", "prod_conversion_rate", "user_total_views", "user_total_purchases",
]

X = agg_user_prod[feature_cols]
y = agg_user_prod["label_purchase"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

X_train.shape, X_test.shape


((3669048, 11), (917262, 11))

#### ***1.4 Entraînement RandomForest et premières métriques***

In [None]:
## 1.4 Entraînement RandomForest et métriques

rf = RandomForestClassifier(n_estimators = 200, max_depth = None, n_jobs = -1, class_weight = "balanced_subsample", random_state = 42,)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:, 1]

print("AUC:", roc_auc_score(y_test, y_proba))
print("F1:", f1_score(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


AUC: 0.9999168617796206
F1: 0.9928044069951268
Accuracy: 0.996149409874169


#### ***1.5 Sauvegarde du dataset agrégé propre***

In [None]:
cols_to_keep = [
    "user_id", "product_id",
    "nb_view", "nb_cart", "nb_remove", "nb_total_events",
    "has_cart", "has_remove",
    "prod_total_views", "prod_total_purchases", "prod_conversion_rate",
    "user_total_views", "user_total_purchases",
    "label_purchase",]

agg_user_prod_clean = agg_user_prod[cols_to_keep].drop_duplicates()
agg_user_prod_clean.to_csv("agg_user_prod_clean.csv", index=False)
agg_user_prod_clean.head()


Unnamed: 0,user_id,product_id,nb_view,nb_cart,nb_remove,nb_total_events,has_cart,has_remove,prod_total_views,prod_total_purchases,prod_conversion_rate,user_total_views,user_total_purchases,label_purchase
0,9794320,4905,1,0,0,1,0,0,345,209,0.605797,90,4,0
1,9794320,5705033,2,1,1,4,1,1,127,23,0.181102,90,4,0
2,9794320,5724282,1,1,1,3,1,1,480,148,0.308333,90,4,0
3,9794320,5724305,3,1,2,6,1,1,615,175,0.284553,90,4,0
4,9794320,5724608,3,1,1,5,1,1,351,145,0.413105,90,4,0
