### ***2. Modèle de recommandation avec embeddings (PyTorch)***

In [1]:
### ***2. Modèle de recommandation avec embeddings (PyTorch)***

import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import roc_auc_score
import joblib
import os

os.makedirs("encoders", exist_ok=True)
os.makedirs("models", exist_ok=True)


#### **2.1 Chargement des données**

In [None]:
df_raw = pd.read_csv("data_clean.csv")
agg_user_prod = pd.read_csv("agg_user_prod_clean.csv")  

df_raw.head(), agg_user_prod.head()


(   Unnamed: 0               event_time        event_type  product_id  \
 0           0  2019-10-01 06:03:28 UTC              cart     5844670   
 1           1  2019-10-01 06:03:41 UTC              cart     5824819   
 2           2  2019-10-01 06:10:59 UTC  remove_from_cart     5830883   
 3           3  2019-10-01 06:11:04 UTC              view     5844670   
 4           4  2019-10-01 06:12:01 UTC              cart     5844670   
 
            category_id category_code    brand  price  user_id  \
 0  1487580007852147670           NaN  bluesky   0.79  9794320   
 1  1487580007852147670           NaN    domix   1.24  9794320   
 2  1487580007852147670           NaN      NaN   9.37  9794320   
 3  1487580007852147670           NaN  bluesky   0.79  9794320   
 4  1487580007852147670           NaN  bluesky   0.79  9794320   
 
                            user_session  
 0  1be8fa80-8036-4d95-93da-494a08d82cb5  
 1  1be8fa80-8036-4d95-93da-494a08d82cb5  
 2  1be8fa80-8036-4d95-93da-494a0

#### **2.2 Encoders user / product et normalisation du prix**

In [None]:
user_encoder = LabelEncoder()
product_encoder = LabelEncoder()

df_raw["user_idx"] = user_encoder.fit_transform(df_raw["user_id"])
df_raw["product_idx"] = product_encoder.fit_transform(df_raw["product_id"])

price_scaler = MinMaxScaler()
df_raw["price_normalized"] = price_scaler.fit_transform(df_raw[["price"]]) # on normalise pour que le modèle converge mieux

joblib.dump(user_encoder, "encoders/user_encoder.pkl")
joblib.dump(product_encoder, "encoders/product_encoder.pkl")
joblib.dump(price_scaler, "encoders/price_scaler.pkl")

df_raw[["user_id", "user_idx", "product_id", "product_idx", "price", "price_normalized"]].head()


Unnamed: 0,user_id,user_idx,product_id,product_idx,price,price_normalized
0,9794320,0,5844670,21990,0.79,0.196881
1,9794320,0,5824819,17712,1.24,0.197986
2,9794320,0,5830883,18981,9.37,0.217954
3,9794320,0,5844670,21990,0.79,0.196881
4,9794320,0,5844670,21990,0.79,0.196881


#### **2.3 Merge des indices dans le dataset agrégé**

In [4]:

df_ids_price = df_raw[["user_id", "product_id", "user_idx", "product_idx", "price_normalized"]].drop_duplicates()

agg_user_prod = agg_user_prod.merge(df_ids_price, on=["user_id", "product_id"], how="left")
agg_user_prod.dropna(subset=["user_idx", "product_idx", "price_normalized"], inplace=True)

agg_user_prod.to_csv("agg_user_prod_ncf.csv", index=False)
agg_user_prod.head()


Unnamed: 0,user_id,product_id,nb_view,nb_cart,nb_remove,nb_total_events,has_cart,has_remove,prod_total_views,prod_total_purchases,prod_conversion_rate,user_total_views,user_total_purchases,label_purchase,user_idx,product_idx,price_normalized
0,9794320,4905,1,0,0,1,0,0,345,209,0.605797,90,4,0,0,156,0.19774
1,9794320,5705033,2,1,1,4,1,1,127,23,0.181102,90,4,0,0,5534,0.2014
2,9794320,5705033,2,1,1,4,1,1,127,23,0.181102,90,4,0,0,5534,0.201768
3,9794320,5724282,1,1,1,3,1,1,480,148,0.308333,90,4,0,0,6333,0.200712
4,9794320,5724282,1,1,1,3,1,1,480,148,0.308333,90,4,0,0,6333,0.201032


#### **2.4 Dataset NCF (user_idx, product_idx, price_normalized, label)**

In [5]:
data_nn = agg_user_prod[["user_idx", "product_idx", "price_normalized", "label_purchase"]].reset_index(drop=True)
data_nn["label_purchase"].value_counts(normalize=True)


label_purchase
0    0.72876
1    0.27124
Name: proportion, dtype: float64

#### **2.5 Split train / test**

In [6]:
X = data_nn[["user_idx", "product_idx", "price_normalized"]]
y = data_nn["label_purchase"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

X_train.shape, X_test.shape


((3804997, 3), (951250, 3))

#### **2.6 Dataset PyTorch**

In [None]:

class InteractionsDataset(Dataset):
    def __init__(self, X, y):
        self.user_idx = torch.tensor(X["user_idx"].values, dtype=torch.long)
        self.product_idx = torch.tensor(X["product_idx"].values, dtype=torch.long)
        self.price = torch.tensor(X["price_normalized"].values, dtype=torch.float32)
        self.label = torch.tensor(y.values, dtype=torch.float32)

    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        return (
            self.user_idx[idx],
            self.product_idx[idx],
            self.price[idx],
            self.label[idx],)

train_dataset = InteractionsDataset(X_train, y_train)
test_dataset = InteractionsDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)
len(train_dataset), len(test_dataset)


(3804997, 951250)

#### **2.7 Définition du modèle**

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_users = int(data_nn["user_idx"].max()) + 1
num_products = int(data_nn["product_idx"].max()) + 1

class NCFModel(nn.Module):
    def __init__(self, num_users, num_items, emb_dim=32):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, emb_dim)
        self.item_emb = nn.Embedding(num_items, emb_dim)
        input_dim = emb_dim * 2 + 1
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
        )

    def forward(self, user_idx, item_idx, price):
        u = self.user_emb(user_idx)
        i = self.item_emb(item_idx)
        x = torch.cat([u, i, price.unsqueeze(1)], dim=1)
        logit = self.mlp(x).squeeze(1)
        return logit

model = NCFModel(num_users, num_products, emb_dim=32).to(device)
model


NCFModel(
  (user_emb): Embedding(110518, 32)
  (item_emb): Embedding(40777, 32)
  (mlp): Sequential(
    (0): Linear(in_features=65, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=16, bias=True)
    (5): ReLU()
    (6): Linear(in_features=16, out_features=1, bias=True)
  )
)

#### **2.8 Entraînement NCF**

In [9]:

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

def train_one_epoch(model, loader):
    model.train()
    total_loss = 0.0
    for user_idx, item_idx, price, label in loader:
        user_idx = user_idx.to(device)
        item_idx = item_idx.to(device)
        price = price.to(device)
        label = label.to(device)

        optimizer.zero_grad()
        logits = model(user_idx, item_idx, price)
        loss = criterion(logits, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * label.size(0)
    return total_loss / len(loader.dataset)

def evaluate_auc(model, loader):
    model.eval()
    all_labels = []
    all_probs = []
    with torch.no_grad():
        for user_idx, item_idx, price, label in loader:
            user_idx = user_idx.to(device)
            item_idx = item_idx.to(device)
            price = price.to(device)
            logits = model(user_idx, item_idx, price)
            probs = torch.sigmoid(logits)
            all_labels.extend(label.numpy().tolist())
            all_probs.extend(probs.cpu().numpy().tolist())
    return roc_auc_score(all_labels, all_probs)

for epoch in range(5):
    train_loss = train_one_epoch(model, train_loader)
    val_auc = evaluate_auc(model, test_loader)
    print(f"Epoch {epoch+1}: loss={train_loss:.4f}, AUC={val_auc:.4f}")


Epoch 1: loss=0.5340, AUC=0.7581
Epoch 2: loss=0.4753, AUC=0.7772
Epoch 3: loss=0.4597, AUC=0.7804
Epoch 4: loss=0.4516, AUC=0.7810
Epoch 5: loss=0.4450, AUC=0.7802


#### **2.9 Sauvegarde du modèle entraîné**

In [10]:

torch.save(model.state_dict(), "models/ncf_model.pt")


#### **2.10 Construction de df_products.csv pour Streamlit**

In [None]:
# On crée une table produit unique avec meta + stats + prix normalisé

df_products = df_raw[[
    "product_id", "product_idx", "category_id", "category_code", "brand", "price", "price_normalized"
]].drop_duplicates()

prod_stats = agg_user_prod.groupby("product_id").agg(
    prod_total_views=("prod_total_views", "max"),
    prod_total_purchases=("prod_total_purchases", "max"),
    prod_conversion_rate=("prod_conversion_rate", "max"),).reset_index()

df_products = df_products.merge(prod_stats, on="product_id", how="left")
df_products.fillna({"prod_total_views": 0, "prod_total_purchases": 0, "prod_conversion_rate": 0}, inplace=True)

df_products.to_csv("df_products.csv", index=False)
df_products.head()


Unnamed: 0,product_id,product_idx,category_id,category_code,brand,price,price_normalized,prod_total_views,prod_total_purchases,prod_conversion_rate
0,5844670,21990,1487580007852147670,,bluesky,0.79,0.196881,665,661,0.993985
1,5824819,17712,1487580007852147670,,domix,1.24,0.197986,113,97,0.858407
2,5830883,18981,1487580007852147670,,,9.37,0.217954,11,3,0.272727
3,5811668,14757,1487580005427839846,,irisk,2.48,0.201032,633,244,0.385466
4,5724608,6387,1487580005427839846,,irisk,2.48,0.201032,351,145,0.413105
