### ***3. Modèle séquentiel (GRU) sur les sessions***

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score

df = pd.read_csv("data_clean.csv")

# Encodage simple de event_type
event_type_map = {"view": 0, "cart": 1, "purchase": 2, "remove_from_cart": 3}
df = df[df["event_type"].isin(event_type_map.keys())].copy()
df["event_type_id"] = df["event_type"].map(event_type_map)

# Si product_idx existe déjà (via embeddings.ipynb), on peut le recharger
try:
    df_idx = pd.read_csv("df_products.csv")
    df = df.merge(df_idx[["product_id", "product_idx"]].drop_duplicates(), on="product_id", how="left")
    df["product_idx"] = df["product_idx"].fillna(0).astype(int)
except FileNotFoundError:
    # sinon on reste sur product_id comme proxy d'index
    df["product_idx"] = df["product_id"]

df["event_time"] = pd.to_datetime(df["event_time"])
df = df.sort_values(["user_session", "event_time"])

df[["user_session", "event_time", "event_type", "product_id", "event_type_id"]].head()


Unnamed: 0,user_session,event_time,event_type,product_id,event_type_id
5804085,0000061d-f3e9-484b-8c73-e54f355032a3,2020-01-16 03:30:41+00:00,view,5560754,0
1465757,000013d6-68a4-40cf-9452-6577dbfab515,2019-10-23 09:07:38+00:00,view,5859210,0
1465763,000013d6-68a4-40cf-9452-6577dbfab515,2019-10-23 09:12:50+00:00,view,30195,0
1465764,000013d6-68a4-40cf-9452-6577dbfab515,2019-10-23 09:44:43+00:00,cart,30195,1
1465772,000013d6-68a4-40cf-9452-6577dbfab515,2019-10-23 09:57:19+00:00,remove_from_cart,5817690,3


#### ***3.1 Séquences par session***

In [None]:

session_label = (df.groupby("user_session")["event_type"].apply(lambda x: int((x == "purchase").any())).rename("label_purchase_session"))

def build_session_sequences(group):
    return pd.Series({"product_idx_seq": group["product_idx"].tolist(),"event_type_seq": group["event_type_id"].tolist(),})

session_seqs = df.groupby("user_session").apply(build_session_sequences)
session_seqs = session_seqs.join(session_label)
session_seqs.head()


  session_seqs = df.groupby("user_session").apply(build_session_sequences)


Unnamed: 0_level_0,product_idx_seq,event_type_seq,label_purchase_session
user_session,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0000061d-f3e9-484b-8c73-e54f355032a3,[1513],[0],0
000013d6-68a4-40cf-9452-6577dbfab515,"[26044, 786, 786, 16285, 786, 11719, 26274, 30...","[0, 0, 1, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, ...",1
00002b0e-d7f7-454e-8386-431c4021a9f6,"[29796, 33564, 33518, 2238, 2247, 2469, 2469, ...","[3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...",0
00002f36-401d-4bc5-bac6-1a683f52ac2b,[20961],[0],0
00004c7d-9507-474e-a559-0fdaccaaa390,[27613],[0],0


#### **3.2 Troncature / filtrage**

In [3]:

MAX_LEN = 30

def truncate(seq):
    return seq[-MAX_LEN:]

session_seqs["product_idx_seq"] = session_seqs["product_idx_seq"].apply(truncate)
session_seqs["event_type_seq"] = session_seqs["event_type_seq"].apply(truncate)

session_seqs = session_seqs[session_seqs["product_idx_seq"].str.len() > 1]
len(session_seqs)


720186

#### **3.3 Dataset PyTorch**

In [4]:

class SessionDataset(Dataset):
    def __init__(self, df_sessions, max_len=MAX_LEN):
        self.product_seqs = df_sessions["product_idx_seq"].tolist()
        self.event_seqs = df_sessions["event_type_seq"].tolist()
        self.labels = df_sessions["label_purchase_session"].values.astype("float32")
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        p_seq = self.product_seqs[idx]
        e_seq = self.event_seqs[idx]
        L = len(p_seq)

        pad_len = self.max_len - L
        if pad_len > 0:
            p_seq = [0] * pad_len + p_seq
            e_seq = [0] * pad_len + e_seq
        else:
            p_seq = p_seq[-self.max_len:]
            e_seq = e_seq[-self.max_len:]

        return (
            torch.tensor(p_seq, dtype=torch.long),
            torch.tensor(e_seq, dtype=torch.long),
            torch.tensor(self.labels[idx], dtype=torch.float32),
        )

from sklearn.model_selection import train_test_split

train_idx, test_idx = train_test_split(
    np.arange(len(session_seqs)),
    test_size=0.2,
    random_state=42,
    stratify=session_seqs["label_purchase_session"],
)

train_sessions = session_seqs.iloc[train_idx]
test_sessions = session_seqs.iloc[test_idx]

train_dataset = SessionDataset(train_sessions)
test_dataset = SessionDataset(test_sessions)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False)

len(train_dataset), len(test_dataset)


(576148, 144038)

#### **3.4 Modèle GRU**

In [5]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_products = int(df["product_idx"].max()) + 1
num_event_types = len(event_type_map)

class GRUSessionModel(nn.Module):
    def __init__(self, num_items, num_event_types, emb_dim=32, hidden_dim=64):
        super().__init__()
        self.item_emb = nn.Embedding(num_items, emb_dim)
        self.event_emb = nn.Embedding(num_event_types, emb_dim)
        self.gru = nn.GRU(input_size=emb_dim * 2, hidden_size=hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, p_seq, e_seq):
        p_e = self.item_emb(p_seq)
        e_e = self.event_emb(e_seq)
        x = torch.cat([p_e, e_e], dim=-1)
        output, h_n = self.gru(x)
        h_last = h_n.squeeze(0)
        logit = self.fc(h_last).squeeze(1)
        return logit

model_seq = GRUSessionModel(num_products, num_event_types).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model_seq.parameters(), lr=1e-3)


#### **Entraînement rapide du modèle séquentiel**

In [6]:
def train_epoch_seq(model, loader):
    model.train()
    total_loss = 0.0
    for p_seq, e_seq, label in loader:
        p_seq = p_seq.to(device)
        e_seq = e_seq.to(device)
        label = label.to(device)

        optimizer.zero_grad()
        logits = model(p_seq, e_seq)
        loss = criterion(logits, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * label.size(0)
    return total_loss / len(loader.dataset)

def eval_auc_seq(model, loader):
    model.eval()
    all_labels, all_probs = [], []
    with torch.no_grad():
        for p_seq, e_seq, label in loader:
            p_seq = p_seq.to(device)
            e_seq = e_seq.to(device)
            logits = model(p_seq, e_seq)
            probs = torch.sigmoid(logits)
            all_labels.extend(label.numpy().tolist())
            all_probs.extend(probs.cpu().numpy().tolist())
    return roc_auc_score(all_labels, all_probs)

for epoch in range(3):
    loss_tr = train_epoch_seq(model_seq, train_loader)
    auc_te = eval_auc_seq(model_seq, test_loader)
    print(f"Epoch {epoch+1}: loss={loss_tr:.4f}, AUC={auc_te:.4f}")


Epoch 1: loss=0.0123, AUC=0.9998
Epoch 2: loss=0.0050, AUC=0.9998
Epoch 3: loss=0.0039, AUC=0.9998


In [2]:
# generation du fichier de reqierements.txt automatiquement
!pip freeze > requirements.txt

