# GRU4Rec - Oct

In [1]:
import pandas as pd

df = pd.read_csv(
    "nov_reduced.csv",
    usecols=["event_time", "user_id", "product_id", "user_session"]
)

# Convertir el timestamp a datetime
df["event_time"] = pd.to_datetime(df["event_time"])

print(df.head())
print(df.dtypes)

                 event_time  product_id    user_id  \
0 2019-11-01 00:00:00+00:00     1003461  520088904   
1 2019-11-01 00:00:00+00:00     5000088  530496790   
2 2019-11-01 00:00:01+00:00    17302664  561587266   
3 2019-11-01 00:00:01+00:00     3601530  518085591   
4 2019-11-01 00:00:01+00:00     1004775  558856683   

                           user_session  
0  4d3b30da-a5e4-49df-b1a8-ba5943f1dd33  
1  8e5f4f83-366c-4f70-860e-ca7417414283  
2  755422e7-9040-477b-9bd2-6a6e8fd97387  
3  3bfb58cd-7892-48cc-8020-2f17e6de6e7f  
4  313628f1-68b8-460d-84f6-cec7a8796ef2  
event_time      datetime64[ns, UTC]
product_id                    int64
user_id                       int64
user_session                 object
dtype: object


In [2]:
import numpy as np
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import random
from collections import Counter

# ================================================================
#   1. CONFIGURACI√ìN GLOBAL
# ================================================================
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

MAX_LEN = 50
MIN_SESSION_LEN = 5
TOP_N_ITEMS = 2000  # ajustar seg√∫n tu dataset
PAD_IDX = 0

df_small = df.sample(frac=0.5, random_state=42)


# ===================================================================
#   2. CREAR SESIONES + AUMENTACI√ìN POR DWELL 
# ===================================================================

DWELL_THRESHOLD = 75   

def build_sessions_with_dwell_repetitions(df):
    sessions = []

    df = df.sort_values(["user_session", "event_time"])

    for session_id, group in df.groupby("user_session"):
        group = group.sort_values("event_time")

        items = group["product_id"].tolist()
        times = group["event_time"].values

        # dwell entre eventos en segundos
        dwells = np.diff(times).astype("timedelta64[s]").astype(float)
        dwells = np.append(dwells, 1.0)

        seq = []
        for item, d in zip(items, dwells):

            # reps = floor(dwell/75) + 1
            reps = int(max(d, 1) // DWELL_THRESHOLD) + 1

            # REPETIR EL ITEM reps VECES
            seq.extend([item] * reps)

        if len(seq) >= 2:
            sessions.append(seq)

    return sessions


sessions = build_sessions_with_dwell_repetitions(df_small)
print("Sesiones creadas:", len(sessions))
print("Ejemplo sesi√≥n aumentada:", sessions[0][:25])

# ===================================================================
#   3. SPLIT TRAIN/TEST
# ===================================================================

split = int(0.8 * len(sessions))
train_sessions = sessions[:split]
test_sessions = sessions[split:]

print("Train:", len(train_sessions), " Test:", len(test_sessions))

# ===================================================================
#   4. FILTRAR TOP ITEMS (IGUAL A ANTES)
# ===================================================================

counter = Counter([item for sess in train_sessions for item in sess])
top_items = [item for item, _ in counter.most_common(TOP_N_ITEMS)]

item2idx = {item: i + 1 for i, item in enumerate(top_items)}
idx2item = {v: k for k, v in item2idx.items()}
N_ITEMS = len(item2idx) + 1

def encode_session(sess):
    return [item2idx[item] for item in sess if item in item2idx]

train_encoded = [encode_session(s) for s in train_sessions]
test_encoded = [encode_session(s) for s in test_sessions]

train_encoded = [s for s in train_encoded if len(s) >= MIN_SESSION_LEN]
test_encoded = [s for s in test_encoded if len(s) >= MIN_SESSION_LEN]

print("Train encoded:", len(train_encoded))
print("Test encoded:", len(test_encoded))
print("N_ITEMS:", N_ITEMS)


Device: cuda
Sesiones creadas: 5859994
Ejemplo sesi√≥n aumentada: [5100816, 5100816, 5100816, 1005107, 1005107, 11200402]
Train: 4687995  Test: 1171999
Train encoded: 1309888
Test encoded: 327080
N_ITEMS: 2001


In [3]:
# ===================================================================
#   DATASET + DATALOADER (SIN DWELL)
# ===================================================================

class GRUDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]

        # limitar a MAX_LEN
        if len(seq) > MAX_LEN:
            seq = seq[-MAX_LEN:]

        items = torch.tensor(seq[:-1], dtype=torch.long)
        targets = torch.tensor(seq[1:], dtype=torch.long)

        return items, targets


def collate_fn(batch):
    items_batch, targets_batch = zip(*batch)
    max_len = max(len(x) for x in items_batch)

    def pad(x, pad_value=0):
        return torch.cat([x, torch.full((max_len - len(x),), pad_value, dtype=x.dtype)])

    items = torch.stack([pad(x, PAD_IDX) for x in items_batch])
    targets = torch.stack([pad(x, PAD_IDX) for x in targets_batch])

    return items, targets


train_loader = DataLoader(
    GRUDataset(train_encoded),
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn
)

test_loader = DataLoader(
    GRUDataset(test_encoded),
    batch_size=1,
    shuffle=False,
    collate_fn=collate_fn
)

# ===================================================================
#   1. DEFINICI√ìN DEL MODELO SASREC
# ===================================================================

class SASRec(nn.Module):
    def __init__(
        self,
        n_items,
        emb_size=128,
        hidden_size=128,
        n_heads=2,
        n_layers=2,
        max_len=MAX_LEN,
        dropout=0.2
    ):
        super().__init__()

        self.item_emb = nn.Embedding(n_items, emb_size, padding_idx=PAD_IDX)
        self.pos_emb = nn.Embedding(max_len, emb_size)

        self.dropout = nn.Dropout(dropout)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=emb_size,
                nhead=n_heads,
                dim_feedforward=hidden_size,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=n_layers
        )

        self.fc = nn.Linear(emb_size, n_items)
        self.max_len = max_len

    def forward(self, items):
        B, T = items.size()

        positions = torch.arange(T, device=items.device).unsqueeze(0)
        emb = self.item_emb(items) + self.pos_emb(positions)

        emb = self.dropout(emb)

        pad_mask = items.eq(PAD_IDX)

        out = self.transformer(emb, src_key_padding_mask=pad_mask)

        logits = self.fc(out)
        return logits


# ===================================================================
#   2. ENTRENAMIENTO SASREC
# ===================================================================

def train_epoch_sasrec(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0

    for items, targets in loader:
        items, targets = items.to(device), targets.to(device)

        optimizer.zero_grad()
        logits = model(items)

        B, T, C = logits.shape
        loss = criterion(logits.reshape(B*T, C), targets.reshape(B*T))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(loader)


# ===================================================================
#   3. EVALUACI√ìN SASREC
# ===================================================================

def evaluate_sasrec(model, loader):
    model.eval()

    recall_k = 10
    recall_sum = 0
    mrr_sum = 0
    ndcg_sum = 0
    total = 0

    for items, targets in loader:
        items, targets = items.to(device), targets.to(device)

        logits = model(items)[0, -1]
        topk = torch.topk(logits, recall_k).indices.tolist()

        target = targets[0, -1].item()
        if target == PAD_IDX:
            continue

        total += 1

        if target in topk:
            rank = topk.index(target) + 1
            recall_sum += 1
            mrr_sum += 1 / rank
            ndcg_sum += 1 / math.log2(rank + 1)

    return (
        recall_sum / total,
        mrr_sum / total,
        ndcg_sum / total
    )


# ===================================================================
#   4. SASRec CON DWELL (usa train_loader y test_loader)
# ===================================================================

print("\n\n================ SASREC CON DWELL ================\n")

sasrec_dwell = SASRec(N_ITEMS).to(device)
criterion_sasrec = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer_sasrec = torch.optim.Adam(sasrec_dwell.parameters(), lr=1e-4)

EPOCHS = 10

import time

start = time.time()

for ep in range(1, EPOCHS + 1):
    loss = train_epoch_sasrec(sasrec_dwell, train_loader, optimizer_sasrec, criterion_sasrec)
    recall, mrr, ndcg = evaluate_sasrec(sasrec_dwell, test_loader)

    print(f"\nEpoch {ep}/{EPOCHS} (SASRec con dwell)")
    print(f"Loss: {loss:.4f}")
    print(f"Recall@10: {recall:.4f}  MRR@10: {mrr:.4f}  NDCG@10: {ndcg:.4f}")

end = time.time()

print(f"\n‚è± Tiempo total: {end - start:.2f} segundos")





Epoch 1/10 (SASRec con dwell)
Loss: 0.9563
Recall@10: 0.6874  MRR@10: 0.5998  NDCG@10: 0.6207

Epoch 2/10 (SASRec con dwell)
Loss: 0.3561
Recall@10: 0.7171  MRR@10: 0.6117  NDCG@10: 0.6367

Epoch 3/10 (SASRec con dwell)
Loss: 0.3131
Recall@10: 0.7311  MRR@10: 0.6164  NDCG@10: 0.6436

Epoch 4/10 (SASRec con dwell)
Loss: 0.2946
Recall@10: 0.7380  MRR@10: 0.6185  NDCG@10: 0.6469

Epoch 5/10 (SASRec con dwell)
Loss: 0.2844
Recall@10: 0.7421  MRR@10: 0.6198  NDCG@10: 0.6488

Epoch 6/10 (SASRec con dwell)
Loss: 0.2780
Recall@10: 0.7441  MRR@10: 0.6204  NDCG@10: 0.6498

Epoch 7/10 (SASRec con dwell)
Loss: 0.2739
Recall@10: 0.7462  MRR@10: 0.6210  NDCG@10: 0.6507

Epoch 8/10 (SASRec con dwell)
Loss: 0.2707
Recall@10: 0.7480  MRR@10: 0.6215  NDCG@10: 0.6515

Epoch 9/10 (SASRec con dwell)
Loss: 0.2677
Recall@10: 0.7489  MRR@10: 0.6218  NDCG@10: 0.6519

Epoch 10/10 (SASRec con dwell)
Loss: 0.2659
Recall@10: 0.7493  MRR@10: 0.6221  NDCG@10: 0.6522

‚è± Tiempo total: 9334.48 segundos


In [None]:
import gc
import torch

# üî• 1. Elimina variables GRANDES creadas en la parte con dwell
del sessions
del train_sessions
del test_sessions
del train_encoded
del test_encoded
del train_loader
del test_loader
del sasrec_dwell
del optimizer_sasrec
del criterion_sasrec

# üî• 2. Forzar recolector de basura
gc.collect()

# üî• 3. Liberar memoria de GPU (si corresponde)
if torch.cuda.is_available():
    torch.cuda.empty_cache()


In [9]:
# ===================================================================
#   1. RECREAR SESIONES ORIGINALES (SIN REPETICI√ìN POR DWELL)
# ===================================================================


def build_sessions_no_dwell(df):
    sessions = []

    df = df.sort_values(["user_session", "event_time"])

    for session_id, group in df.groupby("user_session"):
        group = group.sort_values("event_time")

        items = group["product_id"].tolist()

        if len(items) >= 2:
            sessions.append(items)

    return sessions


sessions_no_dwell = build_sessions_no_dwell(df_small)

print("Total sesiones SIN dwell:", len(sessions_no_dwell))
print("Ejemplo sesi√≥n SIN dwell:", sessions_no_dwell[0][:20])


# ===================================================================
#   2. SPLIT TRAIN / TEST
# ===================================================================

split = int(0.8 * len(sessions_no_dwell))
train_sessions_no_dwell = sessions_no_dwell[:split]
test_sessions_no_dwell = sessions_no_dwell[split:]

print("Train:", len(train_sessions_no_dwell), "Test:", len(test_sessions_no_dwell))


# ===================================================================
#   3. FILTRAR TOP ITEMS (SE REUSA item2idx DEL MODELO CON DWELL)
#      IMPORTANTE: usar el MISMO diccionario para comparaci√≥n justa
# ===================================================================

def encode_session_no_dwell(sess):
    return [item2idx[item] for item in sess if item in item2idx]

train_encoded_no_dwell = [encode_session_no_dwell(s) for s in train_sessions_no_dwell]
test_encoded_no_dwell = [encode_session_no_dwell(s) for s in test_sessions_no_dwell]

train_encoded_no_dwell = [s for s in train_encoded_no_dwell if len(s) >= MIN_SESSION_LEN]
test_encoded_no_dwell  = [s for s in test_encoded_no_dwell if len(s) >= MIN_SESSION_LEN]

print("Train encoded (sin dwell):", len(train_encoded_no_dwell))
print("Test encoded  (sin dwell):", len(test_encoded_no_dwell))


# ===================================================================
#   4. DATASET + DATALOADER (SIN DWELL)
# ===================================================================

class GRUDatasetNoDwell(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]

        if len(seq) > MAX_LEN:
            seq = seq[-MAX_LEN:]

        items = torch.tensor(seq[:-1], dtype=torch.long)
        targets = torch.tensor(seq[1:], dtype=torch.long)

        return items, targets


def collate_fn_no_dwell(batch):
    items_batch, targets_batch = zip(*batch)
    max_len = max(len(x) for x in items_batch)

    def pad(x, pad_value=0):
        return torch.cat([x, torch.full((max_len - len(x),), pad_value, dtype=x.dtype)])

    items = torch.stack([pad(x, PAD_IDX) for x in items_batch])
    targets = torch.stack([pad(x, PAD_IDX) for x in targets_batch])

    return items, targets


train_loader_no_dwell = DataLoader(
    GRUDatasetNoDwell(train_encoded_no_dwell),
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn_no_dwell
)

test_loader_no_dwell = DataLoader(
    GRUDatasetNoDwell(test_encoded_no_dwell),
    batch_size=1,
    shuffle=False,
    collate_fn=collate_fn_no_dwell
)


# ===================================================================

print("\n\n================ SASREC SIN DWELL ================\n")

sasrec_no_dwell = SASRec(N_ITEMS).to(device)
criterion_sasrec_nd = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer_sasrec_nd = torch.optim.Adam(sasrec_no_dwell.parameters(), lr=1e-4)

start = time.time()

for ep in range(1, EPOCHS + 1):
    loss = train_epoch_sasrec(sasrec_no_dwell, train_loader_no_dwell, optimizer_sasrec_nd, criterion_sasrec_nd)
    recall, mrr, ndcg = evaluate_sasrec(sasrec_no_dwell, test_loader_no_dwell)

    print(f"\nEpoch {ep}/{EPOCHS} (SASRec sin dwell)")
    print(f"Loss: {loss:.4f}")
    print(f"Recall@10: {recall:.4f}  MRR@10: {mrr:.4f}  NDCG@10: {ndcg:.4f}")

end = time.time()

print(f"\n‚è± Tiempo total: {end - start:.2f} segundos")


Total sesiones SIN dwell: 5859994
Ejemplo sesi√≥n SIN dwell: [5100816, 1005107, 11200402]
Train: 4687995 Test: 1171999
Train encoded (sin dwell): 578083
Test encoded  (sin dwell): 144489




Epoch 1/10 (SASRec sin dwell)
Loss: 1.8606
Recall@10: 0.5961  MRR@10: 0.4775  NDCG@10: 0.5061

Epoch 2/10 (SASRec sin dwell)
Loss: 0.7330
Recall@10: 0.6352  MRR@10: 0.4982  NDCG@10: 0.5309

Epoch 3/10 (SASRec sin dwell)
Loss: 0.6392
Recall@10: 0.6561  MRR@10: 0.5079  NDCG@10: 0.5432

Epoch 4/10 (SASRec sin dwell)
Loss: 0.5969
Recall@10: 0.6684  MRR@10: 0.5138  NDCG@10: 0.5506

Epoch 5/10 (SASRec sin dwell)
Loss: 0.5720
Recall@10: 0.6760  MRR@10: 0.5166  NDCG@10: 0.5545

Epoch 6/10 (SASRec sin dwell)
Loss: 0.5566
Recall@10: 0.6818  MRR@10: 0.5195  NDCG@10: 0.5581

Epoch 7/10 (SASRec sin dwell)
Loss: 0.5452
Recall@10: 0.6852  MRR@10: 0.5210  NDCG@10: 0.5601

Epoch 8/10 (SASRec sin dwell)
Loss: 0.5370
Recall@10: 0.6882  MRR@10: 0.5214  NDCG@10: 0.5611

Epoch 9/10 (SASRec sin dwell)
Loss: 0.5301
Recall