# GRU4Rec - Oct

In [1]:
import pandas as pd

df = pd.read_csv(
    "nov_reduced.csv",
    usecols=["event_time", "user_id", "product_id", "user_session"]
)

# Convertir el timestamp a datetime
df["event_time"] = pd.to_datetime(df["event_time"])

print(df.head())
print(df.dtypes)

                 event_time  product_id    user_id  \
0 2019-11-01 00:00:00+00:00     1003461  520088904   
1 2019-11-01 00:00:00+00:00     5000088  530496790   
2 2019-11-01 00:00:01+00:00    17302664  561587266   
3 2019-11-01 00:00:01+00:00     3601530  518085591   
4 2019-11-01 00:00:01+00:00     1004775  558856683   

                           user_session  
0  4d3b30da-a5e4-49df-b1a8-ba5943f1dd33  
1  8e5f4f83-366c-4f70-860e-ca7417414283  
2  755422e7-9040-477b-9bd2-6a6e8fd97387  
3  3bfb58cd-7892-48cc-8020-2f17e6de6e7f  
4  313628f1-68b8-460d-84f6-cec7a8796ef2  
event_time      datetime64[ns, UTC]
product_id                    int64
user_id                       int64
user_session                 object
dtype: object


In [2]:
# ================================================================
#   GRU4Rec con Dwell Time 
#   - Dwell correctamente integrado al modelo
# ================================================================

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import random
from collections import Counter

# ================================================================
#   1. CONFIGURACIÓN GLOBAL
# ================================================================
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

MAX_LEN = 50
MIN_SESSION_LEN = 5
TOP_N_ITEMS = 2000  # ajustar según tu dataset
PAD_IDX = 0

df_small = df.sample(frac=0.547, random_state=42)

# ===================================================================
#   2. CREAR SESIONES + AUMENTACIÓN POR DWELL 
# ===================================================================
print("\n\n================ GRU4REC CON DWELL ================")

# ===================================================================
#   2. CREAR SESIONES + AUMENTACIÓN POR DWELL 
# ===================================================================

DWELL_THRESHOLD = 75   

def build_sessions_with_dwell_repetitions(df):
    sessions = []

    df = df.sort_values(["user_session", "event_time"])

    for session_id, group in df.groupby("user_session"):
        group = group.sort_values("event_time")

        items = group["product_id"].tolist()
        times = group["event_time"].values

        # dwell entre eventos en segundos
        dwells = np.diff(times).astype("timedelta64[s]").astype(float)
        dwells = np.append(dwells, 1.0)

        seq = []
        for item, d in zip(items, dwells):

            # reps = floor(dwell/75) + 1
            reps = int(max(d, 1) // DWELL_THRESHOLD) + 1

            # REPETIR EL ITEM reps VECES
            seq.extend([item] * reps)

        if len(seq) >= 2:
            sessions.append(seq)

    return sessions


sessions = build_sessions_with_dwell_repetitions(df_small)
print("Sesiones creadas:", len(sessions))
print("Ejemplo sesión aumentada:", sessions[0][:25])

# ===================================================================
#   3. SPLIT TRAIN/TEST
# ===================================================================

split = int(0.8 * len(sessions))
train_sessions = sessions[:split]
test_sessions = sessions[split:]

print("Train:", len(train_sessions), " Test:", len(test_sessions))

# ===================================================================
#   4. FILTRAR TOP ITEMS (IGUAL A ANTES)
# ===================================================================

counter = Counter([item for sess in train_sessions for item in sess])
top_items = [item for item, _ in counter.most_common(TOP_N_ITEMS)]

item2idx = {item: i + 1 for i, item in enumerate(top_items)}
idx2item = {v: k for k, v in item2idx.items()}
N_ITEMS = len(item2idx) + 1

def encode_session(sess):
    return [item2idx[item] for item in sess if item in item2idx]

train_encoded = [encode_session(s) for s in train_sessions]
test_encoded = [encode_session(s) for s in test_sessions]

train_encoded = [s for s in train_encoded if len(s) >= MIN_SESSION_LEN]
test_encoded = [s for s in test_encoded if len(s) >= MIN_SESSION_LEN]

print("Train encoded:", len(train_encoded))
print("Test encoded:", len(test_encoded))
print("N_ITEMS:", N_ITEMS)


Device: cuda


Sesiones creadas: 6220590
Ejemplo sesión aumentada: [5100816, 1005135, 1005135, 1005107, 1005107, 11200402]
Train: 4976472  Test: 1244118
Train encoded: 1405892
Test encoded: 351002
N_ITEMS: 2001


In [None]:
# ===================================================================
#   5. DATASET + DATALOADER (SIN DWELL)
# ===================================================================

class GRUDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]

        # limitar a MAX_LEN
        if len(seq) > MAX_LEN:
            seq = seq[-MAX_LEN:]

        items = torch.tensor(seq[:-1], dtype=torch.long)
        targets = torch.tensor(seq[1:], dtype=torch.long)

        return items, targets


def collate_fn(batch):
    items_batch, targets_batch = zip(*batch)
    max_len = max(len(x) for x in items_batch)

    def pad(x, pad_value=0):
        return torch.cat([x, torch.full((max_len - len(x),), pad_value, dtype=x.dtype)])

    items = torch.stack([pad(x, PAD_IDX) for x in items_batch])
    targets = torch.stack([pad(x, PAD_IDX) for x in targets_batch])

    return items, targets


train_loader = DataLoader(
    GRUDataset(train_encoded),
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn
)

test_loader = DataLoader(
    GRUDataset(test_encoded),
    batch_size=1,
    shuffle=False,
    collate_fn=collate_fn
)

# ===================================================================
#   6. MODELO GRU4REC ESTÁNDAR (SIN DWELL COMO FEATURE)
# ===================================================================

class GRU4Rec(nn.Module):
    def __init__(self, n_items, emb_size=128, hidden_size=128):
        super().__init__()

        self.embed = nn.Embedding(n_items, emb_size, padding_idx=PAD_IDX)
        self.gru = nn.GRU(emb_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, n_items)

    def forward(self, items):
        emb = self.embed(items)
        out, _ = self.gru(emb)
        logits = self.fc(out)
        return logits


model = GRU4Rec(N_ITEMS).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# ===================================================================
#   7. ENTRENAMIENTO
# ===================================================================

def train_epoch():
    model.train()
    total_loss = 0

    for items, targets in train_loader:
        items, targets = items.to(device), targets.to(device)

        optimizer.zero_grad()

        logits = model(items)
        B, T, C = logits.shape

        loss = criterion(logits.reshape(B*T, C), targets.reshape(B*T))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(train_loader)

# ===================================================================
#   8. EVALUACIÓN
# ===================================================================

def evaluate():
    model.eval()

    recall_sum = mrr_sum = ndcg_sum = 0
    recall_k = mrr_k = ndcg_k = 10
    total = 0

    for items, targets in test_loader:
        items, targets = items.to(device), targets.to(device)

        logits = model(items)[0, -1]
        topk = torch.topk(logits, recall_k).indices.tolist()

        target = targets[0, -1].item()
        if target == PAD_IDX:
            continue

        total += 1

        if target in topk:
            rank = topk.index(target) + 1
            recall_sum += 1
            mrr_sum += 1 / rank
            ndcg_sum += 1 / np.log2(rank + 1)

    return (
        recall_sum / total,
        mrr_sum / total,
        ndcg_sum / total
    )

# ===================================================================
#   9. LOOP DE ENTRENAMIENTO
# ===================================================================
import time

start = time.time()

EPOCHS = 10
for ep in range(1, EPOCHS + 1):
    loss = train_epoch()
    recall, mrr, ndcg = evaluate()
    print(f"\nEpoch {ep}/{EPOCHS}")
    print(f"Loss: {loss:.4f}")
    print(f"Recall@10: {recall:.4f}  MRR@10: {mrr:.4f}  NDCG@10: {ndcg:.4f}")

end = time.time()

print(f"\n Tiempo total: {end - start:.2f} segundos")



Epoch 1/10
Loss: 1.5457
Recall@10: 0.7606  MRR@10: 0.6311  NDCG@10: 0.6618

Epoch 2/10
Loss: 1.4414
Recall@10: 0.7628  MRR@10: 0.6318  NDCG@10: 0.6628

Epoch 3/10
Loss: 1.4319
Recall@10: 0.7631  MRR@10: 0.6322  NDCG@10: 0.6632

Epoch 4/10
Loss: 1.4269
Recall@10: 0.7640  MRR@10: 0.6323  NDCG@10: 0.6636

Epoch 5/10
Loss: 1.4232
Recall@10: 0.7636  MRR@10: 0.6324  NDCG@10: 0.6635

Epoch 6/10
Loss: 1.4204
Recall@10: 0.7649  MRR@10: 0.6327  NDCG@10: 0.6640

Epoch 7/10
Loss: 1.4183
Recall@10: 0.7648  MRR@10: 0.6328  NDCG@10: 0.6641

Epoch 8/10
Loss: 1.4166
Recall@10: 0.7647  MRR@10: 0.6327  NDCG@10: 0.6640

Epoch 9/10
Loss: 1.4152
Recall@10: 0.7650  MRR@10: 0.6328  NDCG@10: 0.6642

Epoch 10/10
Loss: 1.4140
Recall@10: 0.7655  MRR@10: 0.6329  NDCG@10: 0.6644

 Tiempo total: 4057.27 segundos


: 

In [3]:
# ===================================================================
#   GRU4Rec SIN DWELL — sesiones limpias, independientes del modelo con dwell
# ===================================================================

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import random
from collections import Counter

# ================================================================
#   1. CONFIGURACIÓN GLOBAL
# ================================================================
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

MAX_LEN = 50
MIN_SESSION_LEN = 5
TOP_N_ITEMS = 2000  # ajustar según tu dataset
PAD_IDX = 0

df_small = df.sample(frac=0.547, random_state=42)

print("\n\n================ GRU4REC SIN DWELL ================")

# ===================================================================
#   1. RECREAR SESIONES ORIGINALES (SIN REPETICIÓN POR DWELL)
# ===================================================================

def build_sessions_no_dwell(df):
    sessions = []

    df = df.sort_values(["user_session", "event_time"])

    for session_id, group in df.groupby("user_session"):
        group = group.sort_values("event_time")

        items = group["product_id"].tolist()

        if len(items) >= 2:
            sessions.append(items)

    return sessions


sessions_no_dwell = build_sessions_no_dwell(df_small)

print("Total sesiones SIN dwell:", len(sessions_no_dwell))
print("Ejemplo sesión SIN dwell:", sessions_no_dwell[0][:20])


# ===================================================================
#   2. SPLIT TRAIN / TEST
# ===================================================================

split = int(0.8 * len(sessions_no_dwell))
train_sessions_no_dwell = sessions_no_dwell[:split]
test_sessions_no_dwell = sessions_no_dwell[split:]

print("Train:", len(train_sessions_no_dwell), "Test:", len(test_sessions_no_dwell))

counter = Counter([item for sess in train_sessions_no_dwell for item in sess])
top_items = [item for item, _ in counter.most_common(TOP_N_ITEMS)]

item2idx = {item: i + 1 for i, item in enumerate(top_items)}
idx2item = {v: k for k, v in item2idx.items()}
N_ITEMS = len(item2idx) + 1


# ===================================================================
#   3. FILTRAR TOP ITEMS (SE REUSA item2idx DEL MODELO CON DWELL)
#      IMPORTANTE: usar el MISMO diccionario para comparación justa
# ===================================================================

def encode_session_no_dwell(sess):
    return [item2idx[item] for item in sess if item in item2idx]

train_encoded_no_dwell = [encode_session_no_dwell(s) for s in train_sessions_no_dwell]
test_encoded_no_dwell = [encode_session_no_dwell(s) for s in test_sessions_no_dwell]

train_encoded_no_dwell = [s for s in train_encoded_no_dwell if len(s) >= MIN_SESSION_LEN]
test_encoded_no_dwell  = [s for s in test_encoded_no_dwell if len(s) >= MIN_SESSION_LEN]

print("Train encoded (sin dwell):", len(train_encoded_no_dwell))
print("Test encoded  (sin dwell):", len(test_encoded_no_dwell))


# ===================================================================
#   4. DATASET + DATALOADER (SIN DWELL)
# ===================================================================

class GRUDatasetNoDwell(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]

        if len(seq) > MAX_LEN:
            seq = seq[-MAX_LEN:]

        items = torch.tensor(seq[:-1], dtype=torch.long)
        targets = torch.tensor(seq[1:], dtype=torch.long)

        return items, targets


def collate_fn_no_dwell(batch):
    items_batch, targets_batch = zip(*batch)
    max_len = max(len(x) for x in items_batch)

    def pad(x, pad_value=0):
        return torch.cat([x, torch.full((max_len - len(x),), pad_value, dtype=x.dtype)])

    items = torch.stack([pad(x, PAD_IDX) for x in items_batch])
    targets = torch.stack([pad(x, PAD_IDX) for x in targets_batch])

    return items, targets


train_loader_no_dwell = DataLoader(
    GRUDatasetNoDwell(train_encoded_no_dwell),
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn_no_dwell
)

test_loader_no_dwell = DataLoader(
    GRUDatasetNoDwell(test_encoded_no_dwell),
    batch_size=1,
    shuffle=False,
    collate_fn=collate_fn_no_dwell
)


# ===================================================================
#   5. MODELO GRU4Rec SIN DWELL
# ===================================================================

class GRU4RecNoDwell(nn.Module):
    def __init__(self, n_items, emb_size=128, hidden_size=128):
        super().__init__()
        self.embed = nn.Embedding(n_items, emb_size, padding_idx=PAD_IDX)
        self.gru = nn.GRU(emb_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, n_items)

    def forward(self, items):
        emb = self.embed(items)
        out, _ = self.gru(emb)
        logits = self.fc(out)
        return logits


model_no_dwell = GRU4RecNoDwell(N_ITEMS).to(device)
criterion_no_dwell = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer_no_dwell = torch.optim.Adam(model_no_dwell.parameters(), lr=1e-3)


# ===================================================================
#   6. ENTRENAMIENTO
# ===================================================================

def train_epoch_no_dwell():
    model_no_dwell.train()
    total_loss = 0

    for items, targets in train_loader_no_dwell:
        items, targets = items.to(device), targets.to(device)

        optimizer_no_dwell.zero_grad()

        logits = model_no_dwell(items)
        B, T, C = logits.shape

        loss = criterion_no_dwell(logits.reshape(B*T, C), targets.reshape(B*T))
        loss.backward()
        optimizer_no_dwell.step()

        total_loss += loss.item()

    return total_loss / len(train_loader_no_dwell)


# ===================================================================
#   7. EVALUACIÓN
# ===================================================================

def evaluate_no_dwell():
    model_no_dwell.eval()

    recall_k = 10
    recall_sum = 0
    mrr_sum = 0
    ndcg_sum = 0
    total = 0

    for items, targets in test_loader_no_dwell:
        items, targets = items.to(device), targets.to(device)

        logits = model_no_dwell(items)[0, -1]
        topk = torch.topk(logits, recall_k).indices.tolist()

        target = targets[0, -1].item()
        if target == PAD_IDX:
            continue

        total += 1

        if target in topk:
            rank = topk.index(target) + 1
            recall_sum += 1
            mrr_sum += 1 / rank
            ndcg_sum += 1 / np.log2(rank + 1)

    return (
        recall_sum / total,
        mrr_sum / total,
        ndcg_sum / total
    )


# ===================================================================
#   8. LOOP DE ENTRENAMIENTO
# ===================================================================

print("\nEntrenando GRU4Rec SIN dwell...\n")

import time

start = time.time()


EPOCHS = 10
for ep in range(1, EPOCHS + 1):
    loss = train_epoch_no_dwell()
    recall, mrr, ndcg = evaluate_no_dwell()

    print(f"\nEpoch {ep}/{EPOCHS} (sin dwell)")
    print(f"Loss: {loss:.4f}")
    print(f"Recall@10: {recall:.4f}  MRR@10: {mrr:.4f}  NDCG@10: {ndcg:.4f}")

end = time.time()

print(f"\n⏱ Tiempo total: {end - start:.2f} segundos")


Device: cuda


Total sesiones SIN dwell: 6220590
Ejemplo sesión SIN dwell: [5100816, 1005135, 1005107, 11200402]
Train: 4976472 Test: 1244118
Train encoded (sin dwell): 789742
Test encoded  (sin dwell): 197378

Entrenando GRU4Rec SIN dwell...


Epoch 1/10 (sin dwell)
Loss: 3.4575
Recall@10: 0.6992  MRR@10: 0.5182  NDCG@10: 0.5612

Epoch 2/10 (sin dwell)
Loss: 3.2823
Recall@10: 0.7031  MRR@10: 0.5194  NDCG@10: 0.5631

Epoch 3/10 (sin dwell)
Loss: 3.2595
Recall@10: 0.7042  MRR@10: 0.5206  NDCG@10: 0.5643

Epoch 4/10 (sin dwell)
Loss: 3.2460
Recall@10: 0.7044  MRR@10: 0.5203  NDCG@10: 0.5641

Epoch 5/10 (sin dwell)
Loss: 3.2363
Recall@10: 0.7056  MRR@10: 0.5205  NDCG@10: 0.5645

Epoch 6/10 (sin dwell)
Loss: 3.2291
Recall@10: 0.7058  MRR@10: 0.5206  NDCG@10: 0.5646

Epoch 7/10 (sin dwell)
Loss: 3.2233
Recall@10: 0.7060  MRR@10: 0.5210  NDCG@10: 0.5650

Epoch 8/10 (sin dwell)
Loss: 3.2181
Recall@10: 0.7066  MRR@10: 0.5207  NDCG@10: 0.5649

Epoch 9/10 (sin dwell)
Loss: 3.2143
Recall@10: 0.70