# Notebook GRU4Rec sin Dwell

In [1]:
import pandas as pd

nov = pd.read_csv("nov_reduced.csv")

# evento_time como datetime
nov["event_time"] = pd.to_datetime(nov["event_time"])


In [2]:
def split_train_test(df):

    df = df.sort_values("event_time")
    cutoff = df["event_time"].quantile(0.9)

    train = df[df["event_time"] < cutoff]
    test  = df[df["event_time"] >= cutoff]

    return train, test


In [3]:
nov_train, nov_test = split_train_test(nov)
print(len(nov_train), len(nov_test))

60751765 6750214


In [5]:
import gc
gc.collect()

0

In [4]:
def build_sessions(df):
    df = df.sort_values(["user_session", "event_time"])
    sessions = df.groupby("user_session")["product_id"].apply(list)
    return sessions


In [5]:
nov_train_sessions = build_sessions(nov_train)
nov_test_sessions  = build_sessions(nov_test)

In [6]:
def filter_sessions_min_length(sessions, min_len=5):
    return [seq for seq in sessions if len(seq) >= min_len]

In [7]:
nov_train_sessions = filter_sessions_min_length(nov_train_sessions, min_len=5)
nov_test_sessions  = filter_sessions_min_length(nov_test_sessions,  min_len=5)

In [8]:
from collections import defaultdict

def build_id_maps(sessions):
    items = set()
    for seq in sessions:
        for item in seq:
            items.add(item)

    item2idx = {item: i+1 for i, item in enumerate(sorted(items))}
    idx2item = {v: k for k, v in item2idx.items()}
    return item2idx, idx2item


In [9]:
item2idx, idx2item = build_id_maps(nov_train_sessions)

In [10]:
from collections import Counter

def filter_top_items(train_sessions, max_items=15000):
    item_counts = Counter()
    for session in train_sessions:
        for item in session:
            item_counts[item] += 1

    top_items = set([item for item, _ in item_counts.most_common(max_items)])

    filtered_sessions = []
    for session in train_sessions:
        filtered = [item for item in session if item in top_items]
        if len(filtered) >= 5:
            filtered_sessions.append(filtered)

    return filtered_sessions, top_items

nov_train_sessions, top_items = filter_top_items(nov_train_sessions, max_items=15000)

In [11]:
nov_test_sessions_filtered = []
for session in nov_test_sessions:
    filtered = [item for item in session if item in top_items]
    if len(filtered) >= 5:
        nov_test_sessions_filtered.append(filtered)

In [12]:
items_present = set(item for session in nov_train_sessions
                    for item in session)

item2idx_filtered = {item: i+1 for i, item in enumerate(sorted(items_present))}
idx2item_filtered = {i: item for item, i in item2idx_filtered.items()}


In [13]:
def encode_sessions(sessions, item2idx, min_len=5):
    encoded = []
    for seq in sessions:
        seq = [item2idx[x] for x in seq if x in item2idx]
        if len(seq) >= min_len:
            encoded.append(seq)
    return encoded

train_encoded = encode_sessions(nov_train_sessions, item2idx_filtered)
test_encoded  = encode_sessions(nov_test_sessions, item2idx_filtered)

### Modelo GRU sin Dwell

In [14]:
import torch
from torch.utils.data import Dataset, DataLoader

MAX_LEN = 15

class GRUDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]
        if len(seq) > MAX_LEN:
            seq = seq[-MAX_LEN:]   # conservar las últimas interacciones
        return torch.tensor(seq[:-1]), torch.tensor(seq[1:])


In [15]:
PAD_IDX = 0

def collate_fn(batch):
    X_batch, y_batch = zip(*batch)

    # Longitudes individuales
    lengths = [len(x) for x in X_batch]
    max_len = min(MAX_LEN, max(lengths))

    # Pading
    X_padded = [torch.cat([x, torch.full((max_len - len(x),), PAD_IDX)]) for x in X_batch]
    y_padded = [torch.cat([y, torch.full((max_len - len(y),), PAD_IDX)]) for y in y_batch]

    return torch.stack(X_padded), torch.stack(y_padded)

In [16]:
BATCH_SIZE = 32

train_loader = DataLoader(
    GRUDataset(train_encoded),
    batch_size=BATCH_SIZE,
    shuffle=True,
    drop_last=True,
    collate_fn=collate_fn
)

test_loader = DataLoader(
    GRUDataset(test_encoded),
    batch_size=1,
    shuffle=False,
    collate_fn=collate_fn
)

print("Número de batches en train_loader:", len(train_loader))

Número de batches en train_loader: 96789


In [17]:
import torch.nn as nn

class GRU4Rec(nn.Module):
    def __init__(self, n_items, emb_size=128, hidden_size=128):
        super().__init__()
        self.embed = nn.Embedding(n_items+1, emb_size)
        self.gru = nn.GRU(emb_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, n_items+1)

    def forward(self, x):
        x = self.embed(x)
        out, _ = self.gru(x)
        logits = self.fc(out)
        return logits


In [18]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)
n_items = len(item2idx)
model = GRU4Rec(n_items).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

Using device: cuda


In [19]:
def train_gru(model, loader, optimizer, criterion, epochs=10):
    model.train()

    for epoch in range(1, epochs+1):
        total_loss = 0
        n_batch = 0
        for X, y in loader:
            X, y = X.to(device), y.to(device)

            optimizer.zero_grad()

            logits = model(X)

            logits = logits.reshape(-1, logits.size(-1))
            y = y.reshape(-1)

            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            n_batch += 1

            print(f"\rEpoch {epoch} - Batch Loss: {loss.item():.4f} - N Batch: {n_batch}", end="")

        print(f"[Epoch {epoch}] Loss = {total_loss/len(loader):.4f}")


In [20]:
train_gru(model, train_loader, optimizer, criterion)

Epoch 1 - Batch Loss: 3.8820 - N Batch: 96789[Epoch 1] Loss = 3.8155
Epoch 2 - Batch Loss: 3.3484 - N Batch: 96789[Epoch 2] Loss = 3.5327
Epoch 3 - Batch Loss: 3.0185 - N Batch: 96789[Epoch 3] Loss = 3.4961
Epoch 4 - Batch Loss: 3.7615 - N Batch: 96789[Epoch 4] Loss = 3.4752
Epoch 5 - Batch Loss: 3.0139 - N Batch: 96789[Epoch 5] Loss = 3.4605
Epoch 6 - Batch Loss: 3.3034 - N Batch: 96789[Epoch 6] Loss = 3.4495
Epoch 7 - Batch Loss: 4.0285 - N Batch: 96789[Epoch 7] Loss = 3.4416
Epoch 8 - Batch Loss: 3.7380 - N Batch: 96789[Epoch 8] Loss = 3.4352
Epoch 9 - Batch Loss: 3.3445 - N Batch: 96789[Epoch 9] Loss = 3.4301
Epoch 10 - Batch Loss: 3.2809 - N Batch: 96789[Epoch 10] Loss = 3.4260


In [21]:
import numpy as np

def evaluate_model(model, loader, k=10):
    model.eval()
    
    recall_list = []
    mrr_list = []
    ndcg_list = []
    
    with torch.no_grad():
        for X, y in loader:
            X, y = X.to(device), y.to(device)
            
            # Predecir
            logits = model(X)  # (batch, seq_len, n_items)
            
            # Evaluar cada secuencia
            for i in range(X.size(0)):
                for j in range(X.size(1)):
                    target = y[i, j].item()
                    
                    # Ignorar padding
                    if target == PAD_IDX:
                        continue
                    
                    # Obtener scores y top-k
                    scores = logits[i, j]
                    _, top_k_indices = torch.topk(scores, k)
                    top_k_indices = top_k_indices.cpu().numpy()
                    
                    # Recall@k
                    recall = 1.0 if target in top_k_indices else 0.0
                    recall_list.append(recall)
                    
                    # MRR@k
                    if target in top_k_indices:
                        rank = np.where(top_k_indices == target)[0][0] + 1
                        mrr_list.append(1.0 / rank)
                    else:
                        mrr_list.append(0.0)
                    
                    # NDCG@k
                    if target in top_k_indices:
                        rank = np.where(top_k_indices == target)[0][0] + 1
                        ndcg_list.append(1.0 / np.log2(rank + 1))
                    else:
                        ndcg_list.append(0.0)
    
    recall_at_k = np.mean(recall_list)
    mrr_at_k = np.mean(mrr_list)
    ndcg_at_k = np.mean(ndcg_list)
    
    return recall_at_k, mrr_at_k, ndcg_at_k

In [22]:
# Evaluar en el test set
print("Evaluando modelo en test set...")
recall_10, mrr_10, ndcg_10 = evaluate_model(model, test_loader, k=10)

print(f"\nResultados en Test Set:")
print(f"Recall@10: {recall_10:.4f}")
print(f"MRR@10:    {mrr_10:.4f}")
print(f"NDCG@10:   {ndcg_10:.4f}")

Evaluando modelo en test set...

Resultados en Test Set:
Recall@10: 0.6679
MRR@10:    0.4930
NDCG@10:   0.5344
