In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import random
from collections import Counter
import time
import math

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"

MAX_LEN = 50
MIN_SESSION_LEN = 5
TOP_N_ITEMS = 2000
PAD_IDX = 0

print(f"Device: {device}")

Device: cuda


In [None]:
df = pd.read_csv(
    "../nov_reduced.csv",
    usecols=["event_time", "user_id", "product_id", "user_session"]
)
df["event_time"] = pd.to_datetime(df["event_time"])


def build_sessions_no_dwell(df):
    sessions = []
    df = df.sort_values(["user_session", "event_time"])
    
    for session_id, group in df.groupby("user_session"):
        items = group["product_id"].tolist()
        
        if len(items) >= 2:
            sessions.append(items)
            
    return sessions

sessions_no_dwell = build_sessions_no_dwell(df)

print(f"Total sesiones SIN dwell: {len(sessions_no_dwell)}")
print(f"Ejemplo sesión: {sessions_no_dwell[0]}")

Total sesiones SIN dwell: 5859994
Ejemplo sesión: [5100816, 1005107, 11200402]


In [None]:
split = int(0.8 * len(sessions_no_dwell))
train_sessions = sessions_no_dwell[:split]
test_sessions = sessions_no_dwell[split:]

print(f"Train: {len(train_sessions)}, Test: {len(test_sessions)}")

counter = Counter([item for sess in train_sessions for item in sess])
top_items = [item for item, _ in counter.most_common(TOP_N_ITEMS)]

item2idx = {item: i + 1 for i, item in enumerate(top_items)}
N_ITEMS = len(item2idx) + 1

print(f"N_ITEMS (Vocabulario): {N_ITEMS}")

Train: 4687995, Test: 1171999
N_ITEMS (Vocabulario): 2001


In [None]:
def encode_session(sess):
    return [item2idx[item] for item in sess if item in item2idx]

train_encoded = [encode_session(s) for s in train_sessions]
test_encoded = [encode_session(s) for s in test_sessions]


train_encoded = [s for s in train_encoded if len(s) >= MIN_SESSION_LEN]
test_encoded = [s for s in test_encoded if len(s) >= MIN_SESSION_LEN]


class GRUDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        seq = self.sequences[idx]
        if len(seq) > MAX_LEN: seq = seq[-MAX_LEN:]
        items = torch.tensor(seq[:-1], dtype=torch.long)
        targets = torch.tensor(seq[1:], dtype=torch.long)
        return items, targets

def collate_fn(batch):
    items_batch, targets_batch = zip(*batch)
    max_len = max(len(x) for x in items_batch)
    def pad(x):
        return torch.cat([x, torch.full((max_len - len(x),), PAD_IDX, dtype=x.dtype)])
    items = torch.stack([pad(x) for x in items_batch])
    targets = torch.stack([pad(x) for x in targets_batch])
    return items, targets

train_loader = DataLoader(GRUDataset(train_encoded), batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(GRUDataset(test_encoded), batch_size=1, shuffle=False, collate_fn=collate_fn)

In [None]:
class NARM(nn.Module):
    def __init__(self, n_items, emb_size=128, hidden_size=128, dropout=0.2):
        super().__init__()
        self.embedding = nn.Embedding(n_items, emb_size, padding_idx=PAD_IDX)
        self.gru = nn.GRU(emb_size, hidden_size, batch_first=True)
        self.linear_one = nn.Linear(hidden_size, hidden_size)
        self.linear_two = nn.Linear(hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size * 2, n_items)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, items):
        emb = self.dropout(self.embedding(items))
        gru_out, h_last = self.gru(emb)
        h_last = h_last.squeeze(0)
        
        q1 = self.linear_one(gru_out)
        q2 = self.linear_two(h_last).unsqueeze(1)
        attn_scores = torch.sum(torch.tanh(q1 + q2), dim=-1)
        attn_weights = torch.softmax(attn_scores, dim=-1)
        
        context = torch.bmm(attn_weights.unsqueeze(1), gru_out).squeeze(1)
        final_rep = torch.cat([context, h_last], dim=-1)
        
        B, T = items.size()
        logits = self.fc(final_rep).unsqueeze(1).repeat(1, T, 1)
        return logits

In [None]:
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for items, targets in loader:
        items, targets = items.to(device), targets.to(device)
        optimizer.zero_grad()
        logits = model(items)
        loss = criterion(logits.view(-1, logits.size(-1)), targets.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    recall_sum, mrr_sum, ndcg_sum, total = 0, 0, 0, 0
    k = 10
    
    with torch.no_grad():
        for items, targets in loader:
            items, targets = items.to(device), targets.to(device)
            logits = model(items)[:, -1, :]
            target = targets[:, -1].item()
            
            if target == PAD_IDX: continue
            
            total += 1
            topk = torch.topk(logits, k).indices.squeeze().tolist()
            
            if target in topk:
                rank = topk.index(target) + 1
                recall_sum += 1
                mrr_sum += 1 / rank
                ndcg_sum += 1 / math.log2(rank + 1)
                
    return recall_sum/total, mrr_sum/total, ndcg_sum/total

print("\n================ NARM SIN DWELL ================\n")

model = NARM(N_ITEMS).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

EPOCHS = 10
start = time.time()

for ep in range(1, EPOCHS + 1):
    loss = train_epoch(model, train_loader, optimizer, criterion)
    recall, mrr, ndcg = evaluate(model, test_loader)
    print(f"Epoch {ep} | Loss: {loss:.4f} | R@10: {recall:.4f} | MRR@10: {mrr:.4f} | NDCG@10: {ndcg:.4f}")

print(f"\n⏱ Tiempo total: {time.time() - start:.2f} s")



Epoch 1 | Loss: 2.5406 | R@10: 0.6155 | MRR@10: 0.3843 | NDCG@10: 0.4405
Epoch 2 | Loss: 2.1201 | R@10: 0.6183 | MRR@10: 0.3815 | NDCG@10: 0.4391
Epoch 3 | Loss: 2.0440 | R@10: 0.6208 | MRR@10: 0.3801 | NDCG@10: 0.4387
Epoch 4 | Loss: 2.0027 | R@10: 0.6220 | MRR@10: 0.3857 | NDCG@10: 0.4431
Epoch 5 | Loss: 1.9750 | R@10: 0.6237 | MRR@10: 0.3857 | NDCG@10: 0.4435
Epoch 6 | Loss: 1.9657 | R@10: 0.6218 | MRR@10: 0.3836 | NDCG@10: 0.4415
Epoch 7 | Loss: 2.1225 | R@10: 0.6152 | MRR@10: 0.3752 | NDCG@10: 0.4335
Epoch 8 | Loss: 2.1983 | R@10: 0.6178 | MRR@10: 0.3795 | NDCG@10: 0.4374
Epoch 9 | Loss: 2.1266 | R@10: 0.6226 | MRR@10: 0.3805 | NDCG@10: 0.4393
Epoch 10 | Loss: 1.9862 | R@10: 0.6287 | MRR@10: 0.3900 | NDCG@10: 0.4479

⏱ Tiempo total: 2774.79 s
