# NARM - Oct

In [1]:
import pandas as pd

df = pd.read_csv(
    "oct_reduced.csv",
    usecols=["event_time", "user_id", "product_id", "user_session"]
)

# Convertir el timestamp a datetime
df["event_time"] = pd.to_datetime(df["event_time"])

print(df.head())
print(df.dtypes)

                 event_time  product_id    user_id  \
0 2019-10-01 00:00:00+00:00    44600062  541312140   
1 2019-10-01 00:00:00+00:00     3900821  554748717   
2 2019-10-01 00:00:01+00:00    17200506  519107250   
3 2019-10-01 00:00:01+00:00     1307067  550050854   
4 2019-10-01 00:00:04+00:00     1004237  535871217   

                           user_session  
0  72d76fde-8bb3-4e00-8c23-a032dfed738c  
1  9333dfbd-b87a-4708-9857-6336556b0fcc  
2  566511c2-e2e3-422b-b695-cf8e6e792ca8  
3  7c90fc70-0e80-4590-96f3-13c02c18c713  
4  c6bd7419-2748-4c56-95b4-8cec9ff8b80d  
event_time      datetime64[ns, UTC]
product_id                    int64
user_id                       int64
user_session                 object
dtype: object


In [2]:
import numpy as np
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import random
from collections import Counter

# ================================================================
#   1. CONFIGURACI√ìN GLOBAL
# ================================================================
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

MAX_LEN = 50
MIN_SESSION_LEN = 5
TOP_N_ITEMS = 2000  # ajustar seg√∫n tu dataset
PAD_IDX = 0

#df_small = df.sample(frac=0.5, random_state=42)


# ===================================================================
#   2. CREAR SESIONES + AUMENTACI√ìN POR DWELL 
# ===================================================================
print("\n\n================ NARM Con DWELL Sessions ================")

# ===================================================================
#   2. CREAR SESIONES + AUMENTACI√ìN POR DWELL 
# ===================================================================

DWELL_THRESHOLD = 75   

def build_sessions_with_dwell_repetitions(df):
    sessions = []

    df = df.sort_values(["user_session", "event_time"])

    for session_id, group in df.groupby("user_session"):
        group = group.sort_values("event_time")

        items = group["product_id"].tolist()
        times = group["event_time"].values

        # dwell entre eventos en segundos
        dwells = np.diff(times).astype("timedelta64[s]").astype(float)
        dwells = np.append(dwells, 1.0)

        seq = []
        for item, d in zip(items, dwells):

            # reps = floor(dwell/75) + 1
            reps = int(max(d, 1) // DWELL_THRESHOLD) + 1

            # REPETIR EL ITEM reps VECES
            seq.extend([item] * reps)

        if len(seq) >= 2:
            sessions.append(seq)

    return sessions


sessions = build_sessions_with_dwell_repetitions(df)
print("Sesiones creadas:", len(sessions))
print("Ejemplo sesi√≥n aumentada:", sessions[0][:25])

# ===================================================================
#   3. SPLIT TRAIN/TEST
# ===================================================================

split = int(0.8 * len(sessions))
train_sessions = sessions[:split]
test_sessions = sessions[split:]

print("Train:", len(train_sessions), " Test:", len(test_sessions))

# ===================================================================
#   4. FILTRAR TOP ITEMS (IGUAL A ANTES)
# ===================================================================

counter = Counter([item for sess in train_sessions for item in sess])
top_items = [item for item, _ in counter.most_common(TOP_N_ITEMS)]

item2idx = {item: i + 1 for i, item in enumerate(top_items)}
idx2item = {v: k for k, v in item2idx.items()}
N_ITEMS = len(item2idx) + 1

def encode_session(sess):
    return [item2idx[item] for item in sess if item in item2idx]

train_encoded = [encode_session(s) for s in train_sessions]
test_encoded = [encode_session(s) for s in test_sessions]

train_encoded = [s for s in train_encoded if len(s) >= MIN_SESSION_LEN]
test_encoded = [s for s in test_encoded if len(s) >= MIN_SESSION_LEN]

print("Train encoded:", len(train_encoded))
print("Test encoded:", len(test_encoded))
print("N_ITEMS:", N_ITEMS)


Device: cuda


Sesiones creadas: 5974844
Ejemplo sesi√≥n aumentada: [54900011, 54900011]
Train: 4779875  Test: 1194969
Train encoded: 1467627
Test encoded: 366642
N_ITEMS: 2001


In [3]:
# ===================================================================
#   DATASET + DATALOADER (SIN DWELL)
# ===================================================================

class GRUDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]

        # limitar a MAX_LEN
        if len(seq) > MAX_LEN:
            seq = seq[-MAX_LEN:]

        items = torch.tensor(seq[:-1], dtype=torch.long)
        targets = torch.tensor(seq[1:], dtype=torch.long)

        return items, targets


def collate_fn(batch):
    items_batch, targets_batch = zip(*batch)
    max_len = max(len(x) for x in items_batch)

    def pad(x, pad_value=0):
        return torch.cat([x, torch.full((max_len - len(x),), pad_value, dtype=x.dtype)])

    items = torch.stack([pad(x, PAD_IDX) for x in items_batch])
    targets = torch.stack([pad(x, PAD_IDX) for x in targets_batch])

    return items, targets


train_loader = DataLoader(
    GRUDataset(train_encoded),
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn
)

test_loader = DataLoader(
    GRUDataset(test_encoded),
    batch_size=1,
    shuffle=False,
    collate_fn=collate_fn
)

# ===================================================================
#   1. DEFINICI√ìN DEL MODELO SASREC
# ===================================================================

# ===================================================================
#   MODELO NARM (Neural Attentive Recommendation Machine)
#   Compatible 1:1 con tu pipeline actual
# ===================================================================

class NARM(nn.Module):
    def __init__(
        self,
        n_items,
        emb_size=128,
        hidden_size=128,
        dropout=0.2
    ):
        super().__init__()

        self.embedding = nn.Embedding(n_items, emb_size, padding_idx=PAD_IDX)

        # GRU para la evoluci√≥n secuencial
        self.gru = nn.GRU(
            input_size=emb_size,
            hidden_size=hidden_size,
            batch_first=True
        )

        # Atenci√≥n de intenci√≥n (global + local)
        self.linear_one = nn.Linear(hidden_size, hidden_size)
        self.linear_two = nn.Linear(hidden_size, hidden_size)

        # Capa para puntuar √≠tems
        self.fc = nn.Linear(hidden_size * 2, n_items)

        self.dropout = nn.Dropout(dropout)
        self.hidden_size = hidden_size

    def forward(self, items):
        """
        items: (B, T)
        Retorna: logits (B, T, n_items)
        Igual que SASRec, para que tu training loop siga funcionando tal cual.
        """
        emb = self.dropout(self.embedding(items))       # (B, T, E)

        gru_out, h_last = self.gru(emb)                 # gru_out: (B,T,H)  h_last: (1,B,H)
        h_last = h_last.squeeze(0)                      # (B, H)

        # -------- ATTENTION ----------
        # score_i = q^T tanh(W1*h_i + W2*h_last)
        q1 = self.linear_one(gru_out)                   # (B,T,H)
        q2 = self.linear_two(h_last).unsqueeze(1)       # (B,1,H)

        attn_scores = torch.sum(torch.tanh(q1 + q2), dim=-1)  # (B,T)
        attn_weights = torch.softmax(attn_scores, dim=-1)      # (B,T)

        # Contexto atencional
        context = torch.bmm(attn_weights.unsqueeze(1), gru_out)  # (B,1,H)
        context = context.squeeze(1)                             # (B,H)

        # Representaci√≥n final = [contexto ; estado final]
        final_rep = torch.cat([context, h_last], dim=-1)          # (B, 2H)

        # Repetimos la predicci√≥n a lo largo de T para compatibilidad con CrossEntropy
        B, T = items.size()
        logits = self.fc(final_rep).unsqueeze(1).repeat(1, T, 1)

        return logits


# ===================================================================
#   2. ENTRENAMIENTO SASREC
# ===================================================================

def train_epoch_sasrec(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0

    for items, targets in loader:
        items, targets = items.to(device), targets.to(device)

        optimizer.zero_grad()
        logits = model(items)

        B, T, C = logits.shape
        loss = criterion(logits.reshape(B*T, C), targets.reshape(B*T))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(loader)


# ===================================================================
#   3. EVALUACI√ìN SASREC
# ===================================================================

def evaluate_sasrec(model, loader):
    model.eval()

    recall_k = 10
    recall_sum = 0
    mrr_sum = 0
    ndcg_sum = 0
    total = 0

    for items, targets in loader:
        items, targets = items.to(device), targets.to(device)

        logits = model(items)[0, -1]
        topk = torch.topk(logits, recall_k).indices.tolist()

        target = targets[0, -1].item()
        if target == PAD_IDX:
            continue

        total += 1

        if target in topk:
            rank = topk.index(target) + 1
            recall_sum += 1
            mrr_sum += 1 / rank
            ndcg_sum += 1 / math.log2(rank + 1)

    return (
        recall_sum / total,
        mrr_sum / total,
        ndcg_sum / total
    )


# ===================================================================
#   4. SASRec CON DWELL (usa train_loader y test_loader)
# ===================================================================

print("\n\n================ NARM CON DWELL ================\n")

narm_dwell = NARM(N_ITEMS).to(device)
criterion_narm = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer_narm = torch.optim.Adam(narm_dwell.parameters(), lr=1e-3)

import time

start = time.time()

EPOCHS = 10

for ep in range(1, EPOCHS + 1):
    loss = train_epoch_sasrec(narm_dwell, train_loader, optimizer_narm, criterion_narm)
    recall, mrr, ndcg = evaluate_sasrec(narm_dwell, test_loader)

    print(f"\nEpoch {ep}/{EPOCHS} (NARM CON DWELL)")
    print(f"Loss: {loss:.4f}")
    print(f"Recall@10: {recall:.4f}  MRR@10: {mrr:.4f}  NDCG@10: {ndcg:.4f}")

end = time.time()

print(f"\n‚è± Tiempo total: {end - start:.2f} segundos")





Epoch 1/10 (NARM CON DWELL)
Loss: 1.5623
Recall@10: 0.7351  MRR@10: 0.5326  NDCG@10: 0.5823

Epoch 2/10 (NARM CON DWELL)
Loss: 1.2773
Recall@10: 0.7401  MRR@10: 0.5388  NDCG@10: 0.5881

Epoch 3/10 (NARM CON DWELL)
Loss: 1.2366
Recall@10: 0.7440  MRR@10: 0.5395  NDCG@10: 0.5896

Epoch 4/10 (NARM CON DWELL)
Loss: 1.2178
Recall@10: 0.7465  MRR@10: 0.5409  NDCG@10: 0.5913

Epoch 5/10 (NARM CON DWELL)
Loss: 1.2065
Recall@10: 0.7482  MRR@10: 0.5405  NDCG@10: 0.5913

Epoch 6/10 (NARM CON DWELL)
Loss: 1.1991
Recall@10: 0.7497  MRR@10: 0.5406  NDCG@10: 0.5918

Epoch 7/10 (NARM CON DWELL)
Loss: 1.1944
Recall@10: 0.7504  MRR@10: 0.5416  NDCG@10: 0.5926

Epoch 8/10 (NARM CON DWELL)
Loss: 1.1901
Recall@10: 0.7518  MRR@10: 0.5418  NDCG@10: 0.5932

Epoch 9/10 (NARM CON DWELL)
Loss: 1.1886
Recall@10: 0.7516  MRR@10: 0.5420  NDCG@10: 0.5933

Epoch 10/10 (NARM CON DWELL)
Loss: 1.1870
Recall@10: 0.7517  MRR@10: 0.5415  NDCG@10: 0.5929

‚è± Tiempo total: 5355.16 segundos


In [4]:
import gc
import torch

print("\n========== LIMPIANDO MEMORIA ==========\n")

# üî• 1. Elimina variables GRANDES creadas en la parte con dwell
del sessions
del train_sessions
del test_sessions
del train_encoded
del test_encoded
del train_loader
del test_loader
del narm_dwell
del optimizer_narm
del criterion_narm

# borra tambi√©n diccionarios si quieres ahorrar a√∫n m√°s
# OJO: item2idx lo necesitas de nuevo? 
# Si quieres mantenerlo para usar el MISMO diccionario, NO lo borres.
# del item2idx
# del idx2item

# üî• 2. Forzar recolector de basura
gc.collect()

# üî• 3. Liberar memoria de GPU (si corresponde)
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print(">> Memoria limpiada.\n")



>> Memoria limpiada.



In [5]:
# ===================================================================
#   1. RECREAR SESIONES ORIGINALES (SIN REPETICI√ìN POR DWELL)
# ===================================================================


def build_sessions_no_dwell(df):
    sessions = []

    df = df.sort_values(["user_session", "event_time"])

    for session_id, group in df.groupby("user_session"):
        group = group.sort_values("event_time")

        items = group["product_id"].tolist()

        if len(items) >= 2:
            sessions.append(items)

    return sessions


sessions_no_dwell = build_sessions_no_dwell(df)

print("Total sesiones SIN dwell:", len(sessions_no_dwell))
print("Ejemplo sesi√≥n SIN dwell:", sessions_no_dwell[0][:20])


# ===================================================================
#   2. SPLIT TRAIN / TEST
# ===================================================================

split = int(0.8 * len(sessions_no_dwell))
train_sessions_no_dwell = sessions_no_dwell[:split]
test_sessions_no_dwell = sessions_no_dwell[split:]

print("Train:", len(train_sessions_no_dwell), "Test:", len(test_sessions_no_dwell))


# ===================================================================
#   3. FILTRAR TOP ITEMS (SE REUSA item2idx DEL MODELO CON DWELL)
#      IMPORTANTE: usar el MISMO diccionario para comparaci√≥n justa
# ===================================================================

def encode_session_no_dwell(sess):
    return [item2idx[item] for item in sess if item in item2idx]

train_encoded_no_dwell = [encode_session_no_dwell(s) for s in train_sessions_no_dwell]
test_encoded_no_dwell = [encode_session_no_dwell(s) for s in test_sessions_no_dwell]

train_encoded_no_dwell = [s for s in train_encoded_no_dwell if len(s) >= MIN_SESSION_LEN]
test_encoded_no_dwell  = [s for s in test_encoded_no_dwell if len(s) >= MIN_SESSION_LEN]

print("Train encoded (sin dwell):", len(train_encoded_no_dwell))
print("Test encoded  (sin dwell):", len(test_encoded_no_dwell))


# ===================================================================
#   4. DATASET + DATALOADER (SIN DWELL)
# ===================================================================

class GRUDatasetNoDwell(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]

        if len(seq) > MAX_LEN:
            seq = seq[-MAX_LEN:]

        items = torch.tensor(seq[:-1], dtype=torch.long)
        targets = torch.tensor(seq[1:], dtype=torch.long)

        return items, targets


def collate_fn_no_dwell(batch):
    items_batch, targets_batch = zip(*batch)
    max_len = max(len(x) for x in items_batch)

    def pad(x, pad_value=0):
        return torch.cat([x, torch.full((max_len - len(x),), pad_value, dtype=x.dtype)])

    items = torch.stack([pad(x, PAD_IDX) for x in items_batch])
    targets = torch.stack([pad(x, PAD_IDX) for x in targets_batch])

    return items, targets


train_loader_no_dwell = DataLoader(
    GRUDatasetNoDwell(train_encoded_no_dwell),
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn_no_dwell
)

test_loader_no_dwell = DataLoader(
    GRUDatasetNoDwell(test_encoded_no_dwell),
    batch_size=1,
    shuffle=False,
    collate_fn=collate_fn_no_dwell
)


# ===================================================================

print("\n\n================ NARM SIN DWELL ================\n")

narm_no_dwell = NARM(N_ITEMS).to(device)
criterion_narm_nd = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer_narm_nd = torch.optim.Adam(narm_no_dwell.parameters(), lr=1e-4)

start = time.time()

for ep in range(1, EPOCHS + 1):
    loss = train_epoch_sasrec(narm_no_dwell, train_loader_no_dwell, optimizer_narm_nd, criterion_narm_nd)
    recall, mrr, ndcg = evaluate_sasrec(narm_no_dwell, test_loader_no_dwell)

    print(f"\nEpoch {ep}/{EPOCHS} (NARM sin dwell)")
    print(f"Loss: {loss:.4f}")
    print(f"Recall@10: {recall:.4f}  MRR@10: {mrr:.4f}  NDCG@10: {ndcg:.4f}")

end = time.time()

print(f"\n‚è± Tiempo total: {end - start:.2f} segundos")


Total sesiones SIN dwell: 5974844
Ejemplo sesi√≥n SIN dwell: [54900011, 54900011]
Train: 4779875 Test: 1194969
Train encoded (sin dwell): 980336
Test encoded  (sin dwell): 244745




Epoch 1/10 (NARM sin dwell)
Loss: 3.2747
Recall@10: 0.6705  MRR@10: 0.4450  NDCG@10: 0.4997

Epoch 2/10 (NARM sin dwell)
Loss: 2.2008
Recall@10: 0.6947  MRR@10: 0.4634  NDCG@10: 0.5196

Epoch 3/10 (NARM sin dwell)
Loss: 2.0426
Recall@10: 0.7025  MRR@10: 0.4717  NDCG@10: 0.5279

Epoch 4/10 (NARM sin dwell)
Loss: 1.9633
Recall@10: 0.7057  MRR@10: 0.4782  NDCG@10: 0.5336

Epoch 5/10 (NARM sin dwell)
Loss: 1.9125
Recall@10: 0.7078  MRR@10: 0.4803  NDCG@10: 0.5358

Epoch 6/10 (NARM sin dwell)
Loss: 1.8768
Recall@10: 0.7090  MRR@10: 0.4811  NDCG@10: 0.5366

Epoch 7/10 (NARM sin dwell)
Loss: 1.8500
Recall@10: 0.7098  MRR@10: 0.4831  NDCG@10: 0.5384

Epoch 8/10 (NARM sin dwell)
Loss: 1.8297
Recall@10: 0.7109  MRR@10: 0.4834  NDCG@10: 0.5388

Epoch 9/10 (NARM sin dwell)
Loss: 1.8136
Recall@10: 0.7113  MRR@10: 0.482