# BERT4Rec - Oct

In [1]:
import pandas as pd

df = pd.read_csv(
    "oct_reduced.csv",
    usecols=["event_time", "user_id", "product_id", "user_session"]
)

# Convertir el timestamp a datetime
df["event_time"] = pd.to_datetime(df["event_time"])

print(df.head())
print(df.dtypes)

                 event_time  product_id    user_id  \
0 2019-10-01 00:00:00+00:00    44600062  541312140   
1 2019-10-01 00:00:00+00:00     3900821  554748717   
2 2019-10-01 00:00:01+00:00    17200506  519107250   
3 2019-10-01 00:00:01+00:00     1307067  550050854   
4 2019-10-01 00:00:04+00:00     1004237  535871217   

                           user_session  
0  72d76fde-8bb3-4e00-8c23-a032dfed738c  
1  9333dfbd-b87a-4708-9857-6336556b0fcc  
2  566511c2-e2e3-422b-b695-cf8e6e792ca8  
3  7c90fc70-0e80-4590-96f3-13c02c18c713  
4  c6bd7419-2748-4c56-95b4-8cec9ff8b80d  
event_time      datetime64[ns, UTC]
product_id                    int64
user_id                       int64
user_session                 object
dtype: object


In [2]:
import numpy as np
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import random
from collections import Counter

# ================================================================
#   1. CONFIGURACIÓN GLOBAL
# ================================================================
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

MAX_LEN = 50
MIN_SESSION_LEN = 5
TOP_N_ITEMS = 2000  # ajustar según tu dataset
PAD_IDX = 0

#df_small = df.sample(frac=0.1, random_state=42)


# ===================================================================
#   2. CREAR SESIONES + AUMENTACIÓN POR DWELL 
# ===================================================================
print("\n\n================ GRU4REC CON DWELL ================")

# ===================================================================
#   2. CREAR SESIONES + AUMENTACIÓN POR DWELL 
# ===================================================================

DWELL_THRESHOLD = 75   

def build_sessions_with_dwell_repetitions(df):
    sessions = []

    df = df.sort_values(["user_session", "event_time"])

    for session_id, group in df.groupby("user_session"):
        group = group.sort_values("event_time")

        items = group["product_id"].tolist()
        times = group["event_time"].values

        # dwell entre eventos en segundos
        dwells = np.diff(times).astype("timedelta64[s]").astype(float)
        dwells = np.append(dwells, 1.0)

        seq = []
        for item, d in zip(items, dwells):

            # reps = floor(dwell/75) + 1
            reps = int(max(d, 1) // DWELL_THRESHOLD) + 1

            # REPETIR EL ITEM reps VECES
            seq.extend([item] * reps)

        if len(seq) >= 2:
            sessions.append(seq)

    return sessions


sessions = build_sessions_with_dwell_repetitions(df)
print("Sesiones creadas:", len(sessions))
print("Ejemplo sesión aumentada:", sessions[0][:25])

# ===================================================================
#   3. SPLIT TRAIN/TEST
# ===================================================================

split = int(0.8 * len(sessions))
train_sessions = sessions[:split]
test_sessions = sessions[split:]

print("Train:", len(train_sessions), " Test:", len(test_sessions))

# ===================================================================
#   4. FILTRAR TOP ITEMS (IGUAL A ANTES)
# ===================================================================

counter = Counter([item for sess in train_sessions for item in sess])
top_items = [item for item, _ in counter.most_common(TOP_N_ITEMS)]

item2idx = {item: i + 1 for i, item in enumerate(top_items)}
idx2item = {v: k for k, v in item2idx.items()}
N_ITEMS = len(item2idx) + 1

def encode_session(sess):
    return [item2idx[item] for item in sess if item in item2idx]

train_encoded = [encode_session(s) for s in train_sessions]
test_encoded = [encode_session(s) for s in test_sessions]

train_encoded = [s for s in train_encoded if len(s) >= MIN_SESSION_LEN]
test_encoded = [s for s in test_encoded if len(s) >= MIN_SESSION_LEN]

print("Train encoded:", len(train_encoded))
print("Test encoded:", len(test_encoded))
print("N_ITEMS:", N_ITEMS)


Device: cuda


Sesiones creadas: 5974844
Ejemplo sesión aumentada: [54900011, 54900011]
Train: 4779875  Test: 1194969
Train encoded: 1467627
Test encoded: 366642
N_ITEMS: 2001


In [None]:
# ===================================================================
#   DATASET + DATALOADER (SIN DWELL)
# ===================================================================

class GRUDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]

        # limitar a MAX_LEN
        if len(seq) > MAX_LEN:
            seq = seq[-MAX_LEN:]

        items = torch.tensor(seq[:-1], dtype=torch.long)
        targets = torch.tensor(seq[1:], dtype=torch.long)

        return items, targets


def collate_fn(batch):
    items_batch, targets_batch = zip(*batch)
    max_len = max(len(x) for x in items_batch)

    def pad(x, pad_value=0):
        return torch.cat([x, torch.full((max_len - len(x),), pad_value, dtype=x.dtype)])

    items = torch.stack([pad(x, PAD_IDX) for x in items_batch])
    targets = torch.stack([pad(x, PAD_IDX) for x in targets_batch])

    return items, targets


train_loader = DataLoader(
    GRUDataset(train_encoded),
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn
)

test_loader = DataLoader(
    GRUDataset(test_encoded),
    batch_size=1,
    shuffle=False,
    collate_fn=collate_fn
)

# ===================================================================
#   1. DEFINICIÓN DEL MODELO BERT
# ===================================================================

class BERT4Rec(nn.Module):
    def __init__(
        self,
        n_items,
        emb_size=128,
        hidden_size=256,
        n_heads=4,
        n_layers=2,
        max_len=MAX_LEN,
        dropout=0.2,
        mask_prob=0.15
    ):
        super().__init__()

        self.mask_prob = mask_prob
        self.max_len = max_len

        # 1. Embeddings
        self.item_emb = nn.Embedding(n_items, emb_size, padding_idx=PAD_IDX)
        self.pos_emb = nn.Embedding(max_len, emb_size)

        # 2. Transformer bidireccional (BERT)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=emb_size,
            nhead=n_heads,
            dim_feedforward=hidden_size,
            dropout=dropout,
            batch_first=True,
            activation="gelu"
        )
        self.transformer = nn.TransformerEncoder(
            encoder_layer,
            num_layers=n_layers
        )

        # 3. Capa final de predicción
        self.fc = nn.Linear(emb_size, n_items)

        self.dropout = nn.Dropout(dropout)

    # ----------------------------------------------------------
    #   Genera máscara MLM para entrenamiento
    # ----------------------------------------------------------
    def random_mask(self, items):
        prob = torch.rand(items.shape, device=items.device)
        mask = (prob < self.mask_prob) & (items != PAD_IDX)

        masked = items.clone()
        masked[mask] = 1   # token especial [MASK] → en tu diccionario puedes usar 1

        return masked, mask


    def forward(self, items):
        B, T = items.size()

        # Aplicar máscara solo en entrenamiento
        if self.training:
            masked_items, _ = self.random_mask(items)
        else:
            masked_items = items

        positions = torch.arange(T, device=items.device).unsqueeze(0)

        x = self.item_emb(masked_items) + self.pos_emb(positions)
        x = self.dropout(x)

        # Máscara de padding
        pad_mask = items.eq(PAD_IDX)

        # Self-attention bidireccional (sin máscara causal)
        x = self.transformer(x, src_key_padding_mask=pad_mask)

        logits = self.fc(x)
        return logits



# ===================================================================
#   2. ENTRENAMIENTO BERT4REC
# ===================================================================

def train_epoch_sasrec(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0

    for items, targets in loader:
        items, targets = items.to(device), targets.to(device)

        optimizer.zero_grad()
        logits = model(items)

        B, T, C = logits.shape
        loss = criterion(logits.reshape(B*T, C), targets.reshape(B*T))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(loader)


# ===================================================================
#   3. EVALUACIÓN BERT4REC
# ===================================================================

def evaluate_sasrec(model, loader):
    model.eval()

    recall_k = 10
    recall_sum = 0
    mrr_sum = 0
    ndcg_sum = 0
    total = 0

    for items, targets in loader:
        items, targets = items.to(device), targets.to(device)

        logits = model(items)[0, -1]
        topk = torch.topk(logits, recall_k).indices.tolist()

        target = targets[0, -1].item()
        if target == PAD_IDX:
            continue

        total += 1

        if target in topk:
            rank = topk.index(target) + 1
            recall_sum += 1
            mrr_sum += 1 / rank
            ndcg_sum += 1 / math.log2(rank + 1)

    return (
        recall_sum / total,
        mrr_sum / total,
        ndcg_sum / total
    )


# ===================================================================
#   4. BERT4REC CON DWELL (usa train_loader y test_loader)
# ===================================================================

print("\n\n================ BERT4REC CON DWELL ================\n")

bert4rec = BERT4Rec(N_ITEMS).to(device)
criterion_bert = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer_bert = torch.optim.Adam(bert4rec.parameters(), lr=1e-4)


EPOCHS = 10

import time

start = time.time()

for ep in range(1, EPOCHS + 1):
    loss = train_epoch_sasrec(bert4rec, train_loader, optimizer_bert, criterion_bert)
    recall, mrr, ndcg = evaluate_sasrec(bert4rec, test_loader)

    print(f"\nEpoch {ep}/{EPOCHS} (BERT4Rec con dwell)")
    print(f"Loss: {loss:.4f}")
    print(f"Recall@10: {recall:.4f}  MRR@10: {mrr:.4f}  NDCG@10: {ndcg:.4f}")

end = time.time()

print(f"\n⏱ Tiempo total: {end - start:.2f} segundos")





Epoch 1/10 (BERT4Rec con dwell)
Loss: 1.1698
Recall@10: 0.7537  MRR@10: 0.6355  NDCG@10: 0.6639

Epoch 2/10 (BERT4Rec con dwell)
Loss: 0.5534
Recall@10: 0.7743  MRR@10: 0.6516  NDCG@10: 0.6809

Epoch 3/10 (BERT4Rec con dwell)
Loss: 0.5003
Recall@10: 0.7822  MRR@10: 0.6560  NDCG@10: 0.6861

Epoch 4/10 (BERT4Rec con dwell)
Loss: 0.4757
Recall@10: 0.7856  MRR@10: 0.6581  NDCG@10: 0.6885

Epoch 5/10 (BERT4Rec con dwell)
Loss: 0.4614
Recall@10: 0.7886  MRR@10: 0.6596  NDCG@10: 0.6903

Epoch 6/10 (BERT4Rec con dwell)
Loss: 0.4515
Recall@10: 0.7897  MRR@10: 0.6603  NDCG@10: 0.6911

Epoch 7/10 (BERT4Rec con dwell)
Loss: 0.4443
Recall@10: 0.7915  MRR@10: 0.6611  NDCG@10: 0.6922

Epoch 8/10 (BERT4Rec con dwell)
Loss: 0.4394
Recall@10: 0.7928  MRR@10: 0.6616  NDCG@10: 0.6928

Epoch 9/10 (BERT4Rec con dwell)
Loss: 0.4349
Recall@10: 0.7930  MRR@10: 0.6616  NDCG@10: 0.6929

Epoch 10/10 (BERT4Rec con dwell)
Loss: 0.4315
Recall@10: 0.7936  MRR@10: 0.6620  NDCG@10: 0.6933

⏱ Tiempo total: 10550.20 

In [None]:
import gc
import torch

print("\n========== LIMPIANDO MEMORIA ==========\n")

del sessions
del train_sessions
del test_sessions
del train_encoded
del test_encoded
del train_loader
del test_loader
del bert4rec    
del optimizer_bert
del criterion_bert

gc.collect()

if torch.cuda.is_available():
    torch.cuda.empty_cache()

print(">> Memoria limpiada.\n")



>> Memoria limpiada.



In [5]:
# ===================================================================
#   1. RECREAR SESIONES ORIGINALES (SIN REPETICIÓN POR DWELL)
# ===================================================================


def build_sessions_no_dwell(df):
    sessions = []

    df = df.sort_values(["user_session", "event_time"])

    for session_id, group in df.groupby("user_session"):
        group = group.sort_values("event_time")

        items = group["product_id"].tolist()

        if len(items) >= 2:
            sessions.append(items)

    return sessions


sessions_no_dwell = build_sessions_no_dwell(df)

print("Total sesiones SIN dwell:", len(sessions_no_dwell))
print("Ejemplo sesión SIN dwell:", sessions_no_dwell[0][:20])


# ===================================================================
#   2. SPLIT TRAIN / TEST
# ===================================================================

split = int(0.8 * len(sessions_no_dwell))
train_sessions_no_dwell = sessions_no_dwell[:split]
test_sessions_no_dwell = sessions_no_dwell[split:]

print("Train:", len(train_sessions_no_dwell), "Test:", len(test_sessions_no_dwell))


# ===================================================================
#   3. FILTRAR TOP ITEMS (SE REUSA item2idx DEL MODELO CON DWELL)
#      IMPORTANTE: usar el MISMO diccionario para comparación justa
# ===================================================================

def encode_session_no_dwell(sess):
    return [item2idx[item] for item in sess if item in item2idx]

train_encoded_no_dwell = [encode_session_no_dwell(s) for s in train_sessions_no_dwell]
test_encoded_no_dwell = [encode_session_no_dwell(s) for s in test_sessions_no_dwell]

train_encoded_no_dwell = [s for s in train_encoded_no_dwell if len(s) >= MIN_SESSION_LEN]
test_encoded_no_dwell  = [s for s in test_encoded_no_dwell if len(s) >= MIN_SESSION_LEN]

print("Train encoded (sin dwell):", len(train_encoded_no_dwell))
print("Test encoded  (sin dwell):", len(test_encoded_no_dwell))


# ===================================================================
#   4. DATASET + DATALOADER (SIN DWELL)
# ===================================================================

class GRUDatasetNoDwell(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]

        if len(seq) > MAX_LEN:
            seq = seq[-MAX_LEN:]

        items = torch.tensor(seq[:-1], dtype=torch.long)
        targets = torch.tensor(seq[1:], dtype=torch.long)

        return items, targets


def collate_fn_no_dwell(batch):
    items_batch, targets_batch = zip(*batch)
    max_len = max(len(x) for x in items_batch)

    def pad(x, pad_value=0):
        return torch.cat([x, torch.full((max_len - len(x),), pad_value, dtype=x.dtype)])

    items = torch.stack([pad(x, PAD_IDX) for x in items_batch])
    targets = torch.stack([pad(x, PAD_IDX) for x in targets_batch])

    return items, targets


train_loader_no_dwell = DataLoader(
    GRUDatasetNoDwell(train_encoded_no_dwell),
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn_no_dwell
)

test_loader_no_dwell = DataLoader(
    GRUDatasetNoDwell(test_encoded_no_dwell),
    batch_size=1,
    shuffle=False,
    collate_fn=collate_fn_no_dwell
)


# ===================================================================

print("\n\n================ BERT4REC SIN DWELL ================\n")

bert_no_dwell = BERT4Rec(N_ITEMS).to(device)
criterion_bert_nd = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer_bert_nd = torch.optim.Adam(bert_no_dwell.parameters(), lr=1e-4)

start = time.time()

for ep in range(1, EPOCHS + 1):
    loss = train_epoch_sasrec(bert_no_dwell, train_loader_no_dwell, optimizer_bert_nd, criterion_bert_nd)
    recall, mrr, ndcg = evaluate_sasrec(bert_no_dwell, test_loader_no_dwell)
    
    print(f"\nEpoch {ep}/{EPOCHS} (BERT4Rec sin dwell)")
    print(f"Loss: {loss:.4f}")
    print(f"Recall@10: {recall:.4f}  MRR@10: {mrr:.4f}  NDCG@10: {ndcg:.4f}")

end = time.time()

print(f"\n⏱ Tiempo total: {end - start:.2f} segundos")


Total sesiones SIN dwell: 5974844
Ejemplo sesión SIN dwell: [54900011, 54900011]
Train: 4779875 Test: 1194969
Train encoded (sin dwell): 980336
Test encoded  (sin dwell): 244745




Epoch 1/10 (BERT4Rec sin dwell)
Loss: 1.7716
Recall@10: 0.7121  MRR@10: 0.5662  NDCG@10: 0.6013

Epoch 2/10 (BERT4Rec sin dwell)
Loss: 0.9420
Recall@10: 0.7363  MRR@10: 0.5856  NDCG@10: 0.6217

Epoch 3/10 (BERT4Rec sin dwell)
Loss: 0.8601
Recall@10: 0.7459  MRR@10: 0.5921  NDCG@10: 0.6288

Epoch 4/10 (BERT4Rec sin dwell)
Loss: 0.8230
Recall@10: 0.7508  MRR@10: 0.5954  NDCG@10: 0.6325

Epoch 5/10 (BERT4Rec sin dwell)
Loss: 0.8016
Recall@10: 0.7541  MRR@10: 0.5974  NDCG@10: 0.6348

Epoch 6/10 (BERT4Rec sin dwell)
Loss: 0.7869
Recall@10: 0.7566  MRR@10: 0.5986  NDCG@10: 0.6362

Epoch 7/10 (BERT4Rec sin dwell)
Loss: 0.7756
Recall@10: 0.7580  MRR@10: 0.5990  NDCG@10: 0.6369

Epoch 8/10 (BERT4Rec sin dwell)
Loss: 0.7666
Recall@10: 0.7592  MRR@10: 0.5999  NDCG@10: 0.6378

Epoch 9/10 (BERT4Rec sin dwell)
Loss: 0.75