# BERT4Rec

Se usó este mismo notebook para el dataset de octubre y noviembre, solo se cambió la ubicación del archivo

## Sin dwell time

In [1]:
import pandas as pd
import numpy as np


df = pd.read_csv('../oct_reduced.csv')
df = df[df['event_type'] == 'view']
df = df[['event_time', 'user_id', 'product_id']].copy()
df['event_time'] = pd.to_datetime(df['event_time'], utc=True)


item_counts = df['product_id'].value_counts()
top_items = item_counts.head(50000).index  
df = df[df['product_id'].isin(top_items)]

print(f"Items únicos después de filtrar: {df['product_id'].nunique()}")

df = df.sort_values(['user_id', 'event_time'])

user_sequences = df.groupby('user_id')['product_id'].apply(list).reset_index()
user_sequences.columns = ['user_id', 'item_sequence']
user_sequences['seq_length'] = user_sequences['item_sequence'].apply(len)
user_sequences = user_sequences[
    (user_sequences['seq_length'] >= 5) & 
    (user_sequences['seq_length'] <= 50)
]

print(f"Usuarios: {len(user_sequences)}")
print(f"Secuencia promedio: {user_sequences['seq_length'].mean():.2f}")

Items únicos después de filtrar: 50000
Usuarios: 1282243
Secuencia promedio: 15.36
Usuarios: 1282243
Secuencia promedio: 15.36


In [2]:

all_items = df['product_id'].unique()
print(f"Items únicos: {len(all_items)}")

item2idx = {item: idx+2 for idx, item in enumerate(all_items)}
idx2item = {idx: item for item, idx in item2idx.items()}
item2idx['[PAD]'] = 0
item2idx['[MASK]'] = 1

n_items = len(all_items) + 2
print(f"Total items (incluyendo tokens): {n_items}")


user_sequences['item_sequence'] = user_sequences['item_sequence'].apply(
    lambda seq: [item2idx[item] for item in seq]
)


all_idx = [idx for seq in user_sequences['item_sequence'] for idx in seq]
print(f"Índice mínimo: {min(all_idx)}, máximo: {max(all_idx)}")
print(f"Válido: {max(all_idx) < n_items}")

Items únicos: 50000
Total items (incluyendo tokens): 50002
Índice mínimo: 2, máximo: 50001
Válido: True
Índice mínimo: 2, máximo: 50001
Válido: True


In [3]:
def split_sequences(sequences):
    train_data, val_data, test_data = [], [], []
    
    for seq in sequences:
        if len(seq) >= 5:
            train_seq = seq[:-2]
            val_item = seq[-2]
            test_item = seq[-1]
            
            if len(train_seq) >= 3:
                train_data.append(train_seq)
                val_data.append((train_seq + [val_item], val_item))
                test_data.append((train_seq + [val_item], test_item))
    
    return train_data, val_data, test_data

train_seqs, val_data, test_data = split_sequences(
    user_sequences['item_sequence'].tolist()
)

print(f"Train: {len(train_seqs)}, Val: {len(val_data)}, Test: {len(test_data)}")

Train: 1282243, Val: 1282243, Test: 1282243


In [None]:
import torch
import torch.nn as nn
import gc


torch.cuda.empty_cache()
gc.collect()

class BERT4Rec(nn.Module):
    def __init__(self, n_items, hidden_size=64, num_heads=2, num_layers=1, 
                 max_len=50, dropout=0.1):
        super().__init__()
        
        self.n_items = n_items
        self.hidden_size = hidden_size
        self.max_len = max_len
        
        
        self.item_embedding = nn.Embedding(n_items, hidden_size, padding_idx=0)
        self.position_embedding = nn.Embedding(max_len, hidden_size)
        
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_size,
            nhead=num_heads,
            dim_feedforward=hidden_size * 2,  
            dropout=dropout,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        
        self.output_layer = nn.Linear(hidden_size, n_items)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, item_seq):
        batch_size, seq_len = item_seq.shape
        
        position_ids = torch.arange(seq_len, device=item_seq.device)
        position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
        
        item_emb = self.item_embedding(item_seq)
        pos_emb = self.position_embedding(position_ids)
        
        sequence_emb = self.dropout(item_emb + pos_emb)
        attention_mask = (item_seq == 0)
        
        hidden = self.transformer(sequence_emb, src_key_padding_mask=attention_mask)
        output = self.output_layer(hidden)
        
        return output

device = torch.device('cuda')
model = BERT4Rec(
    n_items=n_items,
    hidden_size=64,
    num_heads=2,
    num_layers=1,
    max_len=50,
    dropout=0.1
).to(device)

print(f"Modelo en: {device}")
print(f"Parámetros: {sum(p.numel() for p in model.parameters()):,}")
print(f"Memoria GPU: {torch.cuda.memory_allocated(0) / (1024**3):.2f} GB")

Modelo en: cuda
Parámetros: 6,486,930
Memoria GPU: 0.02 GB


In [5]:
def mask_sequence(seq, mask_prob=0.15, mask_token=1, n_items=None):
    """
    Enmascara aleatoriamente items en la secuencia
    """
    masked_seq = seq.copy()
    labels = [-100] * len(seq)
    
    for i in range(len(seq)):
        if np.random.random() < mask_prob:
            labels[i] = seq[i]
            
            prob = np.random.random()
            if prob < 0.8:  
                masked_seq[i] = mask_token
            elif prob < 0.9:  
                masked_seq[i] = np.random.randint(2, n_items)
    
    return masked_seq, labels

In [None]:
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler

class BERT4RecDataset(Dataset):
    def __init__(self, sequences, max_len=50, n_items=None):
        self.sequences = sequences
        self.max_len = max_len
        self.mask_token = 1
        self.n_items = n_items
        
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        seq = self.sequences[idx]
        
        if len(seq) > self.max_len:
            seq = seq[-self.max_len:]
        
        masked_seq, labels = mask_sequence(
            seq, 
            mask_prob=0.15, 
            mask_token=self.mask_token,
            n_items=self.n_items
        )
        
        pad_len = self.max_len - len(masked_seq)
        masked_seq = [0] * pad_len + masked_seq
        labels = [-100] * pad_len + labels
        
        return {
            'input_ids': torch.tensor(masked_seq, dtype=torch.long),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

train_dataset = BERT4RecDataset(train_seqs, max_len=50, n_items=n_items)
train_loader = DataLoader(
    train_dataset, 
    batch_size=32,  
    shuffle=True,
    num_workers=0
)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=-100)
scaler = GradScaler()

model.train()
for epoch in range(10):
    total_loss = 0
    
    for i, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        with autocast():
            outputs = model(input_ids)
            loss = criterion(outputs.view(-1, n_items), labels.view(-1))
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
        
        if (i + 1) % 50 == 0:
            torch.cuda.empty_cache()
            print(f"  Batch {i+1}/{len(train_loader)}, Loss: {loss.item():.4f}")
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/10, Avg Loss: {avg_loss:.4f}")
    torch.cuda.empty_cache()

  scaler = GradScaler()
  with autocast():


  Batch 50/40071, Loss: 10.0418
  Batch 100/40071, Loss: 9.9543
  Batch 100/40071, Loss: 9.9543
  Batch 150/40071, Loss: 9.8260
  Batch 150/40071, Loss: 9.8260
  Batch 200/40071, Loss: 9.2967
  Batch 200/40071, Loss: 9.2967
  Batch 250/40071, Loss: 9.9008
  Batch 250/40071, Loss: 9.9008
  Batch 300/40071, Loss: 9.4623
  Batch 300/40071, Loss: 9.4623
  Batch 350/40071, Loss: 8.7372
  Batch 350/40071, Loss: 8.7372
  Batch 400/40071, Loss: 8.9853
  Batch 400/40071, Loss: 8.9853
  Batch 450/40071, Loss: 9.1588
  Batch 450/40071, Loss: 9.1588
  Batch 500/40071, Loss: 10.0288
  Batch 500/40071, Loss: 10.0288
  Batch 550/40071, Loss: 9.6573
  Batch 550/40071, Loss: 9.6573
  Batch 600/40071, Loss: 8.3443
  Batch 600/40071, Loss: 8.3443
  Batch 650/40071, Loss: 9.6011
  Batch 650/40071, Loss: 9.6011
  Batch 700/40071, Loss: 8.6494
  Batch 700/40071, Loss: 8.6494
  Batch 750/40071, Loss: 8.9813
  Batch 750/40071, Loss: 8.9813
  Batch 800/40071, Loss: 9.4557
  Batch 800/40071, Loss: 9.4557
  Batc

In [7]:
@torch.no_grad()
def evaluate(model, test_data, k=10, max_len=50):
    model.eval()
    recalls, mrrs, ndcgs = [], [], []
    
    for i, (seq, target) in enumerate(test_data):
        if len(seq) > max_len:
            seq = seq[-max_len:]
        
        pad_len = max_len - len(seq)
        input_seq = [0] * pad_len + seq
        input_tensor = torch.tensor([input_seq], dtype=torch.long).to(device)
        
        output = model(input_tensor)
        scores = output[0, -1, :]
        
        _, top_k = torch.topk(scores, k)
        top_k = top_k.cpu().numpy()
        
        if target in top_k:
            recalls.append(1.0)
            rank = np.where(top_k == target)[0][0] + 1
            mrrs.append(1.0 / rank)
            ndcgs.append(1.0 / np.log2(rank + 1))
        else:
            recalls.append(0.0)
            mrrs.append(0.0)
            ndcgs.append(0.0)
        
        if (i + 1) % 1000 == 0:
            torch.cuda.empty_cache()
    
    return np.mean(recalls), np.mean(mrrs), np.mean(ndcgs)

recall, mrr, ndcg = evaluate(model, test_data, k=10, max_len=50)
print(f"Recall@10: {recall:.4f}")
print(f"MRR@10: {mrr:.4f}")
print(f"NDCG@10: {ndcg:.4f}")

Recall@10: 0.4681
MRR@10: 0.3289
NDCG@10: 0.3620


## Con Dwell Time

In [None]:
%reset -f
import gc
gc.collect()


import torch
torch.cuda.empty_cache()

print("Memoria limpiada.")



Memoria limpiada.


In [None]:
import pandas as pd
import numpy as np


df = pd.read_csv('../oct_reduced.csv')
df = df[df['event_type'] == 'view']
df = df[['event_time', 'user_id', 'product_id', 'user_session']].copy()
df['event_time'] = pd.to_datetime(df['event_time'], utc=True)


item_counts = df['product_id'].value_counts()
top_items = item_counts.head(50000).index  
df = df[df['product_id'].isin(top_items)]

print(f"Items únicos después de filtrar: {df['product_id'].nunique()}")


df = df.sort_values(['user_id', 'user_session', 'event_time'])


df['next_event_time'] = df.groupby(['user_id', 'user_session'])['event_time'].shift(-1)
df['dwell_time'] = (df['next_event_time'] - df['event_time']).dt.total_seconds()


df['dwell_time'] = df['dwell_time'].fillna(0)
df['dwell_time'] = df['dwell_time'].clip(0, 3600) 

print(f"Dwell time stats:")
print(df['dwell_time'].describe())


user_data = df.groupby('user_id').apply(
    lambda x: pd.Series({
        'item_sequence': x['product_id'].tolist(),
        'dwell_sequence': x['dwell_time'].tolist()
    })
).reset_index()

user_data['seq_length'] = user_data['item_sequence'].apply(len)
user_data = user_data[
    (user_data['seq_length'] >= 5) & 
    (user_data['seq_length'] <= 50)
]

print(f"Usuarios: {len(user_data)}")
print(f"Secuencia promedio: {user_data['seq_length'].mean():.2f}")

Items únicos después de filtrar: 50000
Dwell time stats:
Dwell time stats:
count    3.862741e+07
mean     6.816604e+01
std      2.452554e+02
min      0.000000e+00
25%      5.000000e+00
50%      2.500000e+01
75%      5.700000e+01
max      3.600000e+03
Name: dwell_time, dtype: float64
count    3.862741e+07
mean     6.816604e+01
std      2.452554e+02
min      0.000000e+00
25%      5.000000e+00
50%      2.500000e+01
75%      5.700000e+01
max      3.600000e+03
Name: dwell_time, dtype: float64


  user_data = df.groupby('user_id').apply(


Usuarios: 1282243
Secuencia promedio: 15.36


In [None]:
# Normalizar dwell time a diferentes escalas
def normalize_dwell_time(dwell_times, method='log'):
    """
    Normaliza dwell times
    method: 'log', 'minmax', 'bins'
    """
    dwell_array = np.array(dwell_times)
    
    if method == 'log':
        normalized = np.log1p(dwell_array)  
        normalized = normalized / (np.log1p(3600))  
    
    elif method == 'minmax':
        normalized = dwell_array / 3600.0  
    
    elif method == 'bins':
   
        bins = [0, 10, 30, 60, 180, 600, 3600]  
        normalized = np.digitize(dwell_array, bins) / len(bins)
    
    return normalized.tolist()

user_data['dwell_normalized'] = user_data['dwell_sequence'].apply(
    lambda x: normalize_dwell_time(x, method='log')
)

print("Dwell time normalizado (primeros 5 valores de un usuario):")
print(user_data['dwell_normalized'].iloc[0][:5])

Dwell time normalizado (primeros 5 valores de un usuario):
[0.0, 0.7759672430724603, 0.5535007734948387, 0.0, 0.38808971672060893]


In [None]:

all_items = df['product_id'].unique()
print(f"Items únicos: {len(all_items)}")

item2idx = {item: idx+2 for idx, item in enumerate(all_items)}
idx2item = {idx: item for item, idx in item2idx.items()}
item2idx['[PAD]'] = 0
item2idx['[MASK]'] = 1

n_items = len(all_items) + 2
print(f"Total items (incluyendo tokens): {n_items}")


user_data['item_sequence'] = user_data['item_sequence'].apply(
    lambda seq: [item2idx[item] for item in seq]
)


all_idx = [idx for seq in user_data['item_sequence'] for idx in seq]
print(f"Índices válidos: {max(all_idx) < n_items}")

Items únicos: 50000
Total items (incluyendo tokens): 50002
Índices válidos: True
Índices válidos: True


In [None]:
def split_sequences_with_dwell(user_data):
    """
    Split manteniendo item sequences y dwell time
    """
    train_data = []
    val_data = []
    test_data = []
    
    for _, row in user_data.iterrows():
        item_seq = row['item_sequence']
        dwell_seq = row['dwell_normalized']
        
        if len(item_seq) >= 5:
            train_items = item_seq[:-2]
            train_dwells = dwell_seq[:-2]
            
            val_item = item_seq[-2]
            test_item = item_seq[-1]
            
            if len(train_items) >= 3:
                train_data.append({
                    'items': train_items,
                    'dwells': train_dwells
                })
                
                val_data.append({
                    'items': item_seq[:-1],
                    'dwells': dwell_seq[:-1],
                    'target': val_item
                })
                
                test_data.append({
                    'items': item_seq[:-1],
                    'dwells': dwell_seq[:-1],
                    'target': test_item
                })
    
    return train_data, val_data, test_data

train_seqs, val_data, test_data = split_sequences_with_dwell(user_data)
print(f"Train: {len(train_seqs)}, Val: {len(val_data)}, Test: {len(test_data)}")

Train: 1282243, Val: 1282243, Test: 1282243


In [None]:
import torch
import torch.nn as nn
import gc

torch.cuda.empty_cache()
gc.collect()

class BERT4RecWithDwellTime(nn.Module):
    def __init__(self, n_items, hidden_size=64, num_heads=2, num_layers=1, 
                 max_len=50, dropout=0.1, use_dwell_time=True):
        super().__init__()
        
        self.n_items = n_items
        self.hidden_size = hidden_size
        self.max_len = max_len
        self.use_dwell_time = use_dwell_time
        self.item_embedding = nn.Embedding(n_items, hidden_size, padding_idx=0)
        self.position_embedding = nn.Embedding(max_len, hidden_size)
        
     
        if use_dwell_time:

            self.dwell_projection = nn.Linear(1, hidden_size)
            self.dwell_gate = nn.Linear(hidden_size * 2, hidden_size)
        

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_size,
            nhead=num_heads,
            dim_feedforward=hidden_size * 2,
            dropout=dropout,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        self.output_layer = nn.Linear(hidden_size, n_items)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, item_seq, dwell_seq=None):
        """
        Args:
            item_seq: (batch_size, seq_len)
            dwell_seq: (batch_size, seq_len) - dwell times normalizados
        """
        batch_size, seq_len = item_seq.shape
        
       
        position_ids = torch.arange(seq_len, device=item_seq.device)
        position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
        
        
        item_emb = self.item_embedding(item_seq)
        pos_emb = self.position_embedding(position_ids)
        
      
        if self.use_dwell_time and dwell_seq is not None:
          
            dwell_emb = self.dwell_projection(dwell_seq.unsqueeze(-1))
            
           
            combined = torch.cat([item_emb, dwell_emb], dim=-1)
            item_emb = torch.sigmoid(self.dwell_gate(combined)) * item_emb + \
                       (1 - torch.sigmoid(self.dwell_gate(combined))) * dwell_emb
        
 
        sequence_emb = self.dropout(item_emb + pos_emb)
        
     
        attention_mask = (item_seq == 0)
        

        hidden = self.transformer(sequence_emb, src_key_padding_mask=attention_mask)
        
    
        output = self.output_layer(hidden)
        
        return output

device = torch.device('cuda')


model_dwell = BERT4RecWithDwellTime(
    n_items=n_items,
    hidden_size=64,
    num_heads=2,
    num_layers=1,
    max_len=50,
    dropout=0.1,
    use_dwell_time=True
).to(device)

print("\nModelo con dwell time:")
print(f"  Parámetros: {sum(p.numel() for p in model_dwell.parameters()):,}")

print(f"\nMemoria GPU: {torch.cuda.memory_allocated(0) / (1024**3):.2f} GB")


Modelo con dwell time:
  Parámetros: 6,495,314

Memoria GPU: 0.04 GB


In [None]:
from torch.utils.data import Dataset, DataLoader

def mask_sequence_dwell(seq, mask_prob=0.15, mask_token=1, n_items=None):
    """
    Enmascara aleatoriamente items en la secuencia
    """
    masked_seq = seq.copy()
    labels = [-100] * len(seq)
    
    for i in range(len(seq)):
        if np.random.random() < mask_prob:
            labels[i] = seq[i]
            
            prob = np.random.random()
            if prob < 0.8:
                masked_seq[i] = mask_token
            elif prob < 0.9:
                masked_seq[i] = np.random.randint(2, n_items)
    
    return masked_seq, labels

class BERT4RecDatasetWithDwell(Dataset):
    def __init__(self, sequences, max_len=50, n_items=None):
        self.sequences = sequences
        self.max_len = max_len
        self.mask_token = 1
        self.n_items = n_items
        
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        seq_data = self.sequences[idx]
        item_seq = seq_data['items']
        dwell_seq = seq_data['dwells']
        
       
        if len(item_seq) > self.max_len:
            item_seq = item_seq[-self.max_len:]
            dwell_seq = dwell_seq[-self.max_len:]
        
     
        masked_seq, labels = mask_sequence_dwell(
            item_seq, 
            mask_prob=0.15, 
            mask_token=self.mask_token,
            n_items=self.n_items
        )
        
   
        pad_len = self.max_len - len(masked_seq)
        masked_seq = [0] * pad_len + masked_seq
        dwell_seq = [0.0] * pad_len + dwell_seq
        labels = [-100] * pad_len + labels
        
        return {
            'input_ids': torch.tensor(masked_seq, dtype=torch.long),
            'dwell_times': torch.tensor(dwell_seq, dtype=torch.float),
            'labels': torch.tensor(labels, dtype=torch.long)
        }


train_dataset = BERT4RecDatasetWithDwell(train_seqs, max_len=50, n_items=n_items)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=0)

print(f"Dataset creado: {len(train_dataset)} secuencias")

Dataset creado: 1282243 secuencias


In [None]:
from torch.cuda.amp import autocast, GradScaler


optimizer = torch.optim.Adam(model_dwell.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=-100)
scaler = GradScaler()


torch.cuda.empty_cache()
gc.collect()

print("Iniciando entrenamiento de BERT4Rec con Dwell Time...\n")

model_dwell.train()
for epoch in range(10):
    total_loss = 0
    
    for i, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        dwell_times = batch['dwell_times'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
       
        with autocast():
            outputs = model_dwell(input_ids, dwell_times)
            loss = criterion(outputs.view(-1, n_items), labels.view(-1))
        
    
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
        
     
        if (i + 1) % 50 == 0:
            torch.cuda.empty_cache()
            print(f"  Batch {i+1}/{len(train_loader)}, Loss: {loss.item():.4f}")
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/10, Avg Loss: {avg_loss:.4f}")
    torch.cuda.empty_cache()

print("\n✓ Entrenamiento completado")

Iniciando entrenamiento de BERT4Rec con Dwell Time...



  scaler = GradScaler()
  with autocast():


  Batch 50/160281, Loss: 10.9180
  Batch 100/160281, Loss: 10.0361
  Batch 100/160281, Loss: 10.0361
  Batch 150/160281, Loss: 9.8583
  Batch 150/160281, Loss: 9.8583
  Batch 200/160281, Loss: 8.3138
  Batch 200/160281, Loss: 8.3138
  Batch 250/160281, Loss: 10.8777
  Batch 250/160281, Loss: 10.8777
  Batch 300/160281, Loss: 9.7793
  Batch 300/160281, Loss: 9.7793
  Batch 350/160281, Loss: 9.9327
  Batch 350/160281, Loss: 9.9327
  Batch 400/160281, Loss: 10.5286
  Batch 400/160281, Loss: 10.5286
  Batch 450/160281, Loss: 9.4228
  Batch 450/160281, Loss: 9.4228
  Batch 500/160281, Loss: 10.6888
  Batch 500/160281, Loss: 10.6888
  Batch 550/160281, Loss: 9.5106
  Batch 550/160281, Loss: 9.5106
  Batch 600/160281, Loss: 9.6367
  Batch 600/160281, Loss: 9.6367
  Batch 650/160281, Loss: 9.4446
  Batch 650/160281, Loss: 9.4446
  Batch 700/160281, Loss: 7.5515
  Batch 700/160281, Loss: 7.5515
  Batch 750/160281, Loss: 9.8971
  Batch 750/160281, Loss: 9.8971
  Batch 800/160281, Loss: 10.3281
 

In [None]:
@torch.no_grad()
def evaluate_with_dwell(model, test_data, k=10, max_len=50):
    """Evalúa modelo con dwell time"""
    model.eval()
    recalls = []
    mrrs = []
    ndcgs = []
    
    print(f"Evaluando {len(test_data)} muestras...")
    
    for i, sample in enumerate(test_data):
        item_seq = sample['items']
        dwell_seq = sample['dwells']
        target = sample['target']
        
 
        if len(item_seq) > max_len:
            item_seq = item_seq[-max_len:]
            dwell_seq = dwell_seq[-max_len:]
        
        pad_len = max_len - len(item_seq)
        input_seq = [0] * pad_len + item_seq
        dwell_input = [0.0] * pad_len + dwell_seq
        
        input_tensor = torch.tensor([input_seq], dtype=torch.long).to(device)
        dwell_tensor = torch.tensor([dwell_input], dtype=torch.float).to(device)
        
     
        output = model(input_tensor, dwell_tensor)
        scores = output[0, -1, :]
        
       
        _, top_k = torch.topk(scores, k)
        top_k = top_k.cpu().numpy()
  
        if target in top_k:
            recalls.append(1.0)
            rank = np.where(top_k == target)[0][0] + 1
            mrrs.append(1.0 / rank)
            ndcgs.append(1.0 / np.log2(rank + 1))
        else:
            recalls.append(0.0)
            mrrs.append(0.0)
            ndcgs.append(0.0)
        
        if (i + 1) % 1000 == 0:
            print(f"  Evaluadas {i+1}/{len(test_data)} muestras...")
            torch.cuda.empty_cache()
    
    return {
        'recall': np.mean(recalls),
        'mrr': np.mean(mrrs),
        'ndcg': np.mean(ndcgs)
    }


print("\n" + "="*50)
print("EVALUACIÓN EN TEST SET")
print("="*50)

results = evaluate_with_dwell(model_dwell, test_data, k=10, max_len=50)

print("\nBERT4Rec con Dwell Time:")
print(f"  Recall@10: {results['recall']:.4f}")
print(f"  MRR@10:    {results['mrr']:.4f}")
print(f"  NDCG@10:   {results['ndcg']:.4f}")


EVALUACIÓN EN TEST SET
Evaluando 1282243 muestras...
  Evaluadas 1000/1282243 muestras...
  Evaluadas 1000/1282243 muestras...
  Evaluadas 2000/1282243 muestras...
  Evaluadas 2000/1282243 muestras...
  Evaluadas 3000/1282243 muestras...
  Evaluadas 3000/1282243 muestras...
  Evaluadas 4000/1282243 muestras...
  Evaluadas 4000/1282243 muestras...
  Evaluadas 5000/1282243 muestras...
  Evaluadas 5000/1282243 muestras...
  Evaluadas 6000/1282243 muestras...
  Evaluadas 6000/1282243 muestras...
  Evaluadas 7000/1282243 muestras...
  Evaluadas 7000/1282243 muestras...
  Evaluadas 8000/1282243 muestras...
  Evaluadas 8000/1282243 muestras...
  Evaluadas 9000/1282243 muestras...
  Evaluadas 9000/1282243 muestras...
  Evaluadas 10000/1282243 muestras...
  Evaluadas 10000/1282243 muestras...
  Evaluadas 11000/1282243 muestras...
  Evaluadas 11000/1282243 muestras...
  Evaluadas 12000/1282243 muestras...
  Evaluadas 12000/1282243 muestras...
  Evaluadas 13000/1282243 muestras...
  Evaluadas 13

In [20]:
# Limpieza global en CPU
%reset -f
import gc
gc.collect()

# Limpieza GPU (PyTorch)
import torch
torch.cuda.empty_cache()

print("Memoria limpiada.")

Memoria limpiada.
