# NextIt - Oct

In [1]:
import pandas as pd

df = pd.read_csv(
    "oct_reduced.csv",
    usecols=["event_time", "user_id", "product_id", "user_session"]
)

# Convertir el timestamp a datetime
df["event_time"] = pd.to_datetime(df["event_time"])

print(df.head())
print(df.dtypes)

                 event_time  product_id    user_id  \
0 2019-10-01 00:00:00+00:00    44600062  541312140   
1 2019-10-01 00:00:00+00:00     3900821  554748717   
2 2019-10-01 00:00:01+00:00    17200506  519107250   
3 2019-10-01 00:00:01+00:00     1307067  550050854   
4 2019-10-01 00:00:04+00:00     1004237  535871217   

                           user_session  
0  72d76fde-8bb3-4e00-8c23-a032dfed738c  
1  9333dfbd-b87a-4708-9857-6336556b0fcc  
2  566511c2-e2e3-422b-b695-cf8e6e792ca8  
3  7c90fc70-0e80-4590-96f3-13c02c18c713  
4  c6bd7419-2748-4c56-95b4-8cec9ff8b80d  
event_time      datetime64[ns, UTC]
product_id                    int64
user_id                       int64
user_session                 object
dtype: object


In [2]:
import numpy as np
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import random
from collections import Counter

# ================================================================
#   1. CONFIGURACI√ìN GLOBAL
# ================================================================
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

MAX_LEN = 50
MIN_SESSION_LEN = 5
TOP_N_ITEMS = 2000  # ajustar seg√∫n tu dataset
PAD_IDX = 0

#df_small = df.sample(frac=0.01, random_state=42)


# ===================================================================
#   2. CREAR SESIONES + AUMENTACI√ìN POR DWELL 
# ===================================================================
print("\n\n================ GRU4REC CON DWELL ================")

# ===================================================================
#   2. CREAR SESIONES + AUMENTACI√ìN POR DWELL 
# ===================================================================

DWELL_THRESHOLD = 75   

def build_sessions_with_dwell_repetitions(df):
    sessions = []

    df = df.sort_values(["user_session", "event_time"])

    for session_id, group in df.groupby("user_session"):
        group = group.sort_values("event_time")

        items = group["product_id"].tolist()
        times = group["event_time"].values

        # dwell entre eventos en segundos
        dwells = np.diff(times).astype("timedelta64[s]").astype(float)
        dwells = np.append(dwells, 1.0)

        seq = []
        for item, d in zip(items, dwells):

            # reps = floor(dwell/75) + 1
            reps = int(max(d, 1) // DWELL_THRESHOLD) + 1

            # REPETIR EL ITEM reps VECES
            seq.extend([item] * reps)

        if len(seq) >= 2:
            sessions.append(seq)

    return sessions


sessions = build_sessions_with_dwell_repetitions(df)
print("Sesiones creadas:", len(sessions))
print("Ejemplo sesi√≥n aumentada:", sessions[0][:25])

# ===================================================================
#   3. SPLIT TRAIN/TEST
# ===================================================================

split = int(0.8 * len(sessions))
train_sessions = sessions[:split]
test_sessions = sessions[split:]

print("Train:", len(train_sessions), " Test:", len(test_sessions))

# ===================================================================
#   4. FILTRAR TOP ITEMS (IGUAL A ANTES)
# ===================================================================

counter = Counter([item for sess in train_sessions for item in sess])
top_items = [item for item, _ in counter.most_common(TOP_N_ITEMS)]

item2idx = {item: i + 1 for i, item in enumerate(top_items)}
idx2item = {v: k for k, v in item2idx.items()}
N_ITEMS = len(item2idx) + 1

def encode_session(sess):
    return [item2idx[item] for item in sess if item in item2idx]

train_encoded = [encode_session(s) for s in train_sessions]
test_encoded = [encode_session(s) for s in test_sessions]

train_encoded = [s for s in train_encoded if len(s) >= MIN_SESSION_LEN]
test_encoded = [s for s in test_encoded if len(s) >= MIN_SESSION_LEN]

print("Train encoded:", len(train_encoded))
print("Test encoded:", len(test_encoded))
print("N_ITEMS:", N_ITEMS)


Device: cuda


Sesiones creadas: 5974844
Ejemplo sesi√≥n aumentada: [54900011, 54900011]
Train: 4779875  Test: 1194969
Train encoded: 1467627
Test encoded: 366642
N_ITEMS: 2001


In [3]:
# ===================================================================
#   DATASET + DATALOADER (SIN DWELL)
# ===================================================================

class GRUDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]

        # limitar a MAX_LEN
        if len(seq) > MAX_LEN:
            seq = seq[-MAX_LEN:]

        items = torch.tensor(seq[:-1], dtype=torch.long)
        targets = torch.tensor(seq[1:], dtype=torch.long)

        return items, targets


def collate_fn(batch):
    items_batch, targets_batch = zip(*batch)
    max_len = max(len(x) for x in items_batch)

    def pad(x, pad_value=0):
        return torch.cat([x, torch.full((max_len - len(x),), pad_value, dtype=x.dtype)])

    items = torch.stack([pad(x, PAD_IDX) for x in items_batch])
    targets = torch.stack([pad(x, PAD_IDX) for x in targets_batch])

    return items, targets


train_loader = DataLoader(
    GRUDataset(train_encoded),
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn
)

test_loader = DataLoader(
    GRUDataset(test_encoded),
    batch_size=1,
    shuffle=False,
    collate_fn=collate_fn
)

# ===================================================================
#   1. DEFINICI√ìN DEL MODELO NextItNet
# ===================================================================

class CausalConv1d(nn.Module):
    def __init__(self, in_ch, out_ch, kernel_size, dilation):
        super().__init__()
        padding = (kernel_size - 1) * dilation
        self.conv = nn.Conv1d(in_ch, out_ch, kernel_size, padding=padding, dilation=dilation)
        self.crop = padding

    def forward(self, x):
        # x: (B, C, T)
        out = self.conv(x)
        if self.crop > 0:
            out = out[:, :, :-self.crop]
        return out

class ResidualBlock(nn.Module):
    def __init__(self, channels, kernel_size, dilation):
        super().__init__()
        self.conv1 = CausalConv1d(channels, channels, kernel_size, dilation)
        self.bn1 = nn.BatchNorm1d(channels)
        self.act = nn.ReLU()
        self.conv2 = CausalConv1d(channels, channels, kernel_size, dilation)
        self.bn2 = nn.BatchNorm1d(channels)

    def forward(self, x):
        # x: (B, C, T)
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.act(out)
        out = self.conv2(out)
        out = self.bn2(out)
        return self.act(out + x)

class NextItNet(nn.Module):
    def __init__(self, n_items, emb_size=128, channels=128, kernel_size=3, dilations=[1,2,4,8], max_len=MAX_LEN):
        super().__init__()
        self.item_emb = nn.Embedding(n_items, emb_size, padding_idx=PAD_IDX)
        # proyectar embedding a 'channels' para conv1d
        self.proj = nn.Linear(emb_size, channels)
        self.blocks = nn.ModuleList([ResidualBlock(channels, kernel_size, d) for d in dilations])
        self.out = nn.Linear(channels, n_items)
        self.max_len = max_len

    def forward(self, items):
        # items: (B,T)
        B,T = items.size()
        emb = self.item_emb(items)           # (B,T,emb)
        h = self.proj(emb)                  # (B,T,channels)
        h = h.permute(0,2,1)                # (B,channels,T)  -> conv1d expects C dim second
        for b in self.blocks:
            h = b(h)
        h = h.permute(0,2,1)                # (B,T,channels)
        logits = self.out(h)                # (B,T,N_ITEMS)
        return logits



# ===================================================================
#   2. ENTRENAMIENTO SASREC
# ===================================================================

def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0

    for items, targets in loader:
        items, targets = items.to(device), targets.to(device)

        optimizer.zero_grad()
        logits = model(items)

        B, T, C = logits.shape
        loss = criterion(logits.reshape(B*T, C), targets.reshape(B*T))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(loader)


# ===================================================================
#   3. EVALUACI√ìN SASREC
# ===================================================================

def evaluate_sasrec(model, loader):
    model.eval()

    recall_k = 10
    recall_sum = 0
    mrr_sum = 0
    ndcg_sum = 0
    total = 0

    for items, targets in loader:
        items, targets = items.to(device), targets.to(device)

        logits = model(items)[0, -1]
        topk = torch.topk(logits, recall_k).indices.tolist()

        target = targets[0, -1].item()
        if target == PAD_IDX:
            continue

        total += 1

        if target in topk:
            rank = topk.index(target) + 1
            recall_sum += 1
            mrr_sum += 1 / rank
            ndcg_sum += 1 / math.log2(rank + 1)

    return (
        recall_sum / total,
        mrr_sum / total,
        ndcg_sum / total
    )


# ===================================================================
#   4. SASRec CON DWELL (usa train_loader y test_loader)
# ===================================================================

print("\n\n================ NextItNet CON DWELL ================\n")

# Instancia y entrenamiento
nextit_dwell = NextItNet(N_ITEMS, emb_size=128, channels=128, dilations=[1,2,4,8]).to(device)
opt_dwell = torch.optim.Adam(nextit_dwell.parameters(), lr=1e-4)
crit_dwell = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

EPOCHS = 10

import time
start = time.time()

for ep in range(1, EPOCHS+1):
    loss = train_epoch(nextit_dwell, train_loader, opt_dwell, crit_dwell)   # reutiliza funci√≥n
    recall, mrr, ndcg = evaluate_sasrec(nextit_dwell, test_loader)
    print(f"\nEpoch {ep}/{EPOCHS} (NextItNet con dwell)")
    print(f"Loss: {loss:.4f}")
    print(f"Recall@10: {recall:.4f}  MRR@10: {mrr:.4f}  NDCG@10: {ndcg:.4f}")

end = time.time()

print(f"\n‚è± Tiempo total: {end - start:.2f} segundos")





Epoch 1/10 (NextItNet con dwell)
Loss: 2.1408
Recall@10: 0.7830  MRR@10: 0.6585  NDCG@10: 0.6881

Epoch 2/10 (NextItNet con dwell)
Loss: 1.6907
Recall@10: 0.7906  MRR@10: 0.6615  NDCG@10: 0.6922

Epoch 3/10 (NextItNet con dwell)
Loss: 1.6513
Recall@10: 0.7926  MRR@10: 0.6608  NDCG@10: 0.6921

Epoch 4/10 (NextItNet con dwell)
Loss: 1.6306
Recall@10: 0.7938  MRR@10: 0.6590  NDCG@10: 0.6911

Epoch 5/10 (NextItNet con dwell)
Loss: 1.6150
Recall@10: 0.7909  MRR@10: 0.6529  NDCG@10: 0.6857

Epoch 6/10 (NextItNet con dwell)
Loss: 1.6047
Recall@10: 0.7887  MRR@10: 0.6463  NDCG@10: 0.6802

Epoch 7/10 (NextItNet con dwell)
Loss: 1.5954
Recall@10: 0.7836  MRR@10: 0.6360  NDCG@10: 0.6711

Epoch 8/10 (NextItNet con dwell)
Loss: 1.5886
Recall@10: 0.7843  MRR@10: 0.6390  NDCG@10: 0.6735

Epoch 9/10 (NextItNet con dwell)
Loss: 1.5819
Recall@10: 0.7806  MRR@10: 0.6278  NDCG@10: 0.6642

Epoch 10/10 (NextItNet con dwell)
Loss: 1.5768
Recall@10: 0.7626  MRR@10: 0.6025  NDCG@10: 0.6406

‚è± Tiempo tota

In [4]:
import gc
import torch

print("\n========== LIMPIANDO MEMORIA ==========\n")

# üî• 1. Elimina variables GRANDES creadas en la parte con dwell
del sessions
del train_sessions
del test_sessions
del train_encoded
del test_encoded
del train_loader
del test_loader
del nextit_dwell
del opt_dwell
del crit_dwell


# üî• 2. Forzar recolector de basura
gc.collect()

# üî• 3. Liberar memoria de GPU (si corresponde)
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print(">> Memoria limpiada.\n")



>> Memoria limpiada.



In [5]:
# ===================================================================
#   1. RECREAR SESIONES ORIGINALES (SIN REPETICI√ìN POR DWELL)
# ===================================================================


def build_sessions_no_dwell(df):
    sessions = []

    df = df.sort_values(["user_session", "event_time"])

    for session_id, group in df.groupby("user_session"):
        group = group.sort_values("event_time")

        items = group["product_id"].tolist()

        if len(items) >= 2:
            sessions.append(items)

    return sessions


sessions_no_dwell = build_sessions_no_dwell(df)

print("Total sesiones SIN dwell:", len(sessions_no_dwell))
print("Ejemplo sesi√≥n SIN dwell:", sessions_no_dwell[0][:20])


# ===================================================================
#   2. SPLIT TRAIN / TEST
# ===================================================================

split = int(0.8 * len(sessions_no_dwell))
train_sessions_no_dwell = sessions_no_dwell[:split]
test_sessions_no_dwell = sessions_no_dwell[split:]

print("Train:", len(train_sessions_no_dwell), "Test:", len(test_sessions_no_dwell))


# ===================================================================
#   3. FILTRAR TOP ITEMS (SE REUSA item2idx DEL MODELO CON DWELL)
#      IMPORTANTE: usar el MISMO diccionario para comparaci√≥n justa
# ===================================================================

def encode_session_no_dwell(sess):
    return [item2idx[item] for item in sess if item in item2idx]

train_encoded_no_dwell = [encode_session_no_dwell(s) for s in train_sessions_no_dwell]
test_encoded_no_dwell = [encode_session_no_dwell(s) for s in test_sessions_no_dwell]

train_encoded_no_dwell = [s for s in train_encoded_no_dwell if len(s) >= MIN_SESSION_LEN]
test_encoded_no_dwell  = [s for s in test_encoded_no_dwell if len(s) >= MIN_SESSION_LEN]

print("Train encoded (sin dwell):", len(train_encoded_no_dwell))
print("Test encoded  (sin dwell):", len(test_encoded_no_dwell))


# ===================================================================
#   4. DATASET + DATALOADER (SIN DWELL)
# ===================================================================

class NexitDatasetNoDwell(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]

        if len(seq) > MAX_LEN:
            seq = seq[-MAX_LEN:]

        items = torch.tensor(seq[:-1], dtype=torch.long)
        targets = torch.tensor(seq[1:], dtype=torch.long)

        return items, targets


def collate_fn_no_dwell(batch):
    items_batch, targets_batch = zip(*batch)
    max_len = max(len(x) for x in items_batch)

    def pad(x, pad_value=0):
        return torch.cat([x, torch.full((max_len - len(x),), pad_value, dtype=x.dtype)])

    items = torch.stack([pad(x, PAD_IDX) for x in items_batch])
    targets = torch.stack([pad(x, PAD_IDX) for x in targets_batch])

    return items, targets


train_loader_no_dwell = DataLoader(
    NexitDatasetNoDwell(train_encoded_no_dwell),
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn_no_dwell
)

test_loader_no_dwell = DataLoader(
    NexitDatasetNoDwell(test_encoded_no_dwell),
    batch_size=1,
    shuffle=False,
    collate_fn=collate_fn_no_dwell
)


# ===================================================================

print("\n\n================ NEXTITNET SIN DWELL ================\n")

# 1. Instanciar modelo NextItNet
nextit_no_dwell = NextItNet(N_ITEMS).to(device)

# 2. Optimizador y criterio
criterion_nextit_nd = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer_nextit_nd = torch.optim.Adam(nextit_no_dwell.parameters(), lr=1e-4)

start = time.time()

# 3. Entrenamiento
for ep in range(1, EPOCHS + 1):
    loss = train_epoch(nextit_no_dwell, train_loader_no_dwell, optimizer_nextit_nd, criterion_nextit_nd)
    recall, mrr, ndcg = evaluate_sasrec(nextit_no_dwell, test_loader_no_dwell)

    print(f"\nEpoch {ep}/{EPOCHS} (NextItNet sin dwell)")
    print(f"Loss: {loss:.4f}")
    print(f"Recall@10: {recall:.4f}  MRR@10: {mrr:.4f}  NDCG@10: {ndcg:.4f}")

end = time.time()

print(f"\n‚è± Tiempo total: {end - start:.2f} segundos")

Total sesiones SIN dwell: 5974844
Ejemplo sesi√≥n SIN dwell: [54900011, 54900011]
Train: 4779875 Test: 1194969
Train encoded (sin dwell): 980336
Test encoded  (sin dwell): 244745




Epoch 1/10 (NextItNet sin dwell)
Loss: 3.4023
Recall@10: 0.7462  MRR@10: 0.5921  NDCG@10: 0.6287

Epoch 2/10 (NextItNet sin dwell)
Loss: 2.9005
Recall@10: 0.7566  MRR@10: 0.5987  NDCG@10: 0.6362

Epoch 3/10 (NextItNet sin dwell)
Loss: 2.8440
Recall@10: 0.7606  MRR@10: 0.6014  NDCG@10: 0.6393

Epoch 4/10 (NextItNet sin dwell)
Loss: 2.8140
Recall@10: 0.7627  MRR@10: 0.5996  NDCG@10: 0.6384

Epoch 5/10 (NextItNet sin dwell)
Loss: 2.7931
Recall@10: 0.7633  MRR@10: 0.5994  NDCG@10: 0.6384

Epoch 6/10 (NextItNet sin dwell)
Loss: 2.7774
Recall@10: 0.7644  MRR@10: 0.5968  NDCG@10: 0.6367

Epoch 7/10 (NextItNet sin dwell)
Loss: 2.7652
Recall@10: 0.7599  MRR@10: 0.5902  NDCG@10: 0.6306

Epoch 8/10 (NextItNet sin dwell)
Loss: 2.7539
Recall@10: 0.7621  MRR@10: 0.5929  NDCG@10: 0.6332

Epoch 9/10 (NextItNet sin dwell)
