# Notebook GRU4Rec sin Dwell

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mkechinov/ecommerce-behavior-data-from-multi-category-store")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/mkechinov/ecommerce-behavior-data-from-multi-category-store?dataset_version_number=8...


100%|██████████| 4.29G/4.29G [00:25<00:00, 182MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/mkechinov/ecommerce-behavior-data-from-multi-category-store/versions/8


In [4]:
import pandas as pd
import gc

path = "/root/.cache/kagglehub/datasets/mkechinov/ecommerce-behavior-data-from-multi-category-store/versions/8"

USE_COLS = ["event_time", "user_id", "product_id", "user_session"]

def reduce_month(month_filename, output_filename, chunksize=1_000_000):

    reader = pd.read_csv(
        f"{path}/{month_filename}",
        usecols=USE_COLS,
        chunksize=chunksize
    )

    first = True

    for chunk in reader:
        # Convertir fechas dentro del chunk
        chunk["event_time"] = pd.to_datetime(chunk["event_time"])

        # Guardar chunk reducido
        chunk.to_csv(output_filename, mode="w" if first else "a",
                     index=False, header=first)
        first = False

        del chunk
        gc.collect()

    print("OUTPUT creado:", output_filename)


In [5]:
reduce_month("2019-Oct.csv", "oct_reduced.csv")
reduce_month("2019-Nov.csv", "nov_reduced.csv")


OUTPUT creado: oct_reduced.csv
OUTPUT creado: nov_reduced.csv


In [6]:
oct = pd.read_csv("oct_reduced.csv")
#nov = pd.read_csv("nov_reduced.csv")

# evento_time como datetime
oct["event_time"] = pd.to_datetime(oct["event_time"])
#nov["event_time"] = pd.to_datetime(nov["event_time"])


In [7]:
def split_train_test(df):

    df = df.sort_values("event_time")
    cutoff = df["event_time"].quantile(0.9)

    train = df[df["event_time"] < cutoff]
    test  = df[df["event_time"] >= cutoff]

    return train, test


In [8]:
oct_train, oct_test = split_train_test(oct)
print(len(oct_train), len(oct_test))


38203886 4244878


In [None]:
#nov_train, nov_test = split_train_test(nov)
#print(len(nov_train), len(nov_test))

In [9]:
def build_sessions(df):
    df = df.sort_values(["user_session", "event_time"])
    sessions = df.groupby("user_session")["product_id"].apply(list)
    return sessions


In [10]:
oct_train_sessions = build_sessions(oct_train)
oct_test_sessions  = build_sessions(oct_test)

In [11]:
def filter_sessions_min_length(sessions, min_len=5):
    return sessions[sessions.apply(len) >= min_len]

In [12]:
oct_train_sessions = filter_sessions_min_length(oct_train_sessions, min_len=5)
oct_test_sessions  = filter_sessions_min_length(oct_test_sessions,  min_len=5)

In [15]:
from collections import defaultdict

def build_id_maps(train_sessions):
    all_items = set()
    for s in train_sessions:
        all_items.update(s)

    item2idx = {item: i+1 for i, item in enumerate(all_items)}
    idx2item = {i+1: item for i, item in enumerate(all_items)}

    return item2idx, idx2item


In [16]:
item2idx, idx2item = build_id_maps(oct_train_sessions)

In [17]:
def encode_sessions(sessions, item2idx, min_len=5):
    encoded = []
    for seq in sessions:
        seq = [item2idx[x] for x in seq if x in item2idx]
        if len(seq) >= min_len:
            encoded.append(seq)
    return encoded

train_encoded = encode_sessions(oct_train_sessions, item2idx)
test_encoded  = encode_sessions(oct_test_sessions, item2idx)


### Modelo GRU sin Dwell

In [18]:
import torch
from torch.utils.data import Dataset, DataLoader

MAX_LEN = 100

class GRUDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]
        if len(seq) > MAX_LEN:
            seq = seq[-MAX_LEN:]   # conservar las últimas interacciones
        return torch.tensor(seq[:-1]), torch.tensor(seq[1:])


In [19]:
PAD_IDX = 0

def collate_fn(batch):
    X_batch, y_batch = zip(*batch)

    # Longitudes individuales
    lengths = [len(x) for x in X_batch]
    max_len = max(lengths)

    # Pading
    X_padded = [torch.cat([x, torch.full((max_len - len(x),), PAD_IDX)]) for x in X_batch]
    y_padded = [torch.cat([y, torch.full((max_len - len(y),), PAD_IDX)]) for y in y_batch]

    return torch.stack(X_padded), torch.stack(y_padded)

In [25]:
BATCH_SIZE = 32

train_loader = DataLoader(
    GRUDataset(train_encoded),
    batch_size=BATCH_SIZE,
    shuffle=True,
    drop_last=True,
    collate_fn=collate_fn
)

test_loader = DataLoader(
    GRUDataset(test_encoded),
    batch_size=1,
    shuffle=False,
    collate_fn=collate_fn
)



In [21]:
import torch.nn as nn

class GRU4Rec(nn.Module):
    def __init__(self, n_items, emb_size=128, hidden_size=128):
        super().__init__()
        self.embed = nn.Embedding(n_items+1, emb_size)
        self.gru = nn.GRU(emb_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, n_items+1)

    def forward(self, x):
        x = self.embed(x)
        out, _ = self.gru(x)
        logits = self.fc(out)
        return logits


In [22]:
device = "cuda" if torch.cuda.is_available() else "cpu"

n_items = len(item2idx)
model = GRU4Rec(n_items).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [23]:
def train_gru(model, loader, optimizer, criterion, epochs=10):
    model.train()

    for epoch in range(1, epochs+1):
        total_loss = 0

        for X, y in loader:
            X, y = X.to(device), y.to(device)

            optimizer.zero_grad()

            logits = model(X)

            logits = logits.reshape(-1, logits.size(-1))
            y = y.reshape(-1)

            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"[Epoch {epoch}] Loss = {total_loss/len(loader):.4f}")


In [26]:
train_gru(model, train_loader, optimizer, criterion)

[Epoch 1] Loss = 5.3946
[Epoch 2] Loss = 4.7251
[Epoch 3] Loss = 4.5912
[Epoch 4] Loss = 4.5242
[Epoch 5] Loss = 4.4851
[Epoch 6] Loss = 4.4573
[Epoch 7] Loss = 4.4373
[Epoch 8] Loss = 4.4204
[Epoch 9] Loss = 4.4078
[Epoch 10] Loss = 4.3979


In [27]:
import numpy as np

def recall_at_k(pred, target, k=10):
    return 1.0 if target in pred[:k] else 0.0

def ndcg_at_k(pred, target, k=10):
    if target in pred[:k]:
        idx = pred[:k].index(target)
        return 1 / np.log2(idx + 2)
    return 0.0


In [28]:
model.eval()

rec_list = []
ndcg_list = []

with torch.no_grad():
    for X, y in test_loader:
        X = X.to(device)
        logits = model(X)

        # último paso predice el próximo item
        last_logits = logits[0, -1]
        topk = torch.topk(last_logits, 10).indices.cpu().tolist()

        true_item = y[0, -1].item()

        rec_list.append(recall_at_k(topk, true_item))
        ndcg_list.append(ndcg_at_k(topk, true_item))

recall10 = np.mean(rec_list)
ndcg10 = np.mean(ndcg_list)

print("Recall@10:", recall10)
print("NDCG@10:", ndcg10)


Recall@10: 0.5706841180391444
NDCG@10: 0.4560607495303335
