In [1]:
import polars as pl
import pandas as pd
import numpy as np
from datetime import timedelta
from tqdm import tqdm
import faiss
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import MessagePassing
from torch_geometric.nn import LGConv
from torch_geometric.loader import NeighborLoader
from torch_geometric.data import Data
from torch_scatter import scatter

from tqdm import tqdm

In [2]:
df_clickstream = pl.read_parquet("clickstream.pq")
df_event = pl.read_parquet('events.pq')

In [3]:
treshold = df_clickstream['event_date'].max() - timedelta(days=28)

In [4]:
df_clickstream = df_clickstream.filter(df_clickstream['event_date'] <= treshold)

In [5]:
df_clickstream = df_clickstream.filter(df_clickstream['event_date'] > treshold - timedelta(days=31))

In [7]:
def build_training_df(df_clickstream: pl.LazyFrame, df_event: pl.LazyFrame,
                      recency_lambda=30, alpha=0.5) -> pd.DataFrame:
    df = (
        df_clickstream
        .join(df_event, on="event", how="inner")
        .group_by(["node", "cookie"])
        .agg([
            pl.col("event_date").max().alias("event_date"),
            pl.col("is_contact").max().alias("is_contact"),
        ])
        .with_columns((pl.col("is_contact") + 0.5).alias("is_contact"))
        .with_columns(pl.count().over("node").alias("node_count"))
        .filter(pl.col("node_count") >= 100)
        .drop("node_count")
    )
    df_pd = df.select(["cookie","node","event_date","is_contact"]).to_pandas()
    df_pd["cookie"] = df_pd["cookie"].astype(str)
    df_pd["node"]   = df_pd["node"].astype(str)

    max_date = df_pd["event_date"].max()
    df_pd["age_days"]  = (max_date - df_pd["event_date"]).dt.days
    df_pd["recency_w"] = np.exp(-df_pd["age_days"] / recency_lambda)
    df_pd["edge_w"]    = df_pd["is_contact"] + alpha * df_pd["recency_w"]
    df_pd["cookie"]    = "_" + df_pd["cookie"]
    return df_pd

class WeightedLGConv(MessagePassing):
    def __init__(self):
        super().__init__(aggr='add')
    def forward(self, x, edge_index, edge_weight):
        row, col = edge_index
        deg = scatter(edge_weight, row, dim=0, dim_size=x.size(0), reduce='sum')
        norm = edge_weight / torch.sqrt(deg[row] * deg[col] + 1e-12)
        return self.propagate(edge_index, x=x, norm=norm)
    def message(self, x_j, norm):
        return norm.view(-1, 1) * x_j

class LightGCN(nn.Module):
    def __init__(self, num_nodes, emb_dim=64, num_layers=2):
        super().__init__()
        self.emb = nn.Embedding(num_nodes, emb_dim)
        self.convs = nn.ModuleList([WeightedLGConv() for _ in range(num_layers)])
    def forward(self, edge_index, edge_weight):
        x = self.emb.weight
        all_emb = [x]
        for conv in self.convs:
            x = conv(x, edge_index, edge_weight)
            all_emb.append(x)
        return torch.mean(torch.stack(all_emb), dim=0)

def prepare_graph_weighted(df: pd.DataFrame):
    users = df['cookie'].unique().tolist()
    items = df['node'].unique().tolist()
    user_map = {u: i for i, u in enumerate(users)}
    item_map = {i: idx + len(users) for idx, i in enumerate(items)}
    df['u_idx'] = df['cookie'].map(user_map)
    df['i_idx'] = df['node'].map(item_map)

    e1 = np.stack([df['u_idx'], df['i_idx']], axis=0)
    e2 = np.stack([df['i_idx'], df['u_idx']], axis=0)
    edge_index  = torch.tensor(np.concatenate([e1, e2], axis=1), dtype=torch.long)
    weights     = np.concatenate([df['edge_w'], df['edge_w']], axis=0)
    edge_weight = torch.tensor(weights, dtype=torch.float)

    data = Data(edge_index=edge_index,
                edge_weight=edge_weight,
                num_nodes=len(users) + len(items))
    return data, user_map, item_map

def bpr_loss(u_emb, pos_emb, neg_emb):
    pos_score = (u_emb * pos_emb).sum(dim=1, keepdim=True)
    neg_score = torch.bmm(neg_emb, u_emb.unsqueeze(-1)).squeeze(-1)
    return -torch.log(torch.sigmoid(pos_score - neg_score)).mean()

def train_lightgcn(data, df: pd.DataFrame, user_map, item_map,
                   emb_dim=128, num_layers=3, lr=1e-2,
                   weight_decay=1e-5, epochs=100,
                   K_neg=10, patience=5, val_k=100,
                   dropout=0.1):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = LightGCN(data.num_nodes, emb_dim, num_layers).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode='max',
                                                           factor=0.5,
                                                           patience=2,
                                                           verbose=True)

    user_pos = df.groupby('u_idx')['i_idx'].apply(list).to_dict()
    train_pos, val_set = {}, []
    for u, items in user_pos.items():
        if len(items) > 1:
            train_pos[u] = items[:-1]
            val_set.append((u, items[-1]))
        else:
            train_pos[u] = items

    best_recall, no_improve = 0.0, 0
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        users = list(train_pos.keys())
        np.random.shuffle(users)
        for i in range(0, len(users), 2048):
            batch = users[i:i+2048]
            pos = [np.random.choice(train_pos[u]) for u in batch]
            neg = [np.random.choice(list(set(item_map.values()) - set(train_pos[u])), size=K_neg)
                   for u in batch]

            u_idx   = torch.tensor(batch, device=device)
            pos_idx = torch.tensor(pos, device=device)
            neg_idx = torch.tensor(neg, device=device)

            optimizer.zero_grad()
            emb_all = model(data.edge_index.to(device), data.edge_weight.to(device))
            u_emb   = emb_all[u_idx]
            pos_emb = emb_all[pos_idx]
            neg_emb = emb_all[neg_idx]

            loss = bpr_loss(u_emb, pos_emb, neg_emb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        model.eval()
        with torch.no_grad():
            emb_all   = model(data.edge_index.to(device), data.edge_weight.to(device))
            user_emb  = emb_all[:len(user_map)]
            item_emb  = emb_all[len(user_map):]
            scores    = user_emb @ item_emb.T

            # popularity penalty
            num_users = len(user_map)
            df['i_local'] = df['i_idx'] - num_users
            cnts = df['i_local'].value_counts().sort_index()
            deg = torch.tensor(cnts.values, device=device)
            eta = 0.2
            penalty = deg.pow(eta)

            adjusted = scores / penalty.unsqueeze(0)
            topk = adjusted.topk(val_k, dim=1).indices.cpu().numpy()
            hits = sum(1 for u,v in val_set if (v - num_users) in topk[u])
            recall = hits / len(val_set)

        print(f"Epoch {epoch} | Loss {total_loss:.4f} | Val Recall@{val_k} {recall:.4f}")
        scheduler.step(recall)
        if recall > best_recall:
            best_recall, no_improve = recall, 0
            torch.save(model.state_dict(), 'best_lgcn.pth')
        else:
            no_improve += 1
            if no_improve >= patience:
                print("Early stopping.")
                break

    model.load_state_dict(torch.load('best_lgcn.pth'))
    return model

def recommend(model, data, df: pd.DataFrame, user_map, item_map,
               top_k=300, eta=0.5) -> pd.DataFrame:
    device = next(model.parameters()).device
    model.eval()
    with torch.no_grad():
        emb_all  = model(data.edge_index.to(device), data.edge_weight.to(device))
        user_emb = emb_all[:len(user_map)]
        item_emb = emb_all[len(user_map):]
        scores   = user_emb @ item_emb.T

        # compute item frequencies
        num_users = len(user_map)
        df['i_local'] = df['i_idx'] - num_users
        cnts = df['i_local'].value_counts().sort_index()
        deg = torch.tensor(cnts.values, device=device, dtype=torch.float)
        penalty = deg.pow(eta)

        adjusted = scores / penalty.unsqueeze(0)
        topk_idx = adjusted.topk(top_k, dim=1).indices.cpu().numpy()

    inv_user = {v:k for k,v in user_map.items()}
    inv_item = {v-len(user_map):k for k,v in item_map.items()}
    recs = []
    for u_idx, items in enumerate(topk_idx):
        cookie = inv_user[u_idx]
        for rank, i_loc in enumerate(items, 1):
            node = inv_item[i_loc]
            recs.append({'cookie': cookie, 'node': node, 'rank': rank})
    return pd.DataFrame(recs)


def get_embeddings_polars(model, user_map, item_map):
    """
    Returns two Polars DataFrames: users and items embeddings with original IDs and embedding lists.
    """
    emb_all = model.emb.weight.detach().cpu().numpy()
    num_users = len(user_map)

    user_emb = emb_all[:num_users]
    item_emb = emb_all[num_users:]

    inv_user = {idx: uid for uid, idx in user_map.items()}
    inv_item = {idx - num_users: nid for nid, idx in item_map.items()}

    users_list = [ {'id': inv_user[i], 'embedding': user_emb[i].tolist()} for i in range(num_users) ]
    items_list = [ {'id': inv_item[i], 'embedding': item_emb[i].tolist()} for i in range(len(item_emb)) ]

    user_df = pl.DataFrame(users_list)
    item_df = pl.DataFrame(items_list)

    return user_df, item_df

In [8]:
df_pd = build_training_df(df_clickstream, df_event)

  .with_columns(pl.count().over("node").alias("node_count"))


In [9]:
df_pd

Unnamed: 0,cookie,node,event_date,is_contact,age_days,recency_w,edge_w
0,_74314,71520,2025-01-23 13:38:24,1.5,2,0.935507,1.967753
1,_120524,3015,2025-01-14 21:03:22,0.5,11,0.693041,0.846520
2,_146748,152476,2025-01-23 23:40:39,0.5,2,0.935507,0.967753
3,_86072,122304,2025-01-23 18:28:44,0.5,2,0.935507,0.967753
4,_92049,71520,2025-01-11 12:27:24,0.5,14,0.627089,0.813545
...,...,...,...,...,...,...,...
4336577,_59659,152714,2025-01-19 19:31:51,0.5,6,0.818731,0.909365
4336578,_129938,229403,2025-01-12 16:43:08,0.5,13,0.648344,0.824172
4336579,_88987,51163,2025-01-25 11:18:04,0.5,0,1.000000,1.000000
4336580,_89332,320948,2025-01-17 16:57:53,0.5,8,0.765928,0.882964


In [10]:
data, u_map, i_map = prepare_graph_weighted(df_pd)

In [11]:
model = train_lightgcn(data, df_pd, u_map, i_map)

  neg_idx = torch.tensor(neg, device=device)


Epoch 0 | Loss 40.1735 | Val Recall@100 0.0118
Epoch 1 | Loss 38.4979 | Val Recall@100 0.0171
Epoch 2 | Loss 37.8220 | Val Recall@100 0.0898
Epoch 3 | Loss 36.1555 | Val Recall@100 0.2337
Epoch 4 | Loss 33.4694 | Val Recall@100 0.2533
Epoch 5 | Loss 31.7225 | Val Recall@100 0.2528
Epoch 6 | Loss 30.8246 | Val Recall@100 0.2524
Epoch 7 | Loss 30.3837 | Val Recall@100 0.2518
Epoch 00008: reducing learning rate of group 0 to 5.0000e-03.
Epoch 8 | Loss 29.9733 | Val Recall@100 0.2521
Epoch 9 | Loss 30.0529 | Val Recall@100 0.2521
Early stopping.


In [12]:
submission = recommend(model, data, df_pd, u_map, i_map, eta=0.4)

In [13]:
user_emb_df, item_emb_df = get_embeddings_polars(model, u_map, i_map)

In [17]:
submission_df = pl.from_pandas(submission)

In [18]:
df = (
    submission_df
    .with_columns(
        pl.col('cookie')
        .str.strip_chars('_')
        .cast(pl.Int64)
    )
    .with_columns(
        pl.col('node')
        .cast(pl.UInt32)
        .alias('node')
    )
)
df

cookie,node,rank
i64,u32,i64
74314,71546,1
74314,71549,2
74314,71511,3
74314,71547,4
74314,71514,5
…,…,…
134812,299995,296
134812,313801,297
134812,371454,298
134812,163826,299


In [19]:
user_embeddings = (
    user_emb_df
    .with_columns(
        pl.col('id')
        .str.strip_chars('_')
        .cast(pl.Int64)
        .alias('cookie')
    )
    .drop(pl.col('id'))
)

In [21]:
item_embeddings = (
    item_emb_df
    .with_columns(
        pl.col('id')
        .cast(pl.Int64)
        .alias('node')
    )
)

In [23]:
user_embeddings.write_parquet('retrieval_data/user_graph_emb_28d.pq')
item_embeddings.write_parquet('retrieval_data/item_graph_emb_28d.pq')
df.write_parquet('retrieval_data/top300_graph_emb_28d.pq')

In [25]:
df_clickstream = pl.read_parquet("clickstream.pq")

df_past = df_clickstream.filter(df_clickstream['event_date']<= treshold)

df_eval = df_clickstream.filter(df_clickstream['event_date']> treshold)[['cookie', 'node', 'event']]
df_eval = df_eval.join(df_past, on=['cookie', 'node'], how='anti')
df_eval = df_eval.filter(
    pl.col('event').is_in(
        df_event.filter(pl.col('is_contact')==1)['event'].unique()
    )
)
df_eval = df_eval.filter(
        pl.col('cookie').is_in(df_past['cookie'].unique())
    ).filter(
        pl.col('node').is_in(df_past['node'].unique())
    )
df_eval = df_eval.unique(['cookie', 'node'])

In [26]:
df_eval = df_eval.filter(pl.col('cookie').is_in(df.select('cookie').unique()))

In [27]:
from utils import recall_at

recall_at(df_eval, df, k=300)

0.2662449329037006

In [28]:
recall_at(df_eval, df.filter(df['rank'] <=40), k=40)

0.0531678296018784