In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
import polars as pl
import implicit
import lightgbm as lgb
from itertools import groupby

from utils import create_features, recall_at, fit_lgb_ranker

In [2]:
df_test_users = pl.read_parquet(f'test_users.pq')
df_clickstream = pl.read_parquet(f'clickstream.pq')

df_cat_features = pl.read_parquet(f'cat_features.pq')
df_text_features = pl.read_parquet(f'text_features.pq')
df_event = pl.read_parquet(f'events.pq')

In [3]:
treshold = df_clickstream['event_date'].max() - timedelta(days=7)

In [4]:
df_train = (
    df_clickstream
    .join(df_event, on='event', how='left')
    .filter(df_clickstream['event_date']<= treshold)
    # .filter(pl.col('is_contact')==1) # comment for i2i_seq_all and increase cuts for memory
    .group_by(['node', 'cookie'])
    .agg(pl.min('event_date'))
)

In [5]:
dt_min = df_clickstream['event_date'].min()
df_train = df_train.with_columns((pl.col("event_date") - dt_min).dt.total_seconds().alias("event_date"))

In [7]:
def i2i_model(df_train,
              max_pos_diff=45,
              w_pos=1.718759727352264,
              w_time=2.3933322742952314,
              w_recency=0.39059527613202566,
              pop_pow=0.36497800607640746,
              N_LAST=59,
              TOPK_USER=300,
              DECAY_BASE=0.9750338237166386
             ):
    interactions_df = df_train.sort(['cookie', 'event_date'])

    interactions_df = interactions_df.with_columns(
        pl.arange(0, pl.count(), step=1).over('cookie').alias('pos')
    )
    
    pair_df = interactions_df.join(
        interactions_df, 
        on='cookie', 
        how='inner', 
        suffix='_j' 
    ).filter(
        (pl.col('pos_j') > pl.col('pos')) &
        ((pl.col('pos_j') - pl.col('pos')) <= max_pos_diff)
    ).with_columns([
        (pl.col('pos_j') - pl.col('pos')).alias('pos_diff'),
        (pl.col('event_date_j') - pl.col('event_date')).alias('time_diff')
    ])

    max_ts = interactions_df['event_date'].max()

    pos   = w_pos / (1 + pl.col('pos_diff'))
    time  = w_time / (1 + pl.col('time_diff').log1p())
    recency = 1 + w_recency * (1 - (max_ts - pl.col('event_date_j')) / max_ts)
    
    pair_df = pair_df.with_columns(
        (pos * time * recency).alias('pair_weight')
    )

    i2i_scores = pair_df.group_by(['node', 'node_j']).agg(
        pl.sum('pair_weight').alias('score')
    )
    
    i2i_symm = pl.concat([
        i2i_scores,
        i2i_scores.rename({'node': 'node_j', 'node_j': 'node'}).select(['node', 'node_j', 'score'])
    ])
    i2i_scores = i2i_symm.group_by(['node','node_j']).agg(
        pl.sum('score').alias('score')    # sum if a pair appeared in both directions
    )
    
    item_freq = interactions_df.group_by('node').agg(pl.count().alias('freq'))
    i2i_scores = i2i_scores.join(item_freq, left_on='node_j', right_on='node', how='left')
    i2i_scores = i2i_scores.with_columns(
        (pl.col('score') / pl.col('freq').pow(pop_pow)).alias('i2i_score')   # e.g. divide by sqrt(freq of item_j)
    ).select(['node', 'node_j', 'i2i_score'])

    N = 500
    i2i_topN = (
        i2i_scores.sort('i2i_score', descending=True)
                  .group_by('node')
                  .head(N)
    )

    full_hist = (
        interactions_df
        .select(['cookie', 'node'])
        .unique()
    )
    
    lastN = (
        interactions_df
        .sort(['cookie', 'event_date'], descending=True)
        .group_by('cookie')
        .head(N_LAST)
        .with_columns(
            pl.int_range(1, pl.len() + 1).over('cookie').alias('recency_rank') #pl.cum_count().over('cookie').alias('recency_rank')
        )
    )
    
    lastN = (
        lastN
        .group_by(['cookie', 'node'])
        .agg(pl.min('recency_rank').alias('recency_rank'))
        .with_columns(
            (DECAY_BASE ** pl.col('recency_rank')).alias('recency_weight')
        )
    )
    
    user_cand = (
        lastN
        .join(i2i_topN,
              left_on='node',
              right_on='node',
              how='inner')
        .with_columns(
            (pl.col('i2i_score') * pl.col('recency_weight')).alias('weighted_i2i')
        )
    )
    
    user_cand = (
        user_cand
        .join(
            full_hist,
            how='anti',
            left_on=['cookie', 'node_j'], 
            right_on=['cookie', 'node']
        )
    )
    
    user_recs = (
        user_cand
        .group_by(['cookie', 'node_j'])
        .agg(pl.sum('weighted_i2i').alias('i2i_score'))
        .rename({'node_j': 'node'})
    )
    
    user_recs = (
        user_recs
        .with_columns(
            pl.col('i2i_score')
              .rank(method='dense', descending=True)
              .over('cookie')
              .alias('i2i_rank')
        )
        .filter(pl.col('i2i_rank') <= TOPK_USER)
        .sort(['cookie', 'i2i_rank'])
        .select(['cookie', 'node', 'i2i_score', 'i2i_rank'])
    )
    return user_recs

In [8]:
user_recs = i2i_model(df_train)

  pl.arange(0, pl.count(), step=1).over('cookie').alias('pos')
  item_freq = interactions_df.group_by('node').agg(pl.count().alias('freq'))


In [9]:
def compute_i2i_score_for_user(cookie_id, candidate_item):
    history_items = user_history_dict[cookie_id]
    score_sum = 0.0
    for h in history_items:
        score_sum += i2i_map.get(h, {}).get(candidate_item, 0.0)
    return score_sum

In [11]:
df_eval = df_clickstream.filter(df_clickstream['event_date']> treshold)[['cookie', 'node', 'event']]
df_eval = df_eval.join(df_train, on=['cookie', 'node'], how='anti')
df_eval = df_eval.filter(
    pl.col('event').is_in(
        df_event.filter(pl.col('is_contact')==1)['event'].unique()
    )
)
df_eval = df_eval.filter(
        pl.col('cookie').is_in(df_train['cookie'].unique())
    ).filter(
        pl.col('node').is_in(df_train['node'].unique())
    )
df_eval = df_eval.unique(['cookie', 'node'])

In [12]:
recall_at(df_eval, user_recs, k=300)

0.3907971781291905

In [13]:
recall_at(df_eval, user_recs.filter(pl.col('i2i_rank') <=40), k=40)

0.1372568479345053

In [14]:
# user_recs.select('cookie', 'node', 'i2i_score').write_parquet('retrieval_data/i2i_28d.pq')

In [15]:
user_recs

cookie,node,i2i_score,i2i_rank
i64,u32,f64,u64
0,130796,29.306155,1
0,115704,24.967123,2
0,116118,24.283344,3
0,130802,20.651484,4
0,170538,19.202336,5
…,…,…,…
149999,214199,4.167754,296
149999,15,4.143768,297
149999,152033,4.135192,298
149999,51164,4.134414,299


In [16]:
----

SyntaxError: invalid syntax (2133496677.py, line 1)

In [None]:
%matplotlib inline
import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

# Optional: Configure logging to see Optuna messages
import logging
optuna.logging.set_verbosity(logging.INFO)

In [None]:
def objective(trial):
    # Suggest hyperparameters
    max_pos_diff = trial.suggest_int('max_pos_diff', 30, 50)
    w_pos        = trial.suggest_float('w_pos', 1.0, 5.0)
    w_time       = trial.suggest_float('w_time', 1.0, 5.0)
    w_recency    = trial.suggest_float('w_recency', 0.0, 2.0)
    pop_pow      = 0.365 # trial.suggest_float('pop_pow', 0.01, 0.99)
    N_LAST       = trial.suggest_int('N_LAST', 30, 60)
    TOPK_USER    = 300  # fixed evaluation cutoff
    DECAY_BASE   = trial.suggest_float('DECAY_BASE', 0.8, 1.0)

    user_recs = i2i_model(
        df_train,
        max_pos_diff=max_pos_diff,
        w_pos=w_pos,
        w_time=w_time,
        w_recency=w_recency,
        pop_pow=pop_pow,
        N_LAST=N_LAST,
        TOPK_USER=TOPK_USER,
        DECAY_BASE=DECAY_BASE
    )

    return recall_at(df_eval, user_recs, k=TOPK_USER)

In [None]:
study = optuna.create_study(direction='maximize')

def print_callback(study, trial):
    print(f"Trial {trial.number}: Value={trial.value:.5f}, Params={trial.params}")

study.optimize(
    objective,
    n_trials=50,
    callbacks=[print_callback],
    show_progress_bar=True  # requires Optuna >= 3.1.0
)

In [None]:
print(f"Number of finished trials: {len(study.trials)}")
print(f"Best recall: {study.best_value:.5f}")
print("Best parameters:")
for key, val in study.best_params.items():
    print(f"- {key}: {val}")

# Plot optimization history
plot_optimization_history(study)

In [None]:
plot_param_importances(study)

In [None]:
Number of finished trials: 100
Best recall: 0.37856
Best parameters:
- max_pos_diff: 45
- w_pos: 1.718759727352264
- w_time: 2.3933322742952314
- w_recency: 0.39059527613202566
- pop_pow: 0.36497800607640746
- N_LAST: 59
- DECAY_BASE: 0.9750338237166386