In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
import polars as pl
import implicit
import lightgbm as lgb
from itertools import groupby

from utils import create_features, recall_at, fit_lgb_ranker, fit_catboost_ranker
from constants import FILES, ORDER, read_any

In [3]:
DATA_DIR = 'retrieval_data/'
SUBSAMPLE = 0.03

In [4]:
df_test_users = pl.read_parquet(f'test_users.pq')
df_clickstream = pl.read_parquet(f'clickstream.pq')

df_cat_features = pl.read_parquet(f'cat_features.pq')
df_text_features = pl.read_parquet(f'text_features.pq')
df_event = pl.read_parquet(f'events.pq')

In [5]:
df_clickstream = df_clickstream.join(df_event, on='event', how='left')
df_clickstream = df_clickstream.join(df_cat_features, on='item', how='left')

In [6]:
def prepare_df_event_target(df_clickstream, threshold):
    treshold_train = df_clickstream['event_date'].max() - timedelta(days=threshold)
    df_events_train = df_clickstream.filter(df_clickstream['event_date']<= treshold_train)
    df_targets_train = df_clickstream.filter(
        (df_clickstream['event_date']> treshold_train) & (df_clickstream['event_date'] < treshold_train + timedelta(days=14))
    )[['cookie', 'node', 'event']]
    df_targets_train = df_targets_train.join(df_events_train, on=['cookie', 'node'], how='anti')

    df_targets_train = (
        df_targets_train
        .filter(
            pl.col('event').is_in(
                df_event.filter(pl.col('is_contact') == 1)['event'].unique()
            )
        )
        .with_columns(pl.lit(1).alias("target"))
        .filter(pl.col('cookie').is_in(df_events_train['cookie'].unique()))
        .filter(pl.col('node').is_in(df_events_train['node'].unique()))
        .unique(['cookie', 'node'])
    )
    return df_events_train, df_targets_train

In [7]:
df_events_14, df_targets_14 = prepare_df_event_target(df_clickstream, 14)
df_events_28, df_targets_28 = prepare_df_event_target(df_clickstream, 28)

In [8]:
als_all_14 = pl.read_csv(f'{DATA_DIR}/14d-back-als-all.csv')
als_contact_14 = pl.read_csv(f'{DATA_DIR}/14d-back-als-contact.csv')
als_user_emb_14 = pl.read_parquet(f'{DATA_DIR}/als_user_emb_14d.pq')
als_item_emb_14 = pl.read_parquet(f'{DATA_DIR}/als_item_emb_14d.pq')
als_17_user_emb_14 = pl.read_parquet(f'{DATA_DIR}/als_17_user_emb_14d.pq')
als_17_item_emb_14 = pl.read_parquet(f'{DATA_DIR}/als_17_item_emb_14d.pq')
i2i_seq_all_14 = pl.read_parquet(f'{DATA_DIR}/i2i_17_14d.pq')
i2i_seq_contact_14 = pl.read_parquet(f'{DATA_DIR}/i2i_11_14d.pq')
i2i_df_14 = pl.read_parquet(f'{DATA_DIR}/i2i_14d.pq')

avg_text_node_14 = pl.read_parquet(f'{DATA_DIR}/avg_text_node_14d.parquet')
avg_text_cookie_14 = pl.read_parquet(f'{DATA_DIR}/cluster_text_cookie_14d.parquet')
last_text_cookie_14 = pl.read_parquet(f'{DATA_DIR}/last_item_text_cookie_14d.parquet')
topk_avg_text_cookie_14 = pl.read_csv(f'{DATA_DIR}/cluster_text_pred_14d.csv')
topk_last_text_cookie_14 = pl.read_csv(f'{DATA_DIR}/last_item_text_pred_14d.csv')
graph_pred_14 = pl.read_parquet(f'{DATA_DIR}/top300_graph_emb_14d.pq')
graph_item_emb_14 = pl.read_parquet(f'{DATA_DIR}/item_graph_emb_14d.pq')
graph_user_emb_14 = pl.read_parquet(f'{DATA_DIR}/user_graph_emb_14d.pq')
tag_cosine_14 = pl.read_parquet(f'{DATA_DIR}/top300_tag_cosine1_14d.pq')
tag_emb_cookie_14 = pl.read_parquet(f'{DATA_DIR}/tag_cosine_cookie_14d.pq')
tag_emb_node_14 = pl.read_parquet(f'{DATA_DIR}/tag_cosine_node_14d.pq')
transformer_14 = pl.read_parquet(f'{DATA_DIR}/transformer2_14d.parquet')

In [9]:
als_all_28 = pl.read_csv(f'{DATA_DIR}/28d-back-als-all.csv')
als_contact_28 = pl.read_csv(f'{DATA_DIR}/28d-back-als-contact.csv')
als_user_emb_28 = pl.read_parquet(f'{DATA_DIR}/als_user_emb_28d.pq')
als_item_emb_28 = pl.read_parquet(f'{DATA_DIR}/als_item_emb_28d.pq')
als_17_user_emb_28 = pl.read_parquet(f'{DATA_DIR}/als_17_user_emb_28d.pq')
als_17_item_emb_28 = pl.read_parquet(f'{DATA_DIR}/als_17_item_emb_28d.pq')
i2i_seq_all_28 = pl.read_parquet(f'{DATA_DIR}/i2i_17_28d.pq')
i2i_seq_contact_28 = pl.read_parquet(f'{DATA_DIR}/i2i_11_28d.pq')
i2i_df_28 = pl.read_parquet(f'{DATA_DIR}/i2i_28d.pq')

avg_text_node_28 = pl.read_parquet(f'{DATA_DIR}/avg_text_node_28d.parquet')
avg_text_cookie_28 = pl.read_parquet(f'{DATA_DIR}/cluster_text_cookie_28d.parquet')
last_text_cookie_28 = pl.read_parquet(f'{DATA_DIR}/last_item_text_cookie_28d.parquet')
topk_avg_text_cookie_28 = pl.read_csv(f'{DATA_DIR}/cluster_text_pred_28d.csv')
topk_last_text_cookie_28 = pl.read_csv(f'{DATA_DIR}/last_item_text_pred_28d.csv')
graph_pred_28 = pl.read_parquet(f'{DATA_DIR}/top300_graph_emb_28d.pq')
graph_item_emb_28 = pl.read_parquet(f'{DATA_DIR}/item_graph_emb_28d.pq')
graph_user_emb_28 = pl.read_parquet(f'{DATA_DIR}/user_graph_emb_28d.pq')
tag_cosine_28 = pl.read_parquet(f'{DATA_DIR}/top300_tag_cosine1_28d.pq')
tag_emb_cookie_28 = pl.read_parquet(f'{DATA_DIR}/tag_cosine_cookie_28d.pq')
tag_emb_node_28 = pl.read_parquet(f'{DATA_DIR}/tag_cosine_node_28d.pq')
transformer_28 = pl.read_parquet(f'{DATA_DIR}/transformer2_28d.parquet')

In [10]:
target = 'target'
group_column = 'cookie'

In [None]:
DAYS = 14
dfs = {
    key: read_any(DATA_DIR / tpl.format(d=DAYS))
    for key, tpl in FILES.items()
}

# Build the arg list in the exact order create_features expects
feature_args = [dfs[name] for name in ORDER]

# Call as before
df_14 = create_features(
    df_events_14,
    df_targets_14,
    *feature_args,
    subsample=SUBSAMPLE
)

In [11]:
df_14 = create_features(df_events_14, 
                         df_targets_14, 
                         als_all_14, 
                         als_user_emb_14,
                         als_item_emb_14,
                         als_contact_14, 
                         als_17_user_emb_14,
                         als_17_item_emb_14,
                         i2i_seq_all_14,
                         i2i_seq_contact_14,
                         avg_text_node_14,
                         avg_text_cookie_14,
                         last_text_cookie_14,
                         topk_avg_text_cookie_14,
                         topk_last_text_cookie_14,
                         graph_pred_14,
                         graph_item_emb_14,
                         graph_user_emb_14,
                         i2i_df_14,
                         tag_cosine_14,
                         tag_emb_cookie_14,
                         tag_emb_node_14,
                         transformer_14,
                         subsample=SUBSAMPLE)

start scale
end scale
df built, retrieved 88493 items from 156112 overall, 0.5668558470841447% coverage1


In [12]:
df_28 = create_features(df_events_28, 
                         df_targets_28, 
                         als_all_28, 
                         als_user_emb_28,
                         als_item_emb_28,
                         als_contact_28, 
                         als_17_user_emb_28,
                         als_17_item_emb_28,
                         i2i_seq_all_28,
                         i2i_seq_contact_28,
                         avg_text_node_28,
                         avg_text_cookie_28,
                         last_text_cookie_28,
                         topk_avg_text_cookie_28,
                         topk_last_text_cookie_28,
                         graph_pred_28,
                         graph_item_emb_28,
                         graph_user_emb_28,
                         i2i_df_28,
                         tag_cosine_28,
                         tag_emb_cookie_28,
                         tag_emb_node_28,
                         transformer_28,
                         subsample=SUBSAMPLE)

start scale
end scale
df built, retrieved 89880 items from 154214 overall, 0.5828264619295265% coverage1


In [13]:
df_14 = df_14.with_columns(pl.lit(14).alias("lag"))
df_28 = df_28.with_columns(pl.lit(28).alias("lag"))

df = pl.concat([df_14, df_28], how="vertical")

In [14]:
df_14

node,cookie,als-all,als-17,rank_category,rank_location,count_all_pop,count_contact,i2i_11_score,i2i_11_score_right,rank,rank_last_item,rank_right,i2i_score,tag_score,proba,count_all,contact_all,contact_ratio,user_location,user_category,num_contacts,num_events,surface_unique_counts,location_unique_counts,count_location,count_category,event,target,dot_centroid,dot_last_item,dot_als,dot_als_17,dot_graph,dot_tag_score,last_node,avg_node,last_node_diff,avg_node_diff,last_contact_node,last_contact_node_diff,lag
i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,i64,i32,f32,f32,f32,f32,f32,f64,u32,f64,i64,f64,u32,i64,i32
130853,72750,,,,,,,,,,,,,0.329281,,0.100178,0.100086,0.107226,7530,7,0.106363,0.124287,0.683333,0.126242,,,,0,12.332542,11.815451,0.003975,-0.000872,0.022317,0.229281,202602,72655.665871,71749,58197.334129,44437,86416,14
122327,100253,0.370635,0.146374,,,,,0.103489,,,,0.922742,0.124708,,0.106645,0.123015,0.150316,0.132813,4545,35,0.103181,0.166353,0.683333,0.145923,0.101303,,,0,12.43658,13.436519,0.911341,0.083877,1.5955,0.219215,71518,182848.848644,50809,60521.848644,234906,112579,14
345062,142064,,,,,,,,,,,,,0.295366,,0.100959,0.102257,0.135299,6199,7,0.101591,0.126437,0.683333,0.130928,0.100084,,,0,12.481132,11.023561,0.091073,0.012105,0.131555,0.195366,214253,151338.953947,130809,193723.046053,71515,273547,14
122326,53495,,0.108165,,0.128112,,,0.100987,,,,,,,,0.117282,0.16062,0.152645,5069,31,0.1,0.102382,0.433333,0.109372,0.100345,,,0,9.993291,7.513049,0.000472,0.014782,0.696836,0.0,284141,152187.547619,161815,29861.547619,,,14
71546,67272,,,,0.112048,0.36554,0.549695,0.111917,0.114198,,,0.1,0.192978,,0.111676,0.392964,0.617859,0.126531,8458,24,0.104772,0.109761,0.433333,0.114995,0.119095,,,0,12.936415,13.799674,0.042838,-0.00258,3.675938,0.001391,214316,140899.431953,142770,69353.431953,115659,44113,14
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1925,127429,,,,,,,,,,,,,,0.10356,0.105107,0.122997,0.167589,2348,24,0.112195,0.136256,0.683333,0.131865,0.115031,,10,1,10.457757,11.062087,0.079045,0.035876,1.093802,0.0,303463,133432.376,301538,131507.376,5742,3817,14
152705,117905,,,,,,,,,,,,,,0.10222,0.11449,0.115664,0.116225,3707,6,0.103181,0.104881,0.35,0.114995,0.100756,,10,1,9.423779,11.727062,0.042456,0.009466,0.404166,0.0,44019,89685.658824,108686,63019.341176,57716,94989,14
194841,57435,,,,,,,,,,,,,,0.101789,0.103536,0.124335,0.20329,4528,53,0.113786,0.180414,0.766667,0.162793,0.100227,,4,1,11.555828,14.310779,0.021335,0.001003,0.565358,0.000618,218478,216308.359567,23637,21467.359567,290456,95615,14
214221,98702,,,,,,,,,,,,,,0.113029,0.105242,0.169885,0.30007,8383,40,0.101591,0.127831,0.516667,0.161856,0.100732,,5,1,9.388001,13.153259,-0.075719,0.003423,0.546864,0.0,71521,132448.870833,142700,81772.129167,153380,60841,14


In [15]:
features = [i for i in df.columns if i not in ['event', 'target', 'lag']]
cat_features = ['user_location', 'user_category', 'node', 'cookie', 'last_node', 'last_contact_node']
features

['node',
 'cookie',
 'als-all',
 'als-17',
 'rank_category',
 'rank_location',
 'count_all_pop',
 'count_contact',
 'i2i_11_score',
 'i2i_11_score_right',
 'rank',
 'rank_last_item',
 'rank_right',
 'i2i_score',
 'tag_score',
 'proba',
 'count_all',
 'contact_all',
 'contact_ratio',
 'user_location',
 'user_category',
 'num_contacts',
 'num_events',
 'surface_unique_counts',
 'location_unique_counts',
 'count_location',
 'count_category',
 'dot_centroid',
 'dot_last_item',
 'dot_als',
 'dot_als_17',
 'dot_graph',
 'dot_tag_score',
 'last_node',
 'avg_node',
 'last_node_diff',
 'avg_node_diff',
 'last_contact_node',
 'last_contact_node_diff']

In [16]:
params_cb = {
    'loss_function': "YetiRank", 
    'iterations': 4000,
    'depth': 6,
    'learning_rate': 0.05,
    'verbose': False
}

In [17]:
als_all1 = pl.read_csv(f'{DATA_DIR}/als-all.csv')
als_contact1 = pl.read_csv(f'{DATA_DIR}/als-contact.csv')
als_user_emb1 = pl.read_parquet(f'{DATA_DIR}/als_user_emb.pq')
als_item_emb1 = pl.read_parquet(f'{DATA_DIR}/als_item_emb.pq')
als_17_user_emb1 = pl.read_parquet(f'{DATA_DIR}/als_user_emb.pq')
als_17_item_emb1 = pl.read_parquet(f'{DATA_DIR}/als_item_emb.pq')
i2i_df1 = pl.read_parquet(f'{DATA_DIR}/i2i.pq')

i2i_seq_all1 = pl.read_parquet(f'{DATA_DIR}/i2i_17.pq')
i2i_seq_contact1 = pl.read_parquet(f'{DATA_DIR}/i2i_11.pq')

avg_text_node1 = pl.read_parquet(f'{DATA_DIR}/avg_text_node.parquet')
avg_text_cookie1 = pl.read_parquet(f'{DATA_DIR}/cluster_text_cookie.parquet')
last_text_cookie1 = pl.read_parquet(f'{DATA_DIR}/last_item_text_cookie.parquet')
topk_avg_text_cookie1 = pl.read_csv(f'{DATA_DIR}/cluster_text_pred.csv')
topk_last_text_cookie1 = pl.read_csv(f'{DATA_DIR}/last_item_text_pred.csv')
graph_pred1 = pl.read_parquet(f'{DATA_DIR}/top300_graph_emb.pq')
graph_item_emb1 = pl.read_parquet(f'{DATA_DIR}/item_graph_emb.pq')
graph_user_emb1 = pl.read_parquet(f'{DATA_DIR}/user_graph_emb.pq')
tag_cosine1 = pl.read_parquet(f'{DATA_DIR}/top300_tag_cosine1.pq')
tag_emb_cookie1 = pl.read_parquet(f'{DATA_DIR}/tag_cosine_cookie.pq')
tag_emb_node1 = pl.read_parquet(f'{DATA_DIR}/tag_cosine_node.pq')
transformer1 = pl.read_parquet(f'{DATA_DIR}/transformer2.parquet')

In [None]:
sub_df = create_features(df_clickstream, 
                         df_test_users, 
                         als_all1, 
                         als_user_emb1,
                         als_item_emb1,
                         als_contact1, 
                         als_17_user_emb1,
                         als_17_item_emb1,
                         i2i_seq_all1,
                         i2i_seq_contact1,
                         avg_text_node1,
                         avg_text_cookie1,
                         last_text_cookie1,
                         topk_avg_text_cookie1,
                         topk_last_text_cookie1,
                         graph_pred1,
                         graph_item_emb1,
                         graph_user_emb1,
                         i2i_df1,
                         tag_cosine1,
                         tag_emb_cookie1,
                         tag_emb_node1,
                         transformer1,
                         is_submit=True)
sub_df

start scale
end scale


In [None]:
sub_df = sub_df.filter(pl.col('cookie').is_in(df_test_users))

In [27]:
for col in cat_features:
    df = df.with_columns(pl.col(col).fill_null(-1).cast(pl.Int64))
    sub_df = sub_df.with_columns(pl.col(col).fill_null(-1).cast(pl.Int64))
    df = df.with_columns(pl.col(col).cast(pl.Int64))
    sub_df = sub_df.with_columns(pl.col(col).cast(pl.Int64))

In [28]:
df

node,cookie,als-all,als-17,rank_category,rank_location,count_all_pop,count_contact,i2i_11_score,i2i_11_score_right,rank,rank_last_item,rank_right,i2i_score,tag_score,proba,count_all,contact_all,contact_ratio,user_location,user_category,num_contacts,num_events,surface_unique_counts,location_unique_counts,count_location,count_category,event,target,dot_centroid,dot_last_item,dot_als,dot_als_17,dot_graph,dot_tag_score,last_node,avg_node,last_node_diff,avg_node_diff,last_contact_node,last_contact_node_diff,lag
i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,i64,i32,f32,f32,f32,f32,f32,f64,i64,f64,i64,f64,i64,i64,i32
130853,72750,,,,,,,,,,,,,0.329281,,0.100178,0.100086,0.107226,7530,7,0.106363,0.124287,0.683333,0.126242,,,,0,12.332542,11.815451,0.003975,-0.000872,0.022317,0.229281,202602,72655.665871,71749,58197.334129,44437,86416,14
122327,100253,0.370635,0.146374,,,,,0.103489,,,,0.922742,0.124708,,0.106645,0.123015,0.150316,0.132813,4545,35,0.103181,0.166353,0.683333,0.145923,0.101303,,,0,12.43658,13.436519,0.911341,0.083877,1.5955,0.219215,71518,182848.848644,50809,60521.848644,234906,112579,14
345062,142064,,,,,,,,,,,,,0.295366,,0.100959,0.102257,0.135299,6199,7,0.101591,0.126437,0.683333,0.130928,0.100084,,,0,12.481132,11.023561,0.091073,0.012105,0.131555,0.195366,214253,151338.953947,130809,193723.046053,71515,273547,14
122326,53495,,0.108165,,0.128112,,,0.100987,,,,,,,,0.117282,0.16062,0.152645,5069,31,0.1,0.102382,0.433333,0.109372,0.100345,,,0,9.993291,7.513049,0.000472,0.014782,0.696836,0.0,284141,152187.547619,161815,29861.547619,-1,,14
71546,67272,,,,0.112048,0.36554,0.549695,0.111917,0.114198,,,0.1,0.192978,,0.111676,0.392964,0.617859,0.126531,8458,24,0.104772,0.109761,0.433333,0.114995,0.119095,,,0,12.936415,13.799674,0.042838,-0.00258,3.675938,0.001391,214316,140899.431953,142770,69353.431953,115659,44113,14
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
161726,14833,,,,,,,,,,,,,,0.105899,0.100803,0.102431,0.144565,2348,59,0.1,0.101763,0.190909,0.109901,0.101,,10,1,11.73789,11.241702,0.004364,,0.073187,0.0,214216,175978.333333,52490,14252.333333,-1,,28
153137,58731,,,,,,,,,,,,,,0.104285,0.111154,0.110041,0.113272,6798,51,0.107114,0.147402,0.463636,0.223212,0.100598,,10,1,11.758527,13.268492,0.004505,-0.029466,0.153475,0.49999,120263,204075.862445,32874,50938.862445,230739,77602,28
43169,72842,,,,,,,,,,,,,,0.119954,0.109603,0.118571,0.128514,2348,40,0.113211,0.128213,0.463636,0.183608,0.105317,,15,1,9.220644,10.885252,0.005933,0.021991,0.219829,0.009468,243180,191530.197802,200011,148361.197802,152470,109301,28
122397,62571,,,,,,,,,,,,,,0.104936,0.112855,0.126241,0.130099,1033,57,0.115244,0.110269,0.372727,0.127503,0.100097,,10,1,10.449804,13.676293,-0.035758,0.010198,-0.040163,0.000001,114231,182555.28,8166,60158.28,113966,8431,28


In [29]:
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'num_boost_round': 150,
    'num_leaves': 63,
    'max_bin': 255,
    'min_data_in_leaf': 20,
    'min_sum_hessian_in_leaf': 5.0,
    'is_enable_sparse': True,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'force_row_wise': True,    # Important for ranking tasks
}

ranker = fit_lgb_ranker(df, features, cat_features, params, group_column=['lag', 'cookie'])

predictions = ranker.predict(sub_df[features])

sub_df = sub_df.with_columns(pred=pl.Series(predictions))

top_40_per_cookie = (
    sub_df
    .sort(["cookie", "pred"], descending=[False, True])
    .group_by("cookie", maintain_order=True)
    .head(40)
)
top_40_per_cookie.select('cookie', 'node', 'pred')



[LightGBM] [Info] Total Bins 9148
[LightGBM] [Info] Number of data points in the train set: 4980370, number of used features: 39




cookie,node,pred
i64,i64,f64
1,214377,2.137132
1,214339,2.110812
1,243177,2.106701
1,229418,2.065049
1,229353,2.036372
…,…,…
149999,120273,1.524545
149999,5746,1.489668
149999,130598,1.478941
149999,1906,1.442367


In [32]:
top_40_per_cookie

cookie,node,als-all,als-17,rank_category,rank_location,count_all_pop,count_contact,i2i_11_score,i2i_11_score_right,rank,rank_last_item,rank_right,i2i_score,tag_score,proba,count_all,contact_all,contact_ratio,user_location,user_category,num_contacts,num_events,surface_unique_counts,location_unique_counts,count_location,count_category,dot_centroid,dot_last_item,dot_als,dot_als_17,dot_graph,dot_tag_score,last_node,avg_node,last_node_diff,avg_node_diff,last_contact_node,last_contact_node_diff,pred
i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,f32,f32,f32,f32,f32,f64,i64,f64,i64,f64,i64,i64,f64
1,214377,0.388399,0.155472,,1.059839,,,0.10074,0.11253,0.105025,,0.638462,0.153936,,,0.119365,0.196737,0.176642,5675,51,0.127017,0.300717,0.6,0.36383,0.100262,,9.602156,6.799148,0.79495,0.79495,1.777191,0.054455,238794,190473.194553,24417,23903.805447,230740,16363,2.137132
1,214339,0.482473,,,,,,,0.115062,,,0.501338,0.144688,,0.105784,0.109871,0.196271,0.249628,5675,51,0.127017,0.300717,0.6,0.36383,0.100066,,11.904276,8.087865,1.054257,1.054257,2.147449,0.025495,238794,190473.194553,24455,23865.805447,230740,16401,2.110812
1,243177,0.440659,0.155265,0.220482,,,,0.101277,0.119614,,,,0.166316,0.688926,,0.109232,0.125184,0.141854,5675,51,0.127017,0.300717,0.6,0.36383,0.100159,0.109232,9.823614,3.14852,0.939,0.939,1.239473,0.588926,238794,190473.194553,4383,52703.805447,230740,12437,2.106701
1,229418,0.431561,,0.184337,,,,0.101598,0.114017,0.336181,,,0.162843,,,0.111778,0.129303,0.138172,5675,51,0.127017,0.300717,0.6,0.36383,0.100093,0.111778,10.868161,6.319442,0.913922,0.913922,1.701562,0.230218,238794,190473.194553,9376,38944.805447,230740,1322,2.065049
1,229353,0.362099,0.179283,0.176305,,,,0.102027,0.121665,,,,0.140902,,,0.112266,0.133828,0.142312,5675,51,0.127017,0.300717,0.6,0.36383,0.100154,0.112266,11.037426,6.203959,0.722457,0.722457,1.164517,0.325752,238794,190473.194553,9441,38879.805447,230740,1387,2.036372
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
149999,120273,0.288416,0.118965,,0.537751,,,,0.121705,,,,0.158768,,0.101327,0.112002,0.124342,0.131118,8458,7,0.100386,0.134291,0.516667,0.125532,0.101797,,12.646255,14.89689,0.519358,0.519358,0.860745,0.067635,120251,101450.143949,22,18822.856051,51162,69111,1.524545
149999,5746,0.269109,0.125223,,1.051807,,,0.104391,0.117508,,,1.1,0.138763,,0.101625,0.128902,0.164994,0.134501,8458,7,0.100386,0.134291,0.516667,0.125532,0.100945,,11.08381,13.622387,0.46614,0.46614,0.924175,0.047818,120251,101450.143949,114505,95704.143949,51162,45416,1.489668
149999,130598,0.420031,0.125906,,,,,0.104992,0.132583,,,0.220401,0.248631,0.38492,0.106327,0.127536,0.118499,0.110307,8458,7,0.100386,0.134291,0.516667,0.125532,0.10031,,11.730777,10.958941,0.882142,0.882142,1.764718,0.28492,120251,101450.143949,10347,29147.856051,51162,79436,1.478941
149999,1906,0.221676,0.150715,,0.678313,0.11451,,0.108449,0.142569,,,0.344147,0.181022,,0.114851,0.150601,0.190655,0.127487,8458,7,0.100386,0.134291,0.516667,0.125532,0.101425,,10.790383,11.376149,0.335394,0.335394,1.699379,0.000751,120251,101450.143949,118345,99544.143949,51162,49256,1.442367


In [31]:
top_40_per_cookie.select('cookie', 'node', 'pred').write_csv('pred_14d_lgb1.csv')

In [None]:
# model_cb = fit_catboost_ranker(df, features, cat_features, params_cb, group_column=['lag', 'cookie']) # group_column=['lag', 'cookie']
# predictions_cb = model_cb.predict(sub_df[features].to_pandas())
# sub_df = sub_df.with_columns(pred=pl.Series(predictions_cb))

# top_40_per_cookie_cb = (
#     sub_df
#     .sort(["cookie", "pred"], descending=[False, True])
#     .group_by("cookie", maintain_order=True)
#     .head(40)
# )
# top_40_per_cookie_cb = top_40_per_cookie_cb.select('cookie', 'node', 'pred')
# top_40_per_cookie_cb.write_csv('pred_ranker_14d_cb1.csv')

In [None]:
top_40_per_cookie_cb

In [None]:
# 1	243177	3.408451
# 1	214377	3.349749
# 1	214297	3.18459
# 1	229316	3.144853
# 1	229353	2.988165
# …	…	…
# 149999	214260	2.581512
# 149999	214261	2.561505
# 149999	71546	2.553889
# 149999	214460	2.548078
# 149999	120504	2.544617

In [None]:
---

In [None]:
corr = df.select(features + ['target']).fill_null(0).corr()

# Convert to pandas DataFrame for Seaborn
corr_pd = corr.to_pandas()

# Create heatmap with proper axis labels
plt.figure(figsize=(10, 8))
heatmap = sns.heatmap(
    corr_pd, 
    annot=True, 
    cmap='coolwarm', 
    center=0,
    fmt=".2f",
    linewidths=.5,
    annot_kws={"size": 10}
)

# Explicitly set both x and y labels
heatmap.set_xticklabels(heatmap.get_xticklabels(), rotation=45, horizontalalignment='right')
heatmap.set_yticklabels(heatmap.get_yticklabels(), rotation=0)  # y-axis labels horizontal

plt.title('Correlation Matrix Heatmap')
plt.tight_layout()  # Ensures labels don't get cut off
plt.show()

In [None]:
corr_pd = (
    df
    .select(features + ['target'])
    .drop('user_location', 'user_category')
    .fill_null(0)               # or .fillna(0) if pandas
    .corr()
    .to_pandas()
)

# Ensure the index really is your feature names:
labels = list(corr_pd.columns)
corr_pd.index = labels

# 2) Set up figure + axis
fig, ax = plt.subplots(figsize=(10, 8))

# 3) Plot heatmap with no built-in ticks
sns.heatmap(
    corr_pd,
    annot=True,
    fmt=".2f",
    cmap='coolwarm',
    center=0,
    linewidths=0.5,
    annot_kws={"size":10},
    xticklabels=False,
    yticklabels=False,
    ax=ax
)

# 4) Manually place ticks at cell centers
n = len(labels)
tick_positions = np.arange(n) + 0.5

ax.set_xticks(tick_positions)
ax.set_yticks(tick_positions)

# 5) Assign your labels
ax.set_xticklabels(labels, rotation=45, ha='right')
ax.set_yticklabels(labels, rotation=0, va='center')

# 6) Force ticks to bottom & left
ax.xaxis.tick_bottom()
ax.yaxis.tick_left()

ax.set_title('Correlation Matrix Heatmap')
plt.tight_layout()
plt.show()

In [None]:
df

In [None]:
1

In [22]:
1

1