In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
import polars as pl
import implicit
import lightgbm as lgb
from itertools import groupby
from pathlib import Path

from utils import create_features, recall_at, fit_lgb_ranker, fit_catboost_ranker
from constants import FILES, ORDER, read_any

In [3]:
DATA_DIR = 'retrieval_data/'
SUBSAMPLE = 0.01

In [4]:
df_test_users = pl.read_parquet(f'test_users.pq')
df_clickstream = pl.read_parquet(f'clickstream.pq')

df_cat_features = pl.read_parquet(f'cat_features.pq')
df_text_features = pl.read_parquet(f'text_features.pq')
df_event = pl.read_parquet(f'events.pq')

In [5]:
df_clickstream = df_clickstream.join(df_event, on='event', how='left')
df_clickstream = df_clickstream.join(df_cat_features, on='item', how='left')

In [6]:
def prepare_df_event_target(df_clickstream, threshold):
    treshold_train = df_clickstream['event_date'].max() - timedelta(days=threshold)
    df_events_train = df_clickstream.filter(df_clickstream['event_date']<= treshold_train)
    df_targets_train = df_clickstream.filter(
        (df_clickstream['event_date']> treshold_train) & (df_clickstream['event_date'] < treshold_train + timedelta(days=14))
    )[['cookie', 'node', 'event']]
    df_targets_train = df_targets_train.join(df_events_train, on=['cookie', 'node'], how='anti')

    df_targets_train = (
        df_targets_train
        .filter(
            pl.col('event').is_in(
                df_event.filter(pl.col('is_contact') == 1)['event'].unique()
            )
        )
        .with_columns(pl.lit(1).alias("target"))
        .filter(pl.col('cookie').is_in(df_events_train['cookie'].unique()))
        .filter(pl.col('node').is_in(df_events_train['node'].unique()))
        .unique(['cookie', 'node'])
    )
    return df_events_train, df_targets_train

In [7]:
df_events_14, df_targets_14 = prepare_df_event_target(df_clickstream, 14)
df_events_28, df_targets_28 = prepare_df_event_target(df_clickstream, 28)

In [8]:
# als_all_14 = pl.read_csv(f'{DATA_DIR}/14d-back-als-all.csv')
# als_contact_14 = pl.read_csv(f'{DATA_DIR}/14d-back-als-contact.csv')
# als_user_emb_14 = pl.read_parquet(f'{DATA_DIR}/als_user_emb_14d.pq')
# als_item_emb_14 = pl.read_parquet(f'{DATA_DIR}/als_item_emb_14d.pq')
# als_17_user_emb_14 = pl.read_parquet(f'{DATA_DIR}/als_17_user_emb_14d.pq')
# als_17_item_emb_14 = pl.read_parquet(f'{DATA_DIR}/als_17_item_emb_14d.pq')
# i2i_seq_all_14 = pl.read_parquet(f'{DATA_DIR}/i2i_17_14d.pq')
# i2i_seq_contact_14 = pl.read_parquet(f'{DATA_DIR}/i2i_11_14d.pq')
# i2i_df_14 = pl.read_parquet(f'{DATA_DIR}/i2i_14d.pq')

# avg_text_node_14 = pl.read_parquet(f'{DATA_DIR}/avg_text_node_14d.parquet')
# avg_text_cookie_14 = pl.read_parquet(f'{DATA_DIR}/cluster_text_cookie_14d.parquet')
# last_text_cookie_14 = pl.read_parquet(f'{DATA_DIR}/last_item_text_cookie_14d.parquet')
# topk_avg_text_cookie_14 = pl.read_csv(f'{DATA_DIR}/cluster_text_pred_14d.csv')
# topk_last_text_cookie_14 = pl.read_csv(f'{DATA_DIR}/last_item_text_pred_14d.csv')
# graph_pred_14 = pl.read_parquet(f'{DATA_DIR}/top300_graph_emb_14d.pq')
# graph_item_emb_14 = pl.read_parquet(f'{DATA_DIR}/item_graph_emb_14d.pq')
# graph_user_emb_14 = pl.read_parquet(f'{DATA_DIR}/user_graph_emb_14d.pq')
# tag_cosine_14 = pl.read_parquet(f'{DATA_DIR}/top300_tag_cosine1_14d.pq')
# tag_emb_cookie_14 = pl.read_parquet(f'{DATA_DIR}/tag_cosine_cookie_14d.pq')
# tag_emb_node_14 = pl.read_parquet(f'{DATA_DIR}/tag_cosine_node_14d.pq')
# transformer_14 = pl.read_parquet(f'{DATA_DIR}/transformer2_14d.parquet')

In [9]:
# als_all_28 = pl.read_csv(f'{DATA_DIR}/28d-back-als-all.csv')
# als_contact_28 = pl.read_csv(f'{DATA_DIR}/28d-back-als-contact.csv')
# als_user_emb_28 = pl.read_parquet(f'{DATA_DIR}/als_user_emb_28d.pq')
# als_item_emb_28 = pl.read_parquet(f'{DATA_DIR}/als_item_emb_28d.pq')
# als_17_user_emb_28 = pl.read_parquet(f'{DATA_DIR}/als_17_user_emb_28d.pq')
# als_17_item_emb_28 = pl.read_parquet(f'{DATA_DIR}/als_17_item_emb_28d.pq')
# i2i_seq_all_28 = pl.read_parquet(f'{DATA_DIR}/i2i_17_28d.pq')
# i2i_seq_contact_28 = pl.read_parquet(f'{DATA_DIR}/i2i_11_28d.pq')
# i2i_df_28 = pl.read_parquet(f'{DATA_DIR}/i2i_28d.pq')

# avg_text_node_28 = pl.read_parquet(f'{DATA_DIR}/avg_text_node_28d.parquet')
# avg_text_cookie_28 = pl.read_parquet(f'{DATA_DIR}/cluster_text_cookie_28d.parquet')
# last_text_cookie_28 = pl.read_parquet(f'{DATA_DIR}/last_item_text_cookie_28d.parquet')
# topk_avg_text_cookie_28 = pl.read_csv(f'{DATA_DIR}/cluster_text_pred_28d.csv')
# topk_last_text_cookie_28 = pl.read_csv(f'{DATA_DIR}/last_item_text_pred_28d.csv')
# graph_pred_28 = pl.read_parquet(f'{DATA_DIR}/top300_graph_emb_28d.pq')
# graph_item_emb_28 = pl.read_parquet(f'{DATA_DIR}/item_graph_emb_28d.pq')
# graph_user_emb_28 = pl.read_parquet(f'{DATA_DIR}/user_graph_emb_28d.pq')
# tag_cosine_28 = pl.read_parquet(f'{DATA_DIR}/top300_tag_cosine1_28d.pq')
# tag_emb_cookie_28 = pl.read_parquet(f'{DATA_DIR}/tag_cosine_cookie_28d.pq')
# tag_emb_node_28 = pl.read_parquet(f'{DATA_DIR}/tag_cosine_node_28d.pq')
# transformer_28 = pl.read_parquet(f'{DATA_DIR}/transformer2_28d.parquet')

In [10]:
target = 'target'
group_column = 'cookie'

In [11]:
DAYS = '14'
dfs = {
    key: read_any(Path(DATA_DIR) / tpl.format(d=DAYS))
    for key, tpl in FILES.items()
}

feature_args = [dfs[name] for name in ORDER]

df_14 = create_features(
    df_events_14,
    df_targets_14,
    *feature_args,
    subsample=SUBSAMPLE
)

df built, retrieved 88502 items from 156112 overall, 0.5669134980014349% coverage


In [12]:
df_14

node,cookie,als-all,als-17,rank_category,rank_location,count_all_pop,count_contact,i2i_11_score,i2i_11_score_right,rank,rank_last_item,rank_right,i2i_score,tag_score,proba,count_all,contact_all,contact_ratio,user_location,user_category,num_contacts,num_events,surface_unique_counts,location_unique_counts,count_location,count_category,event,target,dot_centroid,dot_last_item,dot_als,dot_als_17,dot_graph,last_node,avg_node,last_node_diff,avg_node_diff,last_contact_node,last_contact_node_diff
i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,i64,i32,f32,f32,f32,f32,f32,u32,f64,i64,f64,u32,i64
130821,80407,,,,,,,,,,,,0.133383,,,0.10237,0.104047,0.125627,2551,19,0.104772,0.103428,0.516667,0.112184,0.1,,,0,9.011393,11.210859,0.024172,-0.002015,0.219741,116995,116823.85,13826,13997.15,116986,13835
156363,69287,,,,,,,,,0.692965,,,,,,0.100389,0.100819,0.131569,4283,19,0.102651,0.103254,0.433333,0.110309,0.100019,,,0,12.028119,13.208828,0.002818,-0.000888,0.015816,116123,110769.157895,40240,45593.842105,116123,40240
255802,104572,,,,,,,,,,,,,0.713323,,0.100013,0.1,0.1,2348,51,0.109014,0.215856,0.6,0.277132,0.100061,0.100013,,0,,,0.005256,,,152109,199099.388471,103693,56702.611529,142970,112832
155668,100250,0.178002,0.119847,,,,,,,,,,,,,0.10201,0.106415,0.147898,2348,28,0.108484,0.149503,0.433333,0.186223,0.102976,,,0,10.67544,7.112644,0.262669,0.035907,0.170702,6489,99746.460727,149179,55921.539273,310791,155123
173381,145048,,,,,,,,,,,,,0.681536,,0.100078,0.100114,0.121787,166,49,0.119618,0.169258,0.516667,0.263074,,,,0,9.388408,9.195368,0.096364,0.000637,,130594,135599.307628,42787,37781.692372,106480,66901
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
214224,45427,,,,,,,,,,,,,,0.103807,0.102963,0.119791,0.200229,7530,57,0.10106,0.102324,0.183333,0.104686,0.100028,,5,1,10.56519,9.353305,0.008017,0.001134,0.64473,199251,191278.682927,14973,22945.317073,199251,14973
199254,48499,,,,,,,,,,,,,,0.101409,0.106502,0.123831,0.155009,4283,32,0.103712,0.123531,0.433333,0.159981,0.100604,,4,1,9.631543,10.54518,0.015623,-0.032867,0.288628,114229,137062.783251,85025,62191.216749,171356,27898
196488,21280,,,,,,,,,,,,,,0.102498,0.101203,0.105988,0.174692,6858,12,0.10106,0.103196,0.183333,0.124367,0.100011,,4,1,8.520979,9.908524,-0.005833,0.000133,0.059938,153748,153541.303571,42740,42946.696429,342310,145822
196646,94030,,,,,,,,,,,,,,0.101697,0.101944,0.107063,0.154531,6793,49,0.12386,0.117256,0.433333,0.135614,,,19,1,11.538765,9.56947,0.043247,0.011346,-0.128491,97686,104983.932886,98960,91662.067114,97686,98960


In [13]:
DAYS = '28'
dfs = {
    key: read_any(Path(DATA_DIR) / tpl.format(d=DAYS))
    for key, tpl in FILES.items()
}

feature_args = [dfs[name] for name in ORDER]

df_28 = create_features(
    df_events_28,
    df_targets_28,
    *feature_args,
    subsample=SUBSAMPLE
)

df built, retrieved 89893 items from 154214 overall, 0.5829107603719507% coverage


In [14]:
df_14 = df_14.with_columns(pl.lit(14).alias("lag"))
df_28 = df_28.with_columns(pl.lit(28).alias("lag"))

df = pl.concat([df_14, df_28], how="vertical")

In [15]:
features = [i for i in df.columns if i not in ['event', 'target', 'lag']]
cat_features = ['user_location', 'user_category', 'node', 'cookie', 'last_node', 'last_contact_node']

In [16]:
als_all1 = pl.read_csv(f'{DATA_DIR}/als-all.csv')
als_contact1 = pl.read_csv(f'{DATA_DIR}/als-contact.csv')
als_user_emb1 = pl.read_parquet(f'{DATA_DIR}/als_user_emb.pq')
als_item_emb1 = pl.read_parquet(f'{DATA_DIR}/als_item_emb.pq')
als_17_user_emb1 = pl.read_parquet(f'{DATA_DIR}/als_user_emb.pq')
als_17_item_emb1 = pl.read_parquet(f'{DATA_DIR}/als_item_emb.pq')
i2i_df1 = pl.read_parquet(f'{DATA_DIR}/i2i.pq')

i2i_seq_all1 = pl.read_parquet(f'{DATA_DIR}/i2i_17.pq')
i2i_seq_contact1 = pl.read_parquet(f'{DATA_DIR}/i2i_11.pq')

avg_text_node1 = pl.read_parquet(f'{DATA_DIR}/avg_text_node.parquet')
avg_text_cookie1 = pl.read_parquet(f'{DATA_DIR}/cluster_text_cookie.parquet')
last_text_cookie1 = pl.read_parquet(f'{DATA_DIR}/last_item_text_cookie.parquet')
topk_avg_text_cookie1 = pl.read_csv(f'{DATA_DIR}/cluster_text_pred.csv')
topk_last_text_cookie1 = pl.read_csv(f'{DATA_DIR}/last_item_text_pred.csv')
graph_pred1 = pl.read_parquet(f'{DATA_DIR}/top300_graph_emb.pq')
graph_item_emb1 = pl.read_parquet(f'{DATA_DIR}/item_graph_emb.pq')
graph_user_emb1 = pl.read_parquet(f'{DATA_DIR}/user_graph_emb.pq')
tag_cosine1 = pl.read_parquet(f'{DATA_DIR}/top300_tag_cosine1.pq')
tag_emb_cookie1 = pl.read_parquet(f'{DATA_DIR}/tag_cosine_cookie.pq')
tag_emb_node1 = pl.read_parquet(f'{DATA_DIR}/tag_cosine_node.pq')
transformer1 = pl.read_parquet(f'{DATA_DIR}/transformer2.parquet')

In [17]:
sub_df = create_features(df_clickstream, 
                         df_test_users, 
                         als_all1, 
                         als_user_emb1,
                         als_item_emb1,
                         als_contact1, 
                         als_17_user_emb1,
                         als_17_item_emb1,
                         i2i_seq_all1,
                         i2i_seq_contact1,
                         avg_text_node1,
                         avg_text_cookie1,
                         last_text_cookie1,
                         topk_avg_text_cookie1,
                         topk_last_text_cookie1,
                         graph_pred1,
                         graph_item_emb1,
                         graph_user_emb1,
                         i2i_df1,
                         tag_cosine1,
                         tag_emb_cookie1,
                         tag_emb_node1,
                         transformer1,
                         is_submit=True)
sub_df

node,cookie,als-all,als-17,rank_category,rank_location,count_all_pop,count_contact,i2i_11_score,i2i_11_score_right,rank,rank_last_item,rank_right,i2i_score,tag_score,proba,count_all,contact_all,contact_ratio,user_location,user_category,num_contacts,num_events,surface_unique_counts,location_unique_counts,count_location,count_category,dot_centroid,dot_last_item,dot_als,dot_als_17,dot_graph,last_node,avg_node,last_node_diff,avg_node_diff,last_contact_node,last_contact_node_diff
i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,f32,f32,f32,f32,f32,u32,f64,i64,f64,u32,i64
229316,1,0.505412,0.195368,0.192369,,,,0.100931,0.112338,,,,0.153325,,,0.111226,0.130475,0.141649,5675,51,0.127017,0.300717,0.6,0.36383,0.100008,0.111226,10.493627,7.036992,1.117484,1.117484,1.43972,238794,190473.194553,9478,38842.805447,230740,1424
214198,1,0.497455,0.155515,,0.473494,0.213738,0.402549,0.101141,0.106435,,,0.434448,,,0.103222,0.246195,0.488616,0.140784,5675,51,0.127017,0.300717,0.6,0.36383,0.100594,,9.571742,7.381647,1.095552,1.095552,3.002574,238794,190473.194553,24596,23724.805447,230740,16542
214233,1,0.488634,,,0.887149,,,,0.106406,0.547236,,0.237124,0.184547,,0.110062,0.126851,0.167427,0.138527,5675,51,0.127017,0.300717,0.6,0.36383,0.100318,,9.351207,5.281264,1.071239,1.071239,4.134598,238794,190473.194553,24561,23759.805447,230740,16507
214339,1,0.482473,,,,,,,0.115062,,,0.501338,0.144688,,0.105784,0.109871,0.196271,0.249628,5675,51,0.127017,0.300717,0.6,0.36383,0.100066,,11.904276,8.087865,1.054257,1.054257,2.147449,238794,190473.194553,24455,23865.805447,230740,16401
152705,1,0.45853,0.155079,,,,,0.100746,0.110226,,,,0.213652,,,0.11416,0.115652,0.11696,5675,51,0.127017,0.300717,0.6,0.36383,0.10005,,8.708915,1.46323,0.988261,0.988261,1.417004,238794,190473.194553,86089,37768.194553,230740,78035
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
199254,38860,,,,,,,,,,,,,,0.102578,0.10588,0.120032,0.152266,6230,19,0.100772,0.105511,0.433333,0.111915,0.100022,,6.14558,7.154558,-0.031062,-0.031062,0.487246,225016,150727.755906,25762,48526.244094,117336,81918
197431,8908,,,,,,,,,,,,,,0.102641,0.10188,0.111764,0.196016,2348,52,0.10386,0.10678,0.516667,0.113617,0.10124,,9.571344,11.21903,-0.009889,-0.009889,0.290966,218480,170475.942308,21049,26955.057692,170538,26893
187771,80887,,,,,,,,,,,,,,0.102498,0.111819,0.139561,0.151356,2105,6,0.1,0.100875,0.183333,0.105957,0.100164,,10.553353,12.95696,0.014019,0.014019,0.097078,336400,116043.619048,148629,71727.380952,,
196329,50930,,,,,,,,,,,,,,0.106097,0.111314,0.144925,0.16092,8256,40,0.1,0.1014,0.35,0.105106,0.101125,,11.11009,11.014272,0.000093,0.000093,0.955452,214223,153859.212121,17894,42469.787879,,


In [23]:
DAYS = '0'
dfs = {
    key: read_any(Path(DATA_DIR) / tpl.format(d=DAYS))
    for key, tpl in FILES.items()
}

feature_args = [dfs[name] for name in ORDER]

sub_df = create_features(
    df_clickstream,
    df_test_users,
    *feature_args,
    subsample=SUBSAMPLE,
    is_submit=True
)

In [18]:
sub_df = sub_df.filter(pl.col('cookie').is_in(df_test_users))

In [19]:
for col in cat_features:
    df = df.with_columns(pl.col(col).fill_null(-1).cast(pl.Int64))
    sub_df = sub_df.with_columns(pl.col(col).fill_null(-1).cast(pl.Int64))
    df = df.with_columns(pl.col(col).cast(pl.Int64))
    sub_df = sub_df.with_columns(pl.col(col).cast(pl.Int64))

In [20]:
df

node,cookie,als-all,als-17,rank_category,rank_location,count_all_pop,count_contact,i2i_11_score,i2i_11_score_right,rank,rank_last_item,rank_right,i2i_score,tag_score,proba,count_all,contact_all,contact_ratio,user_location,user_category,num_contacts,num_events,surface_unique_counts,location_unique_counts,count_location,count_category,event,target,dot_centroid,dot_last_item,dot_als,dot_als_17,dot_graph,last_node,avg_node,last_node_diff,avg_node_diff,last_contact_node,last_contact_node_diff,lag
i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,i64,i32,f32,f32,f32,f32,f32,i64,f64,i64,f64,i64,i64,i32
130821,80407,,,,,,,,,,,,0.133383,,,0.10237,0.104047,0.125627,2551,19,0.104772,0.103428,0.516667,0.112184,0.1,,,0,9.011393,11.210859,0.024172,-0.002015,0.219741,116995,116823.85,13826,13997.15,116986,13835,14
156363,69287,,,,,,,,,0.692965,,,,,,0.100389,0.100819,0.131569,4283,19,0.102651,0.103254,0.433333,0.110309,0.100019,,,0,12.028119,13.208828,0.002818,-0.000888,0.015816,116123,110769.157895,40240,45593.842105,116123,40240,14
255802,104572,,,,,,,,,,,,,0.713323,,0.100013,0.1,0.1,2348,51,0.109014,0.215856,0.6,0.277132,0.100061,0.100013,,0,,,0.005256,,,152109,199099.388471,103693,56702.611529,142970,112832,14
155668,100250,0.178002,0.119847,,,,,,,,,,,,,0.10201,0.106415,0.147898,2348,28,0.108484,0.149503,0.433333,0.186223,0.102976,,,0,10.67544,7.112644,0.262669,0.035907,0.170702,6489,99746.460727,149179,55921.539273,310791,155123,14
173381,145048,,,,,,,,,,,,,0.681536,,0.100078,0.100114,0.121787,166,49,0.119618,0.169258,0.516667,0.263074,,,,0,9.388408,9.195368,0.096364,0.000637,,130594,135599.307628,42787,37781.692372,106480,66901,14
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
120284,51356,,,,,,,,,,,,,,0.115558,0.105363,0.116738,0.146009,2348,28,0.106098,0.13288,0.463636,0.150605,0.104393,,10,1,10.677451,10.636786,0.032362,0.023426,0.623725,1908,97434.748428,118376,22849.251572,230737,110453,28
214268,39140,,,,,,,,,,,,,,0.112674,0.103553,0.134045,0.241258,8383,40,0.107114,0.176963,0.827273,0.181408,0.100756,,5,1,10.379482,5.852678,0.130703,-0.001613,0.43403,122317,148930.711978,91951,65337.288022,46418,167850,28
214304,81065,,,,,,,,,,,,,,0.102926,0.10063,0.108562,0.300259,2006,40,0.103049,0.107572,0.463636,0.117602,0.100092,,5,1,9.501436,13.627958,0.015235,0.002762,0.151781,152642,143856.5,61662,70447.5,51105,163199,28
685,55285,,,,,,,,,,,,,,0.118553,0.113145,0.130327,0.134018,7661,1,0.1,0.101763,0.190909,0.107701,0.100023,,10,1,9.941967,8.373608,0.000142,0.000024,,92388,65523.222222,91703,64838.222222,-1,,28


In [21]:
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'num_boost_round': 150,
    'num_leaves': 63,
    'max_bin': 255,
    'min_data_in_leaf': 20,
    'min_sum_hessian_in_leaf': 5.0,
    'is_enable_sparse': True,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
}

ranker = fit_lgb_ranker(df, features, cat_features, params, group_column=['lag', 'cookie'])

predictions = ranker.predict(sub_df[features])

sub_df = sub_df.with_columns(pred=pl.Series(predictions))

top_40_per_cookie = (
    sub_df
    .sort(["cookie", "pred"], descending=[False, True])
    .group_by("cookie", maintain_order=True)
    .head(40)
)
top_40_per_cookie.select('cookie', 'node', 'pred')



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.511540 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8895
[LightGBM] [Info] Number of data points in the train set: 1779043, number of used features: 38




cookie,node,pred
i64,i64,f64
1,229418,2.540486
1,243177,2.457443
1,214377,2.438571
1,229316,2.334626
1,214339,2.33387
…,…,…
149999,122326,1.702647
149999,214460,1.692605
149999,214290,1.677883
149999,120273,1.675348


In [22]:
top_40_per_cookie.select('cookie', 'node', 'pred').write_csv('pred_14d_lgb1.csv')

In [None]:
1