# Load Data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import timedelta
import polars as pl
import implicit

In [3]:
DATA_DIR = 'retrieval_data/'
N_RETRIEVE = 300
EVAL_DAYS_TRESHOLD = 0

df_test_users = pl.read_parquet(f'test_users.pq')
df_clickstream = pl.read_parquet(f'clickstream.pq')

df_cat_features = pl.read_parquet(f'cat_features.pq')
df_text_features = pl.read_parquet(f'text_features.pq')
df_event = pl.read_parquet(f'events.pq')

In [4]:
df_clickstream

cookie,item,event,event_date,platform,surface,node
i64,i64,i64,datetime[ns],i64,i64,u32
0,19915558,17,2025-02-05 02:30:59,3,2,115659
0,2680232,17,2025-01-24 21:16:57,3,2,115829
1,4247649,17,2025-01-29 23:00:58,2,2,7
1,4247649,17,2025-02-17 14:55:17,2,2,7
1,2171135,17,2025-01-17 19:23:29,2,2,214458
…,…,…,…,…,…,…
149999,4999183,17,2025-01-20 12:23:47,2,2,71511
149999,25999164,17,2025-01-24 14:26:57,2,2,71514
149999,12138732,17,2025-02-12 13:11:42,2,2,51162
149999,28207042,17,2025-02-16 12:35:35,2,2,71511


# PREPARE TRAIN EVAL

In [5]:
treshhold = df_clickstream['event_date'].max() - timedelta(days=EVAL_DAYS_TRESHOLD)

In [6]:
df_train = df_clickstream.filter(df_clickstream['event_date']<= treshhold)
df_eval = df_clickstream.filter(df_clickstream['event_date']> treshhold)[['cookie', 'node', 'event']]

In [7]:
df_eval = df_eval.join(df_train, on=['cookie', 'node'], how='anti')

In [8]:
df_eval = df_eval.filter(
    pl.col('event').is_in(
        df_event.filter(pl.col('is_contact')==1)['event'].unique()
    )
)

In [9]:
df_eval = df_eval.filter(
        pl.col('cookie').is_in(df_train['cookie'].unique())
    ).filter(
        pl.col('node').is_in(df_train['node'].unique())
    )

In [10]:
df_eval = df_eval.unique(['cookie', 'node'])

# TRAIN MODEL

## ALS

In [11]:
def get_als_pred(users, nodes, user_to_pred):
    user_ids = users.unique().to_list()
    item_ids = nodes.unique().to_list()
        
    user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_ids)}
    item_id_to_index = {item_id: idx for idx, item_id in enumerate(item_ids)}
    index_to_user_id = {v: k for k, v in user_id_to_index.items()}
    index_to_item_id = {v:k for k,v in item_id_to_index.items()}
    
    rows = users.replace_strict(user_id_to_index).to_list()
    cols = nodes.replace_strict(item_id_to_index).to_list()
    
    values = [1] * len(users)
    
    sparse_matrix = csr_matrix((values, (rows, cols)), shape=(len(user_ids), len(item_ids)))
    
    model = implicit.als.AlternatingLeastSquares(iterations=10, factors=60)
    model.fit(sparse_matrix, )
    
    user_embeddings = model.user_factors
    item_embeddings = model.item_factors

    # user4pred = np.array([user_id_to_index[i] for i in user_to_pred])
    user4pred = np.array([user_id_to_index[i] for i in user_to_pred], dtype=np.int32)
    
    recommendations, scores = model.recommend(user4pred, sparse_matrix[user4pred], N=N_RETRIEVE, filter_already_liked_items=True)
    
    df_pred = pl.DataFrame(
        {
            'node': [
                [index_to_item_id[i] for i in i] for i in recommendations.tolist()
            ], 
             'cookie': list(user_to_pred),
            'als-all': scores.tolist()
            
        }
    )
    df_pred = df_pred.explode(['node', 'als-all'])
    
    user_emb = pl.DataFrame({
        "cookie": [index_to_user_id[i] for i in range(len(user_embeddings))],
        "als_emb_user": user_embeddings
    })
    item_emb = pl.DataFrame({
        "node": [index_to_item_id[i] for i in range(len(item_embeddings))],
        "als_emb_node": item_embeddings
    })
    return (
        df_pred,
        user_emb,
        item_emb
    )

In [12]:
from scipy.sparse import csr_matrix
import numpy as np
import implicit


users = df_train["cookie"]
nodes = df_train["node"]
eval_users = df_train['cookie'].unique().to_list() # df_eval['cookie'].unique().to_list()

df_pred, user_emb, item_emb = get_als_pred(users, nodes, eval_users)
df_pred.write_csv(f'{DATA_DIR}/als-all.csv')

  check_blas_config()


  0%|          | 0/10 [00:00<?, ?it/s]

In [13]:
user_emb.write_parquet(f'{DATA_DIR}/als_user_emb.pq')

In [14]:
item_emb.write_parquet(f'{DATA_DIR}/als_item_emb.pq')

In [15]:
item_emb

node,als_emb_node
i64,"array[f32, 60]"
1,"[0.003155, 0.00539, … 0.004884]"
2,"[0.0029, 0.005247, … 0.004868]"
3,"[0.002911, 0.005072, … 0.0046]"
4,"[0.001956, 0.003913, … 0.005578]"
5,"[0.002913, 0.005053, … 0.004405]"
…,…
424063,"[0.003173, 0.005433, … 0.00488]"
424064,"[0.00279, 0.004325, … 0.004215]"
424065,"[0.003564, 0.005496, … 0.004254]"
424067,"[0.002862, 0.004403, … 0.003971]"
