# Necessary Common Functions

Those functions should be ran before each part.

In [2]:
import os
import random
import numpy as np
import pandas as pd
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache
from tqdm import tqdm, trange
from collections import Counter, defaultdict
import numba 
from numba import jit
import datasets
from datasets import Dataset as TFDataset
import multiprocessing

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def get_sessions(df: pd.DataFrame, test=False, list_item=False) -> list:
    
    all_item = []
    if 'next_item' in df and not test:
        if list_item:
            for i in trange(len(df)):
                all_item.append(np.concatenate([np.array(df.loc[i, 'prev_items']), np.array(df.loc[i, 'next_item'])], axis=0))
        else:
            for i in trange(len(df)):
                all_item.append(eval((df.loc[i, 'prev_items'][:-1]+f" '{df.loc[i, 'next_item']}']").replace(" ", ",")))
    else:
        if list_item:
            all_item = df['prev_items']
        else:
            for i in trange(len(df)):
                all_item.append(eval((df.loc[i, 'prev_items']).replace(" ", ",")))
    return all_item

In [4]:
def get_co_occurence_dict(sessions: list, bidirection: bool=True, weighted: bool=False, max_dis=None) -> dict:
    res = {}
    for sess in tqdm(sessions):
        for i, id in enumerate(sess):
            if id not in res:
                res[id] = Counter()
            
            if max_dis == None:
                e = len(sess)
            else:
                e = min(i + max_dis + 1, len(sess))

            for j in range(i+1, e):
                if not weighted:
                    res[id][sess[j]] += 1
                else:
                    res[id][sess[j]] += 1 / (j-i)
                if bidirection:
                    if sess[j] not in res:
                        res[sess[j]] = Counter()
                    if not weighted:
                        res[sess[j]][id] += 1
                    else:
                        res[sess[j]][id] += 1 / (j-i)
    return res

In [5]:
def sort_co_occurence_dict(co_occurence_dict: dict) -> dict:
    res = {}
    for k,v in co_occurence_dict.items():
        res[k] = dict(sorted(v.items(), key=lambda item: -item[1]))
    return res

In [6]:
def cast_dtype(df : pd.DataFrame, columns=None):
    if columns is None:
        columns = df.columns
    for k in columns:
        dt = type(df[k].iloc[0])
        if 'float' in str(dt):
            df[k] = df[k].astype('float32')
        elif 'int' in str(dt):
            df[k] = df[k].astype('int32')
        elif dt == list:
            dt_ = type(df.iloc[0][k][0])
            if 'float' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.float32))
            elif 'int' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.int32))

In [7]:
def get_session_last_item(session_df):
    last_items = []
    num_sessions = len(session_df)
    for i in tqdm(range(num_sessions)):
        sess = session_df.iloc[i]
        sess_prev_items = sess['prev_items']
        
        product_list = sess_prev_items.strip('[]').split(' ')
        last_item = product_list[-1].strip("'\n")

        last_items.append(last_item)
    return last_items 

In [8]:
def get_co_graph_counts(session_last_items, merged_candidates_df, co_graph_dict):
    co_graph_count_list = []
    for idx, row in tqdm(merged_candidates_df.iterrows(), total=merged_candidates_df.shape[0]):
        sess_id = row['sess_id']
        product = row['product']
        last_item = session_last_items[sess_id]
        co_graph_count = co_graph_dict[last_item][product]
        co_graph_count_list.append(co_graph_count)
    return co_graph_count_list

In [9]:
def flatten_co_graph_dict(co_graph_dict):
    product_list = []
    neighbor_list = []
    counts_list = []
    for product in tqdm(co_graph_dict.keys(), total=len(co_graph_dict)):
        for neigh in co_graph_dict[product].keys():
            product_list.append(product)
            neighbor_list.append(neigh)
            counts_list.append(co_graph_dict[product][neigh])
    return pd.DataFrame({'product_' : product_list, 'neighbor' : neighbor_list, 'counts' : counts_list})

In [10]:
def normalize_co_graph_counts(merged_candidates_counts):
    # normalize co graph counts 
    # merged_candidates_counts_g = cudf.from_pandas(merged_candidates_counts)
    sessions_count_sum = merged_candidates_counts[['sess_id', 'counts']].groupby('sess_id').sum()
    sessions_count_sum.sort_index(inplace=True)

    sessions_count_sum = sessions_count_sum.to_pandas()

    candidates_count_sum = sessions_count_sum.loc[merged_candidates_counts['sess_id']].reset_index(drop=True)
    merged_candidates_counts['counts_cum'] = candidates_count_sum['counts']
    merged_candidates_counts['normalized_counts'] = merged_candidates_counts['counts'] / merged_candidates_counts['counts_cum']

    # del merged_candidates_counts_g
    # del sessions_count_sum_g
    

# Merge test co-graph counts 

In [73]:
merged_candidates_feature_test_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_test_feature.parquet'
train_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_train_sessions.csv'
valid_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_valid_sessions.csv'
test_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1.csv'
product_data_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/products_train.csv'

In [74]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature_test():
    return pd.read_parquet(merged_candidates_feature_test_path)

@lru_cache(maxsize=1)
def read_product_data():
    return pd.read_csv(product_data_path)

@lru_cache(maxsize=1)
def read_train_data():
    return pd.read_csv(train_sessions_path)

@lru_cache(maxsize=1)
def read_valid_data():
    return pd.read_csv(valid_sessions_path)

@lru_cache(maxsize=1)
def read_test_data():
    return pd.read_csv(test_sessions_path)

In [75]:
merged_candidates_feature_test = read_merged_candidates_feature_test()

In [77]:
merged_candidates_test = merged_candidates_feature_test[['sess_id', 'sess_locale', 'product']]

In [78]:
train_sess_data = read_train_data()
valid_sess_data = read_valid_data()
test_sess_data = read_test_data()
product = read_product_data()

In [79]:
train_sess_item = get_sessions(train_sess_data, list_item=False)
valid_sess_item = get_sessions(valid_sess_data, list_item=False)
test_sess_item = get_sessions(test_sess_data, test=True, list_item=False)

100%|██████████| 3557898/3557898 [05:40<00:00, 10442.37it/s]
100%|██████████| 361581/361581 [00:30<00:00, 11792.96it/s]
100%|██████████| 316971/316971 [00:17<00:00, 17814.50it/s]


# bidirection

In [80]:
# bidirection
co_occurence_dict_bi = get_co_occurence_dict(train_sess_item + valid_sess_item + test_sess_item, bidirection=True, weighted=False)

100%|██████████| 4236450/4236450 [03:57<00:00, 17837.07it/s]


In [81]:
# only one arg, can't use another arg
def get_bi_test_session_co_graph_candidates(sess_id):
    sess = test_sess_item[sess_id]
    prev_items = set()
    cand_counter = Counter()
    for item in sess:
        if item in co_occurence_dict_bi and item not in prev_items:
            cand_counter = cand_counter + co_occurence_dict_bi[item]
            prev_items.add(item) # one time for every item
    for item in sess:
        if item in cand_counter:
            cand_counter.pop(item) # remove history items 
    
    return cand_counter

In [82]:
test_co_graph_candidates = TFDataset.from_dict({'sess_id' : list(range(test_sess_data.shape[0]))})

In [84]:
# about 1 mins
sess_ids = list(range(len(test_sess_item)))
pool = multiprocessing.Pool(10)
bi_test_sessions_counter = pool.map(get_bi_test_session_co_graph_candidates, sess_ids)

In [85]:
test_co_graph_candidates, len(bi_test_sessions_counter)

(Dataset({
     features: ['sess_id'],
     num_rows: 316971
 }),
 316971)

In [86]:
all_items_co_graph_count_list = []
for row in tqdm(merged_candidates_test.itertuples(), total=merged_candidates_test.shape[0]):
    all_items_co_graph_count_list.append(bi_test_sessions_counter[row.sess_id][row.product])
assert len(all_items_co_graph_count_list) == merged_candidates_test.shape[0]
merged_candidates_test['all_items_co_graph_count_0'] = all_items_co_graph_count_list

100%|██████████| 69428431/69428431 [04:22<00:00, 264832.67it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates_test['all_items_co_graph_count_0'] = all_items_co_graph_count_list


In [87]:
count_sum_array = merged_candidates_test.groupby(by='sess_id')['all_items_co_graph_count_0'].sum().to_numpy()
assert len(count_sum_array[merged_candidates_test['sess_id']]) == merged_candidates_test.shape[0]
merged_candidates_test['normalized_all_items_co_graph_count_0'] = count_sum_array[merged_candidates_test['sess_id']]
merged_candidates_test['normalized_all_items_co_graph_count_0'] = merged_candidates_test['all_items_co_graph_count_0'] / merged_candidates_test['normalized_all_items_co_graph_count_0']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates_test['normalized_all_items_co_graph_count_0'] = count_sum_array[merged_candidates_test['sess_id']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates_test['normalized_all_items_co_graph_count_0'] = merged_candidates_test['all_items_co_graph_count_0'] / merged_candidates_test['normalized_all_items_co_graph_count_0']


In [89]:
merged_candidates_feature_test['normalized_all_items_co_graph_count_0'] = merged_candidates_test['normalized_all_items_co_graph_count_0']
merged_candidates_feature_test['all_items_co_graph_count_0'] = merged_candidates_test['all_items_co_graph_count_0']

## uni and weighted

In [90]:
# weight 
# co_occurence_dict_bi_weight = get_co_occurence_dict(train_sess_item + valid_sess_item + test_sess_item, bidirection=True, weighted=True)
co_occurence_dict_uni_weight = get_co_occurence_dict(train_sess_item + valid_sess_item + test_sess_item, bidirection=False, weighted=True)

100%|██████████| 4236450/4236450 [02:33<00:00, 27510.34it/s]


In [91]:
# only one arg, can't use another arg
def get_uni_wei_test_session_co_graph_candidates(sess_id):
    sess = test_sess_item[sess_id]
    prev_items = set()
    cand_counter = Counter()
    for item in sess:
        if item in co_occurence_dict_uni_weight and item not in prev_items:
            cand_counter = cand_counter + co_occurence_dict_uni_weight[item]
            prev_items.add(item) # one time for every item
    for item in sess:
        if item in cand_counter:
            cand_counter.pop(item) # remove history items 
    
    return cand_counter

In [92]:
test_co_graph_candidates = TFDataset.from_dict({'sess_id' : list(range(test_sess_data.shape[0]))})

In [93]:
# about 1 mins
sess_ids = list(range(len(test_sess_item)))
pool = multiprocessing.Pool(10)
uni_wei_test_sessions_counter = pool.map(get_uni_wei_test_session_co_graph_candidates, sess_ids)

In [94]:
test_co_graph_candidates, len(uni_wei_test_sessions_counter)

(Dataset({
     features: ['sess_id'],
     num_rows: 316971
 }),
 316971)

In [95]:
all_items_co_graph_count_list = []
for row in tqdm(merged_candidates_test.itertuples(), total=merged_candidates_test.shape[0]):
    all_items_co_graph_count_list.append(uni_wei_test_sessions_counter[row.sess_id][row.product])
assert len(all_items_co_graph_count_list) == merged_candidates_test.shape[0]
merged_candidates_test['all_items_co_graph_count_1'] = all_items_co_graph_count_list

100%|██████████| 69428431/69428431 [05:06<00:00, 226645.16it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates_test['all_items_co_graph_count_1'] = all_items_co_graph_count_list


In [96]:
count_sum_array = merged_candidates_test.groupby(by='sess_id')['all_items_co_graph_count_1'].sum().to_numpy()
assert len(count_sum_array[merged_candidates_test['sess_id']]) == merged_candidates_test.shape[0]
merged_candidates_test['normalized_all_items_co_graph_count_1'] = count_sum_array[merged_candidates_test['sess_id']]
merged_candidates_test['normalized_all_items_co_graph_count_1'] = merged_candidates_test['all_items_co_graph_count_1'] / merged_candidates_test['normalized_all_items_co_graph_count_1']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates_test['normalized_all_items_co_graph_count_1'] = count_sum_array[merged_candidates_test['sess_id']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates_test['normalized_all_items_co_graph_count_1'] = merged_candidates_test['all_items_co_graph_count_1'] / merged_candidates_test['normalized_all_items_co_graph_count_1']


In [97]:
merged_candidates_feature_test['normalized_all_items_co_graph_count_1'] = merged_candidates_test['normalized_all_items_co_graph_count_1']
merged_candidates_feature_test['all_items_co_graph_count_1'] = merged_candidates_test['all_items_co_graph_count_1']

## uni and dis=1

In [98]:
# max dis = 1 
# co_occurence_dict_bi_dis1 = get_co_occurence_dict(train_sess_item + valid_sess_item + test_sess_item, bidirection=True, weighted=False, max_dis=1)
co_occurence_dict_uni_dis1 = get_co_occurence_dict(train_sess_item + valid_sess_item + test_sess_item, bidirection=False, weighted=False, max_dis=1)

100%|██████████| 4236450/4236450 [01:34<00:00, 44821.91it/s] 


In [99]:
# only one arg, can't use another arg
def get_uni_dis1_test_session_co_graph_candidates(sess_id):
    sess = test_sess_item[sess_id]
    prev_items = set()
    cand_counter = Counter()
    for item in sess:
        if item in co_occurence_dict_uni_dis1 and item not in prev_items:
            cand_counter = cand_counter + co_occurence_dict_uni_dis1[item]
            prev_items.add(item) # one time for every item
    for item in sess:
        if item in cand_counter:
            cand_counter.pop(item) # remove history items 
    
    return cand_counter

In [100]:
test_co_graph_candidates = TFDataset.from_dict({'sess_id' : list(range(test_sess_data.shape[0]))})

In [101]:
# about 1 mins
sess_ids = list(range(len(test_sess_item)))
pool = multiprocessing.Pool(10)
uni_dis1_test_sessions_counter = pool.map(get_uni_dis1_test_session_co_graph_candidates, sess_ids)

In [102]:
test_co_graph_candidates, len(uni_dis1_test_sessions_counter)

(Dataset({
     features: ['sess_id'],
     num_rows: 316971
 }),
 316971)

In [103]:
all_items_co_graph_count_list = []
for row in tqdm(merged_candidates_test.itertuples(), total=merged_candidates_test.shape[0]):
    all_items_co_graph_count_list.append(uni_dis1_test_sessions_counter[row.sess_id][row.product])
assert len(all_items_co_graph_count_list) == merged_candidates_test.shape[0]
merged_candidates_test['all_items_co_graph_count_2'] = all_items_co_graph_count_list

100%|██████████| 69428431/69428431 [05:26<00:00, 212734.94it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates_test['all_items_co_graph_count_2'] = all_items_co_graph_count_list


In [104]:
count_sum_array = merged_candidates_test.groupby(by='sess_id')['all_items_co_graph_count_2'].sum().to_numpy()
assert len(count_sum_array[merged_candidates_test['sess_id']]) == merged_candidates_test.shape[0]
merged_candidates_test['normalized_all_items_co_graph_count_2'] = count_sum_array[merged_candidates_test['sess_id']]
merged_candidates_test['normalized_all_items_co_graph_count_2'] = merged_candidates_test['all_items_co_graph_count_2'] / merged_candidates_test['normalized_all_items_co_graph_count_2']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates_test['normalized_all_items_co_graph_count_2'] = count_sum_array[merged_candidates_test['sess_id']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates_test['normalized_all_items_co_graph_count_2'] = merged_candidates_test['all_items_co_graph_count_2'] / merged_candidates_test['normalized_all_items_co_graph_count_2']


In [105]:
merged_candidates_feature_test['normalized_all_items_co_graph_count_2'] = merged_candidates_test['normalized_all_items_co_graph_count_2']
merged_candidates_feature_test['all_items_co_graph_count_2'] = merged_candidates_test['all_items_co_graph_count_2']

In [106]:
cast_dtype(merged_candidates_feature_test, 
    ['all_items_co_graph_count_0', 'normalized_all_items_co_graph_count_0', 'all_items_co_graph_count_1', 'normalized_all_items_co_graph_count_1', 'all_items_co_graph_count_2', 'normalized_all_items_co_graph_count_2'])
merged_candidates_feature_test.to_parquet(merged_candidates_feature_test_path, engine='pyarrow')

In [109]:
merged_candidates_feature_test

Unnamed: 0,sess_id,sess_locale,product,sasrec_scores_2,sasrec_normalized_scores_2,gru4rec_scores,gru4rec_normalized_scores,product_freq,sess_avg_price,product_price,...,sasrec_normalized_scores_3,normalized_all_items_co_graph_count_0,all_items_co_graph_count_0,seqmlp_scores,seqmlp_normalized_scores,desc_BM25_scores,normalized_all_items_co_graph_count_1,all_items_co_graph_count_1,normalized_all_items_co_graph_count_2,all_items_co_graph_count_2
0,0,DE,4088833651,0.000000,2.975813e-09,0.000000,1.580065e-09,828,25.195269,36.761604,...,2.622550e-09,0.000000,0,0.000000,2.554478e-10,0.000000,0.000000,0.000000,0.000000,0
1,0,DE,B000H6W2GW,0.000000,2.975813e-09,0.000000,1.580065e-09,875,25.195269,36.761604,...,2.622550e-09,0.000000,0,0.000000,2.554478e-10,0.000000,0.000000,0.000000,0.000000,0
2,0,DE,B000JG2RAG,7.665308,6.347557e-06,8.104032,5.226502e-06,24,25.195269,23.190001,...,1.894552e-05,0.000000,0,8.786958,1.672744e-06,67.792648,0.000000,0.000000,0.000000,0
3,0,DE,B000RYSOUW,-2.951060,1.555882e-10,-2.857798,9.068785e-11,5,25.195269,6.900000,...,5.083796e-10,0.000000,0,-3.325048,9.188664e-12,170.360580,0.000000,0.000000,0.000000,0
4,0,DE,B000UGZVQM,3.977920,1.589257e-07,4.688567,1.717488e-07,4,25.195269,21.990000,...,3.784811e-07,0.000000,0,5.540127,6.506522e-08,71.169296,0.000000,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69428426,316970,UK,B0BJCTH4NH,11.327528,1.041200e-04,10.629994,3.818184e-04,74,16.950001,5.800000,...,1.849500e-05,0.010237,16,11.838901,9.762144e-04,164.803131,0.002540,1.285714,0.003086,1
69428427,316970,UK,B0BJTQQWLG,5.604142,3.403292e-07,6.052083,3.923694e-06,6,16.950001,9.880000,...,5.044600e-07,0.000640,1,4.890683,9.375031e-07,303.665985,0.000494,0.250000,0.000000,0
69428428,316970,UK,B0BJV3RL4H,9.146974,1.176336e-05,7.667603,1.973815e-05,7,16.950001,22.097065,...,1.265775e-06,0.000640,1,10.187823,1.872800e-04,226.131516,0.001976,1.000000,0.003086,1
69428429,316970,UK,B0BK7SPC84,-10.383047,3.879279e-14,-6.356799,1.601719e-11,0,16.950001,5.960000,...,6.057512e-15,0.000000,0,-4.160688,1.099036e-10,312.603607,0.000000,0.000000,0.000000,0


In [108]:
merged_candidates_feature_test.query('sess_id==300000').sort_values(by=['sasrec_scores_2'], ascending=False)[['sasrec_scores_2', 'all_items_co_graph_count_0', 'all_items_co_graph_count_1', 'all_items_co_graph_count_2']]

Unnamed: 0,sasrec_scores_3,sasrec_normalized_scores_3,all_items_co_graph_count_0,all_items_co_graph_count_1,all_items_co_graph_count_2
65675691,18.784042,7.110926e-01,141,64.666664,41
65675602,17.250706,1.534640e-01,127,40.392857,27
65675649,16.779135,9.576479e-02,59,20.842857,7
65675546,15.148463,1.875056e-02,43,11.941667,6
65675714,14.737875,1.243649e-02,29,5.916667,3
...,...,...,...,...,...
65675666,-6.248106,9.563178e-12,0,0.000000,0
65675565,-6.320740,8.893195e-12,0,0.000000,0
65675554,-6.321655,8.885057e-12,0,0.000000,0
65675610,-8.322851,1.201025e-12,0,0.000000,0
