# Necessary Common Functions

Those functions should be ran before each part.

In [1]:
import os
import random
import numpy as np
import pandas as pd
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache
from tqdm import tqdm, trange
from collections import Counter, defaultdict

In [2]:
def get_sessions(df: pd.DataFrame, test=False, list_item=False) -> list:
    
    all_item = []
    if 'next_item' in df and not test:
        if list_item:
            for i in trange(len(df)):
                all_item.append(np.concatenate([np.array(df.loc[i, 'prev_items']), np.array(df.loc[i, 'next_item'])], axis=0))
        else:
            for i in trange(len(df)):
                all_item.append(eval((df.loc[i, 'prev_items'][:-1]+f" '{df.loc[i, 'next_item']}']").replace(" ", ",")))
    else:
        if list_item:
            all_item = df['prev_items']
        else:
            for i in trange(len(df)):
                all_item.append(eval((df.loc[i, 'prev_items']).replace(" ", ",")))
    return all_item

In [3]:
def get_co_occurence_dict(sessions: list, bidirection: bool=True, weighted: bool=False, max_dis=None) -> dict:
    res = {}
    for sess in tqdm(sessions):
        for i, id in enumerate(sess):
            if id not in res:
                res[id] = Counter()
            
            if max_dis == None:
                e = len(sess)
            else:
                e = min(i + max_dis + 1, len(sess))

            for j in range(i+1, e):
                if not weighted:
                    res[id][sess[j]] += 1
                else:
                    res[id][sess[j]] += 1 / (j-i)
                if bidirection:
                    if sess[j] not in res:
                        res[sess[j]] = Counter()
                    if not weighted:
                        res[sess[j]][id] += 1
                    else:
                        res[sess[j]][id] += 1 / (j-i)
    return res

In [4]:
def sort_co_occurence_dict(co_occurence_dict: dict) -> dict:
    res = {}
    for k,v in co_occurence_dict.items():
        res[k] = dict(sorted(v.items(), key=lambda item: -item[1]))
    return res

In [26]:
def cast_dtype(df : pd.DataFrame, columns=None):
    if columns is None:
        columns = df.columns
    for k in columns:
        dt = type(df[k].iloc[0])
        if 'float' in str(dt):
            df[k] = df[k].astype('float32')
        elif 'int' in str(dt):
            df[k] = df[k].astype('int32')
        elif dt == list:
            dt_ = type(df.iloc[0][k][0])
            if 'float' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.float32))
            elif 'int' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.int32))

In [6]:
def get_session_last_item(session_df):
    last_items = []
    num_sessions = len(session_df)
    for i in tqdm(range(num_sessions)):
        sess = session_df.iloc[i]
        sess_prev_items = sess['prev_items']
        
        product_list = sess_prev_items.strip('[]').split(' ')
        last_item = product_list[-1].strip("'\n")

        last_items.append(last_item)
    return last_items 

In [7]:
def get_co_graph_counts(session_last_items, merged_candidates_df, co_graph_dict):
    co_graph_count_list = []
    for idx, row in tqdm(merged_candidates_df.iterrows(), total=merged_candidates_df.shape[0]):
        sess_id = row['sess_id']
        product = row['product']
        last_item = session_last_items[sess_id]
        co_graph_count = co_graph_dict[last_item][product]
        co_graph_count_list.append(co_graph_count)
    return co_graph_count_list

In [8]:
def flatten_co_graph_dict(co_graph_dict):
    product_list = []
    neighbor_list = []
    counts_list = []
    for product in tqdm(co_graph_dict.keys(), total=len(co_graph_dict)):
        for neigh in co_graph_dict[product].keys():
            product_list.append(product)
            neighbor_list.append(neigh)
            counts_list.append(co_graph_dict[product][neigh])
    return pd.DataFrame({'product_' : product_list, 'neighbor' : neighbor_list, 'counts' : counts_list})

In [9]:
def normalize_co_graph_counts(merged_candidates_counts):
    # normalize co graph counts 
    # merged_candidates_counts_g = cudf.from_pandas(merged_candidates_counts)
    sessions_count_sum = merged_candidates_counts[['sess_id', 'counts']].groupby('sess_id').sum()
    sessions_count_sum.sort_index(inplace=True)

    # sessions_count_sum = sessions_count_sum.to_pandas()

    candidates_count_sum = sessions_count_sum.loc[merged_candidates_counts['sess_id']].reset_index(drop=True)
    merged_candidates_counts['counts_sum'] = candidates_count_sum['counts']
    merged_candidates_counts['normalized_counts'] = merged_candidates_counts['counts'] / merged_candidates_counts['counts_sum']

    # del merged_candidates_counts_g
    # del sessions_count_sum_g
    

# Merge Test co-graph counts 

In [10]:
merged_candidates_feature_test_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_test_feature.parquet'
train_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_train_sessions.csv'
valid_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_valid_sessions.csv'
test_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1.csv'
product_data_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/products_train.csv'

In [11]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature_test():
    return pd.read_parquet(merged_candidates_feature_test_path)

@lru_cache(maxsize=1)
def read_product_data():
    return pd.read_csv(product_data_path)

@lru_cache(maxsize=1)
def read_train_data():
    return pd.read_csv(train_sessions_path)

@lru_cache(maxsize=1)
def read_valid_data():
    return pd.read_csv(valid_sessions_path)

@lru_cache(maxsize=1)
def read_test_data():
    return pd.read_csv(test_sessions_path)

In [12]:
merged_candidates_feature_test = read_merged_candidates_feature_test()

In [13]:
merged_candidates = merged_candidates_feature_test[['sess_id', 'sess_locale', 'product']]

In [14]:
train_sess_data = read_train_data()
valid_sess_data = read_valid_data()
test_sess_data = read_test_data()
product = read_product_data()

In [15]:
train_sess_item = get_sessions(train_sess_data, list_item=False)
valid_sess_item = get_sessions(valid_sess_data, test=False, list_item=False)
test_sess_item = get_sessions(test_sess_data, test=True, list_item=False)

100%|██████████| 3557898/3557898 [03:16<00:00, 18137.11it/s]
100%|██████████| 361581/361581 [00:15<00:00, 23389.46it/s]


In [16]:
test_session_last_items = get_session_last_item(test_sess_data)

100%|██████████| 361581/361581 [00:18<00:00, 19780.92it/s]


In [17]:
test_session_last_items = np.array(test_session_last_items)
merged_candidates['last_item'] = test_session_last_items[merged_candidates['sess_id']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates['last_item'] = valid_session_last_items[merged_candidates['sess_id']]


## bidirection

In [18]:
co_occurence_dict_bi = get_co_occurence_dict(train_sess_item + valid_sess_item + test_sess_item, bidirection=True, weighted=False)

100%|██████████| 3919479/3919479 [02:24<00:00, 27146.95it/s]


In [19]:
merged_candidates_feature_test = read_merged_candidates_feature_test()
merged_candidates = merged_candidates_feature_test[['sess_id', 'sess_locale', 'product']]
test_session_last_items = get_session_last_item(test_sess_data)
test_session_last_items = np.array(test_session_last_items)
merged_candidates['last_item'] = test_session_last_items[merged_candidates['sess_id']]

100%|██████████| 361581/361581 [00:18<00:00, 19573.18it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates['last_item'] = valid_session_last_items[merged_candidates['sess_id']]


In [20]:
co_occurence_dict_bi_df = flatten_co_graph_dict(co_occurence_dict_bi)

100%|██████████| 1401599/1401599 [00:37<00:00, 37872.54it/s] 


In [None]:
# merged_candidates_g = cudf.from_pandas(merged_candidates)
# co_occurence_dict_bi_df_g = cudf.from_pandas(co_occurence_dict_bi_df)

In [None]:
# merged_candidates_bi_g = merged_candidates_g.merge(co_occurence_dict_bi_df_g, how='left', left_on=['last_item', 'product'], right_on=['product_', 'neighbor'])
# merged_candidates_bi_g.drop(columns=['product_', 'neighbor'], inplace=True)
# merged_candidates_bi_g = merged_candidates_bi_g.sort_values(by=['sess_id', 'product'])
# merged_candidates_bi_g.reset_index(drop=True, inplace=True)
# merged_candidates_bi_g['counts'] = merged_candidates_bi_g['counts'].fillna(0)
# assert len(merged_candidates_bi_g['counts']) == len(merged_candidates)
# merged_candidates_bi = merged_candidates_bi_g.to_pandas()

In [21]:
merged_candidates_bi = merged_candidates.merge(co_occurence_dict_bi_df, how='left', left_on=['last_item', 'product'], right_on=['product_', 'neighbor'])
merged_candidates_bi.drop(columns=['product_', 'neighbor'], inplace=True)
merged_candidates_bi = merged_candidates_bi.sort_values(by=['sess_id', 'product'])
merged_candidates_bi.reset_index(drop=True, inplace=True)
merged_candidates_bi['counts'] = merged_candidates_bi['counts'].fillna(0)
assert len(merged_candidates_bi['counts']) == len(merged_candidates)
# merged_candidates_bi = merged_candidates_bi_g.to_pandas()

In [None]:
# del merged_candidates_g
# del co_occurence_dict_bi_df_g
# del merged_candidates_bi_g

In [22]:
normalize_co_graph_counts(merged_candidates_bi)

In [23]:
merged_candidates_feature_test['co_graph_counts_0'] = merged_candidates_bi['counts']
merged_candidates_feature_test['normalized_co_graph_counts_0'] = merged_candidates_bi['normalized_counts']

In [25]:
merged_candidates_feature_test.query('sess_id==20000').sort_values(by=['sasrec_scores_2'], ascending=False)[['product', 'sasrec_scores_2', 'co_graph_counts_0', 'normalized_co_graph_counts_0']][:25]

Unnamed: 0,product,sasrec_scores_2,co_graph_counts_0,normalized_co_graph_counts_0
4672245,B07D98ZNJX,25.67992,6.0,0.24
4672246,B07D997JWG,24.778963,5.0,0.2
4672174,B007SQ3HGS,19.812937,1.0,0.04
4672267,B07QJB4MFD,18.794708,2.0,0.08
4672300,B081N8XPQH,17.467945,0.0,0.0
4672394,B09Y5GC5R7,17.269825,1.0,0.04
4672340,B08SVPH33Z,16.826963,2.0,0.08
4672297,B081N7R51C,16.567038,1.0,0.04
4672284,B07V4PBMWT,15.528075,1.0,0.04
4672288,B07X9Q39H6,14.468607,1.0,0.04


In [27]:
cast_dtype(merged_candidates_feature_test, ['co_graph_counts_0', 'normalized_co_graph_counts_0'])
merged_candidates_feature_test.to_parquet(merged_candidates_feature_test_path, engine='pyarrow')

## uni and weight

In [29]:
co_occurence_dict_uni_weight = get_co_occurence_dict(train_sess_item + valid_sess_item + test_sess_item, bidirection=False, weighted=True)

100%|██████████| 3919479/3919479 [01:51<00:00, 35200.63it/s]


In [30]:
co_graph_uni_weight_df = flatten_co_graph_dict(co_occurence_dict_uni_weight)

100%|██████████| 1401599/1401599 [00:27<00:00, 50797.52it/s] 


In [22]:
# merged_candidates_g = cudf.from_pandas(merged_candidates)
# co_graph_uni_weight_df_g = cudf.from_pandas(co_graph_uni_weight_df)

In [None]:
merged_candidates_uni_weight = merged_candidates.merge(co_graph_uni_weight_df, how='left', left_on=['last_item', 'product'], right_on=['product_', 'neighbor'])
merged_candidates_uni_weight.drop(columns=['product_', 'neighbor'], inplace=True)
merged_candidates_uni_weight = merged_candidates_uni_weight.sort_values(by=['sess_id', 'product'])
merged_candidates_uni_weight.reset_index(drop=True, inplace=True)
merged_candidates_uni_weight['counts'] = merged_candidates_uni_weight['counts'].fillna(0)
assert len(merged_candidates_uni_weight['counts']) == len(merged_candidates)
# merged_candidates_uni_weight = merged_candidates_uni_weight_g.to_pandas()

In [25]:
# del merged_candidates_g
# del co_graph_uni_weight_df_g
# del merged_candidates_uni_weight_g

In [None]:
normalize_co_graph_counts(merged_candidates_uni_weight)

In [None]:
merged_candidates_feature_test['co_graph_counts_1'] = merged_candidates_uni_weight['counts']
merged_candidates_feature_test['normalized_co_graph_counts_1'] = merged_candidates_uni_weight['normalized_counts']

In [None]:
cast_dtype(merged_candidates_feature_test)
merged_candidates_feature_test.to_parquet(merged_candidates_feature_test_path, engine='pyarrow')

In [None]:
merged_candidates_uni_weight

Unnamed: 0,sess_id,sess_locale,product,last_item,counts,counts_cum,normalized_counts
0,0,UK,B000OPPVCS,B077XGDMD2,1,457,0.002188
1,0,UK,B000V599Y2,B077XGDMD2,0,457,0.000000
2,0,UK,B0018HH444,B077XGDMD2,1,457,0.002188
3,0,UK,B0079JI4DU,B077XGDMD2,1,457,0.002188
4,0,UK,B0079JI4EY,B077XGDMD2,1,457,0.002188
...,...,...,...,...,...,...,...
84407334,361580,DE,B0BB7XV97M,B08427PFR5,0,128,0.000000
84407335,361580,DE,B0BB7YSRBX,B08427PFR5,0,128,0.000000
84407336,361580,DE,B0BB7ZMGY8,B08427PFR5,0,128,0.000000
84407337,361580,DE,B0BD4CP7N3,B08427PFR5,0,128,0.000000


In [None]:
merged_candidates_uni_weight.query("sess_id==0")['counts'].sum()

457

In [None]:
merged_candidates_uni_weight.query("sess_id==0")['normalized_counts'].max()

0.08971553610503283

In [23]:
co_graph_uni_weight_df

Unnamed: 0,product_,neighbor,counts
0,B005ZJTUXE,B005ZJTUXE,7
1,B005ZJTUXE,B00P8VIBBG,11
2,B005ZJTUXE,B07TVSL9TW,7
3,B005ZJTUXE,B014J7P4KU,4
4,B005ZJTUXE,B07HJCRPTB,9
...,...,...,...
29994681,B09KXXFJS7,B09KXXGQX8,1
29994682,B09KXVZ7YQ,B09KXVDK5F,1
29994683,B09KXVZ7YQ,B09KXXGQX8,1
29994684,B09KXVDK5F,B09KXXGQX8,1


In [None]:
merged_candidates_feature_test.query('sess_id==20000').sort_values(by=['sasrec_scores_2'], ascending=False)[['product', 'sasrec_scores_2', 'co_graph_counts_1', 'normalized_co_graph_counts_1']][:25]

## uni and dis=1

In [None]:
co_occurence_dict_uni_dis1 = get_co_occurence_dict(train_sess_item + valid_sess_item + test_sess_item, bidirection=False, weighted=False, max_dis=1)

In [41]:
merged_candidates_feature_test = read_merged_candidates_feature_test()
merged_candidates = merged_candidates_feature_test[['sess_id', 'sess_locale', 'product']]
test_session_last_items = get_session_last_item(test_sess_data)
test_session_last_items = np.array(test_session_last_items)
merged_candidates['last_item'] = test_session_last_items[merged_candidates['sess_id']]

100%|██████████| 361581/361581 [00:09<00:00, 37824.30it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates['last_item'] = valid_session_last_items[merged_candidates['sess_id']]


In [42]:
merged_candidates

Unnamed: 0,sess_id,sess_locale,product,last_item
0,0,UK,B000OPPVCS,B077XGDMD2
1,0,UK,B000V599Y2,B077XGDMD2
2,0,UK,B0018HH444,B077XGDMD2
3,0,UK,B0079JI4DU,B077XGDMD2
4,0,UK,B0079JI4EY,B077XGDMD2
...,...,...,...,...
84407334,361580,DE,B0BB7XV97M,B08427PFR5
84407335,361580,DE,B0BB7YSRBX,B08427PFR5
84407336,361580,DE,B0BB7ZMGY8,B08427PFR5
84407337,361580,DE,B0BD4CP7N3,B08427PFR5


In [38]:
co_occurence_dict_uni_dis1_df = flatten_co_graph_dict(co_occurence_dict_uni_dis1)

100%|██████████| 1401599/1401599 [00:14<00:00, 94514.80it/s] 


In [43]:
# merged_candidates_g = cudf.from_pandas(merged_candidates)
# co_occurence_dict_uni_dis1_df_g = cudf.from_pandas(co_occurence_dict_uni_dis1_df)

In [44]:
# merged_candidates_uni_dis1_g = merged_candidates_g.merge(co_occurence_dict_uni_dis1_df_g, how='left', left_on=['last_item', 'product'], right_on=['product_', 'neighbor'])
# merged_candidates_uni_dis1_g.drop(columns=['product_', 'neighbor'], inplace=True)
# merged_candidates_uni_dis1_g = merged_candidates_uni_dis1_g.sort_values(by=['sess_id', 'product'])
# merged_candidates_uni_dis1_g.reset_index(drop=True, inplace=True)
# merged_candidates_uni_dis1_g['counts'] = merged_candidates_uni_dis1_g['counts'].fillna(0)
# assert len(merged_candidates_uni_dis1_g['counts']) == len(merged_candidates)
# merged_candidates_uni_dis1 = merged_candidates_uni_dis1_g.to_pandas()

In [None]:
merged_candidates_uni_dis1 = merged_candidates.merge(co_occurence_dict_uni_dis1_df, how='left', left_on=['last_item', 'product'], right_on=['product_', 'neighbor'])
merged_candidates_uni_dis1.drop(columns=['product_', 'neighbor'], inplace=True)
merged_candidates_uni_dis1 = merged_candidates_uni_dis1.sort_values(by=['sess_id', 'product'])
merged_candidates_uni_dis1.reset_index(drop=True, inplace=True)
merged_candidates_uni_dis1['counts'] = merged_candidates_uni_dis1['counts'].fillna(0)
assert len(merged_candidates_uni_dis1['counts']) == len(merged_candidates)
# merged_candidates_uni_dis1 = merged_candidates_uni_dis1_g.to_pandas()

In [45]:
# del merged_candidates_g
# del co_occurence_dict_uni_dis1_df_g
# del merged_candidates_uni_dis1_g

In [46]:
normalize_co_graph_counts(merged_candidates_uni_dis1)

In [53]:
merged_candidates_feature_test['co_graph_counts_2'] = merged_candidates_uni_dis1['counts']
merged_candidates_feature_test['normalized_co_graph_counts_2'] = merged_candidates_uni_dis1['normalized_counts']

In [55]:
cast_dtype(merged_candidates_feature_test)
merged_candidates_feature_test.to_parquet(merged_candidates_feature_test_path, engine='pyarrow')

In [48]:
merged_candidates_uni_dis1.query("sess_id==99")

Unnamed: 0,sess_id,sess_locale,product,last_item,counts,counts_cum,normalized_counts
23692,99,UK,B0045XDSZM,B0BC6DL1FW,0.0,13.666667,0.0
23693,99,UK,B004605SE8,B0BC6DL1FW,0.0,13.666667,0.0
23694,99,UK,B0053Y8M1W,B0BC6DL1FW,0.0,13.666667,0.0
23695,99,UK,B007PS6O28,B0BC6DL1FW,0.0,13.666667,0.0
23696,99,UK,B0081Q3YN0,B0BC6DL1FW,0.0,13.666667,0.0
...,...,...,...,...,...,...,...
23906,99,UK,B0BF15FRZT,B0BC6DL1FW,0.0,13.666667,0.0
23907,99,UK,B0BF15JRSV,B0BC6DL1FW,0.0,13.666667,0.0
23908,99,UK,B0BF62M27B,B0BC6DL1FW,0.0,13.666667,0.0
23909,99,UK,B0BF75P7MZ,B0BC6DL1FW,0.0,13.666667,0.0


In [49]:
merged_candidates_uni_dis1.query("sess_id==99")['counts'].sum()

13.666666666666666

In [52]:
merged_candidates_uni_dis1.query("sess_id==99")['counts'].max(), merged_candidates_uni_dis1.query("sess_id==99")['normalized_counts'].max()

(1.8333333333333333, 0.13414634146341464)

In [None]:
merged_candidates_feature_test.query('sess_id==20000').sort_values(by=['sasrec_scores_2'], ascending=False)[['product', 'sasrec_scores_2', 'co_graph_counts_2', 'normalized_co_graph_counts_2']][:25]