# Necessary Common Functions

Those functions should be ran before each part.

In [3]:
import os
import random
import numpy as np
import pandas as pd
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache
from tqdm import tqdm, trange
from collections import Counter, defaultdict
import numba 
from numba import jit

In [4]:
def get_sessions(df: pd.DataFrame, test=False, list_item=False) -> list:
    
    all_item = []
    if 'next_item' in df and not test:
        if list_item:
            for i in trange(len(df)):
                all_item.append(np.concatenate([np.array(df.loc[i, 'prev_items']), np.array(df.loc[i, 'next_item'])], axis=0))
        else:
            for i in trange(len(df)):
                all_item.append(eval((df.loc[i, 'prev_items'][:-1]+f" '{df.loc[i, 'next_item']}']").replace(" ", ",")))
    else:
        if list_item:
            all_item = df['prev_items']
        else:
            for i in trange(len(df)):
                all_item.append(eval((df.loc[i, 'prev_items']).replace(" ", ",")))
    return all_item

In [5]:
def get_co_occurence_dict(sessions: list, bidirection: bool=True, weighted: bool=False, max_dis=None) -> dict:
    res = {}
    for sess in tqdm(sessions):
        for i, id in enumerate(sess):
            if id not in res:
                res[id] = Counter()
            
            if max_dis == None:
                e = len(sess)
            else:
                e = min(i + max_dis + 1, len(sess))

            for j in range(i+1, e):
                if not weighted:
                    res[id][sess[j]] += 1
                else:
                    res[id][sess[j]] += 1 / (j-i)
                if bidirection:
                    if sess[j] not in res:
                        res[sess[j]] = Counter()
                    if not weighted:
                        res[sess[j]][id] += 1
                    else:
                        res[sess[j]][id] += 1 / (j-i)
    return res

In [6]:
def sort_co_occurence_dict(co_occurence_dict: dict) -> dict:
    res = {}
    for k,v in co_occurence_dict.items():
        res[k] = dict(sorted(v.items(), key=lambda item: -item[1]))
    return res

In [7]:
def cast_dtype(df : pd.DataFrame):
    for k in df.columns:
        dt = type(df[k].iloc[0])
        if 'float' in str(dt):
            df[k] = df[k].astype('float32')
        elif 'int' in str(dt):
            df[k] = df[k].astype('int32')
        elif dt == list:
            dt_ = type(df.iloc[0][k][0])
            if 'float' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.float32))
            elif 'int' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.int32))

In [8]:
def get_session_last_item(session_df):
    last_items = []
    num_sessions = len(session_df)
    for i in tqdm(range(num_sessions)):
        sess = session_df.iloc[i]
        sess_prev_items = sess['prev_items']
        
        product_list = sess_prev_items.strip('[]').split(' ')
        last_item = product_list[-1].strip("'\n")

        last_items.append(last_item)
    return last_items 

In [9]:
def get_co_graph_counts(session_last_items, merged_candidates_df, co_graph_dict):
    co_graph_count_list = []
    for idx, row in tqdm(merged_candidates_df.iterrows(), total=merged_candidates_df.shape[0]):
        sess_id = row['sess_id']
        product = row['product']
        last_item = session_last_items[sess_id]
        co_graph_count = co_graph_dict[last_item][product]
        co_graph_count_list.append(co_graph_count)
    return co_graph_count_list

In [10]:
def flatten_co_graph_dict(co_graph_dict):
    product_list = []
    neighbor_list = []
    counts_list = []
    for product in tqdm(co_graph_dict.keys(), total=len(co_graph_dict)):
        for neigh in co_graph_dict[product].keys():
            product_list.append(product)
            neighbor_list.append(neigh)
            counts_list.append(co_graph_dict[product][neigh])
    return pd.DataFrame({'product_' : product_list, 'neighbor' : neighbor_list, 'counts' : counts_list})

In [11]:
def normalize_co_graph_counts(merged_candidates_counts):
    # normalize co graph counts 
    merged_candidates_counts_g = cudf.from_pandas(merged_candidates_counts)
    sessions_count_sum_g = merged_candidates_counts_g[['sess_id', 'counts']].groupby('sess_id').sum()
    sessions_count_sum_g.sort_index(inplace=True)

    sessions_count_sum = sessions_count_sum_g.to_pandas()

    candidates_count_sum = sessions_count_sum.loc[merged_candidates_counts['sess_id']].reset_index(drop=True)
    merged_candidates_counts['counts_cum'] = candidates_count_sum['counts']
    merged_candidates_counts['normalized_counts'] = merged_candidates_counts['counts'] / merged_candidates_counts['counts_cum']

    del merged_candidates_counts_g
    del sessions_count_sum_g
    

# Merge valid co-graph counts 

In [10]:
merged_candidates_feature_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_no_hist_feature.parquet'
train_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_train_sessions.csv'
valid_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_valid_sessions.csv'
test_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1.csv'
product_data_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/products_train.csv'

In [11]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature():
    return pd.read_parquet(merged_candidates_feature_path)

@lru_cache(maxsize=1)
def read_product_data():
    return pd.read_csv(product_data_path)

@lru_cache(maxsize=1)
def read_train_data():
    return pd.read_csv(train_sessions_path)

@lru_cache(maxsize=1)
def read_valid_data():
    return pd.read_csv(valid_sessions_path)

@lru_cache(maxsize=1)
def read_test_data():
    return pd.read_csv(test_sessions_path)

In [12]:
merged_candidates_feature = read_merged_candidates_feature()

In [13]:
merged_candidates = merged_candidates_feature[['sess_id', 'sess_locale', 'product']]

In [14]:
train_sess_data = read_train_data()
valid_sess_data = read_valid_data()
test_sess_data = read_test_data()
product = read_product_data()

In [15]:
train_sess_item = get_sessions(train_sess_data, list_item=False)
valid_sess_item = get_sessions(valid_sess_data, test=True, list_item=False)

100%|██████████| 3557898/3557898 [01:51<00:00, 31962.46it/s]
100%|██████████| 361581/361581 [00:07<00:00, 50744.43it/s]


In [16]:
co_occurence_dict_uni = get_co_occurence_dict(train_sess_item + valid_sess_item, bidirection=False, weighted=False)
co_occurence_dict_dis_3_uni_wgt = get_co_occurence_dict(train_sess_item + valid_sess_item, bidirection=False, weighted=True, max_dis=3)
co_occurence_dict_dis_3_bi = get_co_occurence_dict(train_sess_item + valid_sess_item, bidirection=True, weighted=False, max_dis=3)

100%|██████████| 3919479/3919479 [00:57<00:00, 68245.23it/s]
100%|██████████| 3919479/3919479 [01:00<00:00, 65124.68it/s]
100%|██████████| 3919479/3919479 [01:26<00:00, 45283.56it/s]


In [17]:
valid_session_last_items = get_session_last_item(valid_sess_data)

100%|██████████| 361581/361581 [00:09<00:00, 37754.58it/s]


In [18]:
valid_session_last_items = np.array(valid_session_last_items)
merged_candidates['last_item'] = valid_session_last_items[merged_candidates['sess_id']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates['last_item'] = valid_session_last_items[merged_candidates['sess_id']]


In [19]:
merged_candidates

Unnamed: 0,sess_id,sess_locale,product,last_item
0,0,UK,B000OPPVCS,B077XGDMD2
1,0,UK,B000V599Y2,B077XGDMD2
2,0,UK,B0018HH444,B077XGDMD2
3,0,UK,B0079JI4DU,B077XGDMD2
4,0,UK,B0079JI4EY,B077XGDMD2
...,...,...,...,...
84407334,361580,DE,B0BB7XV97M,B08427PFR5
84407335,361580,DE,B0BB7YSRBX,B08427PFR5
84407336,361580,DE,B0BB7ZMGY8,B08427PFR5
84407337,361580,DE,B0BD4CP7N3,B08427PFR5


In [20]:
valid_sess_data.iloc[0]['prev_items']

"['B09VSN9GLS' 'B09VSG9DCG' 'B0BJ5L1ZPH' 'B09VSN9GLS' 'B0BJ6V797Y'\n 'B09VSG9DCG' 'B077XGDMD2']"

## merge co graph uni

In [21]:
co_graph_uni_df = flatten_co_graph_dict(co_occurence_dict_uni)

100%|██████████| 1401599/1401599 [00:13<00:00, 102670.30it/s]


In [22]:
merged_candidates_g = cudf.from_pandas(merged_candidates)
co_graph_uni_df_g = cudf.from_pandas(co_graph_uni_df)

In [24]:
merged_candidates_uni_g = merged_candidates_g.merge(co_graph_uni_df_g, how='left', left_on=['last_item', 'product'], right_on=['product_', 'neighbor'])
merged_candidates_uni_g.drop(columns=['product_', 'neighbor'], inplace=True)
merged_candidates_uni_g = merged_candidates_uni_g.sort_values(by=['sess_id', 'product'])
merged_candidates_uni_g.reset_index(drop=True, inplace=True)
merged_candidates_uni_g['counts'] = merged_candidates_uni_g['counts'].fillna(0)
assert len(merged_candidates_uni_g['counts']) == len(merged_candidates)
merged_candidates_uni = merged_candidates_uni_g.to_pandas()

In [25]:
del merged_candidates_g
del co_graph_uni_df_g
del merged_candidates_uni_g

In [26]:
normalize_co_graph_counts(merged_candidates_uni)

In [33]:
merged_candidates_feature['co_graph_counts_0'] = merged_candidates_uni['counts']
merged_candidates_feature['normalized_co_graph_counts_0'] = merged_candidates_uni['normalized_counts']

In [35]:
cast_dtype(merged_candidates_feature)
merged_candidates_feature.to_parquet(merged_candidates_feature_path, engine='pyarrow')

In [27]:
merged_candidates_uni

Unnamed: 0,sess_id,sess_locale,product,last_item,counts,counts_cum,normalized_counts
0,0,UK,B000OPPVCS,B077XGDMD2,1,457,0.002188
1,0,UK,B000V599Y2,B077XGDMD2,0,457,0.000000
2,0,UK,B0018HH444,B077XGDMD2,1,457,0.002188
3,0,UK,B0079JI4DU,B077XGDMD2,1,457,0.002188
4,0,UK,B0079JI4EY,B077XGDMD2,1,457,0.002188
...,...,...,...,...,...,...,...
84407334,361580,DE,B0BB7XV97M,B08427PFR5,0,128,0.000000
84407335,361580,DE,B0BB7YSRBX,B08427PFR5,0,128,0.000000
84407336,361580,DE,B0BB7ZMGY8,B08427PFR5,0,128,0.000000
84407337,361580,DE,B0BD4CP7N3,B08427PFR5,0,128,0.000000


In [31]:
merged_candidates_uni.query("sess_id==0")['counts'].sum()

457

In [32]:
merged_candidates_uni.query("sess_id==0")['normalized_counts'].max()

0.08971553610503283

In [30]:
merged_candidates_uni[(merged_candidates_uni['sess_id'] == 100) & (merged_candidates_uni['product'] == 'B09MZ76DTR')]

Unnamed: 0,sess_id,sess_locale,product,last_item,counts,counts_cum,normalized_counts
24014,100,UK,B09MZ76DTR,B09T32YPZ8,67,509,0.131631


In [94]:
co_occurence_dict_uni['B09T32YPZ8'].most_common(5)

[('B09MZ76DTR', 67),
 ('B0B2VMQ213', 54),
 ('B09C5WR4KL', 39),
 ('B0B8ZLM77C', 39),
 ('B09T32YPZ8', 31)]

In [23]:
co_graph_uni_df

Unnamed: 0,product_,neighbor,counts
0,B005ZJTUXE,B005ZJTUXE,7
1,B005ZJTUXE,B00P8VIBBG,11
2,B005ZJTUXE,B07TVSL9TW,7
3,B005ZJTUXE,B014J7P4KU,4
4,B005ZJTUXE,B07HJCRPTB,9
...,...,...,...
29994681,B09KXXFJS7,B09KXXGQX8,1
29994682,B09KXVZ7YQ,B09KXVDK5F,1
29994683,B09KXVZ7YQ,B09KXXGQX8,1
29994684,B09KXVDK5F,B09KXXGQX8,1


In [30]:
valid_sess_data.loc[361580, 'prev_items']

"['B07N6J2M3K' 'B081YZVG5L' 'B011XK46F0' 'B08427PFR5' 'B08427PFR5']"

In [97]:
merged_candidates

Unnamed: 0,sess_id,sess_locale,product,last_item
0,0,UK,B000OPPVCS,B077XGDMD2
1,0,UK,B000V599Y2,B077XGDMD2
2,0,UK,B0018HH444,B077XGDMD2
3,0,UK,B0079JI4DU,B077XGDMD2
4,0,UK,B0079JI4EY,B077XGDMD2
...,...,...,...,...
84407334,361580,DE,B0BB7XV97M,B08427PFR5
84407335,361580,DE,B0BB7YSRBX,B08427PFR5
84407336,361580,DE,B0BB7ZMGY8,B08427PFR5
84407337,361580,DE,B0BD4CP7N3,B08427PFR5


## merge distance 3 uni weighted 

In [41]:
merged_candidates_feature = read_merged_candidates_feature()
merged_candidates = merged_candidates_feature[['sess_id', 'sess_locale', 'product']]
valid_session_last_items = get_session_last_item(valid_sess_data)
valid_session_last_items = np.array(valid_session_last_items)
merged_candidates['last_item'] = valid_session_last_items[merged_candidates['sess_id']]

100%|██████████| 361581/361581 [00:09<00:00, 37824.30it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates['last_item'] = valid_session_last_items[merged_candidates['sess_id']]


In [42]:
merged_candidates

Unnamed: 0,sess_id,sess_locale,product,last_item
0,0,UK,B000OPPVCS,B077XGDMD2
1,0,UK,B000V599Y2,B077XGDMD2
2,0,UK,B0018HH444,B077XGDMD2
3,0,UK,B0079JI4DU,B077XGDMD2
4,0,UK,B0079JI4EY,B077XGDMD2
...,...,...,...,...
84407334,361580,DE,B0BB7XV97M,B08427PFR5
84407335,361580,DE,B0BB7YSRBX,B08427PFR5
84407336,361580,DE,B0BB7ZMGY8,B08427PFR5
84407337,361580,DE,B0BD4CP7N3,B08427PFR5


In [38]:
co_graph_dis_3_uni_wgt_df = flatten_co_graph_dict(co_occurence_dict_dis_3_uni_wgt)

100%|██████████| 1401599/1401599 [00:14<00:00, 94514.80it/s] 


In [43]:
merged_candidates_g = cudf.from_pandas(merged_candidates)
co_graph_dis_3_uni_wgt_df_g = cudf.from_pandas(co_graph_dis_3_uni_wgt_df)

In [44]:
merged_candidates_dis_3_uni_wgt_g = merged_candidates_g.merge(co_graph_dis_3_uni_wgt_df_g, how='left', left_on=['last_item', 'product'], right_on=['product_', 'neighbor'])
merged_candidates_dis_3_uni_wgt_g.drop(columns=['product_', 'neighbor'], inplace=True)
merged_candidates_dis_3_uni_wgt_g = merged_candidates_dis_3_uni_wgt_g.sort_values(by=['sess_id', 'product'])
merged_candidates_dis_3_uni_wgt_g.reset_index(drop=True, inplace=True)
merged_candidates_dis_3_uni_wgt_g['counts'] = merged_candidates_dis_3_uni_wgt_g['counts'].fillna(0)
assert len(merged_candidates_dis_3_uni_wgt_g['counts']) == len(merged_candidates)
merged_candidates_dis_3_uni_wgt = merged_candidates_dis_3_uni_wgt_g.to_pandas()

In [45]:
del merged_candidates_g
del co_graph_dis_3_uni_wgt_df_g
del merged_candidates_dis_3_uni_wgt_g

In [46]:
normalize_co_graph_counts(merged_candidates_dis_3_uni_wgt)

In [53]:
merged_candidates_feature['co_graph_counts_1'] = merged_candidates_dis_3_uni_wgt['counts']
merged_candidates_feature['normalized_co_graph_counts_1'] = merged_candidates_dis_3_uni_wgt['normalized_counts']

In [55]:
cast_dtype(merged_candidates_feature)
merged_candidates_feature.to_parquet(merged_candidates_feature_path, engine='pyarrow')

In [48]:
merged_candidates_dis_3_uni_wgt.query("sess_id==99")

Unnamed: 0,sess_id,sess_locale,product,last_item,counts,counts_cum,normalized_counts
23692,99,UK,B0045XDSZM,B0BC6DL1FW,0.0,13.666667,0.0
23693,99,UK,B004605SE8,B0BC6DL1FW,0.0,13.666667,0.0
23694,99,UK,B0053Y8M1W,B0BC6DL1FW,0.0,13.666667,0.0
23695,99,UK,B007PS6O28,B0BC6DL1FW,0.0,13.666667,0.0
23696,99,UK,B0081Q3YN0,B0BC6DL1FW,0.0,13.666667,0.0
...,...,...,...,...,...,...,...
23906,99,UK,B0BF15FRZT,B0BC6DL1FW,0.0,13.666667,0.0
23907,99,UK,B0BF15JRSV,B0BC6DL1FW,0.0,13.666667,0.0
23908,99,UK,B0BF62M27B,B0BC6DL1FW,0.0,13.666667,0.0
23909,99,UK,B0BF75P7MZ,B0BC6DL1FW,0.0,13.666667,0.0


In [49]:
merged_candidates_dis_3_uni_wgt.query("sess_id==99")['counts'].sum()

13.666666666666666

In [52]:
merged_candidates_dis_3_uni_wgt.query("sess_id==99")['counts'].max(), merged_candidates_dis_3_uni_wgt.query("sess_id==99")['normalized_counts'].max()

(1.8333333333333333, 0.13414634146341464)

## merge distance 3 bi

In [56]:
merged_candidates_feature = read_merged_candidates_feature()
merged_candidates = merged_candidates_feature[['sess_id', 'sess_locale', 'product']]
valid_session_last_items = get_session_last_item(valid_sess_data)
valid_session_last_items = np.array(valid_session_last_items)
merged_candidates['last_item'] = valid_session_last_items[merged_candidates['sess_id']]

100%|██████████| 361581/361581 [00:09<00:00, 37084.40it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates['last_item'] = valid_session_last_items[merged_candidates['sess_id']]


In [57]:
co_graph_dis_3_bi_df = flatten_co_graph_dict(co_occurence_dict_dis_3_bi)

100%|██████████| 1401599/1401599 [00:17<00:00, 78011.07it/s] 


In [61]:
merged_candidates_g = cudf.from_pandas(merged_candidates)
co_graph_dis_3_bi_df_g = cudf.from_pandas(co_graph_dis_3_bi_df)

In [62]:
merged_candidates_dis_3_bi_g = merged_candidates_g.merge(co_graph_dis_3_bi_df_g, how='left', left_on=['last_item', 'product'], right_on=['product_', 'neighbor'])
merged_candidates_dis_3_bi_g.drop(columns=['product_', 'neighbor'], inplace=True)
merged_candidates_dis_3_bi_g = merged_candidates_dis_3_bi_g.sort_values(by=['sess_id', 'product'])
merged_candidates_dis_3_bi_g.reset_index(drop=True, inplace=True)
merged_candidates_dis_3_bi_g['counts'] = merged_candidates_dis_3_bi_g['counts'].fillna(0)
assert len(merged_candidates_dis_3_bi_g['counts']) == len(merged_candidates)
merged_candidates_dis_3_bi = merged_candidates_dis_3_bi_g.to_pandas()

In [63]:
del merged_candidates_g
del co_graph_dis_3_bi_df_g
del merged_candidates_dis_3_bi_g

In [64]:
normalize_co_graph_counts(merged_candidates_dis_3_bi)

In [69]:
merged_candidates_feature['co_graph_counts_2'] = merged_candidates_dis_3_bi['counts']
merged_candidates_feature['normalized_co_graph_counts_2'] = merged_candidates_dis_3_bi['normalized_counts']

In [71]:
cast_dtype(merged_candidates_feature)
merged_candidates_feature.to_parquet(merged_candidates_feature_path, engine='pyarrow')

In [66]:
merged_candidates_dis_3_bi.query('sess_id==200')

Unnamed: 0,sess_id,sess_locale,product,last_item,counts,counts_cum,normalized_counts
47800,200,UK,1472249453,B09XHPY9PG,2,137,0.014599
47801,200,UK,1472249496,B09XHPY9PG,1,137,0.007299
47802,200,UK,B003BS34FG,B09XHPY9PG,0,137,0.000000
47803,200,UK,B004XAKAYE,B09XHPY9PG,1,137,0.007299
47804,200,UK,B0056ZS2H2,B09XHPY9PG,0,137,0.000000
...,...,...,...,...,...,...,...
48020,200,UK,B0BG522W4X,B09XHPY9PG,1,137,0.007299
48021,200,UK,B0BGX4863Y,B09XHPY9PG,1,137,0.007299
48022,200,UK,B0BHL979KY,B09XHPY9PG,0,137,0.000000
48023,200,UK,B0BK9VN833,B09XHPY9PG,2,137,0.014599


In [67]:
merged_candidates_dis_3_bi.query('sess_id==200')['counts'].sum()

137

In [68]:
merged_candidates_dis_3_bi.query('sess_id==200')['counts'].max(), merged_candidates_dis_3_bi.query('sess_id==200')['normalized_counts'].max()

(7, 0.051094890510948905)

In [70]:
merged_candidates_feature

Unnamed: 0,sess_id,sess_locale,product,target,sasrec_scores_2,sasrec_normalized_scores_2,product_freq,gru4rec_scores,gru4rec_normalized_scores,sess_avg_price,product_price,gru4rec_scores_2,gru4rec_normalized_scores_2,co_graph_counts_0,normalized_co_graph_counts_0,co_graph_counts_1,normalized_co_graph_counts_1,co_graph_counts_2,normalized_co_graph_counts_2
0,0,UK,B000OPPVCS,0.0,11.972421,2.286162e-04,104,6.484859,3.816029e-05,7.388571,7.280000,12.291418,5.528012e-05,1,0.002188,1.0,0.004819,2,0.004525
1,0,UK,B000V599Y2,0.0,13.152878,7.443427e-04,37,4.342063,4.477209e-06,7.388571,5.200000,12.142086,4.761183e-05,0,0.000000,0.0,0.000000,2,0.004525
2,0,UK,B0018HH444,0.0,5.606023,3.928400e-07,7,3.220763,1.458925e-06,7.388571,15.800000,8.919555,1.897524e-06,1,0.002188,1.0,0.004819,1,0.002262
3,0,UK,B0079JI4DU,0.0,0.000000,1.443945e-09,67,0.000000,5.824698e-08,7.388571,22.097065,0.000000,2.537897e-10,1,0.002188,0.5,0.002410,2,0.004525
4,0,UK,B0079JI4EY,0.0,0.000000,1.443945e-09,77,0.000000,5.824698e-08,7.388571,22.097065,0.000000,2.537897e-10,1,0.002188,1.0,0.004819,2,0.004525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84407334,361580,DE,B0BB7XV97M,0.0,9.117821,6.077226e-05,56,9.268379,1.396883e-05,32.424000,47.990002,14.038595,8.992638e-05,0,0.000000,0.0,0.000000,0,0.000000
84407335,361580,DE,B0BB7YSRBX,0.0,9.163816,6.363281e-05,58,7.047796,1.516259e-06,32.424000,43.990002,13.342258,4.482001e-05,0,0.000000,0.0,0.000000,0,0.000000
84407336,361580,DE,B0BB7ZMGY8,0.0,11.256460,5.158278e-04,452,9.359167,1.529639e-05,32.424000,41.990002,12.778135,2.549625e-05,0,0.000000,0.0,0.000000,0,0.000000
84407337,361580,DE,B0BD4CP7N3,0.0,-3.778687,1.523433e-10,1,-0.593306,7.282568e-10,32.424000,24.990000,-3.986487,1.335653e-12,0,0.000000,0.0,0.000000,0,0.000000


# Merge test co-graph counts 

In [12]:
merged_candidates_feature_test_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_test_no_hist_feature.parquet'
train_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_train_sessions.csv'
valid_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_valid_sessions.csv'
test_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1.csv'
product_data_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/products_train.csv'

In [13]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature_test():
    return pd.read_parquet(merged_candidates_feature_test_path)

@lru_cache(maxsize=1)
def read_product_data():
    return pd.read_csv(product_data_path)

@lru_cache(maxsize=1)
def read_train_data():
    return pd.read_csv(train_sessions_path)

@lru_cache(maxsize=1)
def read_valid_data():
    return pd.read_csv(valid_sessions_path)

@lru_cache(maxsize=1)
def read_test_data():
    return pd.read_csv(test_sessions_path)

In [14]:
train_sess_data = read_train_data()
valid_sess_data = read_valid_data()
test_sess_data = read_test_data()
product = read_product_data()

In [15]:
train_sess_item = get_sessions(train_sess_data, list_item=False)
valid_sess_item = get_sessions(valid_sess_data, list_item=False)
test_sess_item = get_sessions(valid_sess_data, test=True, list_item=False)

100%|██████████| 3557898/3557898 [01:47<00:00, 33142.47it/s]
100%|██████████| 361581/361581 [00:10<00:00, 35160.11it/s]
100%|██████████| 361581/361581 [00:08<00:00, 40965.08it/s]


In [16]:
co_occurence_dict_uni = get_co_occurence_dict(train_sess_item + valid_sess_item + test_sess_item, bidirection=False, weighted=False)
co_occurence_dict_dis_3_uni_wgt = get_co_occurence_dict(train_sess_item + valid_sess_item + test_sess_item, bidirection=False, weighted=True, max_dis=3)
co_occurence_dict_dis_3_bi = get_co_occurence_dict(train_sess_item + valid_sess_item + test_sess_item, bidirection=True, weighted=False, max_dis=3)

100%|██████████| 4281060/4281060 [00:50<00:00, 84076.70it/s] 
100%|██████████| 4281060/4281060 [01:08<00:00, 62192.56it/s] 
100%|██████████| 4281060/4281060 [01:11<00:00, 59961.36it/s]


## Merge co-graph uni counts 

In [17]:
merged_candidates_feature = read_merged_candidates_feature_test()
merged_candidates = merged_candidates_feature[['sess_id', 'sess_locale', 'product']]
test_session_last_items = get_session_last_item(test_sess_data)
test_session_last_items = np.array(test_session_last_items)
merged_candidates['last_item'] = test_session_last_items[merged_candidates['sess_id']]

100%|██████████| 316971/316971 [00:08<00:00, 38643.12it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates['last_item'] = test_session_last_items[merged_candidates['sess_id']]


In [18]:
merged_candidates

Unnamed: 0,sess_id,sess_locale,product,last_item
0,0,DE,4088833651,B099NQFMG7
1,0,DE,B000H6W2GW,B099NQFMG7
2,0,DE,B000JG2RAG,B099NQFMG7
3,0,DE,B000RYSOUW,B099NQFMG7
4,0,DE,B000UGZVQM,B099NQFMG7
...,...,...,...,...
69428426,316970,UK,B0BJCTH4NH,B0BG2LZQSL
69428427,316970,UK,B0BJTQQWLG,B0BG2LZQSL
69428428,316970,UK,B0BJV3RL4H,B0BG2LZQSL
69428429,316970,UK,B0BK7SPC84,B0BG2LZQSL


In [20]:
test_sess_data.iloc[-1]['prev_items']

"['B0B8JX92YJ' 'B09TN4MP6V' 'B0BG2LZQSL']"

In [21]:
co_graph_uni_df = flatten_co_graph_dict(co_occurence_dict_uni)

100%|██████████| 1405385/1405385 [00:13<00:00, 104549.97it/s]


In [22]:
merged_candidates_g = cudf.from_pandas(merged_candidates)
co_graph_uni_df_g = cudf.from_pandas(co_graph_uni_df)

In [23]:
merged_candidates_uni_g = merged_candidates_g.merge(co_graph_uni_df_g, how='left', left_on=['last_item', 'product'], right_on=['product_', 'neighbor'])
merged_candidates_uni_g.drop(columns=['product_', 'neighbor'], inplace=True)
merged_candidates_uni_g = merged_candidates_uni_g.sort_values(by=['sess_id', 'product'])
merged_candidates_uni_g.reset_index(drop=True, inplace=True)
merged_candidates_uni_g['counts'] = merged_candidates_uni_g['counts'].fillna(0)
assert len(merged_candidates_uni_g['counts']) == len(merged_candidates)
merged_candidates_uni = merged_candidates_uni_g.to_pandas()

In [24]:
del merged_candidates_g
del co_graph_uni_df_g
del merged_candidates_uni_g

In [25]:
normalize_co_graph_counts(merged_candidates_uni)

In [34]:
merged_candidates_feature['co_graph_counts_0'] = merged_candidates_uni['counts']
merged_candidates_feature['normalized_co_graph_counts_0'] = merged_candidates_uni['normalized_counts']

In [36]:
cast_dtype(merged_candidates_feature)
merged_candidates_feature.to_parquet(merged_candidates_feature_test_path, engine='pyarrow')

In [27]:
merged_candidates_uni

Unnamed: 0,sess_id,sess_locale,product,last_item,counts,counts_cum,normalized_counts
0,0,DE,4088833651,B099NQFMG7,0,125,0.0
1,0,DE,B000H6W2GW,B099NQFMG7,0,125,0.0
2,0,DE,B000JG2RAG,B099NQFMG7,0,125,0.0
3,0,DE,B000RYSOUW,B099NQFMG7,0,125,0.0
4,0,DE,B000UGZVQM,B099NQFMG7,0,125,0.0
...,...,...,...,...,...,...,...
69428426,316970,UK,B0BJCTH4NH,B0BG2LZQSL,0,19,0.0
69428427,316970,UK,B0BJTQQWLG,B0BG2LZQSL,0,19,0.0
69428428,316970,UK,B0BJV3RL4H,B0BG2LZQSL,0,19,0.0
69428429,316970,UK,B0BK7SPC84,B0BG2LZQSL,0,19,0.0


In [35]:
merged_candidates_feature

Unnamed: 0,sess_id,sess_locale,product,sasrec_scores_2,sasrec_normalized_scores_2,gru4rec_scores,gru4rec_normalized_scores,product_freq,sess_avg_price,product_price,gru4rec_scores_2,gru4rec_normalized_scores_2,co_graph_counts_0,normalized_co_graph_counts_0
0,0,DE,4088833651,0.000000,2.975813e-09,0.000000,1.580065e-09,828,25.195269,36.761604,0.000000,1.326730e-09,0,0.0
1,0,DE,B000H6W2GW,0.000000,2.975813e-09,0.000000,1.580065e-09,875,25.195269,36.761604,0.000000,1.326730e-09,0,0.0
2,0,DE,B000JG2RAG,7.665308,6.347557e-06,8.104032,5.226502e-06,24,25.195269,23.190001,11.372551,1.152972e-04,0,0.0
3,0,DE,B000RYSOUW,-2.951060,1.555882e-10,-2.857798,9.068785e-11,5,25.195269,6.900000,-2.205641,1.461790e-10,0,0.0
4,0,DE,B000UGZVQM,3.977920,1.589257e-07,4.688567,1.717488e-07,4,25.195269,21.990000,8.559400,6.919625e-06,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69428426,316970,UK,B0BJCTH4NH,11.327528,1.041200e-04,10.629994,3.818184e-04,74,16.950001,5.800000,11.301320,2.638649e-04,0,0.0
69428427,316970,UK,B0BJTQQWLG,5.604142,3.403292e-07,6.052083,3.923694e-06,6,16.950001,9.880000,8.246040,1.243056e-05,0,0.0
69428428,316970,UK,B0BJV3RL4H,9.146974,1.176336e-05,7.667603,1.973815e-05,7,16.950001,22.097065,9.860847,6.248733e-05,0,0.0
69428429,316970,UK,B0BK7SPC84,-10.383047,3.879279e-14,-6.356799,1.601719e-11,0,16.950001,5.960000,-7.227418,2.368389e-12,0,0.0


In [30]:
merged_candidates_uni.query("sess_id==0")['counts'].sum(), merged_candidates_uni.query("sess_id==0")['counts'].max()

(125, 24)

In [32]:
merged_candidates_uni.query("sess_id==0")['normalized_counts'].max()

0.192

## Merge co-graph dis 3 uni weighted 

In [76]:
merged_candidates_feature = read_merged_candidates_feature_test()
merged_candidates = merged_candidates_feature[['sess_id', 'sess_locale', 'product']]
test_session_last_items = get_session_last_item(test_sess_data)
test_session_last_items = np.array(test_session_last_items)
merged_candidates['last_item'] = test_session_last_items[merged_candidates['sess_id']]

100%|██████████| 316971/316971 [00:08<00:00, 37421.19it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates['last_item'] = test_session_last_items[merged_candidates['sess_id']]


In [77]:
co_graph_dis_3_uni_wgt_df = flatten_co_graph_dict(co_occurence_dict_dis_3_uni_wgt)

100%|██████████| 1405385/1405385 [00:14<00:00, 94989.34it/s] 


In [78]:
merged_candidates_g = cudf.from_pandas(merged_candidates)
co_graph_dis_3_uni_wgt_df_g = cudf.from_pandas(co_graph_dis_3_uni_wgt_df)

In [79]:
merged_candidates_dis_3_uni_wgt_g = merged_candidates_g.merge(co_graph_dis_3_uni_wgt_df_g, how='left', left_on=['last_item', 'product'], right_on=['product_', 'neighbor'])
merged_candidates_dis_3_uni_wgt_g.drop(columns=['product_', 'neighbor'], inplace=True)
merged_candidates_dis_3_uni_wgt_g = merged_candidates_dis_3_uni_wgt_g.sort_values(by=['sess_id', 'product'])
merged_candidates_dis_3_uni_wgt_g.reset_index(drop=True, inplace=True)
merged_candidates_dis_3_uni_wgt_g['counts'] = merged_candidates_dis_3_uni_wgt_g['counts'].fillna(0)
assert len(merged_candidates_dis_3_uni_wgt_g['counts']) == len(merged_candidates)
merged_candidates_dis_3_uni_wgt = merged_candidates_dis_3_uni_wgt_g.to_pandas()

In [80]:
del merged_candidates_g
del co_graph_dis_3_uni_wgt_df_g
del merged_candidates_dis_3_uni_wgt_g

In [81]:
normalize_co_graph_counts(merged_candidates_dis_3_uni_wgt)

In [90]:
merged_candidates_feature['co_graph_counts_1'] = merged_candidates_dis_3_uni_wgt['counts']
merged_candidates_feature['normalized_co_graph_counts_1'] = merged_candidates_dis_3_uni_wgt['normalized_counts']

In [92]:
cast_dtype(merged_candidates_feature)
merged_candidates_feature.to_parquet(merged_candidates_feature_test_path, engine='pyarrow')

In [82]:
merged_candidates

Unnamed: 0,sess_id,sess_locale,product,last_item
0,0,DE,4088833651,B099NQFMG7
1,0,DE,B000H6W2GW,B099NQFMG7
2,0,DE,B000JG2RAG,B099NQFMG7
3,0,DE,B000RYSOUW,B099NQFMG7
4,0,DE,B000UGZVQM,B099NQFMG7
...,...,...,...,...
69428426,316970,UK,B0BJCTH4NH,B0BG2LZQSL
69428427,316970,UK,B0BJTQQWLG,B0BG2LZQSL
69428428,316970,UK,B0BJV3RL4H,B0BG2LZQSL
69428429,316970,UK,B0BK7SPC84,B0BG2LZQSL


In [84]:
test_sess_data.iloc[0]['prev_items']

"['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC5PKN5' 'B09V7KG931'\n 'B09PY75FWM' 'B09PXYT6BT' 'B08V12CT4C' 'B08V1KXBQD' 'B08496TCCQ'\n 'B01BVG1XJS' 'B099NQFMG7']"

In [86]:
merged_candidates_dis_3_uni_wgt.query("sess_id==99")['counts'].sum()

19.5

In [87]:
merged_candidates_dis_3_uni_wgt.query("sess_id==99")['counts'].max(), merged_candidates_dis_3_uni_wgt.query("sess_id==99")['normalized_counts'].max()

(1.3333333333333333, 0.06837606837606837)

In [91]:
merged_candidates_feature.query("sess_id==0").sort_values(by='co_graph_counts_0', ascending=False)

Unnamed: 0,sess_id,sess_locale,product,sasrec_scores_2,sasrec_normalized_scores_2,gru4rec_scores,gru4rec_normalized_scores,product_freq,sess_avg_price,product_price,gru4rec_scores_2,gru4rec_normalized_scores_2,co_graph_counts_0,normalized_co_graph_counts_0,co_graph_counts_1,normalized_co_graph_counts_1
126,0,DE,B099NS1XPG,19.174606,6.324576e-01,19.822590,6.419724e-01,169,25.195269,39.990002,19.420296,3.605038e-01,24,0.192,14.5,0.378261
125,0,DE,B099NR3X6D,17.401634,1.074087e-01,18.709232,2.108586e-01,30,25.195269,32.990002,18.237829,1.105023e-01,12,0.096,5.5,0.143478
43,0,DE,B07JG9TFSB,13.167296,1.556288e-03,14.466280,3.029009e-03,1353,25.195269,13.490000,14.652622,3.064330e-03,6,0.048,0.5,0.013043
172,0,DE,B0B7S7LBMB,11.132316,2.033803e-04,13.073363,7.522540e-04,555,25.195269,7.990000,12.708579,4.385795e-04,5,0.040,0.5,0.013043
167,0,DE,B0B53KBXR8,11.583756,3.194237e-04,12.190853,3.112399e-04,134,25.195269,19.990000,12.743567,4.541958e-04,4,0.032,1.0,0.026087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,0,DE,B08KWDBZDV,9.599676,4.392300e-05,7.961596,4.532652e-06,28,25.195269,14.900000,11.804455,1.775792e-04,0,0.000,0.0,0.000000
89,0,DE,B08LR3G17D,-0.067086,2.782728e-09,3.360452,4.550936e-08,763,25.195269,59.990002,0.218839,1.651289e-09,0,0.000,0.0,0.000000
90,0,DE,B08LSDKWRW,-1.809689,4.871554e-10,2.047616,1.224457e-08,648,25.195269,16.490000,-4.405801,1.619449e-11,0,0.000,0.0,0.000000
91,0,DE,B08NZPL9KS,9.761469,5.163661e-05,9.572823,2.270382e-05,60,25.195269,11.990000,12.746300,4.554390e-04,0,0.000,0.0,0.000000


## merge distance 3 bi

In [93]:
merged_candidates_feature_test = read_merged_candidates_feature_test()
merged_candidates = merged_candidates_feature_test[['sess_id', 'sess_locale', 'product']]
test_session_last_items = get_session_last_item(test_sess_data)
test_session_last_items = np.array(test_session_last_items)
merged_candidates['last_item'] = test_session_last_items[merged_candidates['sess_id']]

100%|██████████| 316971/316971 [00:08<00:00, 37625.10it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates['last_item'] = test_session_last_items[merged_candidates['sess_id']]


In [94]:
co_graph_dis_3_bi_df = flatten_co_graph_dict(co_occurence_dict_dis_3_bi)

100%|██████████| 1405385/1405385 [00:15<00:00, 88260.69it/s] 


In [95]:
merged_candidates_g = cudf.from_pandas(merged_candidates)
co_graph_dis_3_bi_df_g = cudf.from_pandas(co_graph_dis_3_bi_df)

In [96]:
merged_candidates_dis_3_bi_g = merged_candidates_g.merge(co_graph_dis_3_bi_df_g, how='left', left_on=['last_item', 'product'], right_on=['product_', 'neighbor'])
merged_candidates_dis_3_bi_g.drop(columns=['product_', 'neighbor'], inplace=True)
merged_candidates_dis_3_bi_g = merged_candidates_dis_3_bi_g.sort_values(by=['sess_id', 'product'])
merged_candidates_dis_3_bi_g.reset_index(drop=True, inplace=True)
merged_candidates_dis_3_bi_g['counts'] = merged_candidates_dis_3_bi_g['counts'].fillna(0)
assert len(merged_candidates_dis_3_bi_g['counts']) == len(merged_candidates)
merged_candidates_dis_3_bi = merged_candidates_dis_3_bi_g.to_pandas()

In [97]:
del merged_candidates_g
del co_graph_dis_3_bi_df_g
del merged_candidates_dis_3_bi_g

In [98]:
normalize_co_graph_counts(merged_candidates_dis_3_bi)

In [102]:
merged_candidates_feature['co_graph_counts_2'] = merged_candidates_dis_3_bi['counts']
merged_candidates_feature['normalized_co_graph_counts_2'] = merged_candidates_dis_3_bi['normalized_counts']

In [None]:
cast_dtype(merged_candidates_feature)
merged_candidates_feature.to_parquet(merged_candidates_feature_test_path, engine='pyarrow')

In [99]:
merged_candidates_dis_3_bi.query('sess_id==200')

Unnamed: 0,sess_id,sess_locale,product,last_item,counts,counts_cum,normalized_counts
43065,200,DE,B000E8NUFK,B08G1C686F,1,182,0.005495
43066,200,DE,B000E8NUFU,B08G1C686F,1,182,0.005495
43067,200,DE,B00FNX9Z00,B08G1C686F,0,182,0.000000
43068,200,DE,B00FZWG9RG,B08G1C686F,1,182,0.005495
43069,200,DE,B015CY1MP0,B08G1C686F,0,182,0.000000
...,...,...,...,...,...,...,...
43264,200,DE,B0BBMJGV76,B08G1C686F,0,182,0.000000
43265,200,DE,B0BD3MB7MG,B08G1C686F,0,182,0.000000
43266,200,DE,B0BD3TLB1D,B08G1C686F,0,182,0.000000
43267,200,DE,B0BDNXC1F8,B08G1C686F,1,182,0.005495


In [100]:
merged_candidates_dis_3_bi.query('sess_id==200')['counts'].sum()

182

In [101]:
merged_candidates_dis_3_bi.query('sess_id==200')['counts'].max(), merged_candidates_dis_3_bi.query('sess_id==200')['normalized_counts'].max()

(43, 0.23626373626373626)

In [105]:
merged_candidates_feature.query("sess_id==100").sort_values(by='co_graph_counts_0', ascending=False)

Unnamed: 0,sess_id,sess_locale,product,sasrec_scores_2,sasrec_normalized_scores_2,gru4rec_scores,gru4rec_normalized_scores,product_freq,sess_avg_price,product_price,gru4rec_scores_2,gru4rec_normalized_scores_2,co_graph_counts_0,normalized_co_graph_counts_0,co_graph_counts_1,normalized_co_graph_counts_1,co_graph_counts_2,normalized_co_graph_counts_2
21660,100,DE,B07JGCM9QM,13.390882,4.028459e-03,12.226933,2.363916e-03,33,28.24,172.570007,13.614773,1.096700e-03,9,0.28125,3.000000,0.181818,5,0.147059
21741,100,DE,B097NR5C9X,17.187954,1.795505e-01,16.838047,2.378009e-01,62,28.24,39.990002,19.579325,4.270313e-01,4,0.12500,1.833333,0.111111,3,0.088235
21740,100,DE,B097NQVSDG,15.171331,2.389893e-02,17.256281,3.612852e-01,51,28.24,36.761604,18.281357,1.166163e-01,4,0.12500,2.333333,0.141414,4,0.117647
21632,100,DE,B01M8M8G23,8.198200,2.238649e-05,11.267111,9.052879e-04,11,28.24,129.990005,11.747287,1.694528e-04,2,0.06250,0.666667,0.040404,2,0.058824
21597,100,DE,B0053PNSZQ,-2.807978,3.715898e-10,-5.755242,3.664989e-11,10,28.24,13.990000,-3.220155,5.355145e-11,2,0.06250,0.500000,0.030303,1,0.029412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21684,100,DE,B07X8XB5LB,-0.806321,2.750250e-09,2.306378,1.161956e-07,4,28.24,9.990000,-1.563927,2.805836e-10,0,0.00000,0.000000,0.000000,0,0.000000
21685,100,DE,B07XC8HC86,13.961270,7.126161e-03,11.870783,1.655610e-03,34,28.24,30.990000,14.364724,2.321600e-03,0,0.00000,0.000000,0.000000,0,0.000000
21686,100,DE,B07Y2N32DX,9.479466,8.061816e-05,9.659750,1.814340e-04,4,28.24,26.990000,10.840804,6.844920e-05,0,0.00000,0.000000,0.000000,0,0.000000
21687,100,DE,B07YWHC5T4,-1.087764,2.075597e-09,2.724977,1.765978e-07,676,28.24,14.490000,3.044332,2.814514e-08,0,0.00000,0.000000,0.000000,0,0.000000
