# Necessary Common Functions

Those functions should be ran before each part.

In [3]:
import os
import random
import numpy as np
import pandas as pd
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache
from tqdm import tqdm, trange
from collections import Counter, defaultdict
import numba 
from numba import jit
import datasets
from datasets import Dataset as TFDataset

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def get_sessions(df: pd.DataFrame, test=False, list_item=False) -> list:
    
    all_item = []
    if 'next_item' in df and not test:
        if list_item:
            for i in trange(len(df)):
                all_item.append(np.concatenate([np.array(df.loc[i, 'prev_items']), np.array(df.loc[i, 'next_item'])], axis=0))
        else:
            for i in trange(len(df)):
                all_item.append(eval((df.loc[i, 'prev_items'][:-1]+f" '{df.loc[i, 'next_item']}']").replace(" ", ",")))
    else:
        if list_item:
            all_item = df['prev_items']
        else:
            for i in trange(len(df)):
                all_item.append(eval((df.loc[i, 'prev_items']).replace(" ", ",")))
    return all_item

In [5]:
def get_co_occurence_dict(sessions: list, bidirection: bool=True, weighted: bool=False, max_dis=None) -> dict:
    res = {}
    for sess in tqdm(sessions):
        for i, id in enumerate(sess):
            if id not in res:
                res[id] = Counter()
            
            if max_dis == None:
                e = len(sess)
            else:
                e = min(i + max_dis + 1, len(sess))

            for j in range(i+1, e):
                if not weighted:
                    res[id][sess[j]] += 1
                else:
                    res[id][sess[j]] += 1 / (j-i)
                if bidirection:
                    if sess[j] not in res:
                        res[sess[j]] = Counter()
                    if not weighted:
                        res[sess[j]][id] += 1
                    else:
                        res[sess[j]][id] += 1 / (j-i)
    return res

In [6]:
def sort_co_occurence_dict(co_occurence_dict: dict) -> dict:
    res = {}
    for k,v in co_occurence_dict.items():
        res[k] = dict(sorted(v.items(), key=lambda item: -item[1]))
    return res

In [7]:
def cast_dtype(df : pd.DataFrame):
    for k in df.columns:
        dt = type(df[k].iloc[0])
        if 'float' in str(dt):
            df[k] = df[k].astype('float32')
        elif 'int' in str(dt):
            df[k] = df[k].astype('int32')
        elif dt == list:
            dt_ = type(df.iloc[0][k][0])
            if 'float' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.float32))
            elif 'int' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.int32))

In [8]:
def get_session_last_item(session_df):
    last_items = []
    num_sessions = len(session_df)
    for i in tqdm(range(num_sessions)):
        sess = session_df.iloc[i]
        sess_prev_items = sess['prev_items']
        
        product_list = sess_prev_items.strip('[]').split(' ')
        last_item = product_list[-1].strip("'\n")

        last_items.append(last_item)
    return last_items 

In [9]:
def get_co_graph_counts(session_last_items, merged_candidates_df, co_graph_dict):
    co_graph_count_list = []
    for idx, row in tqdm(merged_candidates_df.iterrows(), total=merged_candidates_df.shape[0]):
        sess_id = row['sess_id']
        product = row['product']
        last_item = session_last_items[sess_id]
        co_graph_count = co_graph_dict[last_item][product]
        co_graph_count_list.append(co_graph_count)
    return co_graph_count_list

In [10]:
def flatten_co_graph_dict(co_graph_dict):
    product_list = []
    neighbor_list = []
    counts_list = []
    for product in tqdm(co_graph_dict.keys(), total=len(co_graph_dict)):
        for neigh in co_graph_dict[product].keys():
            product_list.append(product)
            neighbor_list.append(neigh)
            counts_list.append(co_graph_dict[product][neigh])
    return pd.DataFrame({'product_' : product_list, 'neighbor' : neighbor_list, 'counts' : counts_list})

In [11]:
def normalize_co_graph_counts(merged_candidates_counts):
    # normalize co graph counts 
    merged_candidates_counts_g = cudf.from_pandas(merged_candidates_counts)
    sessions_count_sum_g = merged_candidates_counts_g[['sess_id', 'counts']].groupby('sess_id').sum()
    sessions_count_sum_g.sort_index(inplace=True)

    sessions_count_sum = sessions_count_sum_g.to_pandas()

    candidates_count_sum = sessions_count_sum.loc[merged_candidates_counts['sess_id']].reset_index(drop=True)
    merged_candidates_counts['counts_cum'] = candidates_count_sum['counts']
    merged_candidates_counts['normalized_counts'] = merged_candidates_counts['counts'] / merged_candidates_counts['counts_cum']

    del merged_candidates_counts_g
    del sessions_count_sum_g
    

# Merge valid co-graph counts 

In [10]:
merged_candidates_feature_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_no_hist_feature.parquet'
train_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_train_sessions.csv'
valid_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_valid_sessions.csv'
test_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1.csv'
product_data_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/products_train.csv'

In [11]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature():
    return pd.read_parquet(merged_candidates_feature_path)

@lru_cache(maxsize=1)
def read_product_data():
    return pd.read_csv(product_data_path)

@lru_cache(maxsize=1)
def read_train_data():
    return pd.read_csv(train_sessions_path)

@lru_cache(maxsize=1)
def read_valid_data():
    return pd.read_csv(valid_sessions_path)

@lru_cache(maxsize=1)
def read_test_data():
    return pd.read_csv(test_sessions_path)

In [12]:
merged_candidates_feature = read_merged_candidates_feature()

In [13]:
merged_candidates = merged_candidates_feature[['sess_id', 'sess_locale', 'product']]

In [14]:
train_sess_data = read_train_data()
valid_sess_data = read_valid_data()
test_sess_data = read_test_data()
product = read_product_data()

In [15]:
train_sess_item = get_sessions(train_sess_data, list_item=False)
valid_sess_item = get_sessions(valid_sess_data, test=True, list_item=False)

100%|██████████| 3557898/3557898 [05:05<00:00, 11650.08it/s]
100%|██████████| 361581/361581 [00:12<00:00, 29146.51it/s]


In [21]:
# bidirection
co_occurence_dict_bi = get_co_occurence_dict(train_sess_item + valid_sess_item, bidirection=True, weighted=False)
# weight 
# co_occurence_dict_bi_weight = get_co_occurence_dict(train_sess_item + valid_sess_item, bidirection=True, weighted=True)
co_occurence_dict_uni_weight = get_co_occurence_dict(train_sess_item + valid_sess_item, bidirection=False, weighted=True)
# max dis = 1 
# co_occurence_dict_bi_dis1 = get_co_occurence_dict(train_sess_item + valid_sess_item, bidirection=True, weighted=False, max_dis=1)
co_occurence_dict_uni_dis1 = get_co_occurence_dict(train_sess_item + valid_sess_item, bidirection=False, weighted=False, max_dis=1)

100%|██████████| 3919479/3919479 [01:46<00:00, 36829.39it/s]


## bidirection

In [None]:
bi_valid_sessions_counter = {}

In [34]:
# only one arg, can't use another arg
def get_bi_valid_session_co_graph_candidates(sess_id_example):
    sess = valid_sess_item[sess_id_example['sess_id']]
    prev_items = set()
    cand_counter = Counter()
    for item in sess:
        if item in co_occurence_dict_bi and item not in prev_items:
            cand_counter = cand_counter + co_occurence_dict_bi[item]
            prev_items.add(item) # one time for every item
    for item in sess:
        if item in cand_counter:
            cand_counter.pop(item) # remove history items 
    
    bi_valid_sessions_counter[sess_id_example['sess_id']] = cand_counter

In [32]:
valid_co_graph_candidates = TFDataset.from_dict({'sess_id' : list(range(valid_sess_data.shape[0]))})

In [35]:
# about 1 mins
datasets.set_progress_bar_enabled(False)
valid_co_graph_candidates = valid_co_graph_candidates.map(get_bi_valid_session_co_graph_candidates, num_proc=8, batched=False)
datasets.set_progress_bar_enabled(True)

In [33]:
valid_co_graph_candidates, len(bi_valid_sessions_counter)

Dataset({
    features: ['sess_id'],
    num_rows: 361581
})

In [38]:
all_items_co_graph_count_list = []
for row in tqdm(merged_candidates.itertuples(), total=merged_candidates.shape[0]):
    all_items_co_graph_count_list.append(bi_valid_sessions_counter[row.sess_id][row.product])
assert len(all_items_co_graph_count_list) == merged_candidates.shape[0]
merged_candidates['all_items_co_graph_count_0'] = all_items_co_graph_count_list

100%|██████████| 84407339/84407339 [01:57<00:00, 719960.20it/s]


In [55]:
count_sum_array = merged_candidates.groupby(by='sess_id')['all_items_co_graph_count_0'].sum().to_numpy()
assert len(count_sum_array[merged_candidates['sess_id']]) == merged_candidates.shape[0]
merged_candidates['normalized_all_items_co_graph_count_0'] = count_sum_array[merged_candidates['sess_id']]
merged_candidates['normalized_all_items_co_graph_count_0'] = merged_candidates['all_items_co_graph_count_0'] / merged_candidates['normalized_all_items_co_graph_count_0']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates['normalized_all_items_co_graph_count_0'] = count_sum_array[merged_candidates['sess_id']]


In [66]:
merged_candidates_feature['normalized_all_items_co_graph_count_0'] = merged_candidates['normalized_all_items_co_graph_count_0']
merged_candidates_feature['all_items_co_graph_count_0'] = merged_candidates['all_items_co_graph_count_0']

## uni and weighted

In [None]:
merged_candidates = merged_candidates_feature[['sess_id', 'sess_locale', 'product']]

In [None]:
uni_wei_valid_sessions_counter = {}

In [None]:
# only one arg, can't use another arg
def get_uni_wei_valid_session_co_graph_candidates(sess_id_example):
    sess = valid_sess_item[sess_id_example['sess_id']]
    prev_items = set()
    cand_counter = Counter()
    for item in sess:
        if item in co_occurence_dict_uni_weight and item not in prev_items:
            cand_counter = cand_counter + co_occurence_dict_uni_weight[item]
            prev_items.add(item) # one time for every item
    for item in sess:
        if item in cand_counter:
            cand_counter.pop(item) # remove history items 
    
    uni_wei_valid_sessions_counter[sess_id_example['sess_id']] = cand_counter

In [None]:
valid_co_graph_candidates = TFDataset.from_dict({'sess_id' : list(range(valid_sess_data.shape[0]))})

In [None]:
# about 1 mins
datasets.set_progress_bar_enabled(False)
valid_co_graph_candidates = valid_co_graph_candidates.map(get_uni_wei_valid_session_co_graph_candidates, num_proc=8, batched=False)
datasets.set_progress_bar_enabled(True)

In [None]:
valid_co_graph_candidates, len(uni_wei_valid_sessions_counter)

Dataset({
    features: ['sess_id'],
    num_rows: 361581
})

In [None]:
all_items_co_graph_count_list = []
for row in tqdm(merged_candidates.itertuples(), total=merged_candidates.shape[0]):
    all_items_co_graph_count_list.append(uni_wei_valid_sessions_counter[row.sess_id][row.product])
assert len(all_items_co_graph_count_list) == merged_candidates.shape[0]
merged_candidates['all_items_co_graph_count_1'] = all_items_co_graph_count_list

100%|██████████| 84407339/84407339 [01:57<00:00, 719960.20it/s]


In [None]:
count_sum_array = merged_candidates.groupby(by='sess_id')['all_items_co_graph_count_1'].sum().to_numpy()
assert len(count_sum_array[merged_candidates['sess_id']]) == merged_candidates.shape[0]
merged_candidates['normalized_all_items_co_graph_count_1'] = count_sum_array[merged_candidates['sess_id']]
merged_candidates['normalized_all_items_co_graph_count_1'] = merged_candidates['all_items_co_graph_count_1'] / merged_candidates['normalized_all_items_co_graph_count_1']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates['normalized_all_items_co_graph_count_0'] = count_sum_array[merged_candidates['sess_id']]


In [None]:
merged_candidates_feature['normalized_all_items_co_graph_count_1'] = merged_candidates['normalized_all_items_co_graph_count_1']
merged_candidates_feature['all_items_co_graph_count_1'] = merged_candidates['all_items_co_graph_count_1']

## uni and dis=1

In [None]:
merged_candidates = merged_candidates_feature[['sess_id', 'sess_locale', 'product']]

In [None]:
uni_dis1_valid_sessions_counter = {}

In [None]:
# only one arg, can't use another arg
def get_uni_dis1_valid_session_co_graph_candidates(sess_id_example):
    sess = valid_sess_item[sess_id_example['sess_id']]
    prev_items = set()
    cand_counter = Counter()
    for item in sess:
        if item in co_occurence_dict_uni_dis1 and item not in prev_items:
            cand_counter = cand_counter + co_occurence_dict_uni_dis1[item]
            prev_items.add(item) # one time for every item
    for item in sess:
        if item in cand_counter:
            cand_counter.pop(item) # remove history items 
    
    uni_dis1_valid_sessions_counter[sess_id_example['sess_id']] = cand_counter

In [None]:
valid_co_graph_candidates = TFDataset.from_dict({'sess_id' : list(range(valid_sess_data.shape[0]))})

In [None]:
# about 1 mins
datasets.set_progress_bar_enabled(False)
valid_co_graph_candidates = valid_co_graph_candidates.map(get_uni_dis1_valid_session_co_graph_candidates, num_proc=8, batched=False)
datasets.set_progress_bar_enabled(True)

In [None]:
valid_co_graph_candidates, len(uni_dis1_valid_sessions_counter)

Dataset({
    features: ['sess_id'],
    num_rows: 361581
})

In [None]:
all_items_co_graph_count_list = []
for row in tqdm(merged_candidates.itertuples(), total=merged_candidates.shape[0]):
    all_items_co_graph_count_list.append(uni_dis1_valid_sessions_counter[row.sess_id][row.product])
assert len(all_items_co_graph_count_list) == merged_candidates.shape[0]
merged_candidates['all_items_co_graph_count_2'] = all_items_co_graph_count_list

100%|██████████| 84407339/84407339 [01:57<00:00, 719960.20it/s]


In [None]:
count_sum_array = merged_candidates.groupby(by='sess_id')['all_items_co_graph_count_2'].sum().to_numpy()
assert len(count_sum_array[merged_candidates['sess_id']]) == merged_candidates.shape[0]
merged_candidates['normalized_all_items_co_graph_count_2'] = count_sum_array[merged_candidates['sess_id']]
merged_candidates['normalized_all_items_co_graph_count_2'] = merged_candidates['all_items_co_graph_count_2'] / merged_candidates['normalized_all_items_co_graph_count_2']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates['normalized_all_items_co_graph_count_0'] = count_sum_array[merged_candidates['sess_id']]


In [None]:
merged_candidates_feature['normalized_all_items_co_graph_count_2'] = merged_candidates['normalized_all_items_co_graph_count_2']
merged_candidates_feature['all_items_co_graph_count_2'] = merged_candidates['all_items_co_graph_count_2']

In [64]:
merged_candidates.query('sess_id==300')['normalized_all_items_co_graph_count_0'].max()

0.28651685393258425

In [71]:
cast_dtype(merged_candidates_feature)
merged_candidates_feature.to_parquet(merged_candidates_feature_path, engine='pyarrow')

In [67]:
merged_candidates_feature

Unnamed: 0,sess_id,sess_locale,product,target,sasrec_scores_2,sasrec_normalized_scores_2,product_freq,gru4rec_scores,gru4rec_normalized_scores,sess_avg_price,...,normalized_co_graph_counts_1,co_graph_counts_2,normalized_co_graph_counts_2,roberta_scores,roberta_normalized_scores,title_BM25_scores,sasrec_scores_3,sasrec_normalized_scores_3,normalized_all_items_co_graph_count_0,all_items_co_graph_count_0
0,0,UK,B000OPPVCS,0.0,11.972421,2.286162e-04,104,6.484859,3.816029e-05,7.388571,...,0.004819,2,0.004525,265.826630,1.087245e-03,298.915375,10.891474,2.517129e-04,0.002635,2
1,0,UK,B000V599Y2,0.0,13.152878,7.443427e-04,37,4.342063,4.477209e-06,7.388571,...,0.000000,2,0.004525,259.157867,1.380768e-06,111.069756,10.677187,2.031618e-04,0.003953,3
2,0,UK,B0018HH444,0.0,5.606023,3.928400e-07,7,3.220763,1.458925e-06,7.388571,...,0.004819,1,0.002262,257.331421,2.222824e-07,0.000000,6.074605,2.036883e-06,0.001318,1
3,0,UK,B0079JI4DU,0.0,0.000000,1.443945e-09,67,0.000000,5.824698e-08,7.388571,...,0.002410,2,0.004525,0.000000,0.000000e+00,0.000000,0.000000,4.685961e-09,0.002635,2
4,0,UK,B0079JI4EY,0.0,0.000000,1.443945e-09,77,0.000000,5.824698e-08,7.388571,...,0.004819,2,0.004525,0.000000,0.000000e+00,0.000000,0.000000,4.685961e-09,0.002635,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84407334,361580,DE,B0BB7XV97M,0.0,9.117821,6.077226e-05,56,9.268379,1.396883e-05,32.424000,...,0.000000,0,0.000000,263.574158,1.378417e-03,118.126396,9.635838,3.403967e-05,0.003356,2
84407335,361580,DE,B0BB7YSRBX,0.0,9.163816,6.363281e-05,58,7.047796,1.516259e-06,32.424000,...,0.000000,0,0.000000,263.523743,1.310646e-03,124.881615,9.159988,2.115080e-05,0.001678,1
84407336,361580,DE,B0BB7ZMGY8,0.0,11.256460,5.158278e-04,452,9.359167,1.529639e-05,32.424000,...,0.000000,0,0.000000,263.567017,1.368608e-03,124.881615,10.119755,5.522656e-05,0.038591,23
84407337,361580,DE,B0BD4CP7N3,0.0,-3.778687,1.523433e-10,1,-0.593306,7.282568e-10,32.424000,...,0.000000,0,0.000000,265.401611,8.571040e-03,192.540955,-1.612869,4.433373e-10,0.000000,0


In [69]:
merged_candidates_feature.query('sess_id==10000').sort_values(by=['sasrec_normalized_scores_3'], ascending=False)[['sasrec_scores_3', 'sasrec_normalized_scores_3', 'all_items_co_graph_count_0', 'normalized_all_items_co_graph_count_0']]

Unnamed: 0,sasrec_scores_3,sasrec_normalized_scores_3,all_items_co_graph_count_0,normalized_all_items_co_graph_count_0
2334309,22.575985,9.955325e-01,3,0.008929
2334430,15.050841,5.369416e-04,12,0.035714
2334460,14.871281,4.486885e-04,6,0.017857
2334338,14.773024,4.066983e-04,3,0.008929
2334316,14.391475,2.776948e-04,2,0.005952
...,...,...,...,...
2334215,-8.178933,4.378892e-14,0,0.000000
2334306,-9.413960,1.273500e-14,2,0.005952
2334357,-9.928643,7.611580e-15,4,0.011905
2334409,-10.111639,6.338708e-15,0,0.000000


# Merge test co-graph counts 

In [1]:
merged_candidates_feature_test_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_test_no_hist_feature.parquet'
train_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_train_sessions.csv'
valid_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_valid_sessions.csv'
test_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1.csv'
product_data_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/products_train.csv'

In [12]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature_test():
    return pd.read_parquet(merged_candidates_feature_test_path)

@lru_cache(maxsize=1)
def read_product_data():
    return pd.read_csv(product_data_path)

@lru_cache(maxsize=1)
def read_train_data():
    return pd.read_csv(train_sessions_path)

@lru_cache(maxsize=1)
def read_valid_data():
    return pd.read_csv(valid_sessions_path)

@lru_cache(maxsize=1)
def read_test_data():
    return pd.read_csv(test_sessions_path)

In [14]:
merged_candidates_feature_test = read_merged_candidates_feature_test()

In [15]:
merged_candidates_test = merged_candidates_feature_test[['sess_id', 'sess_locale', 'product']]

In [16]:
train_sess_data = read_train_data()
valid_sess_data = read_valid_data()
test_sess_data = read_test_data()
product = read_product_data()

In [17]:
train_sess_item = get_sessions(train_sess_data, list_item=False)
valid_sess_item = get_sessions(valid_sess_data, list_item=False)
test_sess_item = get_sessions(test_sess_data, test=True, list_item=False)

100%|██████████| 3557898/3557898 [01:49<00:00, 32538.92it/s]
100%|██████████| 361581/361581 [00:12<00:00, 29689.67it/s]
100%|██████████| 316971/316971 [00:06<00:00, 49106.24it/s]


In [18]:
# bidirection
co_occurence_dict_bi = get_co_occurence_dict(train_sess_item + valid_sess_item, bidirection=True, weighted=False)
# weight 
# co_occurence_dict_bi_weight = get_co_occurence_dict(train_sess_item + valid_sess_item, bidirection=True, weighted=True)
co_occurence_dict_uni_weight = get_co_occurence_dict(train_sess_item + valid_sess_item, bidirection=False, weighted=True)
# max dis = 1 
# co_occurence_dict_bi_dis1 = get_co_occurence_dict(train_sess_item + valid_sess_item, bidirection=True, weighted=False, max_dis=1)
co_occurence_dict_uni_dis1 = get_co_occurence_dict(train_sess_item + valid_sess_item, bidirection=False, weighted=False, max_dis=1)

100%|██████████| 4236450/4236450 [01:34<00:00, 44774.78it/s]


# bidirection

In [None]:
bi_test_sessions_counter = {}

In [None]:
# only one arg, can't use another arg
def get_bi_test_session_co_graph_candidates(sess_id_example):
    sess = test_sess_item[sess_id_example['sess_id']]
    prev_items = set()
    cand_counter = Counter()
    for item in sess:
        if item in co_occurence_dict_bi and item not in prev_items:
            cand_counter = cand_counter + co_occurence_dict_bi[item]
            prev_items.add(item) # one time for every item
    for item in sess:
        if item in cand_counter:
            cand_counter.pop(item) # remove history items 
    
    bi_test_sessions_counter[sess_id_example['sess_id']] = cand_counter

In [None]:
test_co_graph_candidates = TFDataset.from_dict({'sess_id' : list(range(test_sess_data.shape[0]))})

In [None]:
# about 1 mins
datasets.set_progress_bar_enabled(False)
test_co_graph_candidates = test_co_graph_candidates.map(get_bi_test_session_co_graph_candidates, num_proc=8, batched=False)
datasets.set_progress_bar_enabled(True)

In [None]:
test_co_graph_candidates, len(bi_test_sessions_counter)

Dataset({
    features: ['sess_id'],
    num_rows: 361581
})

In [None]:
all_items_co_graph_count_list = []
for row in tqdm(merged_candidates_test.itertuples(), total=merged_candidates_test.shape[0]):
    all_items_co_graph_count_list.append(bi_test_sessions_counter[row.sess_id][row.product])
assert len(all_items_co_graph_count_list) == merged_candidates_test.shape[0]
merged_candidates_test['all_items_co_graph_count_0'] = all_items_co_graph_count_list

100%|██████████| 84407339/84407339 [01:57<00:00, 719960.20it/s]


In [None]:
count_sum_array = merged_candidates_test.groupby(by='sess_id')['all_items_co_graph_count_0'].sum().to_numpy()
assert len(count_sum_array[merged_candidates_test['sess_id']]) == merged_candidates_test.shape[0]
merged_candidates_test['normalized_all_items_co_graph_count_0'] = count_sum_array[merged_candidates_test['sess_id']]
merged_candidates_test['normalized_all_items_co_graph_count_0'] = merged_candidates_test['all_items_co_graph_count_0'] / merged_candidates_test['normalized_all_items_co_graph_count_0']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates['normalized_all_items_co_graph_count_0'] = count_sum_array[merged_candidates['sess_id']]


In [None]:
merged_candidates_feature_test['normalized_all_items_co_graph_count_0'] = merged_candidates_test['normalized_all_items_co_graph_count_0']
merged_candidates_feature_test['all_items_co_graph_count_0'] = merged_candidates_test['all_items_co_graph_count_0']

## uni and weighted

In [None]:
uni_wei_test_sessions_counter = {}

In [None]:
# only one arg, can't use another arg
def get_uni_wei_test_session_co_graph_candidates(sess_id_example):
    sess = test_sess_item[sess_id_example['sess_id']]
    prev_items = set()
    cand_counter = Counter()
    for item in sess:
        if item in co_occurence_dict_uni_weight and item not in prev_items:
            cand_counter = cand_counter + co_occurence_dict_uni_weight[item]
            prev_items.add(item) # one time for every item
    for item in sess:
        if item in cand_counter:
            cand_counter.pop(item) # remove history items 
    
    uni_wei_test_sessions_counter[sess_id_example['sess_id']] = cand_counter

In [None]:
test_co_graph_candidates = TFDataset.from_dict({'sess_id' : list(range(test_sess_data.shape[0]))})

In [None]:
# about 1 mins
datasets.set_progress_bar_enabled(False)
test_co_graph_candidates = test_co_graph_candidates.map(get_uni_wei_test_session_co_graph_candidates, num_proc=8, batched=False)
datasets.set_progress_bar_enabled(True)

In [None]:
test_co_graph_candidates, len(uni_wei_test_sessions_counter)

Dataset({
    features: ['sess_id'],
    num_rows: 361581
})

In [None]:
all_items_co_graph_count_list = []
for row in tqdm(merged_candidates_test.itertuples(), total=merged_candidates_test.shape[0]):
    all_items_co_graph_count_list.append(uni_wei_test_sessions_counter[row.sess_id][row.product])
assert len(all_items_co_graph_count_list) == merged_candidates_test.shape[0]
merged_candidates_test['all_items_co_graph_count_1'] = all_items_co_graph_count_list

100%|██████████| 84407339/84407339 [01:57<00:00, 719960.20it/s]


In [None]:
count_sum_array = merged_candidates_test.groupby(by='sess_id')['all_items_co_graph_count_1'].sum().to_numpy()
assert len(count_sum_array[merged_candidates_test['sess_id']]) == merged_candidates_test.shape[0]
merged_candidates_test['normalized_all_items_co_graph_count_1'] = count_sum_array[merged_candidates_test['sess_id']]
merged_candidates_test['normalized_all_items_co_graph_count_1'] = merged_candidates_test['all_items_co_graph_count_1'] / merged_candidates_test['normalized_all_items_co_graph_count_1']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates['normalized_all_items_co_graph_count_0'] = count_sum_array[merged_candidates['sess_id']]


In [None]:
merged_candidates_feature_test['normalized_all_items_co_graph_count_1'] = merged_candidates_test['normalized_all_items_co_graph_count_1']
merged_candidates_feature_test['all_items_co_graph_count_1'] = merged_candidates_test['all_items_co_graph_count_1']

## uni and dis=1

In [None]:
uni_dis1_test_sessions_counter = {}

In [None]:
# only one arg, can't use another arg
def get_uni_dis1_test_session_co_graph_candidates(sess_id_example):
    sess = test_sess_item[sess_id_example['sess_id']]
    prev_items = set()
    cand_counter = Counter()
    for item in sess:
        if item in co_occurence_dict_uni_dis1 and item not in prev_items:
            cand_counter = cand_counter + co_occurence_dict_uni_dis1[item]
            prev_items.add(item) # one time for every item
    for item in sess:
        if item in cand_counter:
            cand_counter.pop(item) # remove history items 
    
    uni_dis1_test_sessions_counter[sess_id_example['sess_id']] = cand_counter

In [None]:
test_co_graph_candidates = TFDataset.from_dict({'sess_id' : list(range(test_sess_data.shape[0]))})

In [None]:
# about 1 mins
datasets.set_progress_bar_enabled(False)
test_co_graph_candidates = test_co_graph_candidates.map(get_uni_dis1_test_session_co_graph_candidates, num_proc=8, batched=False)
datasets.set_progress_bar_enabled(True)

In [None]:
test_co_graph_candidates, len(uni_dis1_test_sessions_counter)

Dataset({
    features: ['sess_id'],
    num_rows: 361581
})

In [None]:
all_items_co_graph_count_list = []
for row in tqdm(merged_candidates_test.itertuples(), total=merged_candidates_test.shape[0]):
    all_items_co_graph_count_list.append(uni_dis1_test_sessions_counter[row.sess_id][row.product])
assert len(all_items_co_graph_count_list) == merged_candidates_test.shape[0]
merged_candidates_test['all_items_co_graph_count_2'] = all_items_co_graph_count_list

100%|██████████| 84407339/84407339 [01:57<00:00, 719960.20it/s]


In [None]:
count_sum_array = merged_candidates_test.groupby(by='sess_id')['all_items_co_graph_count_2'].sum().to_numpy()
assert len(count_sum_array[merged_candidates_test['sess_id']]) == merged_candidates_test.shape[0]
merged_candidates_test['normalized_all_items_co_graph_count_2'] = count_sum_array[merged_candidates_test['sess_id']]
merged_candidates_test['normalized_all_items_co_graph_count_2'] = merged_candidates_test['all_items_co_graph_count_2'] / merged_candidates_test['normalized_all_items_co_graph_count_2']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates['normalized_all_items_co_graph_count_0'] = count_sum_array[merged_candidates['sess_id']]


In [None]:
merged_candidates_feature_test['normalized_all_items_co_graph_count_2'] = merged_candidates_test['normalized_all_items_co_graph_count_2']
merged_candidates_feature_test['all_items_co_graph_count_2'] = merged_candidates_test['all_items_co_graph_count_2']

In [25]:
merged_candidates_test.query('sess_id==200')['normalized_all_items_co_graph_count_0'].max()

0.17692307692307693

In [37]:
cast_dtype(merged_candidates_feature_test)
merged_candidates_feature_test.to_parquet(merged_candidates_feature_test_path, engine='pyarrow')

In [63]:
test_sess_data

Unnamed: 0,prev_items,locale
0,['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC...,DE
1,['B00R9R5ND6' 'B00R9RZ9ZS' 'B00R9RZ9ZS'],DE
2,['B07YSRXJD3' 'B07G7Q5N6G' 'B08C9Q7QVK' 'B07G7...,DE
3,['B08KQBYV43' '3955350843' '3955350843' '39553...,DE
4,['B09FPTCWMC' 'B09FPTQP68' 'B08HMRY8NG' 'B08TB...,DE
...,...,...
316966,['B077SZ2C3Y' 'B0B14M3VZX'],UK
316967,['B08KFHDPY9' 'B0851KTSRZ' 'B08KFHDPY9' 'B0851...,UK
316968,['B07PY1N81F' 'B07Q1Z8SQN' 'B07PY1N81F' 'B07Q1...,UK
316969,['B01MCQMORK' 'B09JYZ325W'],UK


In [29]:
merged_candidates_test.query('sess_id==0')['normalized_all_items_co_graph_count_0'].max()

0.07752746488666389

In [32]:
merged_candidates_feature_test

Unnamed: 0,sess_id,sess_locale,product,sasrec_scores_2,sasrec_normalized_scores_2,gru4rec_scores,gru4rec_normalized_scores,product_freq,sess_avg_price,product_price,...,normalized_co_graph_counts_1,co_graph_counts_2,normalized_co_graph_counts_2,roberta_scores,roberta_normalized_scores,title_BM25_scores,sasrec_scores_3,sasrec_normalized_scores_3,normalized_all_items_co_graph_count_0,all_items_co_graph_count_0
0,0,DE,4088833651,0.000000,2.975813e-09,0.000000,1.580065e-09,828,25.195269,36.761604,...,0.0,0,0.0,0.000000,0.000000,0.000000,0.000000,2.622550e-09,0.000000,0
1,0,DE,B000H6W2GW,0.000000,2.975813e-09,0.000000,1.580065e-09,875,25.195269,36.761604,...,0.0,0,0.0,0.000000,0.000000,0.000000,0.000000,2.622550e-09,0.000000,0
2,0,DE,B000JG2RAG,7.665308,6.347557e-06,8.104032,5.226502e-06,24,25.195269,23.190001,...,0.0,0,0.0,267.192719,0.004943,287.809601,8.885176,1.894552e-05,0.000000,0
3,0,DE,B000RYSOUW,-2.951060,1.555882e-10,-2.857798,9.068785e-11,5,25.195269,6.900000,...,0.0,0,0.0,267.322815,0.005629,321.394653,-1.640674,5.083796e-10,0.000000,0
4,0,DE,B000UGZVQM,3.977920,1.589257e-07,4.688567,1.717488e-07,4,25.195269,21.990000,...,0.0,0,0.0,267.242462,0.005195,285.328705,4.972019,3.784811e-07,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69428426,316970,UK,B0BJCTH4NH,11.327528,1.041200e-04,10.629994,3.818184e-04,74,16.950001,5.800000,...,0.0,0,0.0,270.043762,0.014921,449.867401,10.968081,1.849500e-05,0.010237,16
69428427,316970,UK,B0BJTQQWLG,5.604142,3.403292e-07,6.052083,3.923694e-06,6,16.950001,9.880000,...,0.0,0,0.0,269.350769,0.007462,431.585815,7.366314,5.044600e-07,0.000640,1
69428428,316970,UK,B0BJV3RL4H,9.146974,1.176336e-05,7.667603,1.973815e-05,7,16.950001,22.097065,...,0.0,0,0.0,269.313751,0.007191,419.572662,8.286265,1.265775e-06,0.000640,1
69428429,316970,UK,B0BK7SPC84,-10.383047,3.879279e-14,-6.356799,1.601719e-11,0,16.950001,5.960000,...,0.0,0,0.0,270.200653,0.017456,420.993561,-10.871386,6.057512e-15,0.000000,0


In [36]:
merged_candidates_feature_test.query('sess_id==300000').sort_values(by=['sasrec_normalized_scores_3'], ascending=False)[['sasrec_scores_3', 'sasrec_normalized_scores_3', 'all_items_co_graph_count_0', 'normalized_all_items_co_graph_count_0']]

Unnamed: 0,sasrec_scores_3,sasrec_normalized_scores_3,all_items_co_graph_count_0,normalized_all_items_co_graph_count_0
65675691,18.784042,7.110926e-01,141,0.220657
65675602,17.250706,1.534640e-01,127,0.198748
65675649,16.779135,9.576479e-02,59,0.092332
65675546,15.148463,1.875056e-02,43,0.067293
65675714,14.737875,1.243649e-02,29,0.045383
...,...,...,...,...
65675666,-6.248106,9.563178e-12,0,0.000000
65675565,-6.320740,8.893195e-12,0,0.000000
65675554,-6.321655,8.885057e-12,0,0.000000
65675610,-8.322851,1.201025e-12,0,0.000000
