## Part0: Necessary Common Functions

Those functions should be ran before each part.

In [2]:
import os
import random
import numpy as np
import pandas as pd
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache
from tqdm import tqdm, trange
from collections import Counter, defaultdict
import numba 
from numba import jit



In [3]:
# Cache loading of data for multiple calls
train_data_dir = '../data_for_recstudio/'
test_data_dir = '../raw_data/'
task = 'task1'
PREDS_PER_SESSION = 100


@lru_cache(maxsize=1)
def read_product_data():
    print(os.getcwd())
    print(os.path.join(train_data_dir, 'products_train.csv'))
    return pd.read_csv(os.path.join(train_data_dir, 'products_train.csv'))

@lru_cache(maxsize=1)
def read_train_data():
    return pd.read_csv(os.path.join(train_data_dir, 'task1_data/task13_4_task1_train_sessions.csv'))

@lru_cache(maxsize=1)
def read_valid_data():
    return pd.read_csv(os.path.join(train_data_dir, 'task1_data/task13_4_task1_valid_sessions.csv'))

@lru_cache(maxsize=3)
def read_test_data(task):
    return pd.read_csv(os.path.join(test_data_dir, f'sessions_test_{task}.csv'))

In [4]:
def map_id(id: str, id_dict: dict) -> int:
    return id_dict[id]

In [5]:
def get_sessions(df: pd.DataFrame, id_dict: dict, test=False, list_item=False) -> list:
    
    all_item = []
    if 'next_item' in df and not test:
        if list_item:
            for i in trange(len(df)):
                all_item.append(np.concatenate([np.array(df.loc[i, 'prev_items']), np.array(df.loc[i, 'next_item'])], axis=0))
        else:
            for i in trange(len(df)):
                all_item.append(eval((df.loc[i, 'prev_items'][:-1]+f" '{df.loc[i, 'next_item']}']").replace(" ", ",")))
    else:
        if list_item:
            all_item = df['prev_items']
        else:
            for i in trange(len(df)):
                all_item.append(eval((df.loc[i, 'prev_items']).replace(" ", ",")))
    all_item_id = []
    for x in all_item:
        all_item_id.append([map_id(y, id_dict) for y in x])
    return all_item_id

In [6]:
def get_item_pop(train_item):
    all_item_id = []
    for s in train_item:
        all_item_id += s
    return Counter(all_item_id)

In [27]:
def get_co_occurence_dict(sessions: list, bidirection: bool=True, weighted: bool=False, max_dis=None) -> dict:
    res = {}
    for sess in tqdm(sessions):
        for i, id in enumerate(sess):
            if id not in res:
                res[id] = Counter()
            
            if max_dis == None:
                e = len(sess)
            else:
                e = min(i + max_dis + 1, len(sess))

            for j in range(i+1, e):
                if not weighted:
                    res[id][sess[j]] += 1
                else:
                    res[id][sess[j]] += 1 / (j-i)
                if bidirection:
                    if sess[j] not in res:
                        res[sess[j]] = Counter()
                    if not weighted:
                        res[sess[j]][id] += 1
                    else:
                        res[sess[j]][id] += 1 / (j-i)
    return res

In [8]:
def sort_co_occurence_dict(co_occurence_dict: dict) -> dict:
    res = {}
    for k,v in co_occurence_dict.items():
        res[k] = dict(sorted(v.items(), key=lambda item: -item[1]))
    return res

In [9]:
def predict_with_co_occurence_dict(sess: list, sess_locale: list, id2locale: dict, co_occurence_dict: dict, topk: int, pred_with_last: bool=False, remove_hist: bool=False) -> list:
    sorted_dict = sort_co_occurence_dict(co_occurence_dict)
    res = []
    for i, s in tqdm(enumerate(sess)):
        locale = sess_locale[i]
        if not pred_with_last:
            cand = Counter()
            for i,id in enumerate(s):
                if id in sorted_dict:
                    if id in sorted_dict:
                        cand = cand + Counter({ k:sorted_dict[id][k] * (i / len(s)) for k in list(sorted_dict[id].keys()) } )
            cand = sorted(cand.items(), key=lambda x: -x[1])
            cand = [i[0] for i in cand]
        else:
            if s[-1] not in sorted_dict:
                cand = []
            else:
                cand = list(sorted_dict[s[-1]].keys())

        if remove_hist:
            cand = [x for x in cand if x not in s]

        cand = [x for x in cand if locale in id2locale[x]]

        cand = cand[: min(len(cand), topk)]
        res.append(cand)
    return res

In [10]:
def pad_topk_with_popular_items(rec_list: list, topk: int, pop_items: list) -> list:
    # TODO: remove duplicated items; add locale constraint
    res = [None] * len(rec_list)
    for i, l in enumerate(rec_list):
        pad_len = topk - len(l)
        if pad_len > 0:
            pad_items = random.sample(pop_items, pad_len)
            res[i] = l + pad_items
        else:
            res[i] = l
    return res

In [11]:
def id2predictions(pred: list, test_df: cudf.DataFrame, id_dict: dict) -> cudf.DataFrame:
    id2product = {v:k for k,v in id_dict.items()}
    assert len(pred) == test_df.shape[0]
    product_pred = [None] * len(pred)
    for i, l in enumerate(pred):
        product_pred[i] = [id2product[x] for x in l]
    res = cudf.DataFrame()
    res['locale'] = test_df['locale']
    res['next_item_prediction'] = product_pred
    return res

In [39]:
def get_neighbors_for_each_product(co_occurence_dict: dict, productid_map: dict, pop_products: list, k: int=300):
    res = {'id': [], 'pid': [], 'candidates_id': [], 'counts' : [], 'normalized_counts' : [], 'state': []}
    for pid, id in tqdm(productid_map.items(), total=len(productid_map)):
        if id in co_occurence_dict and len(co_occurence_dict[id]) > 0:
            neighbors, counts = list(zip(*co_occurence_dict[id].most_common()))
            neighbors, counts = list(neighbors), list(counts)
            if len(neighbors) >= k:
                cands = neighbors[:k]
                counts_k = counts[:k]
                normalized_counts_k = (np.array(counts_k) / np.array(counts).sum()).tolist()
                state = 'Full'
            else:
                num_pop = k - len(neighbors)
                _pop_cands = random.sample(pop_products, k-len(neighbors))
                cands = neighbors + _pop_cands
                counts_k = counts + [0 for i in range(num_pop)]
                normalized_counts_k = (np.array(counts) / np.array(counts).sum()).tolist() + [0.0 for i in range(num_pop)]
                state = 'Pad'
        else:
            cands = random.sample(pop_products, k)
            counts_k = [0 for i in range(k)]
            normalized_counts_k = [0.0 for i in range(k)]
            state = 'No'
        res['id'].append(id)
        res['pid'].append(pid)
        res['candidates_id'].append(cands)
        res['counts'].append(counts_k)
        res['normalized_counts'].append(normalized_counts_k)
        res['state'].append(state)
    return pd.DataFrame(res)

In [13]:
def collect_all_counts(cand_df, co_occ_dict_list):
    for i, co_occ_dict in enumerate(co_occ_dict_list):
        i_counts, i_normalized_counts = [], []
        for j in trange(cand_df.shape[0]):
            item_id = cand_df.iloc[j]['id']
            item_candidates = cand_df.iloc[j]['candidates_id']
            if (item_id in co_occ_dict) and len(co_occ_dict[item_id]) > 0:
                cur_counts = [co_occ_dict[item_id][cand] for cand in item_candidates]
                all_counts = sum(co_occ_dict[item_id].values())
                cur_norm_counts = (np.array(cur_counts) / all_counts).tolist()
            else:
                cur_counts = [0.0 for cand in item_candidates]
                cur_norm_counts = [0.0 for cand in item_candidates]
            i_counts.append(cur_counts), i_normalized_counts.append(cur_norm_counts)
        cand_df[f'counts_{i}'] = i_counts
        cand_df[f'normalized_counts_{i}'] = i_normalized_counts

In [14]:
def cast_dtype(df : pd.DataFrame):
    for k in df.columns:
        dt = type(df[k].iloc[0])
        if 'float' in str(dt):
            df[k] = df[k].astype('float32')
        elif 'int' in str(dt):
            df[k] = df[k].astype('int32')
        elif dt == list:
            dt_ = type(df.iloc[0][k][0])
            if 'float' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.float32))
            elif 'int' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.int32))

In [15]:
def get_session_last_item(session_df):
    last_items = []
    num_sessions = len(session_df)
    for i in tqdm(range(num_sessions)):
        sess = session_df.iloc[i]
        sess_prev_items = sess['prev_items']
        
        product_list = sess_prev_items.strip('[]').split(' ')
        last_item = product_list[-1].strip("'\n")

        last_items.append(last_item)
    return last_items 

## Part1: Generate each item's neighbors in co-occurence graph

In [16]:
data_type = 'all' # ['all', 'tune']
train_sess_data = pd.read_csv(f'../data_for_recstudio/task1_data/task13_4_task1_train_sessions.csv')
test_sess_data = pd.read_csv(f'../data_for_recstudio/task1_data/task13_4_task1_valid_sessions.csv')
product = read_product_data()

/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost
../data_for_recstudio/products_train.csv


In [17]:
# Map product id
id_dict = { id: i+1 for i, id in enumerate(product['id'].unique()) }    # 0 saved for padding

In [18]:
train_sess_itemid = get_sessions(train_sess_data, id_dict, list_item=False)
test_sess_itemid = get_sessions(test_sess_data, id_dict, test=True, list_item=False)

100%|██████████| 3557898/3557898 [01:50<00:00, 32273.17it/s]
100%|██████████| 361581/361581 [00:06<00:00, 52842.86it/s]


In [19]:
train_pop = get_item_pop(train_sess_itemid)
test_pop = get_item_pop(test_sess_itemid)
total_pop = train_pop + test_pop
sorted_pop = sorted(total_pop.items(), key=lambda x:-x[1])
pop_items_500 = [x[0] for x in sorted_pop[:500]]

In [28]:
co_occurence_dict_uni = get_co_occurence_dict(train_sess_itemid + test_sess_itemid, bidirection=False, weighted=False)

100%|██████████| 3919479/3919479 [00:51<00:00, 76213.06it/s] 


In [29]:
co_occurence_dict_dis_3_uni_wgt = get_co_occurence_dict(train_sess_itemid + test_sess_itemid, bidirection=False, weighted=True, max_dis=3)

100%|██████████| 3919479/3919479 [00:47<00:00, 81835.86it/s] 


In [30]:
co_occurence_dict_dis_3_bi = get_co_occurence_dict(train_sess_itemid + test_sess_itemid, bidirection=True, weighted=False, max_dis=3)

100%|██████████| 3919479/3919479 [00:51<00:00, 75837.25it/s] 


In [31]:
co_occurence_dict_merged = {}
co_occurence_dict_list = [co_occurence_dict_uni, co_occurence_dict_dis_3_uni_wgt, co_occurence_dict_dis_3_bi]
for co_occ_dict in co_occurence_dict_list:
    for item_id in tqdm(co_occ_dict.keys(), total=len(co_occ_dict)):
        if item_id not in co_occurence_dict_merged:
            co_occurence_dict_merged[item_id] = Counter()
        for neigh in co_occ_dict[item_id].keys():
            co_occurence_dict_merged[item_id][neigh] += co_occ_dict[item_id][neigh]

100%|██████████| 1401599/1401599 [00:32<00:00, 43200.39it/s] 
100%|██████████| 1401599/1401599 [00:15<00:00, 88310.97it/s] 
100%|██████████| 1401599/1401599 [00:20<00:00, 68111.74it/s] 


In [40]:
neighbors_df = get_neighbors_for_each_product(co_occurence_dict_merged, id_dict, pop_items_500, 300)

100%|██████████| 1410675/1410675 [05:02<00:00, 4670.64it/s]


In [41]:
collect_all_counts(neighbors_df, co_occurence_dict_list)

100%|██████████| 1410675/1410675 [07:54<00:00, 2972.38it/s]
100%|██████████| 1410675/1410675 [07:22<00:00, 3189.12it/s]
100%|██████████| 1410675/1410675 [08:47<00:00, 2675.40it/s]


In [42]:
id2pid = {v:k for k,v in id_dict.items()}
neighbors_df['candidates'] = neighbors_df['candidates_id'].apply(lambda x: [id2pid[_] for _ in x])

In [43]:
neighbors_df = neighbors_df[['pid', 'candidates', 'counts', 'normalized_counts', 'state', \
                             'counts_0', 'normalized_counts_0', 'counts_1', 'normalized_counts_1', 'counts_2', 'normalized_counts_2',]]
neighbors_df.rename(columns={'pid' : 'id'}, inplace=True)
neighbors_df.head(5)

Unnamed: 0,id,candidates,counts,normalized_counts,state,counts_0,normalized_counts_0,counts_1,normalized_counts_1,counts_2,normalized_counts_2
0,B005ZSSN10,"[B005ZSSMO8, B005ZSSNXS, B07B3XSHWN, B074X4W71...","[4.0, 3.0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[0.5, 0.375, 0.125, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",Pad,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
1,B08PRYN6LD,"[B08PSBK59Y, B005PKZK7S, B075WSVXS3, B08PRYN6L...","[21.0, 8.833333333333332, 5.5, 3.3333333333333...","[0.3073170731707317, 0.12926829268292683, 0.08...",Pad,"[7, 6, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, ...","[0.23333333333333334, 0.2, 0.03333333333333333...","[6.0, 0.8333333333333333, 0.5, 0.3333333333333...","[0.5806451612903225, 0.08064516129032256, 0.04...","[8, 2, 4, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...","[0.2857142857142857, 0.07142857142857142, 0.14..."
2,B09MBZJ48V,"[B07VR16HF9, B089QVZBWM, B005HWEZGG, B07FY5LCH...","[91.33333333333333, 13.0, 8.0, 7.3333333333333...","[0.37127371273712734, 0.052845528455284556, 0....",Pad,"[28, 6, 3, 2, 2, 3, 2, 1, 1, 1, 1, 1, 0, 0, 0,...","[0.3684210526315789, 0.07894736842105263, 0.03...","[27.333333333333332, 2.0, 1.0, 1.3333333333333...","[0.6212121212121211, 0.04545454545454545, 0.02...","[36, 5, 4, 4, 4, 3, 4, 3, 2, 2, 1, 1, 3, 3, 3,...","[0.2857142857142857, 0.03968253968253968, 0.03..."
3,B08ZN6F26S,"[B08PVG787Z, B08ZN6F26S, B00VMUSFMW, B07JKP79B...","[15.0, 14.333333333333332, 5.5, 5.333333333333...","[0.16453382084095067, 0.15722120658135283, 0.0...",Pad,"[5, 4, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, ...","[0.1388888888888889, 0.1111111111111111, 0.055...","[4.0, 2.333333333333333, 1.5, 1.33333333333333...","[0.24742268041237112, 0.14432989690721645, 0.0...","[6, 8, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 0, 0, ...","[0.15384615384615385, 0.20512820512820512, 0.0..."
4,B094DGRV7D,"[B08KTP6517, B06WVS91VM, B094DGRV7D, B09JSLJZK...","[21.5, 13.0, 12.333333333333332, 11.0, 10.6666...","[0.08279845956354302, 0.05006418485237485, 0.0...",Pad,"[5, 5, 4, 2, 4, 3, 2, 2, 3, 2, 3, 3, 3, 2, 3, ...","[0.060240963855421686, 0.060240963855421686, 0...","[2.5, 3.0, 2.333333333333333, 2.0, 0.666666666...","[0.06302521008403361, 0.07563025210084033, 0.0...","[14, 5, 6, 7, 6, 6, 4, 4, 3, 4, 3, 3, 2, 2, 2,...","[0.10218978102189781, 0.0364963503649635, 0.04..."


In [44]:
neighbors_df['candidates'].apply(len).describe()

count    1410675.0
mean         300.0
std            0.0
min          300.0
25%          300.0
50%          300.0
75%          300.0
max          300.0
Name: candidates, dtype: float64

In [45]:
type(neighbors_df['counts_0'].iloc[0][0])

int

In [46]:
cast_dtype(neighbors_df)

In [47]:
neighbors_df.to_parquet(f'../candidates/co_graph_item_candidates_300_with_normalized_score_2.parquet', engine='pyarrow')

In [48]:
neighbors_df['candidates'] = neighbors_df['candidates'].apply(lambda x : x[:100])
neighbors_df['counts'] = neighbors_df['counts'].apply(lambda x : x[:100])
neighbors_df['normalized_counts'] = neighbors_df['normalized_counts'].apply(lambda x : x[:100])
neighbors_df['counts_0'] = neighbors_df['counts_0'].apply(lambda x : x[:100])
neighbors_df['normalized_counts_0'] = neighbors_df['normalized_counts_0'].apply(lambda x : x[:100])
neighbors_df['counts_1'] = neighbors_df['counts_1'].apply(lambda x : x[:100])
neighbors_df['normalized_counts_1'] = neighbors_df['normalized_counts_1'].apply(lambda x : x[:100])
neighbors_df['counts_2'] = neighbors_df['counts_2'].apply(lambda x : x[:100])
neighbors_df['normalized_counts_2'] = neighbors_df['normalized_counts_2'].apply(lambda x : x[:100])
neighbors_df['candidates'].apply(len).describe()
neighbors_df['counts'].apply(len).describe()

count    1410675.0
mean         100.0
std            0.0
min          100.0
25%          100.0
50%          100.0
75%          100.0
max          100.0
Name: counts, dtype: float64

In [49]:
neighbors_df.to_parquet(f'../candidates/co_graph_item_candidates_100_with_normalized_score_2.parquet', engine='pyarrow')

In [50]:
neighbors_df = neighbors_df.set_index('id', drop=True)
neighbors_df.head(5)

Unnamed: 0_level_0,candidates,counts,normalized_counts,state,counts_0,normalized_counts_0,counts_1,normalized_counts_1,counts_2,normalized_counts_2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
B005ZSSN10,"[B005ZSSMO8, B005ZSSNXS, B07B3XSHWN, B074X4W71...","[4.0, 3.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.5, 0.375, 0.125, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",Pad,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
B08PRYN6LD,"[B08PSBK59Y, B005PKZK7S, B075WSVXS3, B08PRYN6L...","[21.0, 8.833333, 5.5, 3.3333333, 2.5, 2.5, 2.5...","[0.30731708, 0.12926829, 0.0804878, 0.04878048...",Pad,"[7, 6, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, ...","[0.23333333, 0.2, 0.033333335, 0.033333335, 0....","[6.0, 0.8333333, 0.5, 0.33333334, 0.5, 0.5, 0....","[0.58064514, 0.08064516, 0.048387095, 0.032258...","[8, 2, 4, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...","[0.2857143, 0.071428575, 0.14285715, 0.0714285..."
B09MBZJ48V,"[B07VR16HF9, B089QVZBWM, B005HWEZGG, B07FY5LCH...","[91.333336, 13.0, 8.0, 7.3333335, 7.3333335, 7...","[0.37127373, 0.05284553, 0.032520324, 0.029810...",Pad,"[28, 6, 3, 2, 2, 3, 2, 1, 1, 1, 1, 1, 0, 0, 0,...","[0.36842105, 0.078947365, 0.039473683, 0.02631...","[27.333334, 2.0, 1.0, 1.3333334, 1.3333334, 1....","[0.6212121, 0.045454547, 0.022727273, 0.030303...","[36, 5, 4, 4, 4, 3, 4, 3, 2, 2, 1, 1, 3, 3, 3,...","[0.2857143, 0.03968254, 0.031746034, 0.0317460..."
B08ZN6F26S,"[B08PVG787Z, B08ZN6F26S, B00VMUSFMW, B07JKP79B...","[15.0, 14.333333, 5.5, 5.3333335, 5.3333335, 4...","[0.16453382, 0.15722121, 0.06032907, 0.0585009...",Pad,"[5, 4, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, ...","[0.1388889, 0.11111111, 0.055555556, 0.0555555...","[4.0, 2.3333333, 1.5, 1.3333334, 1.3333334, 0....","[0.24742268, 0.14432989, 0.0927835, 0.08247422...","[6, 8, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 0, 0, ...","[0.15384616, 0.20512821, 0.051282052, 0.051282..."
B094DGRV7D,"[B08KTP6517, B06WVS91VM, B094DGRV7D, B09JSLJZK...","[21.5, 13.0, 12.333333, 11.0, 10.666667, 10.33...","[0.08279846, 0.050064184, 0.047496792, 0.04236...",Pad,"[5, 5, 4, 2, 4, 3, 2, 2, 3, 2, 3, 3, 3, 2, 3, ...","[0.060240965, 0.060240965, 0.04819277, 0.02409...","[2.5, 3.0, 2.3333333, 2.0, 0.6666667, 1.333333...","[0.063025214, 0.075630255, 0.05882353, 0.05042...","[14, 5, 6, 7, 6, 6, 4, 4, 3, 4, 3, 3, 2, 2, 2,...","[0.10218978, 0.03649635, 0.04379562, 0.0510948..."


In [51]:
test_last_items = get_session_last_item(test_sess_data)
test_candidates = neighbors_df.loc[test_last_items]

100%|██████████| 361581/361581 [00:09<00:00, 38156.06it/s]


In [52]:
test_candidates.reset_index(drop=True, inplace=True)
test_candidates['sess_id'] = np.arange(test_candidates.shape[0], dtype=np.int32)

In [56]:
test_candidates.to_parquet('../candidates/co_graph/co_graph_valid_100_with_normalized_score_2.parquet', engine='pyarrow')

: 

In [55]:
test_candidates['counts'].apply(len).describe()

count    361581.0
mean        100.0
std           0.0
min         100.0
25%         100.0
50%         100.0
75%         100.0
max         100.0
Name: counts, dtype: float64

## Part2 : Use co-graph to predict candidates in test

In [14]:
train_sess_data = read_train_data()
valid_sess_data = read_valid_data()
test_sess_data = read_test_data(task)
product = read_product_data()

/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost
../data_for_recstudio/products_train.csv


In [15]:
# Map product id
id_dict = { id: i+1 for i, id in enumerate(product['id'].unique()) }    # 0 saved for padding

In [16]:
locale_info = product[['id', 'locale']].groupby('id')['locale'].agg(list)
id2locale = {id_dict[x]: locale_info[x] for x in product['id'].unique()}
test_locale = test_sess_data['locale'].tolist()

In [17]:
train_sess_itemid = get_sessions(train_sess_data, id_dict)
valid_sess_itemid = get_sessions(valid_sess_data, id_dict)
test_sess_itemid = get_sessions(test_sess_data, id_dict, test=True)

100%|██████████| 3557898/3557898 [01:43<00:00, 34384.93it/s]
100%|██████████| 361581/361581 [00:10<00:00, 36152.94it/s]
100%|██████████| 316971/316971 [00:05<00:00, 53814.24it/s]


In [18]:
# train and valid include data in test.
co_occurence_dict_uni = get_co_occurence_dict(train_sess_itemid + valid_sess_itemid, bidirection=False, weighted=False)

100%|██████████| 3919479/3919479 [00:45<00:00, 86325.86it/s]


In [19]:
co_occurence_dict_dis_3_uni_wgt = get_co_occurence_dict(train_sess_itemid + valid_sess_itemid, bidirection=False, weighted=True, max_dis=3)

100%|██████████| 3919479/3919479 [00:48<00:00, 81390.88it/s]


In [20]:
co_occurence_dict_dis_3_bi = get_co_occurence_dict(train_sess_itemid + valid_sess_itemid, bidirection=True, weighted=False, max_dis=3)

100%|██████████| 3919479/3919479 [01:00<00:00, 65292.90it/s]


In [21]:
train_pop = get_item_pop(train_sess_itemid)
valid_pop = get_item_pop(valid_sess_data)
total_pop = train_pop + valid_pop
sorted_pop = sorted(total_pop.items(), key=lambda x:-x[1])
pop_items_500 = [x[0] for x in sorted_pop[:500]]

In [22]:
co_occurence_dict_merged = {}
co_occurence_dict_list = [co_occurence_dict_uni, co_occurence_dict_dis_3_uni_wgt, co_occurence_dict_dis_3_bi]
for co_occ_dict in co_occurence_dict_list:
    for item_id in tqdm(co_occ_dict.keys(), total=len(co_occ_dict)):
        if item_id not in co_occurence_dict_merged:
            co_occurence_dict_merged[item_id] = Counter()
        for neigh in co_occ_dict[item_id].keys():
            co_occurence_dict_merged[item_id][neigh] += co_occ_dict[item_id][neigh]

100%|██████████| 1405385/1405385 [00:22<00:00, 61433.73it/s] 
100%|██████████| 1405385/1405385 [00:16<00:00, 87183.63it/s] 
100%|██████████| 1405385/1405385 [00:21<00:00, 65469.18it/s] 


In [23]:
neighbors_df = get_neighbors_for_each_product(co_occurence_dict_merged, id_dict, pop_items_500, 300)

100%|██████████| 1410675/1410675 [05:38<00:00, 4163.89it/s]


In [24]:
collect_all_counts(neighbors_df, co_occurence_dict_list)

100%|██████████| 1410675/1410675 [07:43<00:00, 3046.15it/s]
100%|██████████| 1410675/1410675 [09:23<00:00, 2502.38it/s]
100%|██████████| 1410675/1410675 [09:46<00:00, 2405.08it/s]


In [26]:
neighbors_df

Unnamed: 0,id,pid,candidates_id,counts,normalized_counts,state,counts_0,normalized_counts_0,counts_1,normalized_counts_1,counts_2,normalized_counts_2
0,1,B005ZSSN10,"[467115, 304519, 35778, 170156, 356145, 267156...","[4.0, 3.0, 2.5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.38095238095238093, 0.2857142857142857, 0.23...",Pad,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.3333333333333333, 0.3333333333333333, 0.333...","[1.0, 1.0, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.4, 0.4, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.4, 0.2, 0.2, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2,B08PRYN6LD,"[474553, 134034, 345580, 2, 215789, 49563, 202...","[21.0, 8.833333333333332, 5.5, 3.3333333333333...","[0.3073170731707317, 0.12926829268292683, 0.08...",Pad,"[7, 6, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, ...","[0.23333333333333334, 0.2, 0.03333333333333333...","[6.0, 0.8333333333333333, 0.5, 0.3333333333333...","[0.5806451612903225, 0.08064516129032256, 0.04...","[8, 2, 4, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...","[0.2857142857142857, 0.07142857142857142, 0.14..."
2,3,B09MBZJ48V,"[481603, 116190, 119093, 436535, 74637, 85944,...","[91.33333333333333, 13.0, 8.0, 7.3333333333333...","[0.37127371273712734, 0.052845528455284556, 0....",Pad,"[28, 6, 3, 2, 2, 3, 2, 1, 1, 1, 1, 1, 0, 0, 0,...","[0.3684210526315789, 0.07894736842105263, 0.03...","[27.333333333333332, 2.0, 1.0, 1.3333333333333...","[0.6212121212121211, 0.04545454545454545, 0.02...","[36, 5, 4, 4, 4, 3, 4, 3, 2, 2, 1, 1, 3, 3, 3,...","[0.2857142857142857, 0.03968253968253968, 0.03..."
3,4,B08ZN6F26S,"[1050130, 4, 1118660, 215297, 1036575, 246172,...","[15.0, 14.333333333333332, 5.5, 5.333333333333...","[0.16453382084095067, 0.15722120658135283, 0.0...",Pad,"[5, 4, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, ...","[0.1388888888888889, 0.1111111111111111, 0.055...","[4.0, 2.333333333333333, 1.5, 1.33333333333333...","[0.24742268041237112, 0.14432989690721645, 0.0...","[6, 8, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 0, 0, ...","[0.15384615384615385, 0.20512820512820512, 0.0..."
4,5,B094DGRV7D,"[977, 426878, 5, 53514, 40200, 398380, 364139,...","[21.5, 13.0, 12.333333333333332, 11.0, 10.6666...","[0.08113207547169811, 0.04905660377358491, 0.0...",Pad,"[5, 5, 4, 2, 4, 3, 4, 2, 2, 2, 3, 3, 3, 2, 3, ...","[0.05952380952380952, 0.05952380952380952, 0.0...","[2.5, 3.0, 2.333333333333333, 2.0, 0.666666666...","[0.062499999999999986, 0.07499999999999998, 0....","[14, 5, 6, 7, 6, 6, 4, 4, 4, 4, 3, 3, 2, 2, 2,...","[0.09929078014184398, 0.03546099290780142, 0.0..."
...,...,...,...,...,...,...,...,...,...,...,...,...
1410670,1410671,B09XN5CXDM,"[182517, 1345321, 1343232, 1403874, 142970, 62...","[7.333333333333333, 4.5, 3.0, 2.5, 1, 0, 0, 0,...","[0.4, 0.24545454545454548, 0.16363636363636364...",Pad,"[4, 3, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.4, 0.3, 0.1, 0.1, 0.1, 0.0, 0.0, 0.0, 0.0, ...","[1.3333333333333333, 0.5, 1.0, 0.5, 0, 0, 0, 0...","[0.4, 0.15000000000000002, 0.30000000000000004...","[2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.4, 0.2, 0.2, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1410671,1410672,B09S3KGLG6,"[163250, 1037502, 1400924, 517279, 97583, 1386...","[5, 4.5, 3.0, 3.0, 3, 2.5, 2, 2, 1, 0, 0, 0, 0...","[0.19230769230769232, 0.17307692307692307, 0.1...",Pad,"[0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.25, 0.25, 0.25, 0.0, 0.25, 0.0, 0.0, 0...","[0, 0.5, 1.0, 1.0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0...","[0.0, 0.16666666666666666, 0.3333333333333333,...","[5, 3, 1, 1, 3, 1, 2, 2, 1, 0, 0, 0, 0, 0, 0, ...","[0.2631578947368421, 0.15789473684210525, 0.05..."
1410672,1410673,B00E4L5YPW,"[1393542, 1385589, 1391088, 1393291, 1404397, ...","[16.833333333333332, 16.5, 14.833333333333332,...","[0.1337748344370861, 0.13112582781456955, 0.11...",Pad,"[4, 5, 3, 4, 3, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0.0784313725490196, 0.09803921568627451, 0.05...","[1.8333333333333333, 2.5, 1.8333333333333333, ...","[0.09734513274336283, 0.13274336283185842, 0.0...","[11, 9, 10, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...","[0.19642857142857142, 0.16071428571428573, 0.1..."
1410673,1410674,B08B4DFWCR,"[1386437, 1410636, 1402535, 1404727, 1400680, ...","[32.333333333333336, 22.0, 10.333333333333334,...","[0.23658536585365852, 0.16097560975609754, 0.0...",Pad,"[3, 2, 4, 1, 2, 1, 0, 1, 1, 1, 3, 1, 1, 1, 0, ...","[0.12, 0.08, 0.16, 0.04, 0.08, 0.04, 0.0, 0.04...","[2.333333333333333, 2.0, 2.3333333333333335, 0...","[0.17073170731707313, 0.14634146341463414, 0.1...","[27, 18, 4, 9, 3, 2, 4, 1, 1, 1, 0, 1, 1, 1, 2...","[0.2755102040816326, 0.1836734693877551, 0.040..."


In [27]:
id2pid = {v:k for k,v in id_dict.items()}
neighbors_df['candidates'] = neighbors_df['candidates_id'].apply(lambda x: [id2pid[_] for _ in x])
neighbors_df = neighbors_df[['pid', 'candidates', 'counts', 'normalized_counts', 'state', \
                             'counts_0', 'normalized_counts_0', 'counts_1', 'normalized_counts_1', 'counts_2', 'normalized_counts_2',]]
neighbors_df.rename(columns={'pid' : 'id'}, inplace=True)
neighbors_df.head(5)
cast_dtype(neighbors_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neighbors_df.rename(columns={'pid' : 'id'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[k] = df[k].apply(lambda x : np.array(x, dtype=np.float32))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[k] = df[k].apply(lambda x : np.array(x, dtype=np.int32))


In [31]:
neighbors_df.to_parquet(f'../candidates/co_graph_item_candidates_300_test_with_normalized_score.parquet', engine='pyarrow')

In [32]:
neighbors_df['candidates'] = neighbors_df['candidates'].apply(lambda x : x[:100])
neighbors_df['counts'] = neighbors_df['counts'].apply(lambda x : x[:100])
neighbors_df['normalized_counts'] = neighbors_df['normalized_counts'].apply(lambda x : x[:100])
neighbors_df['counts_0'] = neighbors_df['counts_0'].apply(lambda x : x[:100])
neighbors_df['normalized_counts_0'] = neighbors_df['normalized_counts_0'].apply(lambda x : x[:100])
neighbors_df['counts_1'] = neighbors_df['counts_1'].apply(lambda x : x[:100])
neighbors_df['normalized_counts_1'] = neighbors_df['normalized_counts_1'].apply(lambda x : x[:100])
neighbors_df['counts_2'] = neighbors_df['counts_2'].apply(lambda x : x[:100])
neighbors_df['normalized_counts_2'] = neighbors_df['normalized_counts_2'].apply(lambda x : x[:100])
neighbors_df['candidates'].apply(len).describe()
neighbors_df['counts'].apply(len).describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neighbors_df['candidates'] = neighbors_df['candidates'].apply(lambda x : x[:100])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neighbors_df['counts'] = neighbors_df['counts'].apply(lambda x : x[:100])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neighbors_df['normalized_counts'] = neighbors_df[

count    1410675.0
mean         100.0
std            0.0
min          100.0
25%          100.0
50%          100.0
75%          100.0
max          100.0
Name: counts, dtype: float64

In [35]:
neighbors_df.to_parquet(f'../candidates/co_graph/co_graph_item_candidates_100_test_with_normalized_score.parquet', engine='pyarrow')

In [36]:
neighbors_df = neighbors_df.set_index('id', drop=True)
neighbors_df.head(5)

Unnamed: 0_level_0,candidates,counts,normalized_counts,state,counts_0,normalized_counts_0,counts_1,normalized_counts_1,counts_2,normalized_counts_2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
B005ZSSN10,"[B005ZSSMO8, B005ZSSNXS, B00I41DP18, B07B3XSHW...","[4.0, 3.0, 2.5, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.3809524, 0.2857143, 0.23809524, 0.0952381, ...",Pad,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.33333334, 0.33333334, 0.33333334, 0.0, 0.0,...","[1.0, 1.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.4, 0.4, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.4, 0.2, 0.2, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
B08PRYN6LD,"[B08PSBK59Y, B005PKZK7S, B075WSVXS3, B08PRYN6L...","[21.0, 8.833333, 5.5, 3.3333333, 2.5, 2.5, 2.5...","[0.30731708, 0.12926829, 0.0804878, 0.04878048...",Pad,"[7, 6, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, ...","[0.23333333, 0.2, 0.033333335, 0.033333335, 0....","[6.0, 0.8333333, 0.5, 0.33333334, 0.5, 0.5, 0....","[0.58064514, 0.08064516, 0.048387095, 0.032258...","[8, 2, 4, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...","[0.2857143, 0.071428575, 0.14285715, 0.0714285..."
B09MBZJ48V,"[B07VR16HF9, B089QVZBWM, B005HWEZGG, B07FY5LCH...","[91.333336, 13.0, 8.0, 7.3333335, 7.3333335, 7...","[0.37127373, 0.05284553, 0.032520324, 0.029810...",Pad,"[28, 6, 3, 2, 2, 3, 2, 1, 1, 1, 1, 1, 0, 0, 0,...","[0.36842105, 0.078947365, 0.039473683, 0.02631...","[27.333334, 2.0, 1.0, 1.3333334, 1.3333334, 1....","[0.6212121, 0.045454547, 0.022727273, 0.030303...","[36, 5, 4, 4, 4, 3, 4, 3, 2, 2, 1, 1, 3, 3, 3,...","[0.2857143, 0.03968254, 0.031746034, 0.0317460..."
B08ZN6F26S,"[B08PVG787Z, B08ZN6F26S, B00VMUSFMW, B07JKP79B...","[15.0, 14.333333, 5.5, 5.3333335, 5.3333335, 4...","[0.16453382, 0.15722121, 0.06032907, 0.0585009...",Pad,"[5, 4, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, ...","[0.1388889, 0.11111111, 0.055555556, 0.0555555...","[4.0, 2.3333333, 1.5, 1.3333334, 1.3333334, 0....","[0.24742268, 0.14432989, 0.0927835, 0.08247422...","[6, 8, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 0, 0, ...","[0.15384616, 0.20512821, 0.051282052, 0.051282..."
B094DGRV7D,"[B08KTP6517, B06WVS91VM, B094DGRV7D, B09JSLJZK...","[21.5, 13.0, 12.333333, 11.0, 10.666667, 10.33...","[0.08113208, 0.049056605, 0.04654088, 0.041509...",Pad,"[5, 5, 4, 2, 4, 3, 4, 2, 2, 2, 3, 3, 3, 2, 3, ...","[0.05952381, 0.05952381, 0.04761905, 0.0238095...","[2.5, 3.0, 2.3333333, 2.0, 0.6666667, 1.333333...","[0.0625, 0.075, 0.058333334, 0.05, 0.016666668...","[14, 5, 6, 7, 6, 6, 4, 4, 4, 4, 3, 3, 2, 2, 2,...","[0.09929078, 0.035460994, 0.04255319, 0.049645..."


In [39]:
test_last_items = get_session_last_item(test_sess_data)
test_candidates = neighbors_df.loc[test_last_items]

100%|██████████| 316971/316971 [00:08<00:00, 37297.14it/s]


In [42]:
test_candidates.reset_index(drop=True, inplace=True)
test_candidates['sess_id'] = np.arange(test_candidates.shape[0], dtype=np.int32)

In [46]:
test_candidates.to_parquet('../candidates/co_graph/co_graph_test_100_with_normalized_score.parquet', engine='pyarrow')

In [44]:
test_candidates['candidates'].apply(len).describe()

count    316971.0
mean        100.0
std           0.0
min         100.0
25%         100.0
50%         100.0
75%         100.0
max         100.0
Name: candidates, dtype: float64