## Part0: Necessary Common Functions

Those functions should be ran before each part.

In [1]:
import os
import random
import numpy as np
import pandas as pd
import scipy.sparse as ssp
from functools import lru_cache
from tqdm import tqdm
from collections import Counter, defaultdict

In [2]:
# Cache loading of data for multiple calls
train_data_dir = '../raw_data/'
test_data_dir = '../raw_data/'
task = 'task1'
PREDS_PER_SESSION = 100


@lru_cache(maxsize=1)
def read_product_data():
    print(os.getcwd())
    print(os.path.join(train_data_dir, 'products_train.csv'))
    return pd.read_csv(os.path.join(train_data_dir, 'products_train.csv'))

@lru_cache(maxsize=1)
def read_train_data():
    return pd.read_csv(os.path.join(train_data_dir, 'sessions_train.csv'))

@lru_cache(maxsize=3)
def read_test_data(task):
    return pd.read_csv(os.path.join(test_data_dir, f'sessions_test_{task}.csv'))

In [3]:
def map_id(id: str, id_dict: dict) -> int:
    return id_dict[id]

In [4]:

def get_sessions(df: pd.DataFrame, id_dict: dict, test=False, list_item=False) -> list:
    if 'next_item' in df and not test:
        if list_item:
            all_item = df['prev_items']
        else:
            all_item = df.apply(lambda x: eval((x['prev_items'][:-1]+f" '{x['next_item']}']").replace(" ", ",")), axis=1)
    else:
        if list_item:
            all_item = df['prev_items']
        else:
            all_item = df.apply(lambda x: eval(x['prev_items'].replace(" ", ",")), axis=1)
    all_item_id = []
    for x in all_item:
        all_item_id.append([map_id(y, id_dict) for y in x])
    return all_item_id

In [5]:
def get_item_pop(sess: list, ):
    all_item_id = []
    for s in train_sess_itemid:
        all_item_id += s
    return Counter(all_item_id)

In [6]:
def get_co_occurence_dict(sessions: list, bidirection: bool=True, weighted: bool=False) -> dict:
    res = {}
    for sess in tqdm(sessions):
        for i, id in enumerate(sess):
            if id not in res:
                res[id] = Counter()
            for j in range(i+1, len(sess)):
                if not weighted:
                    res[id][sess[j]] += 1
                else:
                    res[id][sess[j]] += 1 / (j-i)
                if bidirection:
                    if sess[j] not in res:
                        res[sess[j]] = Counter()
                    if not weighted:
                        res[sess[j]][id] += 1
                    else:
                        res[sess[j]][id] += 1 / (j-i)
    return res


def convert_co_occurence_dict_to_matrix(co_occurence_dict: dict) -> ssp.csr_matrix:
    pass


In [7]:
def sort_co_occurence_dict(co_occurence_dict: dict) -> dict:
    res = {}
    for k,v in co_occurence_dict.items():
        res[k] = dict(sorted(v.items(), key=lambda item: -item[1]))
    return res

In [8]:
def predict_with_co_occurence_dict(sess: list, sess_locale: list, id2locale: dict, co_occurence_dict: dict, topk: int, pred_with_last: bool=False, remove_hist: bool=False) -> list:
    sorted_dict = sort_co_occurence_dict(co_occurence_dict)
    res = []
    for i, s in tqdm(enumerate(sess)):
        locale = sess_locale[i]
        if not pred_with_last:
            cand = Counter()
            for i,id in enumerate(s):
                if id in sorted_dict:
                    if id in sorted_dict:
                        cand = cand + Counter({ k:sorted_dict[id][k] * (i / len(s)) for k in list(sorted_dict[id].keys()) } )
            cand = sorted(cand.items(), key=lambda x: -x[1])
            cand = [i[0] for i in cand]
        else:
            if s[-1] not in sorted_dict:
                cand = []
            else:
                cand = list(sorted_dict[s[-1]].keys())

        if remove_hist:
            cand = [x for x in cand if x not in s]

        cand = [x for x in cand if locale in id2locale[x]]

        cand = cand[: min(len(cand), topk)]
        res.append(cand)
    return res

In [9]:
def pad_topk_with_popular_items(rec_list: list, topk: int, pop_items: list) -> list:
    # TODO: remove duplicated items; add locale constraint
    res = [None] * len(rec_list)
    for i, l in enumerate(rec_list):
        pad_len = topk - len(l)
        if pad_len > 0:
            pad_items = random.sample(pop_items, pad_len)
            res[i] = l + pad_items
        else:
            res[i] = l
    return res

In [10]:
def id2predictions(pred: list, test_df: pd.DataFrame, id_dict: dict) -> pd.DataFrame:
    id2product = {v:k for k,v in id_dict.items()}
    assert len(pred) == test_df.shape[0]
    product_pred = [None] * len(pred)
    for i, l in enumerate(pred):
        product_pred[i] = [id2product[x] for x in l]
    res = pd.DataFrame()
    res['locale'] = test_df['locale']
    res['next_item_prediction'] = product_pred
    return res

## Part1: Predict with Co-Occurrence Graph

In [204]:
train_sess_data = read_train_data()
test_sess_data = read_test_data(task)
product = read_product_data()

/root/autodl-tmp/huangxu/Amazon-KDDCUP-23/simple_method
../raw_data/products_train.csv


In [205]:
# Map product id
id_dict = { id: i+1 for i, id in enumerate(product['id'].unique()) }    # 0 saved for padding

In [206]:
locale_info = product[['id', 'locale']].groupby('id')['locale'].agg(list)
id2locale = {id_dict[x]: locale_info[x] for x in product['id'].unique()}
test_locale = test_sess_data['locale'].tolist()

In [207]:
train_sess_itemid = get_sessions(train_sess_data, id_dict)
test_sess_itemid = get_sessions(test_sess_data, id_dict, test=True)

In [208]:
train_pop = get_item_pop(train_sess_itemid)
test_pop = get_item_pop(test_sess_data)
total_pop = train_pop + test_pop
sorted_pop = sorted(total_pop.items(), key=lambda x:-x[1])
pop_items_200 = [x[0] for x in sorted_pop[:200]]

In [209]:
co_occurence_dict_train = get_co_occurence_dict(train_sess_itemid, bidirection=False)
co_occurence_dict_test = get_co_occurence_dict(test_sess_itemid, bidirection=False)

100%|██████████| 3606249/3606249 [00:50<00:00, 71406.69it/s] 
100%|██████████| 316971/316971 [00:03<00:00, 105193.21it/s]


In [210]:
co_occurence_dict = get_co_occurence_dict(train_sess_itemid + test_sess_itemid, bidirection=False, weighted=False)
res_test = predict_with_co_occurence_dict(test_sess_itemid, test_locale, id2locale, co_occurence_dict_train, 100, True, True)
res = pad_topk_with_popular_items(res_test, 100, pop_items_200)
pred_res = id2predictions(res, test_sess_data, id_dict)

100%|██████████| 3923220/3923220 [00:53<00:00, 73556.13it/s] 


In [219]:
pred_res['next_item_prediction'].apply(len).describe()

count    316971.0
mean        100.0
std           0.0
min         100.0
25%         100.0
50%         100.0
75%         100.0
max         100.0
Name: next_item_prediction, dtype: float64

In [220]:
pred_res.to_parquet(f'submission_co_occurence_{task}.parquet', engine='pyarrow')

## Part2: Use neighbors in co-occurence graph to compute coverage
- First we need a validation dataset
- Then we should compare several stategies below:
    - Undirected graph + predict with history items 
    - Undirected graph + predict with last item
    - Directed graph + predict with history items 
    - Directed graph + predict with last item

In [11]:
def predict_with_neighbors(sess: list, co_occurence_dict: dict, pred_with_last: bool=False) -> list:
    sorted_dict = sort_co_occurence_dict(co_occurence_dict)
    res = []
    for i, s in tqdm(enumerate(sess), total=len(sess)):
        if not pred_with_last:
            cand = Counter()
            for i,id in enumerate(s):
                if id in sorted_dict:
                    if id in sorted_dict:
                        cand = cand + Counter({ k:sorted_dict[id][k] * (i / len(s)) for k in list(sorted_dict[id].keys()) } )
            cand = sorted(cand.items(), key=lambda x: -x[1])
            cand = [i[0] for i in cand]
        else:
            if s[-1] not in sorted_dict:
                cand = []
            else:
                cand = list(sorted_dict[s[-1]].keys())

        res.append(cand)
    return res

In [12]:
def cal_hit(pred, ground_truth) -> float:
    hit_num = 0
    for i, l in enumerate(pred):
        if ground_truth[i] in l:
            hit_num += 1
    return hit_num / len(ground_truth)

In [58]:
trn_df = pd.read_csv("../raw_data/sampled_train_data.csv")
test_df = pd.read_csv("../raw_data/sampled_test_data.csv")
product_info = read_product_data()

In [13]:
def recstudio_data_to_ori_data(rec_df):
    res = rec_df.groupby("sess_id").agg(list)
    res['next_item'] = res['product_id'].apply(lambda x: x[-1])
    res['prev_items'] = res['product_id'].apply(lambda x: x[: -1])
    res['locale'] = res['locale'].apply(lambda x:x[0])
    res = res[['prev_items', 'next_item', 'locale']]
    return res

In [14]:
data_type = 'all' # ['all', 'tune']
train_sess_data_0 = pd.read_csv(f'../data_for_recstudio/{data_type}_task_1_train_inter_feat.csv')
test_sess_data_0 = pd.read_csv(f'../data_for_recstudio/{data_type}_task_1_valid_inter_feat.csv')
product = read_product_data()

/root/autodl-tmp/huangxu/Amazon-KDDCUP-23/simple_method
../raw_data/products_train.csv


In [15]:
train_sess_data_0

Unnamed: 0,sess_id,product_id,timestamp,locale
0,0,B005ZJTUXE,0,FR
1,0,B005ZJTUXE,1,FR
2,0,B00P8VIBBG,2,FR
3,0,B07TVSL9TW,3,FR
4,1,B09M8HSN22,0,DE
...,...,...,...,...
18321870,3557896,B09J4G565S,2,UK
18321871,3557896,B08K8LLFQ6,3,UK
18321872,3557897,B09ZLBFC7L,0,UK
18321873,3557897,B09ZL9PK6Q,1,UK


In [16]:
trn_df = recstudio_data_to_ori_data(train_sess_data_0)
test_df = recstudio_data_to_ori_data(test_sess_data_0)

In [17]:
id_dict = { id: i+1 for i, id in enumerate(product['id'].unique()) }    # 0 saved for padding

In [18]:
train_sess_itemid = get_sessions(trn_df, id_dict, list_item=True)
test_sess_itemid = get_sessions(test_df, id_dict, test=True, list_item=True)

In [19]:
ground_truth = [id_dict[x] for x in test_df['next_item'].tolist()]

### Undirected graph

In [20]:
co_occurence_dict_train = get_co_occurence_dict(train_sess_itemid, bidirection=True)

100%|██████████| 3557898/3557898 [00:51<00:00, 69489.03it/s]


In [21]:
# predict with all items in session
pred_res = predict_with_neighbors(test_sess_itemid, co_occurence_dict_train, pred_with_last=False)
print('Hit Ratio', (cal_hit(pred_res, ground_truth)))

100%|██████████| 361581/361581 [03:28<00:00, 1734.45it/s]


Hit Ratio 0.555621008847257


In [33]:
length = np.array([len(r) for r in pred_res])
print("Length of neighbors: Mean={}, Min={}, Max={}".format(length.mean(), length.min(), length.max()))

Length of neighbors: Mean=251.27782433258383, Min=0, Max=11263


In [34]:
# predict with last item
pred_res = predict_with_neighbors(test_sess_itemid, co_occurence_dict_train, pred_with_last=True)
print('Hit Ratio', (cal_hit(pred_res, ground_truth)))

100%|██████████| 361581/361581 [00:07<00:00, 45882.60it/s]


Hit Ratio 0.5127813684900479


In [66]:
length = np.array([len(r) for r in pred_res])
print("Length of neighbors: Mean={}, Min={}, Max={}".format(length.mean(), length.min(), length.max()))

Length of neighbors: Mean=147.21013299170326, Min=0, Max=2989


### Directed graph

In [None]:
co_occurence_dict_train = get_co_occurence_dict(train_sess_itemid, bidirection=False)

100%|██████████| 3245625/3245625 [00:54<00:00, 60052.64it/s] 


In [None]:
# predict with all items in session
pred_res = predict_with_neighbors(test_sess_itemid, co_occurence_dict_train, pred_with_last=False)
print('Hit Ratio', (cal_hit(pred_res, ground_truth)))

360624it [03:38, 1651.61it/s]


Hit Ratio 0.5712487244332046


In [None]:
# predict with last item
pred_res = predict_with_neighbors(test_sess_itemid, co_occurence_dict_train, pred_with_last=True)
print('Hit Ratio', (cal_hit(pred_res, ground_truth)))

360624it [00:04, 74615.62it/s]


Hit Ratio 0.5062364124406584


## Part3: Generate each item's neighbors in co-occurence graph

In [13]:
# train_sess_data = read_train_data()
# test_sess_data = read_test_data('task1')
# product = read_product_data()

/root/autodl-tmp/huangxu/Amazon-KDDCUP-23/simple_method
../raw_data/products_train.csv


In [2]:
data_type = 'all' # ['all', 'tune']
train_sess_data_0 = pd.read_csv(f'../data_for_recstudio/{data_type}_task_1_train_inter_feat.csv')
test_sess_data_0 = pd.read_csv(f'../data_for_recstudio/{data_type}_task_1_valid_inter_feat.csv')
product = read_product_data()

NameError: name 'read_product_data' is not defined

In [4]:
test_sess_data_0.head(10)

Unnamed: 0,sess_id,product_id,timestamp,locale
0,0,B09VSN9GLS,0,UK
1,0,B09VSG9DCG,1,UK
2,0,B0BJ5L1ZPH,2,UK
3,0,B09VSN9GLS,3,UK
4,0,B0BJ6V797Y,4,UK
5,0,B09VSG9DCG,5,UK
6,0,B077XGDMD2,6,UK
7,0,B06XG1LZ6Z,7,UK
8,1,B00390YWXE,0,JP
9,1,B00390YWXE,1,JP


In [55]:
train_sess_data = train_sess_data_0.groupby("sess_id").agg(list)
test_sess_data = test_sess_data_0.groupby("sess_id").agg(list)
train_sess_data['locale'] = train_sess_data["locale"].apply(lambda x: x[0])
test_sess_data['locale'] = test_sess_data["locale"].apply(lambda x: x[0])

In [56]:
train_sess_data = train_sess_data.rename(columns={'product_id': 'prev_items'})
test_sess_data = test_sess_data.rename(columns={'product_id': 'prev_items'})

In [57]:
# Map product id
id_dict = { id: i+1 for i, id in enumerate(product['id'].unique()) }    # 0 saved for padding

In [58]:
train_sess_itemid = get_sessions(train_sess_data, id_dict, list_item=True)
test_sess_itemid = get_sessions(test_sess_data, id_dict, list_item=True)

In [59]:
train_pop = get_item_pop(train_sess_itemid)
# test_pop = get_item_pop(test_sess_data)
total_pop = train_pop
sorted_pop = sorted(total_pop.items(), key=lambda x:-x[1])
pop_items_500 = [x[0] for x in sorted_pop[:500]]

### Undirected Graph

In [60]:
co_occurence_dict = get_co_occurence_dict(train_sess_itemid, bidirection=False, weighted=False)

100%|██████████| 3557898/3557898 [00:40<00:00, 88601.73it/s]


In [61]:
def sort_dict_by_values(d: dict, desc=True) -> dict:
    if desc:
        res = sorted(d.items(), key=lambda x: -x[1])
    else:
        res = sorted(d.items(), key=lambda x: x[1])
    return dict(res)

In [62]:
def get_neighbors_for_each_product(co_occurence_dict: dict, productid_map: dict, pop_products: list, k: int=300):
    res = {'id': [], 'candidates_id': [], 'state': []}
    for pid, id in tqdm(productid_map.items(), total=len(productid_map)):
        if id in co_occurence_dict:
            neighbors = list(sort_dict_by_values(co_occurence_dict[id]).keys())
            if len(neighbors) >= k:
                cands = neighbors[:k]
                state = 'Full'
            else:
                _pop_cands = random.sample(pop_products, k-len(neighbors))
                cands = neighbors + _pop_cands
                state = 'Pad'
        else:
            cands = random.sample(pop_products, k)
            state = 'No'
        res['id'].append(pid)
        res['candidates_id'].append(cands)
        res['state'].append(state)
    df = pd.DataFrame(res)
    return df


In [63]:
neighbors_df = get_neighbors_for_each_product(co_occurence_dict, id_dict, pop_items_500, 300)

100%|██████████| 1410675/1410675 [02:33<00:00, 9209.77it/s]


In [64]:
id2pid = {v:k for k,v in id_dict.items()}
neighbors_df['candidates'] = neighbors_df['candidates_id'].apply(lambda x: [id2pid[_] for _ in x])

In [65]:
neighbors_df = neighbors_df[['id', 'candidates', 'state']]
neighbors_df.head(5)

Unnamed: 0,id,candidates,state
0,B005ZSSN10,"[B005ZSSNXS, B07WD58H6R, B00NTCHCU2, B07QQZD49...",pad
1,B08PRYN6LD,"[B08PSBK59Y, B005PKZK7S, B01M9EYRD1, B09K4HVP7...",pad
2,B09MBZJ48V,"[B07VR16HF9, B089QVZBWM, B005HWEZGG, B07FL7GVZ...",pad
3,B08ZN6F26S,"[B08PVG787Z, B08ZN6F26S, B07JKP79B6, B0002HR7W...",pad
4,B094DGRV7D,"[B08KTP6517, B06WVS91VM, B094DGRV7D, B08KTN66X...",pad


In [66]:
neighbors_df['candidates'].apply(len).describe()

count    1410675.0
mean         300.0
std            0.0
min          300.0
25%          300.0
50%          300.0
75%          300.0
max          300.0
Name: candidates, dtype: float64

In [26]:
neighbors_df.to_parquet(f'../co-orrurrence_graph/{data_type}_item_candidates.parquet', engine='pyarrow')

In [53]:
neighbors_df.to_feather('../co-orrurrence_graph/{data_type}_item_candidates.ftr')

In [39]:
neighbors_df.to_csv('../co-orrurrence_graph/{data_type}_item_candidates.csv', index=None)