In [63]:
import os
import pandas as pd

from datetime import datetime
from tqdm import tqdm

from collections import defaultdict
import math
import numpy as np

In [2]:
data_dir = '../data/dressipi_recsys2022/'

In [None]:
train_sessions = pd.read_csv(os.path.join(data_dir, 'train_sessions.csv'))
train_purchases = pd.read_csv(os.path.join(data_dir, 'train_purchases.csv'))
test_sessions = pd.read_csv(os.path.join(data_dir, 'test_leaderboard_sessions.csv'))

In [78]:
common_item = len(set(train_sessions['item_id'].unique()) & set(test_session['item_id'].unique()))
common_item / test_session['item_id'].nunique()

1.0

In [73]:
train_sessions = train_sessions.append(train_purchases)
train_sessions = train_sessions.append(test_sessions)

In [74]:
def get_sim_item(df, session_col, item_col, use_iif=False):
    user_item_dict = df.groupby(session_col)[item_col].agg(list).to_dict()
    sim_item = {}
    item_cnt = defaultdict(int)
    for user, items in tqdm(user_item_dict.items()):
        for i in items:
            item_cnt[i] += 1
            sim_item.setdefault(i, {})
            for relate_item in items:
                if i == relate_item:
                    continue
                sim_item[i].setdefault(relate_item, 0)
                if not use_iif:
                    sim_item[i][relate_item] += 1
                else:
                    sim_item[i][relate_item] += 1 / math.log(1 + len(items))
    sim_item_corr = sim_item.copy()
    for i, related_items in tqdm(sim_item.items()):
        for j, cij in related_items.items():
            sim_item_corr[i][j] = cij / math.sqrt(item_cnt[i] * item_cnt[j])

    return sim_item_corr  

In [75]:
order = train_sessions['item_id'].value_counts().reset_index()
order = order.sort_values('item_id', ascending=False)
popular_items = list(order['index'])

In [109]:
sim_item_corr = get_sim_item(train_sessions, 'session_id','item_id',use_iif=True)

100%|███████████████████████████████████████████████████████████| 1050000/1050000 [00:33<00:00, 31480.98it/s]
100%|████████████████████████████████████████████████████████████████| 23683/23683 [00:09<00:00, 2606.82it/s]


In [110]:
def recommend(sim_item_corr, popular_items, top_k, session_item_list, item_num=300):  
    rank = {}  
    for i in session_item_list:  
        if i not in sim_item_corr.keys():
            continue
        for j, wij in sorted(sim_item_corr[i].items(), key=lambda d: d[1], reverse=True)[0:item_num]:  
            if j not in session_item_list:  
                rank.setdefault(j, 0)  
                rank[j] += wij
    rank = sorted(rank.items(), key=lambda d: d[1], reverse=True)[:top_k]
    rank = np.array(rank)
    item_list = list(rank[:,0].astype('int32'))
    score_list = rank[:,1]
    
    if len(item_list)<top_k:
        index = 0
        while(len(item_list)<top_k):
            item_list.append(popular_items[index])
            item_list = list(set(item_list))
            index +=1
            
    return item_list, score_list

In [111]:
item_list, score_list=recommend(sim_item_corr,popular_items,100,[9655,15654,2507])

In [112]:
top_k = 100
test_session_dict = test_sessions.groupby('session_id')['item_id'].agg(list).to_dict()
session_id_list = []
item_id_list = []
rank_list = []
for session_id,session_item_list in tqdm(test_session_dict.items()):
    item_list, score_list = recommend(sim_item_corr,popular_items,top_k,session_item_list)
    
    session_id_list += [session_id for _ in range(len(item_list))]
    item_id_list += list(item_list)
    rank_list += [x for x in range(1,len(item_list)+1)]

100%|█████████████████████████████████████████████████████████████████| 50000/50000 [02:43<00:00, 304.92it/s]


In [113]:
res_df = pd.DataFrame()
res_df['session_id'] = session_id_list
res_df['item_id'] = item_id_list
res_df['rank'] = rank_list

In [None]:
res_df.to_csv('baseline.csv',index=False)