In [1]:
import pandas as pd

from tqdm import tqdm

from collections import defaultdict

import numpy as np

In [2]:
train_sessions = pd.read_csv('datasets/train_sessions.csv')
train_purchases = pd.read_csv('datasets/train_purchases.csv')
test_sessions = pd.read_csv('datasets/test_leaderboard_sessions.csv')
test_leaderboard = pd.read_csv('datasets/test_final_sessions.csv')

In [3]:
train_sessions = pd.concat([train_sessions,train_purchases,test_sessions,test_leaderboard], ignore_index=True)

In [4]:
train_sessions

Unnamed: 0,session_id,item_id,date
0,3,9655,2020-12-18 21:25:00.373
1,3,9655,2020-12-18 21:19:48.093
2,13,15654,2020-03-13 19:35:27.136
3,18,18316,2020-08-26 19:18:30.833
4,18,2507,2020-08-26 19:16:31.211
...,...,...,...
6199307,4439648,7154,2021-06-14 08:03:19.024
6199308,4439675,23067,2021-06-01 12:21:07.959
6199309,4439868,26085,2021-06-16 22:18:27.509
6199310,4439966,19483,2021-06-06 20:05:06.457


In [6]:
train_sessions['ts'] = pd.to_datetime(train_sessions.date).astype('int64') / 1000
train_sessions = train_sessions.sort_values(by = ['session_id','ts'], ascending = True)

In [7]:
#Recall function
def get_sim2(df, user_col, item_col,alpha):
    user_item = df.groupby("session_id")[item_col].agg(list).reset_index()
    user_item_dict = user_item[[user_col, item_col]].values
    user_time = df.groupby("session_id")['ts'].agg(list).reset_index()
    user_time_dict = dict(user_time.set_index("session_id")["ts"])
    
    sim_item_corr = {}
    item_cnt = defaultdict(int)
    three_days = 3600 * 24 * 3
    
    for user, items in tqdm(user_item_dict):
        for loc1, item in enumerate(items):
            item_cnt[item] += 1
            sim_item.setdefault(item, {})
            for loc2, relate_item in enumerate(items):
                if item != relate_item:
                    t1 = user_time_dict[user][loc1] #time
                    t2 = user_time_dict[user][loc2] #time

                    loc_alpha = 1.0 if loc2 > loc1 else 0.7
                    loc_weight = loc_alpha * (0.8**(abs(loc2-loc1)-1))
                    time_weight = np.exp(- abs(t2 - t1)/three_days)


                    sim_item[item].setdefault(relate_item, 0)

                    sim_item[item][relate_item] += loc_weight*time_weight/np.log(1+len(items))

    return sim_item_corr, item_cnt

sim_item_corr,item_cnt = get_sim2(train_sessions,'session_id','item_id',1)

100%|███████████████████████████████████████████████████████████████████████████| 1100000/1100000 [02:56<00:00, 6220.03it/s]


In [13]:
order = train_sessions['item_id'].value_counts().reset_index()
order = order.sort_values('item_id', ascending=False)
popular_items = list(order['index'])

In [14]:
def recommend(sim_item_corr, popular_items, top_k, session_item_list):  
    rank = {}  
    for i in session_item_list:  
        if i in sim_item_corr:
            for j, wij in sorted(sim_item_corr[i].items(), key=lambda d: d[1], reverse=True)[:300]:  
                if j not in session_item_list:  
                    rank[j] = rank.get(j,0) + wij
                    
    rank_list = sorted(rank.items(), key=lambda d: d[1], reverse=True)[:top_k]
    rank_array = np.array(rank_list)
    item_list = list(rank_array[:,0].astype('int32'))

    index = 0
    while(len(item_list) < top_k):
        if popular_items[index] not in item_list:
            item_list.append(popular_items[index])
        else:
            index += 1
            
    return item_list

In [25]:
top_k = 100
session_id_list = []
item_id_list = []
rank_list = []
test_session_dict = test_sessions.groupby('session_id')['item_id'].agg(list).to_dict()

for session_id, session_item_list in tqdm(test_session_dict.items()):
    item_list = recommend(sim_item_corr, popular_items, top_k, session_item_list)
    session_id_list += [session_id] * len(item_list)
    item_id_list += list(item_list)

    rank_list += range(1,len(item_list)+1)

leaderboard = pd.DataFrame({"session_id":session_id_list, "item_id":item_id_list, "rank":rank_list})
leaderboard.to_csv('leaderboard_result.csv',index=False)
leaderboard

100%|████████████████████████████████████████████████████████████████████████████████| 50000/50000 [03:43<00:00, 223.54it/s]


Unnamed: 0,session_id,item_id,rank
0,26,23612,1
1,26,5383,2
2,26,3650,3
3,26,3425,4
4,26,3260,5
...,...,...,...
4999995,4439757,26691,96
4999996,4439757,26942,97
4999997,4439757,11871,98
4999998,4439757,10931,99
