In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np

In [29]:
train_sessions = pd.read_csv('datasets/train_sessions.csv')
train_purchases = pd.read_csv('datasets/train_purchases.csv')
test_sessions = pd.read_csv('datasets/test_leaderboard_sessions.csv')
item_features = pd.read_csv('datasets/item_features.csv')

In [32]:
train_sessions.head()

Unnamed: 0,session_id,item_id,date,ts
1,3,9655,2020-12-18 21:19:48.093,1608326000.0
0,3,9655,2020-12-18 21:25:00.373,1608327000.0
2,13,15654,2020-03-13 19:35:27.136,1584128000.0
5,18,4026,2020-08-26 19:15:47.232,1598469000.0
4,18,2507,2020-08-26 19:16:31.211,1598469000.0


In [33]:
item_features.head()

Unnamed: 0,item_id,feature_category_id,feature_value_id
0,2,56,365
1,2,62,801
2,2,68,351
3,2,33,802
4,2,72,75


In [31]:
train_sessions['ts'] = pd.to_datetime(train_sessions.date).astype('int64') / 1E9
train_sessions = train_sessions.sort_values(by = ['session_id','ts'], ascending = True)
train_sessions.head()

Unnamed: 0,session_id,item_id,date,ts
1,3,9655,2020-12-18 21:19:48.093,1608326000.0
0,3,9655,2020-12-18 21:25:00.373,1608327000.0
2,13,15654,2020-03-13 19:35:27.136,1584128000.0
5,18,4026,2020-08-26 19:15:47.232,1598469000.0
4,18,2507,2020-08-26 19:16:31.211,1598469000.0


In [44]:
# Fonction de similarité
# Renvoie {item_id: {item_id: score de similarité, ...}, ...}

def similarity(df, alpha, log = lambda *x: None):
    user_item = df.groupby("session_id")["item_id"].agg(list).reset_index()
    user_item_tuple = user_item[["session_id", "item_id"]].values
    user_time = df.groupby("session_id")['ts'].agg(list).reset_index()
    user_time_dict = user_time.set_index("session_id")["ts"].to_dict()
    
    sim_item = {}
    three_days = 3600 * 24 * 3
    
    for user, items in tqdm(user_item_tuple):
        log("For session", user, "and its items", items)
        
        for loc1, item in enumerate(items):
            log("   For index", loc1, "and item", item)
            
            sim_item.setdefault(item, {})
            for loc2, relate_item in enumerate(items):
                log("       For index2", loc2, "and relate_item", relate_item)
                if item != relate_item:
                    log("            item", item, "different than related_item", relate_item, "so:")
                    t1 = user_time_dict[user][loc1]
                    t2 = user_time_dict[user][loc2]
                    
                    delta_index = abs(loc2 - loc1)
                    delta_time = abs(t2 - t1) / 3600
                    
                    log("            delta_index =", delta_index)
                    log("            delta_time =", delta_time)

                    loc_weight = 3 / (abs(loc2 - loc1) + 1)
                    time_weight =  np.exp( - (delta_time^2/25))
                    
                    log("            loc_weight =", loc_weight)
                    log("            time_weight =", time_weight)

                    sim_item[item][relate_item] = sim_item[item].get(relate_item,0) + (loc_weight*time_weight/len(items))
                    log("            similarity =", sim_item[item][relate_item])

    return sim_item

similarity(train_sessions[:10], 1, print)

100%|█████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 22.67it/s]

For session 3 and its items [9655, 9655]
   For index 0 and item 9655
       For index2 0 and relate_item 9655
       For index2 1 and relate_item 9655
   For index 1 and item 9655
       For index2 0 and relate_item 9655
       For index2 1 and relate_item 9655
For session 13 and its items [15654]
   For index 0 and item 15654
       For index2 0 and relate_item 15654
For session 18 and its items [4026, 2507, 18316]
   For index 0 and item 4026
       For index2 0 and relate_item 4026
       For index2 1 and relate_item 2507
            item 4026 different than related_item 2507 so:
            delta_index = 1
            delta_time = 0.012216388848092822
            loc_weight = 1.5
            time_weight = 0.022232597506952338
            similarity = 29.511116298753475
       For index2 2 and relate_item 18316
            item 4026 different than related_item 18316 so:
            delta_index = 2
            delta_time = 0.04544472217559815
            loc_weight = 1.0
           




{9655: {},
 15654: {},
 4026: {2507: 29.511116298753475, 18316: 26.502025099080036},
 2507: {4026: 29.511116298753475, 18316: 32.00414518081351},
 18316: {4026: 26.502025099080036, 2507: 32.00414518081351},
 19896: {27937: 28.529230648775616,
  12804: 28.508231536649305,
  25772: 28.50434379703765},
 27937: {19896: 28.529230648775616,
  12804: 28.520224355716923,
  25772: 29.50797804443049},
 12804: {19896: 28.508231536649305,
  27937: 28.520224355716923,
  25772: 26.527185732634152},
 25772: {19896: 28.50434379703765,
  27937: 29.50797804443049,
  12804: 26.527185732634152}}

In [45]:
sim_item = similarity(train_sessions, 1)

  0%|                                                                               | 99/1000000 [00:24<70:01:10,  3.97it/s]


KeyboardInterrupt: 

In [11]:
import itertools

dict(itertools.islice(sim_item.items(), 1))

{9655: {1660: 0.0019976684215748932,
  25025: 0.004948143471269951,
  27187: 0.0012940474542861267,
  1024: 0.012822839857717573,
  24802: 0.04133135477568773,
  9273: 0.0015917650275266025,
  13052: 0.005665276877106524,
  18664: 5.0539560346207916e-05,
  22719: 2.2674215053380065e-05,
  5572: 3.879210904677344e-05,
  22886: 0.0005906150982059506,
  5450: 3.891365523286038e-05,
  2991: 0.0023525633565329193,
  3600: 7.467412210719601e-05,
  22813: 0.00010385969188114487,
  22729: 0.00019570363509758323,
  13857: 0.0003757599922337258,
  26512: 0.000726350042317358,
  1758: 0.0012977574759825024,
  425: 0.0007302955547208254,
  6257: 0.028270763028507388,
  8923: 0.00013092306884489443,
  3268: 0.0001016014063365699,
  1096: 5.4096459973025196e-05,
  27256: 3.7381604988068445e-05,
  24187: 3.1898720293660544e-05,
  10946: 2.7195087283603996e-05,
  2176: 2.3860612022142498e-05,
  23854: 8.022883034369138e-05,
  26835: 3.5354984061544586e-05,
  19115: 1.6695451109876527e-05,
  17620: 1.5

In [12]:
order = train_sessions['item_id'].value_counts().reset_index()
popular_items = list(order['index'])
popular_items[:10]

[8060, 26853, 2447, 1644, 19882, 7963, 8622, 17089, 11742, 18156]

In [13]:
def recommend(sim_item_corr, popular_items, top_k, session_item_list):  
    rank = {}  
    for i in session_item_list:  
        if i in sim_item_corr:
            for j, wij in sorted(sim_item_corr[i].items(), key=lambda d: d[1], reverse=True)[:300]:  
                if j not in session_item_list:  
                    rank[j] = rank.get(j,0) + wij
                    
    rank_list = sorted(rank.items(), key=lambda d: d[1], reverse=True)[:top_k]
    item_list = [item for item, rank in rank_list]

    index = 0
    while(len(item_list) < top_k):
        if popular_items[index] not in item_list:
            item_list.append(popular_items[index])
        else:
            index += 1
            
    return item_list

In [14]:
def mean_reciprocal_rank(df_pred, df_true):
    mrr = 0
    for session, item, rank in df_pred.values:
        if len(df_true[(df_true.item_id == item) & (df_true.session_id == session)]) != 0:
            mrr += 1/rank
    return mrr / len(df_true)

In [19]:
session_id_list = []
item_id_list = []
rank_list = []

train_sessions_dict = train_sessions.groupby("session_id")["item_id"].agg(list).reset_index()
train_sessions_dict = train_sessions_dict[:100]

for session_id, session_item_list in train_sessions_dict.values:
    item_list = recommend(sim_item, popular_items, 100, session_item_list)
    session_id_list += [session_id] * len(item_list)
    item_id_list += list(item_list)
    rank_list += range(1, len(item_list) + 1)

df_pred = pd.DataFrame({"session_id":session_id_list, "item_id":item_id_list, "rank":rank_list})
df_true = train_purchases[np.isin(train_purchases.session_id, train_sessions_dict["session_id"].unique())]
mean_reciprocal_rank(df_pred, df_true)

0.148397322996576

In [17]:
session_id_list = []
item_id_list = []
rank_list = []
test_session_dict = test_sessions.groupby('session_id')['item_id'].agg(list).reset_index()

for session_id, session_item_list in tqdm(test_session_dict.values):
    item_list = recommend(sim_item, popular_items, 100, session_item_list)
    session_id_list += [session_id] * len(item_list)
    item_id_list += list(item_list)
    rank_list += range(1, len(item_list) + 1)

leaderboard = pd.DataFrame({"session_id":session_id_list, "item_id":item_id_list, "rank":rank_list})
leaderboard.to_csv('leaderboard_result.csv',index=False)
leaderboard

Unnamed: 0,session_id,item_id,rank
0,26,3260,1
1,26,5383,2
2,26,23612,3
3,26,3650,4
4,26,2213,5
...,...,...,...
4999995,4439757,4983,96
4999996,4439757,7153,97
4999997,4439757,6780,98
4999998,4439757,4722,99
