In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np

# Datasets

Nous chargeons les datasets `train_sessions` et `train_purchases` pour entrainer les données. Nous ignorons `item_features` car les calculs de similarités relatifs aux features prennent beaucoup de temps.

In [2]:
train_sessions = pd.read_csv('datasets/train_sessions.csv')
train_purchases = pd.read_csv('datasets/train_purchases.csv')
df_train = pd.concat([train_sessions, train_purchases], ignore_index=True)

In [3]:
df_train.head()

Unnamed: 0,session_id,item_id,date
0,3,9655,2020-12-18 21:25:00.373
1,3,9655,2020-12-18 21:19:48.093
2,13,15654,2020-03-13 19:35:27.136
3,18,18316,2020-08-26 19:18:30.833
4,18,2507,2020-08-26 19:16:31.211


# Preprocessing

Ici nous tranformons les dates en timestamp unix pour pouvoir plus tard calculer des différences de temps.

In [4]:
df_train["ts"] = pd.to_datetime(df_train.date).astype("int64") / 1E9 # timestamp en secondes
df_train = df_train.sort_values(by = ["session_id", "ts"], ascending = True)
df_train.head()

Unnamed: 0,session_id,item_id,date,ts
1,3,9655,2020-12-18 21:19:48.093,1608326000.0
0,3,9655,2020-12-18 21:25:00.373,1608327000.0
4743820,3,15085,2020-12-18 21:26:47.986,1608327000.0
2,13,15654,2020-03-13 19:35:27.136,1584128000.0
4743821,13,18626,2020-03-13 19:36:15.507,1584128000.0


Ici nous préparons les datasets de manière à optimiser les étapes de calcul de similarité.

In [5]:
train_purchases['ts'] = pd.to_datetime(train_purchases.date).astype('int64') / 1E9
session_time = df_train.groupby("session_id")["ts"].agg(list).reset_index()
session_time_dict = session_time.set_index("session_id")["ts"].to_dict()

# Fonction de similarité

Nous choisissons de définir un score de similarité pour chaque couple d'items

Pour chaque couple d'item $(i,j)$ dans chaque session $s$ nous calculons le score de la sorte

$\displaystyle{score_{ij} = \sum_s {e^{- \varphi_1( \Delta T )} * e^{- \varphi_2 (\Delta items) } \over |s|}}$

Avec $\varphi_1$ et $\varphi_2$ des applications linéaires arbitraires définies manuellement pour pondérer l'influence $\Delta T$ (distance temporelle) et $\Delta items$ (distance en quantité d'items qui les séparent dans la session $s$)

$|s|$ étant la taille de la session


In [6]:
# Fonction de similarité
# Renvoie {item_id: {item_id: score de similarité, ...}, ...}
def similarity(df, log = lambda *x: None):
    session_item_tuple = df.groupby("session_id")["item_id"]\
                        .agg(list)\
                        .reset_index()[["session_id", "item_id"]]\
                        .values
    
    sim_item = {}
    
    for session, items in tqdm(session_item_tuple):
        log("For session", session, "and its items", items)
        
        for i, item1 in enumerate(items):
            log("   For index", i, "and item", item1)
            sim_item[item1] = sim_item.get(item1, {})
            
            for j, item2 in enumerate(items):
                log("       For index2", j, "and item2", item2)
                if item1 != item2:
                    sim_item[item1][item2] = sim_item[item1].get(item2, 0)
                    log("            item1", item1, "different than item2", item2, "so:")
                    
                    delta_index = abs(j - i)
                    delta_time = abs(session_time_dict[session][i] - session_time_dict[session][j]) / 3600
                    
                    log("            delta_index =", delta_index)
                    log("            delta_time =", delta_time)

                    time_coeff = np.exp(- delta_time / 72)
                    index_coeff =  np.exp(- (delta_index - 1) / 5)

                    sim_item[item1][item2] += index_coeff * time_coeff/len(items)
                    log("            similarity =", sim_item[item1][item2])
    return sim_item

similarity(train_sessions[:10], print)

100%|███████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 3416.25it/s]

For session 3 and its items [9655, 9655]
   For index 0 and item 9655
       For index2 0 and item2 9655
       For index2 1 and item2 9655
   For index 1 and item 9655
       For index2 0 and item2 9655
       For index2 1 and item2 9655
For session 13 and its items [15654]
   For index 0 and item 15654
       For index2 0 and item2 15654
For session 18 and its items [18316, 2507, 4026]
   For index 0 and item 18316
       For index2 0 and item2 18316
       For index2 1 and item2 2507
            item1 18316 different than item2 2507 so:
            delta_index = 1
            delta_time = 0.012216388848092822
            similarity = 0.33327678077538553
       For index2 2 and item2 4026
            item1 18316 different than item2 4026 so:
            delta_index = 2
            delta_time = 0.04544472217559815
            similarity = 0.27273805078531216
   For index 1 and item 2507
       For index2 0 and item2 18316
            item1 2507 different than item2 18316 so:
         




{9655: {},
 15654: {},
 18316: {2507: 0.33327678077538553, 4026: 0.27273805078531216},
 2507: {18316: 0.33327678077538553, 4026: 0.33317953394911076},
 4026: {18316: 0.27273805078531216, 2507: 0.33317953394911076},
 25772: {6341: 0.24998859111658836,
  25555: 0.20465949616049253,
  20033: 0.16755275288034804},
 6341: {25772: 0.24998859111658836,
  25555: 0.2499830812054097,
  20033: 0.2046587342159378},
 25555: {25772: 0.20465949616049253,
  6341: 0.2499830812054097,
  20033: 0.24998766041242207},
 20033: {25772: 0.16755275288034804,
  6341: 0.2046587342159378,
  25555: 0.24998766041242207}}

In [7]:
sim_item = similarity(df_train)

100%|███████████████████████████████████████████████████████████████████████████| 1000000/1000000 [02:55<00:00, 5707.87it/s]


# Recommandations

Pour générer des recommandations à une session nous calculons la somme des score de similarité des items de la session avec tous les autres items du dataset

Les 100 items ayant la meilleure similarité totale avec les items de la session seront conservés
Si nous n'obtenons pas 100 items, nous piochons parmi les 100 items les plus populaires (les plus achetés)

In [8]:
popular_items = train_purchases['item_id'].value_counts().reset_index()
popular_items = list(popular_items['index'][:100])
popular_items[:10]

[8060, 26853, 19882, 8622, 2447, 4193, 20770, 18156, 17089, 1644]

In [9]:
def recommend(sim_item, popular_items, top_k, session_item_list):  
    rank = {}
    for i in session_item_list:
        if i in sim_item:
            for j, score in sorted(sim_item[i].items(), key=lambda d: d[1], reverse=True):  
                if j not in session_item_list:  
                    rank[j] = rank.get(j,0) + score
                    
    rank_list = sorted(rank.items(), key=lambda d: d[1], reverse=True)[:top_k]
    item_list = [item for item, rank in rank_list]

    index = 0
    if len(item_list) < top_k:
        item_list += popular_items[:top_k - len(item_list)]

    return item_list

# Evaluation

Nous évaluons la qualité de nos recommandations grâce à la *Mean Reciprocal Rank*

$\displaystyle{MRR = |Q|\sum_i \frac{1}{rank_i}}$

nous générons des recommandation pour `train_sessions` et vérifions si nous avons prédit l'achat sur `train_purchases`.
Etant donné que nous vérifions nos résultats sur nos données d'entraînement, cela peut entraîner des résultats hautement biaisés.

In [10]:
def mean_reciprocal_rank(df_pred, df_true):
    mrr = 0
    for session, item, rank in df_pred.values:
        if len(df_true[(df_true.item_id == item) & (df_true.session_id == session)]) != 0:
            mrr += 1/rank
    return mrr / len(df_true)

In [11]:
item_id_list = []

train_sessions_dict = train_sessions.groupby("session_id")["item_id"].agg(list).reset_index()
train_sessions_dict = train_sessions_dict[:100]

for session_id, session_item_list in train_sessions_dict.values:
    item_id_list += recommend(sim_item, popular_items, 100, session_item_list)
    
rank_list = np.resize(np.arange(1,101), len(train_sessions_dict) * 100)
session_id_list = np.repeat(train_sessions_dict["session_id"].values, 100)


df_pred = pd.DataFrame({"session_id":session_id_list, "item_id":item_id_list, "rank":rank_list})
df_true = train_purchases[np.isin(train_purchases.session_id, train_sessions_dict["session_id"].values)]
mean_reciprocal_rank(df_pred, df_true)

0.1683978190475883

In [12]:
leaderboard_sessions = pd.read_csv('datasets/test_leaderboard_sessions.csv')

item_id_list = []

leaderboard_sessions_dict = leaderboard_sessions.groupby("session_id")["item_id"].agg(list).reset_index()
leaderboard_sessions_dict = leaderboard_sessions_dict

for session_id, session_item_list in tqdm(leaderboard_sessions_dict.values):
    item_id_list += recommend(sim_item, popular_items, 100, session_item_list)
    
rank_list = np.resize(np.arange(1,101), len(leaderboard_sessions_dict) * 100)
session_id_list = np.repeat(leaderboard_sessions_dict["session_id"].values, 100)

leaderboard = pd.DataFrame({"session_id":session_id_list, "item_id":item_id_list, "rank":rank_list})
leaderboard.to_csv('leaderboard_result.csv',index=False)
leaderboard

100%|████████████████████████████████████████████████████████████████████████████████| 50000/50000 [06:56<00:00, 120.00it/s]


Unnamed: 0,session_id,item_id,rank
0,26,3260,1
1,26,5383,2
2,26,2213,3
3,26,27921,4
4,26,26538,5
...,...,...,...
4999995,4439757,17059,96
4999996,4439757,4839,97
4999997,4439757,107,98
4999998,4439757,9524,99
