In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np

# Datasets

Nous chargeons les datasets `train_sessions` et `train_purchases` pour entrainer les données. Nous ignorons `item_features` car les calculs de similarités relatifs aux features prennent beaucoup de temps.

In [2]:
train_sessions = pd.read_csv('datasets/train_sessions.csv')
train_purchases = pd.read_csv('datasets/train_purchases.csv')
df_train = pd.concat([train_sessions, train_purchases], ignore_index=True)
item_features = pd.read_csv('datasets/item_features.csv')
candidate_items = pd.read_csv('datasets/candidate_items.csv')["item_id"]

In [3]:
df_train

Unnamed: 0,session_id,item_id,date
0,3,9655,2020-12-18 21:25:00.373
1,3,9655,2020-12-18 21:19:48.093
2,13,15654,2020-03-13 19:35:27.136
3,18,18316,2020-08-26 19:18:30.833
4,18,2507,2020-08-26 19:16:31.211
...,...,...,...
5743815,4439986,2915,2021-05-13 11:56:37.464
5743816,4439990,8786,2020-08-22 14:28:22.382
5743817,4439994,21630,2020-11-27 20:10:28.961
5743818,4439999,16962,2020-11-27 11:01:41.356


In [4]:
item_features

Unnamed: 0,item_id,feature_category_id,feature_value_id
0,2,56,365
1,2,62,801
2,2,68,351
3,2,33,802
4,2,72,75
...,...,...,...
471746,28143,68,351
471747,28143,55,390
471748,28143,11,109
471749,28143,73,91


In [5]:
candidate_items

0           4
1           8
2           9
3          19
4          20
        ...  
4985    28128
4986    28131
4987    28132
4988    28133
4989    28137
Name: item_id, Length: 4990, dtype: int64

# Preprocessing

Ici nous tranformons les dates en timestamp unix pour pouvoir plus tard calculer des différences de temps.

In [6]:
df_train["ts"] = pd.to_datetime(df_train.date).astype("int64") / 1E9 # timestamp en secondes
df_train = df_train.sort_values(by = ["session_id", "ts"], ascending = True)
df_train.head()

Unnamed: 0,session_id,item_id,date,ts
1,3,9655,2020-12-18 21:19:48.093,1608326000.0
0,3,9655,2020-12-18 21:25:00.373,1608327000.0
4743820,3,15085,2020-12-18 21:26:47.986,1608327000.0
2,13,15654,2020-03-13 19:35:27.136,1584128000.0
4743821,13,18626,2020-03-13 19:36:15.507,1584128000.0


Ici nous préparons les datasets de manière à optimiser les étapes de calcul de similarité.

In [7]:
train_purchases['ts'] = pd.to_datetime(train_purchases.date).astype('int64') / 1E9
session_time = df_train.groupby("session_id")["ts"].agg(list).reset_index()
session_time_dict = session_time.set_index("session_id")["ts"].to_dict()
features_dict = item_features.groupby("item_id")[["feature_category_id", "feature_value_id"]]\
                   .agg(list)\
                   .apply(lambda x: dict(zip(x.feature_category_id, x.feature_value_id)), axis=1).to_dict()

# Fonction de similarité

Nous choisissons de définir un score de similarité pour chaque couple d'items

Pour chaque couple d'item $(i,j)$ dans chaque session $s$ nous calculons le score de la sorte

$\displaystyle{score_{ij} = ( \sum_s {\exp (- (\varphi_1( \Delta T ) + \varphi_2 (\Delta items) ) ) \over |s|}} ) * feature\_sim(i,j)$

Avec $\varphi_1$ et $\varphi_2$ des applications linéaires arbitraires définies manuellement pour pondérer l'influence $\Delta T$ (distance temporelle) et $\Delta items$ (distance en quantité d'items qui les séparent dans la session $s$)

$|s|$ étant la taille de la session et $feature\_sim$ étant la similarité dans les caractéristiques des items.


In [8]:
def feature_similarity(item1, item2):
    score = 1
    for feature in features_dict[item1]:
        if feature in features_dict[item2]:
            score *= 1.2
            if features_dict[item2][feature] == features_dict[item2][feature]:
                score *= 1.5
    return score

In [9]:
# Fonction de similarité
# Renvoie {item_id: {item_id: score de similarité, ...}, ...}
def similarity(df, log = lambda *x: None):
    session_item_tuple = df.groupby("session_id")["item_id"]\
                        .agg(list)\
                        .reset_index()[["session_id", "item_id"]]\
                        .values
    
    sim_item = {}
    
    for session, items in tqdm(session_item_tuple):
        log("For session", session, "and its items", items)
        
        for i, item1 in enumerate(items):
            log("   For index", i, "and item", item1)
            sim_item[item1] = sim_item.get(item1, {})
            
            for j, item2 in enumerate(items):
                log("       For index2", j, "and item2", item2)
                if item1 != item2:
                    sim_item[item1][item2] = sim_item[item1].get(item2, 0)
                    log("            item1", item1, "different than item2", item2, "so:")
                    
                    delta_index = abs(j - i)
                    delta_time = abs(session_time_dict[session][i] - session_time_dict[session][j]) / 3600
                    
                    log("            delta_index =", delta_index)
                    log("            delta_time =", delta_time)

                    time_coeff = np.exp(- delta_time / 72)
                    index_coeff =  np.exp(- (delta_index - 1) / 5)

                    sim_item[item1][item2] += index_coeff * time_coeff/len(items)
                    log("            similarity =", sim_item[item1][item2])
    
    for item1 in tqdm(candidate_items.values):
        sim_item[item1] = sim_item.get(item1, {})
        for item2 in candidate_items.values:
            if item1 != item2:
                if item2 in sim_item[item1]:
                    sim_item[item1][item2] = sim_item[item1].get(item2, 1) * feature_similarity(item1, item2)
    
    return sim_item

#similarity(train_sessions[:10], print)

In [10]:
sim_item = similarity(df_train)

100%|███████████████████████████████████████████████████████████████████████████| 1000000/1000000 [02:46<00:00, 5988.89it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 4990/4990 [01:00<00:00, 82.68it/s]


# Recommandations

Pour générer des recommandations à une session nous calculons la somme des score de similarité des items de la session avec tous les autres items du dataset

Les 100 items ayant la meilleure similarité totale avec les items de la session seront conservés
Si nous n'obtenons pas 100 items, nous piochons parmi les 100 items les plus populaires (les plus achetés)

In [11]:
popular_items = train_purchases['item_id'].value_counts().reset_index()
popular_items = list(popular_items['index'][:100])
popular_items[:10]

[8060, 26853, 19882, 8622, 2447, 4193, 20770, 18156, 17089, 1644]

In [12]:
def recommend(sim_item, popular_items, top_k, session_item_list):  
    rank = {}
    for i in session_item_list:
        if i in sim_item:
            for j, score in sorted(sim_item[i].items(), key=lambda d: d[1], reverse=True):  
                if j not in session_item_list:  
                    rank[j] = rank.get(j,0) + score
                    
    rank_list = sorted(rank.items(), key=lambda d: d[1], reverse=True)[:top_k]
    item_list = [item for item, rank in rank_list]

    index = 0
    if len(item_list) < top_k:
        item_list += popular_items[:top_k - len(item_list)]

    return item_list

# Evaluation

Nous évaluons la qualité de nos recommandations grâce à la *Mean Reciprocal Rank*

$\displaystyle{MRR = |Q|\sum_i \frac{1}{rank_i}}$

nous générons des recommandation pour `train_sessions` et vérifions si nous avons prédit l'achat sur `train_purchases`.
Etant donné que nous vérifions nos résultats sur nos données d'entraînement, cela peut entraîner des résultats hautement biaisés.

In [13]:
def mean_reciprocal_rank(df_pred, df_true):
    mrr = 0
    for session, item, rank in df_pred.values:
        if len(df_true[(df_true.item_id == item) & (df_true.session_id == session)]) != 0:
            mrr += 1/rank
    return mrr / len(df_true)

In [14]:
item_id_list = []

train_sessions_dict = train_sessions.groupby("session_id")["item_id"].agg(list).reset_index()
train_sessions_dict = train_sessions_dict[:100]

for session_id, session_item_list in train_sessions_dict.values:
    item_id_list += recommend(sim_item, popular_items, 100, session_item_list)
    
rank_list = np.resize(np.arange(1,101), len(train_sessions_dict) * 100)
session_id_list = np.repeat(train_sessions_dict["session_id"].values, 100)


df_pred = pd.DataFrame({"session_id":session_id_list, "item_id":item_id_list, "rank":rank_list})
df_true = train_purchases[np.isin(train_purchases.session_id, train_sessions_dict["session_id"].values)]
mean_reciprocal_rank(df_pred, df_true)

0.17164886091715956

In [15]:
leaderboard_sessions = pd.read_csv('datasets/test_leaderboard_sessions.csv')

item_id_list = []

leaderboard_sessions_dict = leaderboard_sessions.groupby("session_id")["item_id"].agg(list).reset_index()
leaderboard_sessions_dict = leaderboard_sessions_dict

for session_id, session_item_list in tqdm(leaderboard_sessions_dict.values):
    item_id_list += recommend(sim_item, popular_items, 100, session_item_list)
    
rank_list = np.resize(np.arange(1,101), len(leaderboard_sessions_dict) * 100)
session_id_list = np.repeat(leaderboard_sessions_dict["session_id"].values, 100)

leaderboard = pd.DataFrame({"session_id":session_id_list, "item_id":item_id_list, "rank":rank_list})
leaderboard.to_csv('leaderboard_result.csv',index=False)
leaderboard

100%|████████████████████████████████████████████████████████████████████████████████| 50000/50000 [06:08<00:00, 135.60it/s]


Unnamed: 0,session_id,item_id,rank
0,26,3260,1
1,26,5383,2
2,26,2213,3
3,26,27921,4
4,26,26538,5
...,...,...,...
4999995,4439757,19170,96
4999996,4439757,7237,97
4999997,4439757,16097,98
4999998,4439757,24931,99
