# Chargement et préparation des données 

2 type de système de recommandation :
- content-based 
- collaboratif-filtering


In [1]:
from utils import DataClicks
import pandas as pd
import numpy as np

In [2]:
%%time
interaction_df, metadata_df, article_matrice_df = DataClicks(
                                                        interaction_path_dir="./data/clicks/",
                                                        metadata_path="./data/articles_metadata.csv",
                                                        article_embedding_path="./data/articles_embeddings.pickle").load_data()

CPU times: user 40.4 s, sys: 36.5 s, total: 1min 16s
Wall time: 1min 25s


In [3]:
interaction_df.head()

Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,1506825423271737,2017-10-01 02:37:03,2,157541,2017-10-01 03:00:28.020,4,3,20,1,20,2
1,0,1506825423271737,2017-10-01 02:37:03,2,68866,2017-10-01 03:00:58.020,4,3,20,1,20,2
2,1,1506825426267738,2017-10-01 02:37:06,2,235840,2017-10-01 03:03:37.951,4,1,17,1,16,2
3,1,1506825426267738,2017-10-01 02:37:06,2,96663,2017-10-01 03:04:07.951,4,1,17,1,16,2
4,2,1506825435299739,2017-10-01 02:37:15,2,119592,2017-10-01 03:04:50.575,4,1,17,1,24,2


### Création d'un rating implicite pour la modélisation collaboratif filtering

In [11]:
df = interaction_df.merge(metadata_df,left_on="click_article_id",right_on="article_id")
# On retire les valeurs inutile
df = df[['user_id', 'article_id', 'category_id']]
df

Unnamed: 0,user_id,article_id,category_id
0,0,157541,281
1,20,157541,281
2,44,157541,281
3,45,157541,281
4,76,157541,281
...,...,...,...
2988176,195186,2221,1
2988177,75658,271117,399
2988178,217129,20204,9
2988179,217129,70196,136


In [12]:
rating  = df.groupby(["user_id","category_id"]).size()
rating = rating.to_frame().reset_index()
rating.rename(columns={0:"rating"},inplace=True)
rating.head()

Unnamed: 0,user_id,category_id,rating
0,0,136,1
1,0,186,2
2,0,209,1
3,0,281,2
4,0,375,1


In [49]:
all_clicks_by_user = dict(rating.groupby(by="user_id")["rating"].sum())

In [56]:
rating["all_click_user"] = rating["user_id"].map(all_clicks_by_user)

In [61]:
rating["score"] = (rating["rating"] / rating["all_click_user"])*10

In [62]:
rating["score"].value_counts()

10.000000    100102
5.000000      36897
3.333333      22833
2.500000      15845
2.000000      12480
              ...  
2.873563          1
5.934066          1
1.012146          1
2.988506          1
1.024096          1
Name: score, Length: 3420, dtype: int64

on retire les interaction si l'utilisateur n'a regarder que 1 seul fois la catégorie (faible impact sur cette catégorie)

In [63]:
rating.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_id,503616.0,100591.298803,78296.001115,0.0,37985.0,80436.5,150442.75,322887.0
category_id,503616.0,313.580869,105.291249,1.0,252.0,327.0,409.0,458.0
rating,503616.0,3.195877,3.204592,2.0,2.0,2.0,3.0,535.0
all_click_user,503616.0,24.445784,43.774245,2.0,5.0,12.0,28.0,1210.0
score,503616.0,3.825752,3.461102,0.016529,1.052632,2.5,5.0,10.0


In [28]:
from surprise import Reader, Dataset
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNBaseline
from surprise import KNNWithZScore
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise import SVD, accuracy
from surprise.model_selection import cross_validate

from scipy.spatial import distance

from random import randint


In [64]:
reader = Reader()
data_surprise = Dataset.load_from_df(rating[['user_id', 'category_id', 'score']], reader)

train_set, test_set = train_test_split(data_surprise, test_size=0.2)

In [23]:
from sklearn.model_selection import train_test_split

train, cros_val = train_test_split(df, test_size = 0.2)

## Content-based filtering model

In [37]:
def recommandationArticle(embedding, userId, n=5):
    
    # on récupére tout les article lu par l'utilisateurs
    var = interaction_df.loc[interaction_df['user_id']==userId]['click_article_id'].tolist()
    
    # on choisi un article simillaire au article lu par l'utilisateurs
    value = randint(0, len(var))
    
    # On supprime les article déjà lu par l'utilisateur
    for i in range(0, len(var)):
        if i != value:
            embedding = np.delete(embedding, [i], 0)
    
    arr = []
    
    # on supprime l'article selectionné
    f = np.delete(embedding, [value], 0)
    
    # on récupére les n articles les plus similaire à celui selectionné
    for i in range(0, n):
        # On récupére la matrice de distance
        distances = distance.cdist([embedding[value]], f, "cosine")[0]
        min_index = np.argmin(distances)
        f = np.delete(f, [min_index], 0)
        result = np.where(e==f[min_index])
        arr.append(result[0][0])
        
    return arr



In [39]:
recommandationArticle(article_matrice_df, 0)

[14, 1729, 53901, 796, 784]

## Collaborative Filtering model