# Approches collaboratives : utilisateur-utilisateur, item-item

Pierrick DOSSIN  
Guillaume RIU

In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import random

from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine

### Charger les Données

In [2]:
# Importation des votes sous forme de dataframe
votes = pd.read_csv('Data/votes.csv')

In [3]:
## Matrice Utilisateur Item
MUI = votes.pivot(index="user.id", columns="item.id", values="rating")
MUI_numpy = MUI.to_numpy()
MUI_numpy_flat = MUI_numpy.reshape(-1)

MUI.head()

item.id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


### Définition des métriques

In [4]:
# Erreur quadratique moyenne
def MSE_mat(y_pred, y_true):
    return np.nanmean((y_pred - y_true)**2)

# Erreur absolue moyenne
def MAE_mat(y_pred, y_true):
    return np.nanmean(np.abs(y_pred - y_true))

### Prédictions avec validation croisée

In [5]:
## Création des indices pour les valeurs différentes de np.nan
indices    = np.arange(0, MUI_numpy.shape[0]*MUI_numpy.shape[1])
indices_na = indices[~np.isnan(MUI_numpy_flat)]

In [6]:
## Split Train Test des indices
nbre_replis = 5
np.random.shuffle(indices_na)
print(indices_na.shape)
idx_split = np.split(indices_na, nbre_replis)

(100000,)


In [7]:
# Liste des erreurs MSE et MAE pour chacun des 5 plis de la validation croisée

MSE_votes_sans_biais_u = []
MSE_votes_avec_biais_u = []
MSE_votes_voisins_rapproches_u = []

MSE_votes_sans_biais_i = []
MSE_votes_avec_biais_i = []
MSE_votes_voisins_rapproches_i = []

MAE_votes_sans_biais_u = []
MAE_votes_avec_biais_u = []
MAE_votes_voisins_rapproches_u = []

MAE_votes_sans_biais_i = []
MAE_votes_avec_biais_i = []
MAE_votes_voisins_rapproches_i = []

for i in range(5):
    ## Pour chacun des cinq plis, je construis ma liste d'indice train et test
    idx_train = np.delete(idx_split, i, axis=0).flatten()
    idx_test  = idx_split[i]

    ## J'enlève les valeurs de test de la matrice d'entrainement, et vice versa
    MUI_numpy_flat_train = MUI_numpy_flat.copy()
    MUI_numpy_flat_test  = MUI_numpy_flat.copy()
    MUI_numpy_flat_train[idx_test] = np.nan
    MUI_numpy_flat_test[idx_train] = np.nan

    #  Je redonne la structure de matrice aux ensembles de test et d'entrainement
    MUI_train = pd.DataFrame(MUI_numpy_flat_train.reshape(MUI_numpy.shape))
    MUI_test  = pd.DataFrame(MUI_numpy_flat_test.reshape(MUI_numpy.shape))

    MUI_train_zero_one = MUI_train.replace(np.nan, 0)
    MUI_train_zero_one[MUI_train_zero_one > 0] = 1

    MUI_train_zero = MUI_train.replace(np.nan, 0)

    # Vote sans correction biais

    w_u = 1 - pairwise_distances(MUI_train_zero, metric="cosine")
    w_i = 1 - pairwise_distances(MUI_train_zero.T, metric="cosine")

    pred_sans_biais_u = w_u.dot(MUI_train_zero) / abs(w_u).dot(MUI_train_zero_one)
    pred_sans_biais_i = np.array(MUI_train_zero.dot(w_i) / MUI_train_zero_one.dot(abs(w_i)))

    # Vote avec correction biais

    MUI_train_means_U = np.expand_dims(np.nanmean(MUI_train, axis=1), axis=-1)
    MUI_train_norm_U = (MUI_train - MUI_train_means_U).replace(np.nan, 0)

    MUI_train_means_I = np.expand_dims(np.nanmean(MUI_train, axis=0), axis=0)
    MUI_train_norm_I = (MUI_train - MUI_train_means_I).replace(np.nan, 0)

    pred_avec_biais_u = w_u.dot(MUI_train_norm_U) / abs(w_u).dot(MUI_train_zero_one) + MUI_train_means_U
    pred_avec_biais_i = np.array(MUI_train_norm_I.dot(w_i) / MUI_train_zero_one.dot(abs(w_i)) + MUI_train_means_I)

    # Vote 100 voisins rapprochés

    w_u_100_neighboors = w_u.copy()
    w_i_100_neighboors = w_i.copy()

    for u in w_u_100_neighboors:
        ind = np.argpartition(u, -100)[:-100]
        u[ind] = 0

    for i in w_i_100_neighboors:
        ind = np.argpartition(i, -100)[:-100]
        i[ind] = 0

    w_i_100_neighboors = w_i_100_neighboors.T

    pred_100_neighboors_u = w_u_100_neighboors.dot(MUI_train_norm_U) / abs(w_u_100_neighboors).dot(MUI_train_zero_one) + MUI_train_means_U
    pred_100_neighboors_i = np.array(MUI_train_norm_I.dot(w_i_100_neighboors) / MUI_train_zero_one.dot(abs(w_i_100_neighboors)) + MUI_train_means_I)


    MSE_votes_sans_biais_u.append(MSE_mat(pred_sans_biais_u, MUI_test))
    MAE_votes_sans_biais_u.append(MAE_mat(pred_sans_biais_u, MUI_test))

    MSE_votes_avec_biais_u.append(MSE_mat(pred_avec_biais_u, MUI_test))
    MAE_votes_avec_biais_u.append(MAE_mat(pred_avec_biais_u, MUI_test))

    MSE_votes_voisins_rapproches_u.append(MSE_mat(pred_100_neighboors_u, MUI_test))
    MAE_votes_voisins_rapproches_u.append(MAE_mat(pred_100_neighboors_u, MUI_test))

    MSE_votes_sans_biais_i.append(MSE_mat(pred_sans_biais_i, MUI_test))
    MAE_votes_sans_biais_i.append(MAE_mat(pred_sans_biais_i, MUI_test))

    MSE_votes_avec_biais_i.append(MSE_mat(pred_avec_biais_i, MUI_test))
    MAE_votes_avec_biais_i.append(MAE_mat(pred_avec_biais_i, MUI_test))

    MSE_votes_voisins_rapproches_i.append(MSE_mat(pred_100_neighboors_i, MUI_test))
    MAE_votes_voisins_rapproches_i.append(MAE_mat(pred_100_neighboors_i, MUI_test))

  pred_sans_biais_u = w_u.dot(MUI_train_zero) / abs(w_u).dot(MUI_train_zero_one)
  MUI_train_means_I = np.expand_dims(np.nanmean(MUI_train, axis=0), axis=0)
  pred_avec_biais_u = w_u.dot(MUI_train_norm_U) / abs(w_u).dot(MUI_train_zero_one) + MUI_train_means_U
  pred_100_neighboors_u = w_u_100_neighboors.dot(MUI_train_norm_U) / abs(w_u_100_neighboors).dot(MUI_train_zero_one) + MUI_train_means_U
  pred_sans_biais_u = w_u.dot(MUI_train_zero) / abs(w_u).dot(MUI_train_zero_one)
  MUI_train_means_I = np.expand_dims(np.nanmean(MUI_train, axis=0), axis=0)
  pred_avec_biais_u = w_u.dot(MUI_train_norm_U) / abs(w_u).dot(MUI_train_zero_one) + MUI_train_means_U
  pred_100_neighboors_u = w_u_100_neighboors.dot(MUI_train_norm_U) / abs(w_u_100_neighboors).dot(MUI_train_zero_one) + MUI_train_means_U
  pred_sans_biais_u = w_u.dot(MUI_train_zero) / abs(w_u).dot(MUI_train_zero_one)
  MUI_train_means_I = np.expand_dims(np.nanmean(MUI_train, axis=0), axis=0)
  pred_avec_biais_u = w_u.dot(MUI_train_norm_U) /

In [8]:
print("Erreur approche utilisateur-utilisateur sans correction de biais :")
print("MSE: ", np.mean(MSE_votes_sans_biais_u))
print("MAE: ", np.mean(MAE_votes_sans_biais_u), "\n")

print("Erreur approche item-item sans correction de biais :")
print("MSE: ", np.mean(MSE_votes_sans_biais_i))
print("MAE: ", np.mean(MAE_votes_sans_biais_i), "\n")

print("Erreur approche utilisateur-utilisateur avec correction de biais :")
print("MSE: ", np.mean(MSE_votes_avec_biais_u))
print("MAE: ", np.mean(MAE_votes_avec_biais_u), "\n")

print("Erreur approche item-item avec correction de biais :")
print("MSE: ", np.mean(MSE_votes_avec_biais_i))
print("MAE: ", np.mean(MAE_votes_avec_biais_i), "\n")

print("Erreur approche utilisateur-utilisateur avec 100 voisins ajoutés :")
print("MSE: ", np.mean(MSE_votes_voisins_rapproches_u))
print("MAE: ", np.mean(MAE_votes_voisins_rapproches_u), "\n")

print("Erreur approche item-item avec 100 voisins ajoutés :")
print("MSE: ", np.mean(MSE_votes_voisins_rapproches_i))
print("MAE: ", np.mean(MAE_votes_voisins_rapproches_i), "\n")

Erreur approche utilisateur-utilisateur sans correction de biais :
MSE:  1.033702302515278
MAE:  0.8104806822136075 

Erreur approche item-item sans correction de biais :
MSE:  1.0271691878408606
MAE:  0.8065505855908629 

Erreur approche utilisateur-utilisateur avec correction de biais :
MSE:  0.9079065409662619
MAE:  0.7501180484462318 

Erreur approche item-item avec correction de biais :
MSE:  0.8708140397021109
MAE:  0.7353311136702557 

Erreur approche utilisateur-utilisateur avec 100 voisins ajoutés :
MSE:  0.9024352850559314
MAE:  0.7432765025113431 

Erreur approche item-item avec 100 voisins ajoutés :
MSE:  0.8435690268864656
MAE:  0.7210969116442109 

