In [1]:
import pandas as pd

books = pd.read_csv("data/books.csv")
ratings = pd.read_csv("data/ratings.csv")

In [2]:
ratings["user_id"].value_counts()

12874    200
30944    200
52036    199
12381    199
28158    199
        ... 
32128     21
40753     21
51725     21
43675     20
34590     19
Name: user_id, Length: 53424, dtype: int64

Nombre minimum de note par utilisateur : 19

In [3]:
ratings["book_id"].value_counts()

1       22806
2       21850
4       19088
3       16931
5       16604
        ...  
9315       36
1935       33
9486       24
9345       11
7803        8
Name: book_id, Length: 10000, dtype: int64

Nombre minimum de note par livre : 8

In [4]:
num_movies = ratings['book_id'].nunique()
print(f'Nombre de films uniques : {num_movies}')

num_users = ratings['user_id'].nunique()
print(f'Nombre d\'utilisateurs uniques : {num_users}')

Nombre de films uniques : 10000
Nombre d'utilisateurs uniques : 53424


In [5]:
sorted_books_id = list(map(int,list(ratings['book_id'].unique())))
sorted_books_id.sort()

print(sorted_books_id[0:30])

sorted_user_id = list(map(int,list(ratings['user_id'].unique())))
sorted_user_id.sort()

print(sorted_user_id[0:30])



[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]


## Collaborative Filtering by model (SVD)

In [6]:
R_df = ratings.pivot(index="user_id",columns="book_id",values="rating").fillna(0)

In [8]:
R_df.head()

book_id,1,2,3,4,5,6,7,8,9,10,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,10000
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,5.0,0.0,0.0,5.0,0.0,0.0,4.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,5.0,0.0,4.0,4.0,0.0,4.0,4.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
import numpy as np

R = R_df.values

user_ratings_mean = np.mean(R, axis=1)
print(user_ratings_mean[0:10])
user_ratings_mean = np.float16(user_ratings_mean)

R_demeaned = R - user_ratings_mean.reshape(-1, 1)

[0.042  0.0287 0.0158 0.0505 0.0404 0.0389 0.0592 0.035  0.0455 0.048 ]


In [19]:
user_ratings_mean.dtype

dtype('float64')

In [10]:
from scipy.sparse.linalg import svds

latent_dimension = 30

U, sigma, Vt = svds(R_demeaned, k=latent_dimension)

print(f"Dimensions de U : {U.shape}")
print(f"Dimensions de sigma : {sigma.shape}")
print(f"Dimensions de Vt : {Vt.shape}")

Dimensions de U : (53424, 30)
Dimensions de sigma : (30,)
Dimensions de Vt : (30, 10000)


In [11]:
from scipy.linalg import sqrtm

sigma = np.diag(sigma) #Transforme en matrice (50, 50)
s_root = sqrtm(sigma)

In [12]:
#Représentations latentes des utilisateurs et des films en moindre dimension
Usk = np.dot(U, s_root)
skV = np.dot(s_root, Vt)

Usk = np.float16(Usk)
skV = np.float16(skV)

#Le produit matriciel de ces matrices (Qui incluent toutes le sdeux sigma ici) permet de prédire les notes
predicted_rating = np.dot(Usk, skV)

#Auxquelles on rajoute les moyennes soustraites tout à l'heure
predicted_rating = predicted_rating + user_ratings_mean.reshape(-1, 1)

In [13]:
preds_df = pd.DataFrame(predicted_rating, columns=R_df.columns, index=R_df.index)
preds_df.head()

book_id,1,2,3,4,5,6,7,8,9,10,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,10000
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.666504,0.19812,0.813477,2.953125,2.050781,0.53125,0.680176,1.865234,2.109375,3.0,...,0.049988,0.004425,-0.000183,-0.011963,0.00528,0.003601,0.005035,-0.003021,-9.2e-05,-0.009766
2,-0.018143,4.476562,0.716797,2.089844,1.785156,0.296631,0.795898,1.518555,1.746094,2.404297,...,0.003067,0.004349,0.012192,-0.008041,0.011398,0.00531,0.036255,0.011765,0.033936,0.038452
3,-0.141235,-0.148804,-0.155396,1.412109,1.046875,-0.087952,0.110229,0.982422,-0.109558,0.581543,...,0.006416,0.005508,0.007866,0.008713,0.01683,0.005814,0.011314,0.008377,0.01149,0.012161
4,-0.088989,4.441406,0.809082,5.652344,4.027344,0.094116,2.558594,4.21875,1.912109,2.576172,...,-0.001587,-0.031952,-0.027618,-0.012482,0.00705,-0.017731,0.001312,-0.004761,-0.004822,0.001099
5,-0.035889,0.152832,-0.192627,0.380371,0.29541,1.227539,-0.090332,0.131348,-0.133667,0.318848,...,0.027832,0.026062,0.026276,0.023468,0.018417,0.029633,0.023346,0.021622,0.032532,0.019104
