# 10. Collaborative Filtering - Matrix Factorization

In [10]:
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds

In [25]:
np.set_printoptions(suppress=True)

In [2]:
%run "01.Recommendation_Loading.ipynb"

################  Recommendations (all data, positive recommendations only) ################
Users: 12,636,209
Games: 37,419
Number of users 10,000
First five users id [11203022 12827342 11719330  9553563 11051045]
################  Recommendations (Sample) ################
Minimum number of recommendations to included in the sample: 20
Number of randomly selected users: 10000
Users: 9,688
Games: 20,615
################ User-Game Matrix ################
Number of Rows: 9,689
Numbeer of Columns: 20,616
Number of stored values: 365,753


In [69]:
NUMBER_OF_FACTORS = 100

In [70]:
user_game_matrix = user_game_matrix.astype("float")

In [71]:
user_game_matrix

<9689x20616 sparse matrix of type '<class 'numpy.float64'>'
	with 365753 stored elements in Dictionary Of Keys format>

In [72]:
U, sigma, Vt = svds(user_game_matrix, k = NUMBER_OF_FACTORS)

In [73]:
U.shape

(9689, 100)

In [74]:
Vt.shape

(100, 20616)

In [75]:
sigma = np.diag(sigma)
sigma.shape

(100, 100)

Let's reconstruct the original matrix by multiplying its factors. The resulting matrix is not sparse anymore

In [76]:
all_user_predicted_rating = np.dot(np.dot(U, sigma), Vt)

In [77]:
all_user_predicted_rating

array([[ 0.26476862,  0.13575434,  0.15843898, ..., -0.00495697,
         0.00044043,  0.        ],
       [ 0.04590524,  0.02980932,  0.02213353, ...,  0.00032821,
         0.00001036,  0.        ],
       [-0.02146728,  0.03849372,  0.01747572, ...,  0.0001989 ,
        -0.0000167 ,  0.        ],
       ...,
       [-0.10411044, -0.00696651, -0.00416716, ..., -0.00082547,
         0.0000398 ,  0.        ],
       [ 0.01923727, -0.00401907,  0.00460032, ...,  0.00087787,
        -0.00009203,  0.        ],
       [ 0.        , -0.        , -0.        , ..., -0.        ,
        -0.        ,  0.        ]])

In [78]:
all_user_predicted_rating.shape

(9689, 20616)

In [79]:
all_user_predicted_rating_norms = (all_user_predicted_rating - all_user_predicted_rating.min()) / (all_user_predicted_rating.max() - all_user_predicted_rating.min())

In [80]:
all_user_predicted_rating_norms

array([[0.49445361, 0.46477366, 0.4699923 , ..., 0.4324028 , 0.43364448,
        0.43354316],
       [0.44410373, 0.44040084, 0.43863501, ..., 0.43361866, 0.43354554,
        0.43354316],
       [0.42860457, 0.4423987 , 0.43756347, ..., 0.43358891, 0.43353931,
        0.43354316],
       ...,
       [0.40959238, 0.4319405 , 0.43258449, ..., 0.43335325, 0.43355231,
        0.43354316],
       [0.43796872, 0.43261856, 0.43460147, ..., 0.43374511, 0.43352198,
        0.43354316],
       [0.43354316, 0.43354316, 0.43354316, ..., 0.43354316, 0.43354316,
        0.43354316]])

In [81]:
%run "Get_Training_and_Test_Instances.ipynb"

################  Get Test Data ################
x_test.shape: (9688, 2)
y_test.shape: (9688, 1)
Number of Users: 9,689
Number of Games: 20,616
Number of Negatives Sample per User-Game: 2
################  Get Training Data ################
x_train.shape: (1094959, 2)
y_train.shape: (1094959, 2)


In [82]:
def eval_hit_ratio_SVD(n_users2val, K, x_test):
    counter = 0
    choosen_user = np.random.choice(x_test[:,0], size = n_users2val)
    for user_game in x_test[choosen_user,:]:
        # print(user_game[0])
        y_pred = np.argsort(-all_user_predicted_rating_norms[user_game[0],:][:K])
        # print(y_pred)
        if(user_game[1] in y_pred):
            counter += 1
    return counter / n_users2val

In [58]:
np.argsort(-all_user_predicted_rating_norms[0,:])  

array([ 3067,   103,     9, ...,   120,  4426, 12226])

In [83]:
eval_hit_ratio_SVD(1000, 10000, x_test)

0.051

In [None]:
sorted_indices = np.argsort(-user_games_preds[:, -1])