# 10. Collaborative Filtering - Matrix Factorization

In [3]:
%run "1.Recommendation_Loading.ipynb"

  chunk_df = pd.read_csv(file_path, header=None, skiprows=skiprow)


In [4]:
import scipy as sp

In [5]:
recommendations["date"] = pd.to_datetime(recommendations["date"])

## Data Preparation

### Filtering For Users and Games

In [6]:
def users_at_least_k_recs(df, K, n_users):
    """
    Return a list of users who perform at least K recommendations. Also, filtering for n_users randomòy selected
    Args::
        df: dataframe, recommendation dataset
        K: integer, representing the minimum number of recommendations for a user to be included in the final list
        n_users: integer, number of randomly selected users
    Return:
        users: a list of randomly selected users who perform at least K recommendations
    """
    series = df.groupby("user_id_categorical")["user_id_categorical"].count()>=K
    series1 = series[series]
    users = list(series1.index)
    return np.random.choice(users, size = n_users)

users_to_keep = users_at_least_k_recs(recommendations, 20, 5000)
print("Number of users",  '{0:,.0f}'.format(len(users_to_keep)))
print("First five users id", users_to_keep[:5])


Number of users 5,000
First five users id [11343657  2977524  2145654 10764314  6764024]


### Loading the matrix and Apply Filtering

In [8]:
# LOADING THE MATRIX
# Due to the large memory requirements, the user-game matrix has been created in the notebook "User-Games Matrix - Building"
import gzip
import pickle

with open("matrix/user_game_matrix.pkl", 'rb') as file:
    user_game_matrix = pickle.load(file)

# Convert this array/matrix to Dictionary Of Keys format
# user_game_matrix = user_game_matrix.todok()
print("Number of Rows:", '{0:,.0f}'.format(user_game_matrix.shape[0]))
print("Numbeer of Columns:", '{0:,.0f}'.format(user_game_matrix.shape[1]))
print("Number of stored values:", '{0:,.0f}'.format(user_game_matrix.size))


  user_game_matrix = pickle.load(file)


Number of Rows: 12,663,134
Numbeer of Columns: 37,420
Number of stored values: 47,967,516


In [9]:
# FILTERING the MATRIX for USERS to KEEP
# Matrix with only the subset of users who did at least k recs
mask = np.isin(np.array(user_game_matrix[:,0].todense()).reshape(-1), users_to_keep)
user_game_matrix_k_rec = user_game_matrix[mask]
print("Number of Rows:", '{0:,.0f}'.format(user_game_matrix_k_rec.shape[0]))
print("Numbeer of Columns:", '{0:,.0f}'.format(user_game_matrix_k_rec.shape[1]))
print("Number of stored values:", '{0:,.0f}'.format(user_game_matrix_k_rec.size))
user_game_matrix_k_rec

Number of Rows: 4,911
Numbeer of Columns: 37,420
Number of stored values: 19,823


<4911x37420 sparse matrix of type '<class 'numpy.int32'>'
	with 19823 stored elements in Compressed Sparse Column format>

In [10]:
# INSERTING a GAME ID ROW to the user-game matrix
games_id = np.arange(0, user_game_matrix_k_rec.shape[1]).reshape(1,-1)
games_id = sp.sparse.csc_matrix(games_id)
user_game_matrix_k_rec = sp.sparse.vstack((games_id, user_game_matrix_k_rec))
user_game_matrix_k_rec.todense()


matrix([[       0,        1,        2, ...,    37417,    37418,    37419],
        [    1753,        0,        0, ...,        0,        0,        0],
        [    1768,        0,        0, ...,        0,        0,        0],
        ...,
        [12609922,        0,        0, ...,        0,        0,        0],
        [12627813,        0,        0, ...,        0,        0,        0],
        [12637580,        0,        0, ...,        0,        0,        0]])

In [11]:
# dropping games with no recommendations
mask = list(np.array(np.sum(user_game_matrix_k_rec[1:,1:], axis=0)>=1).reshape(-1))
mask.insert(0,True)
user_game_matrix_k_rec = user_game_matrix_k_rec.T[mask].T
print("Number of Rows:", '{0:,.0f}'.format(user_game_matrix_k_rec.shape[0]))
print("Numbeer of Columns:", '{0:,.0f}'.format(user_game_matrix_k_rec.shape[1]))
print("Number of stored values:", '{0:,.0f}'.format(user_game_matrix_k_rec.size))
user_game_matrix_k_rec.todense()

Number of Rows: 4,912
Numbeer of Columns: 3,553
Number of stored values: 23,375


matrix([[       0,        1,        2, ...,    37121,    37132,    37324],
        [    1753,        0,        0, ...,        0,        0,        0],
        [    1768,        0,        0, ...,        0,        0,        0],
        ...,
        [12609922,        0,        0, ...,        0,        0,        0],
        [12627813,        0,        0, ...,        0,        0,        0],
        [12637580,        0,        0, ...,        0,        0,        0]])

In [12]:
# GAMES MAPPING 
# keys: integer, starting from 1 which are the columns of the filtered matrix
# values: original app_id_categorical
all_games = np.array(user_game_matrix_k_rec[0,1:].todense()).reshape(-1)
games_mapping = {k:i for k,i in enumerate(all_games)}
swapped_games_dict = {v: k for k, v in games_mapping.items()}
# swapped_games_dict[3481]

In [13]:
# USERS mapping
users_to_keep = np.array(user_game_matrix_k_rec[1:,0].todense()).reshape(-1)
users_mapping = {k:i for k,i in enumerate(users_to_keep)}
swapped_users_dict = {v: k for k, v in users_mapping.items()}


In [14]:
def build_recommendations_sample(df):
    # recommendations_sample = df[(df["user_id_categorical"].isin(users_to_keep)) & (df["app_id_categorical"].isin(games_to_keep))]
    recommendations_sample = df[(df["user_id_categorical"].isin(users_to_keep))]
    return recommendations_sample

recommendations_sample = build_recommendations_sample(recommendations)
# recommendations_sample[recommendations_sample["user_id_categorical"]==731702]


In [15]:
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD
import pickle

# Load the user-game matrix
with open("matrix/user_game_matrix.pkl", 'rb') as file:
    user_game_matrix = pickle.load(file)

# Filtering the matrix for users to keep
mask = np.isin(np.array(user_game_matrix[:, 0].todense()).reshape(-1), users_to_keep)
user_game_matrix_k_rec = user_game_matrix[mask]

# Convert the matrix to a dense array
user_game_matrix_dense = user_game_matrix_k_rec.toarray()

# Perform matrix factorization using SVD
k = 50  # Number of latent factors
model = TruncatedSVD(n_components=k)
latent_factors = model.fit_transform(user_game_matrix_dense)

# Example: Predict ratings for a user (index 0) based on the latent factors
user_index = 0
user_ratings = user_game_matrix_dense[user_index, :]
predicted_ratings = np.dot(latent_factors, model.components_)[user_index, :]

# Calculate RMSE
actual_ratings = user_ratings
rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))
print("RMSE:", rmse)

  user_game_matrix = pickle.load(file)


RMSE: 0.005169280487673951
