#Baseline estimator implementation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
def gradient_J(R, B_i, B_u, mu, lambda_1=0.02):
    """Calculate stochastic gradient descent for the Baseline model.

    Parameters:
        R (coo_matrix): Rating matrix.
        B_i (1-D np.array): Observed deviation for each movie.
        B_i (1-D np.array): Observed deviation for each user.
        mu (float): Overall average rating.
        lambda_1 (float): Penality coefficient.

    Returns:
        b_i (1-D np.array): Gradient of observed deviation for each movie.
        b_u (1-D np.array): Gradient of observed deviation for each user.

    """
    users, items = R.nonzero()
    b_u, b_i =  np.zeros(len(B_u)), np.zeros(len(B_i))
    for u, i in zip(users, items):
        e = R[u, i] - (B_i[i] + B_u[u] + mu)
        b_u[u] += e - lambda_1*B_u[u]
        b_i[i] += e - lambda_1*B_i[i]

    return -2*b_i, -2*b_u

In [3]:
def rmse(R, B_i, B_u, mu):
    """Calculate the root-mean-square deviation using the
       estimated parameters.

    Parameters:
        R (coo_matrix): Rating matrix.
        B_i (1-D np.array): Observed deviation for each movie.
        B_i (1-D np.array): Observed deviation for each user.
        mu (float): Overall average rating.

    Returns:
        error (float): RMSE error value.

    """

    users, items = R.nonzero()
    error = 0
    for u, i in zip(users, items):
        error += pow(R[u, i] - mu - B_u[u] - B_i[i], 2)
    error = (1 / R.shape[0]) * error

    return error

In [4]:
def recreate_matrix(B_i, B_u, mu):
    """Recreate the matrix using estimated parameters.

    Parameters:
        B_i (1-D np.array): Observed deviation for each movie.
        B_i (1-D np.array): Observed deviation for each user.
        mu (float): Overall average rating.

    Returns:
        R_estimated (2-D np.array): Recreated rating matrix.

    """

    R_estimated = mu  + B_i[np.newaxis:,] + B_u[:,np.newaxis]

    return R_estimated

In [14]:
def recommend_k_movies(R, B_i, B_u, mu, u, k):
    """Recommend k movies to the user u.

    Parameters:
        R (coo_matrix): Rating matrix.
        B_i (1-D np.array): Observed deviation for each movie.
        B_i (1-D np.array): Observed deviation for each user.
        u (int): Index of the user.
        k (int): Number of movie to recommend.

    Returns:
        R_u_k (list): List of recommended movies' indices.

    """
    
    R = R.tocsr()
    R_u = {}
    for i in range(len(B_i)):
        if R[u, i] == 0:  # If the user u didn't watch the movie i yet
            R_u[i] = B_i[i] + B_u[u] + mu
    
    # Sort movies by estimated rating and keep the first k movies
    R_u_k = [v[0] for v in 
             sorted(R_u.items(), key=lambda x: x[1], reverse=True)[:k]]
    
    return R_u_k

In [5]:
def baseline_estimator(R, N, delta=0.01, lambda_1=0.02, verbose=False, plot=False):
    """Estimate Baseline model parameters using SGD.

    Parameters:
        R (coo_matrix): Rating matrix.
        N (int): Number of iterations.
        delta (float): Learning rate.
        lambda_1 (float): Penality coefficient.
        plot (boolean): Plot RMSE track if True.
        verbose (boolean): Print number of the current iteration at excecution.

    Returns:
        B_i_opt (1-D array): Optimal observed deviation for each movie.
        B_u_opt (1-D array): Optimal observed deviation for each user.

    """

    # Weight initialization
    B_u = 2.5 * np.random.rand(R.shape[0])
    B_i = 2.5 * np.random.rand(R.shape[1])

    # Logs initialization
    mse_track = np.zeros(N) 
    w_track = []

    # Calculate overall average rating value
    mu = R.data.mean()

    # Convert R into a csr_matrix
    R = R.tocsr()

    # SDG on N iterations
    for i in range(N):
        if verbose : 
            print("Iteration "+ str(i))
        
        # Calculate SGD 
        gradient = gradient_J(R, B_i, B_u, mu, lambda_1)
        B_i = B_i - delta * gradient[0] 
        B_u = B_u - delta * gradient[1]

        # Saving RMSE values
        mse_track[i] = rmse(R, B_i, B_u, mu)

        # Saving estimated parameters
        w_track.append([B_i, B_u])

    # Selecting the optimal parameters
    idx_min = np.argmin(mse_track)
    B_i_opt = w_track[idx_min][0]
    B_u_opt = w_track[idx_min][1]

    # PLoting RMSE deviation if plot is True
    if plot:
        mse_track = np.array([float(round(mse,3)) for mse in mse_track])
        plt.figure(figsize = (20,8))
        plt.rcParams['axes.facecolor'] = 'orange'
        plt.grid(c='white')
        plt.plot(np.arange(0,N), mse_track, 'blue')
        plt.xlabel('Iteration')
        plt.ylabel('MSE')
        plt.title('Minimum MSE = {:.4f} reached at iteration {}'.format(min(mse_track),idx_min+1), fontsize = 15)
        plt.axvline(x = idx_min, color = 'green')

    return B_i_opt, B_u_opt

# Dump example

In [6]:
from scipy.sparse import coo_matrix
R = coo_matrix(np.array([
                [0, 3, 0, 4, 0, 5, 2],
                [1, 0, 0, 2, 1, 2, 0],
                [5, 2, 4, 5, 0, 0, 3],
                [3, 0, 0, 3, 5, 5, 1],
                [1, 0, 5, 0, 1, 1, 0],
                [2, 4, 1, 4, 2, 3, 0],
                [0, 0, 0, 2, 0, 0, 4],
                [1, 4, 4, 3, 3, 0, 2]]
))

B_i_opt, B_u_opt = baseline_estimator(R, N=1000, delta=0.01, lambda_1=0.02, verbose=False, plot=True)

In [7]:
R_estimate = recreate_matrix(B_i_opt, B_u_opt, mu=R.data.mean())
R_estimate

array([[3.00329394, 3.55543772, 4.2134912 , 3.76795962, 3.46780058,
        4.13773956, 2.4869375 ],
       [0.93743666, 1.48958043, 2.14763391, 1.70210234, 1.40194329,
        2.07188228, 0.42108021],
       [3.37768985, 3.92983362, 4.5878871 , 4.14235553, 3.84219648,
        4.51213546, 2.8613334 ],
       [3.01757173, 3.5697155 , 4.22776898, 3.78223741, 3.48207836,
        4.15201734, 2.50121528],
       [1.31843209, 1.87057587, 2.52862935, 2.08309777, 1.78293873,
        2.45287771, 0.80207565],
       [1.98683564, 2.53897941, 3.19703289, 2.75150132, 2.45134227,
        3.12128125, 1.47047919],
       [2.86416236, 3.41630613, 4.07435961, 3.62882804, 3.32866899,
        3.99860797, 2.34780591],
       [2.41997242, 2.97211619, 3.63016968, 3.1846381 , 2.88447905,
        3.55441804, 1.90361597]])

In [8]:
R.toarray()

array([[0, 3, 0, 4, 0, 5, 2],
       [1, 0, 0, 2, 1, 2, 0],
       [5, 2, 4, 5, 0, 0, 3],
       [3, 0, 0, 3, 5, 5, 1],
       [1, 0, 5, 0, 1, 1, 0],
       [2, 4, 1, 4, 2, 3, 0],
       [0, 0, 0, 2, 0, 0, 4],
       [1, 4, 4, 3, 3, 0, 2]])

In [20]:
recommend_k_movies(R, B_i_opt, B_u_opt, mu=R.data.mean(), u = 1, k = 2)

[2, 1]

# Real application

In [9]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

***L'execution de cette partie prend beaucoup de temps sur Colab !***

In [None]:
from scipy.sparse import save_npz, load_npz, coo_matrix

def load_matrix(M_name):
    """ Load previously saved sparce matrix M.

    Parameters:
        M_name (string): Matrix name as saved in drive (R, T or D).

    Returns:
        M (coo_matrix): Saved matrix.

    """

    M = load_npz("/content/drive/My Drive/Factorisation matricielle - dataset/training_set_csv/"+M_name+".npz")

    return M

In [None]:
# Loading R matrix

R = load_matrix("R")

In [None]:
import datetime
start = datetime.datetime.now()

B_i_opt, B_u_opt = baseline_estimator(T, N=500, delta=0.01, lambda_1=0.02, verbose=False, plot=True)

print("Parameters estimated in : {}".format(datetime.datetime.now() - start))

In [None]:
import datetime
start = datetime.datetime.now()

R_estimate = recreate_matrix(B_i_opt, B_u_opt, mu=R.data.mean())

print("Parameters estimated in : {}".format(datetime.datetime.now() - start))