In [1]:
import numpy as np
import pandas as pd
import webbrowser
import os
from scipy.optimize import fmin_cg

In [2]:
def normalize_ratings(ratings):
    """
    Given an array of user ratings, subtract the mean of each product's ratings
    :param ratings: 2d array of user ratings
    :return: (normalized ratings array, the calculated means)
    """
    mean_ratings = np.nanmean(ratings, axis=0)
    return ratings - mean_ratings, mean_ratings

In [3]:
def cost(X, *args):
    """
    Cost function for low rank matrix factorization
    :param X: The matrices being factored (P and Q) rolled up as a contiguous array
    :param args: Array containing (num_users, num_products, num_features, ratings, mask, regularization_amount)
    :return: The cost with the current P and Q matrices
    """
    num_users, num_products, num_features, ratings, mask, regularization_amount = args
    # Unroll P and Q
    P = X[0:(num_users * num_features)].reshape(num_users, num_features)
    Q = X[(num_users * num_features):].reshape(num_products, num_features)
    Q = Q.T
    # Calculate current cost
    return (np.sum(np.square(mask * (np.dot(P, Q) - ratings))) / 2) + ((regularization_amount / 2.0) * np.sum(np.square(Q.T))) + ((regularization_amount / 2.0) * np.sum(np.square(P)))


In [4]:
def gradient(X, *args):
    """
    Calculate the cost gradients with the current P and Q.
    :param X: The matrices being factored (P and Q) rolled up as a contiguous array
    :param args: Array containing (num_users, num_products, num_features, ratings, mask, regularization_amount)
    :return: The gradient with the current X
    """
    num_users, num_products, num_features, ratings, mask, regularization_amount = args

    # Unroll P and Q
    P = X[0:(num_users * num_features)].reshape(num_users, num_features)
    Q = X[(num_users * num_features):].reshape(num_products, num_features)
    Q = Q.T

    # Calculate the current gradients for both P and Q
    P_grad = np.dot((mask * (np.dot(P, Q) - ratings)), Q.T) + (regularization_amount * P)
    Q_grad = np.dot((mask * (np.dot(P, Q) - ratings)).T, P) + (regularization_amount * Q.T)

    # Return the gradients as one rolled-up array as expected by fmin_cg
    return np.append(P_grad.ravel(), Q_grad.ravel())

In [17]:
def low_rank_matrix_factorization(ratings, mask=None, num_features=15, regularization_amount=0.01):
    """
    Factor a ratings array into two latent feature arrays (user features and product features)

    :param ratings: Matrix with user ratings to factor
    :param mask: A binary mask of which ratings are present in the ratings array to factor
    :param num_features: Number of latent features to generate for users and products
    :param regularization_amount: How much regularization to apply
    :return: (P, Q) - the factored latent feature arrays
    """
    num_users, num_products = ratings.shape

    # If no mask is provided, consider all 'NaN' elements as missing and create a mask.
    if mask is None:
        mask = np.invert(np.isnan(ratings))

    # Replace NaN values with zero
    ratings = np.nan_to_num(ratings)

    # Create P and Q and fill with random numbers to start
    np.random.seed(0)
    P = np.random.randn(num_users, num_features)
    Q = np.random.randn(num_products, num_features)

    # Roll up P and Q into a contiguous array as fmin_cg expects
    initial = np.append(P.ravel(), Q.ravel())

    # Create an args array as fmin_cg expects
    args = (num_users, num_products, num_features, ratings, mask, regularization_amount)

    # Call fmin_cg to minimize the cost function and this find the best values for P and Q
    X = fmin_cg(cost, initial, fprime=gradient, args=args, maxiter=1000)

    # Unroll the new P and new Q arrays out of the contiguous array returned by fmin_cg
    nP = X[0:(num_users * num_features)].reshape(num_users, num_features)
    nQ = X[(num_users * num_features):].reshape(num_products, num_features)

    return nP, nQ.T

In [6]:
def RMSE(real, predicted):
    """
    Calculate the root mean squared error between a matrix of real ratings and predicted ratings
    :param real: A matrix containing the real ratings (with 'NaN' for any missing elements)
    :param predicted: A matrix of predictions
    :return: The RMSE as a float
    """
    return np.sqrt(np.nanmean(np.square(real - predicted)))

In [7]:
# Read the dataset into a data table using Pandas
raw_dataset_df = pd.read_csv("movie_ratings_data_set.csv")
movies=pd.read_csv("movies.csv",index_col="movie_id")
raw_dataset_df.head(10)

Unnamed: 0,user_id,movie_id,value
0,1,28,4
1,1,26,4
2,1,9,4
3,1,1,4
4,1,14,4
5,1,13,5
6,2,2,5
7,2,15,4
8,2,1,5
9,2,21,5


In [9]:
ratings_df=pd.pivot_table(raw_dataset_df,index='user_id',columns='movie_id',aggfunc=np.max)
ratings_df.head(10)

Unnamed: 0_level_0,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
movie_id,1,2,3,4,5,6,7,8,9,10,...,25,26,27,28,29,30,31,32,33,34
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,4.0,,,,,,,,4.0,,...,,4.0,,4.0,,,,,,
2,5.0,5.0,,,,,,,,,...,,,,,,,3.0,,,4.0
3,4.0,4.0,5.0,,,,,,,,...,,,,,,,,,,
4,5.0,5.0,,5.0,5.0,,,,,,...,,,,,,,,,,
5,5.0,,,,,,,,5.0,,...,,,,,3.0,,3.0,2.0,5.0,5.0
6,5.0,5.0,,,,,,,,,...,,,,,,,2.0,5.0,4.0,3.0
7,5.0,,,2.0,,,,,,,...,,,,,,,,,,4.0
8,4.0,,5.0,,,,,,,5.0,...,,,5.0,5.0,,,,,,
9,5.0,,5.0,,,,,,,,...,,,,5.0,4.0,,,,,
10,4.0,,4.0,,,,,4.0,,,...,,,,,5.0,,,,,


In [20]:
#Return an U matrix for the Users and M matrix for the Movies
U, M = low_rank_matrix_factorization(ratings_df.as_matrix(),
                                                                    num_features=15,
                                                                    regularization_amount=0.1)

M

         Current function value: 32.509226
         Iterations: 1000
         Function evaluations: 1494
         Gradient evaluations: 1494


array([[ 8.23378933e-01,  8.44298813e-01,  4.41732467e-01,
         5.66855851e-01,  1.00800056e+00,  4.98468176e-01,
        -1.12273373e-03,  7.59546930e-01,  4.40341883e-01,
         5.96865459e-01,  6.73493101e-01,  6.02954717e-01,
         5.43164022e-01,  7.50460173e-01,  1.07179707e-01,
         1.07115461e+00, -3.62995928e-01,  1.16787727e+00,
         1.11159026e-01, -2.83550193e-01,  1.30990702e+00,
         5.82329364e-01, -4.44444028e-02,  3.90094409e-01,
         1.32751402e+00,  1.00776932e+00,  1.58254895e-01,
         7.29444209e-01,  3.46367790e-01,  6.06658118e-01,
         3.13340258e-01, -2.18148882e-02, -6.43624387e-02,
         7.45076967e-01],
       [-8.16095481e-01, -3.83432673e-01, -1.19741638e+00,
        -4.57336067e-01, -9.53470060e-01, -8.15923972e-01,
        -5.48154425e-02, -9.67467750e-01, -5.83027698e-01,
        -6.00804465e-01, -2.19882798e-02, -6.43606221e-01,
        -9.48753703e-01, -6.78797387e-01, -1.10876740e+00,
        -2.84003256e-01, -7.63

In [21]:
#Instead of doing a regular multiplication operator we will use the Matmul operator of numpy in order to do matrix multiplication
predicted_ratings = np.matmul(U, M)

In [22]:
predicted_ratings_df = pd.DataFrame(index=ratings_df.index,
                                    columns=ratings_df.columns,
                                    data=predicted_ratings)

In [23]:
predicted_ratings_df.head(10)

Unnamed: 0_level_0,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
movie_id,1,2,3,4,5,6,7,8,9,10,...,25,26,27,28,29,30,31,32,33,34
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,4.002811,3.914904,4.155675,3.542763,3.92753,4.154358,2.624725,4.100723,4.010188,3.809425,...,3.794577,3.984966,3.349802,3.990724,3.524186,2.97222,2.56578,3.297513,3.150224,4.457186
2,4.970294,4.968506,4.441134,3.893871,4.865342,4.701697,3.382183,4.74205,4.544581,4.832425,...,4.342834,4.492776,4.426322,4.200323,3.471386,4.406693,2.994058,4.333038,3.960352,3.999625
3,4.016865,3.984347,4.956194,4.128198,4.538604,4.277047,2.812225,4.377018,4.491508,4.362601,...,4.081162,4.313134,3.911173,4.477426,3.237622,3.110892,1.972839,2.49784,3.403797,4.501351
4,4.989817,4.987847,4.956342,4.962324,5.010878,4.971985,3.878575,5.086879,4.9532,5.05391,...,4.078907,4.748125,4.495332,4.918242,3.103673,3.601891,2.05729,4.182979,4.162926,4.622219
5,4.980654,4.19691,5.411997,4.272397,5.181419,4.492897,3.233642,4.6041,4.98169,4.939415,...,4.475328,4.672454,3.741458,5.073483,3.01743,3.774006,2.996459,2.031604,4.980801,4.987608
6,4.971338,4.964164,4.189038,3.869993,4.511525,4.55111,3.522515,4.723966,4.308157,4.613336,...,4.039826,4.220929,4.739673,3.86089,3.507975,3.8387,2.010703,4.97927,4.00077,3.017148
7,4.978836,4.178918,4.175565,2.041438,4.158906,4.289204,2.825275,4.124074,4.003952,4.08183,...,4.593186,3.959423,3.81045,3.609623,5.27469,4.388624,4.427768,4.674718,4.917094,4.002547
8,4.032198,4.64279,4.988807,4.132703,4.828575,4.388166,3.374843,4.480751,4.884792,5.001875,...,4.272206,4.686852,4.961121,4.964297,3.378273,3.411152,2.115965,3.152195,3.819875,4.479939
9,4.986024,4.657508,4.997741,4.14864,4.850753,4.667128,3.342402,4.813473,4.779158,4.756287,...,4.427131,4.744827,4.054766,4.975313,3.997969,3.524436,2.667715,3.767727,4.443682,4.917949
10,4.00899,3.966851,3.999916,2.503546,3.903739,4.212549,2.766617,4.019975,4.007456,3.879311,...,3.600048,3.550093,3.915929,3.250126,4.998323,4.676766,2.62561,4.506833,3.476676,3.862124
