## Creating an Explicit Latent Matrix Factorization Recommender System for Books10k Dataset

In [42]:
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix
import time

First we load the data set:






In [43]:
#insert your path to ratings.csv here:
path = '/home/robot-tumas/Desktop/projects/python/recSysTutorial/datasets/goodBooks10k/'
ratingsDataMatrix = pd.read_csv(path+'ratings.csv',engine='python')

In [44]:
#we should  drop duplicates to clean the data:
ratingsDataMatrix = ratingsDataMatrix.drop_duplicates()
#lets look at some of the ratings:
print('we have ',ratingsDataMatrix.shape[0], 'observations and ', ratingsDataMatrix.shape[1], 'columns')
print('the columns are:', ratingsDataMatrix.columns)
print('the number of unique users we have is:', len(ratingsDataMatrix.user_id.unique()))
print('the number of unique books we have is:', len(ratingsDataMatrix.book_id.unique()))
print("The median user rated 8 books. We'll look at users that have rated at least 10 books.",ratingsDataMatrix.user_id.value_counts().median())


we have  980112 observations and  3 columns
the columns are: Index(['book_id', 'user_id', 'rating'], dtype='object')
the number of unique users we have is: 53424
the number of unique books we have is: 10000
The median user rated 8 books. We'll look at users that have rated at least 10 books. 8.0


In [45]:
ratingsDataMatrix = ratingsDataMatrix.pivot_table(index = 'user_id', columns = 'book_id', values = 'rating')

We clean the data, remove all users who have rated less than 10 books, and then fill all the nans with zeros to prepare for matrix factorization. 

In [46]:
userReviewCount = ratingsDataMatrix.count(axis=1)
itemReviewCount = ratingsDataMatrix.count(axis=0)

threshold=20
usersToKeep = userReviewCount[userReviewCount >= threshold]
itemsToKeep = itemReviewCount[itemReviewCount >= threshold]
print('users kept:%d'%usersToKeep.shape[0] + "\n items kept: %d "%itemsToKeep.shape[0])
print("by limiting users, we keep {0:.3%},of the total user base.".format(usersToKeep.shape[0]/userReviewCount.shape[0]))
print("by limiting items, we keep {0:.3%},of the total user base.".format(itemsToKeep.shape[0]/itemReviewCount.shape[0]))

#print(f'Only {"{:.2%}".format(usersToKeep.shape[0]/userReviewCount.shape[0])

ratingsDataMatrix=ratingsDataMatrix.loc[usersToKeep.index, :]
ratingsDataMatrix=ratingsDataMatrix.loc[:,itemsToKeep.index]

ratingsDataMatrix.fillna(0,inplace=True)

users kept:14612
 items kept: 9998 
by limiting users, we keep 27.351%,of the total user base.
by limiting items, we keep 99.980%,of the total user base.


In [47]:
#from scipy we can get a set up a sparse coordinate matrix with the ratings and users. 
ratingsMatrix = coo_matrix((ratingsDataMatrix.values)) # rows are books, columns are users


In [48]:
ratingsMatrix

<14612x9998 sparse matrix of type '<class 'numpy.float64'>'
	with 720208 stored elements in COOrdinate format>

## we are now ready for FunkSVD. This is where we approximate Singular Value decomposition using stochastic gradient descent. Lets first define our error function then use it to derive the algorithm.


In [74]:
def evaluate(Rating,P,Q,lambd=0.05):#error to minimize
    values = Rating.data
    row = Rating.row
    col = Rating.col
    error=0
    print(len(values))
    for ui in range(len(values)): #iterate through rating for user u, item i
        r_ui = values[ui]
        pu = row[ui]
        qi = col[ui]
        
        #now we add regularization:
        mean = np.mean(values) #global average of books
        uBias = np.mean(P[pu,:]) #average rating of user
        iBias = np.mean(Q[:,qi]) #average rating of book
        biasUI = np.add(iBias, np.add(mean,uBias)) #total bias for user u, item i
        
        if r_ui > 0: #i.e. if rating makes sense(sometimes latent factorization gets weird)
            est_r_ui = np.matmul(P[pu,:],Q[:,qi] ) + biasUI
            terms = [uBias, iBias, np.linalg.norm(P[pu,:],2),np.linalg.norm(Q[:,qi],2)]
            error = error + (r_ui -  est_r_ui)**2 + lambd * np.sum(np.power(terms,2))
    return error

In [77]:
def SGDoptimizer(Rating,k=10,lambd=0.05,epochs=10, alpha=0.005, verbose=True):
    M,N = Rating.shape
    
    nRatings = len(Rating.data)
    row = Rating.row
    col = Rating.col   
    P = np.random.normal(0,.1,(M,k))
    Q = np.random.normal(0,.1,(k,N))
    print(nRatings)
    rmse = np.sqrt(np.divide(SSError(Rating,P,Q,lambd),nRatings))
    print('initial rsme:',rsme)
    for epoch in range(epochs):
        for ui in range(nRatings):
            r_ui = values[ui]
            pu = row[ui]
            qi = col[ui] # item i, used in context to index matrix Q
            mean = np.mean(values) #global average of books
            uBias = np.mean(P[pu,:]) #average rating of user
            iBias = np.mean(Q[:,qi]) #average rating of book
            biasUI = np.add(iBias, np.add(mean,uBias)) #total bias for user u, item i
            
            #get residuals
            pred_r_ui = np.dot(P[pu,:],Q[:,qi]) + biasUI
            resdiual = r_ui - pred_r_ui

            #update P,Q
            P[pu,:] = P[pu,:] + alpha * np.subtract(residual * Q[:,qi],lambd * P[pu,:] ) 
            Q[:,qi] = Q[:,qi] +alpha * np.substract(residual * P[pu,:],lambd * Q[:,qi])
        
        rsme = np.sqrt(np.divide(SSError(Rating,P,Q,lambd),nRatings))
        if verbose:
            print("epoch :"+(epoch+1)+" ----Current RSME: %.2f"%rsme)
        
    if verbose:
        print("epoch :"+(epoch+1)+" ----Current RSME: %.2f"%rsme)
    
    return P,Q,rsme
            
    

In [78]:
starttime = time.time()
P,Q,rsme = SGDoptimizer(ratingsMatrix)
duration = time.time() - starttime
print("Process time: duration: %.2f"%duration)

720208
720208


UnboundLocalError: local variable 'rsme' referenced before assignment

In [60]:
starttime = time.time()

M,N = ratingsMatrix.shape
P = np.random.rand(M,5)
Q = np.random.rand(5,N)
SSError(ratingsMatrix,P,Q)

duration = time.time() - starttime
print("Process time: duration: %.2f"%duration)

720208


IndexError: index 584 is out of bounds for axis 0 with size 5

array([[0., 0., 0., ..., 0., 0., 0.]])