In [69]:
import pandas as pd
import numpy as np
np.random.seed(0)
#read in the data
data = pd.read_csv("../data/ml-latest-small/ratings.csv")

#get the necessary dimensions
n_users = data['userId'].nunique()
n_movies = data['movieId'].nunique()

In [70]:
#split the data into different bins for the temporal function
def get_bin(num):
    #subtracts 25 because they started collecting the data 25 years after january 1970
    return int((num)/(60*60*24*365)) - 25
data["bin"] = data["timestamp"].apply(get_bin)
data_groups = data.groupby("bin")

In [71]:
#create R matrix
n_users = data.userId.unique().shape[0]
n_items = data.movieId.unique().shape[0]
movieIds = sorted(data.movieId.unique())

R = np.zeros((n_users, n_items))
bin_match = np.zeros((n_users, n_items)) #stored the bin number for each user movie pair
for row in data.itertuples():
    #print(row)
    R[row[1]-1, movieIds.index(row[2])] = row[3]
    bin_match[row[1]-1, movieIds.index(row[2])] = row[5]

In [72]:
#get initial B_i_bin_t dimensions: n_bins by n_movies
movieIds = sorted(data.movieId.unique())
B_i_bint = np.zeros((data['bin'].nunique(),n_movies))
for i in range(len(B_i_bint)):
    sub_frame = data[data.bin==i+1]
    #for movie in movieId, then access 
    count = 0
    total = 0
    for j in range(len(movieIds)):
        r = sub_frame['rating'][sub_frame['movieId']==movieIds[j]].values
        B_i_bint[i,j] = (r.sum())/((r!=0).sum()).astype(float)
B_i_bint  = B_i_bint - (R.sum()/(R!=0).sum().astype(float))
B_i_bint[np.isnan(B_i_bint)] = 0   
    

  # This is added back by InteractiveShellApp.init_path()


In [73]:
def get_error(R, R_hat):
    #only for the that had ratings in the training matrix
    mask = (R>0)
    R_hat[mask] = 0 #set the values we don't need predictions for to 0 
    return np.sqrt(((R - R_hat)**2).mean()) #return the RMSE

def predict(U, M, mu,b_i, b_u, bin_match ,B_i_bint):
    R = np.zeros((len(U),len(M)))
    for u in range(len(U)):#iterate over users
        for i in range(len(M)):#iterate over movies
            bin_num = int(bin_match[u,i]-1) #get the bin number for user u and movie i
            R[u,i] = mu+b_u[u]+b_i[i]+B_i_bint[bin_num,i]+np.dot(U[u,:].T,M[i,:])
    return R
            
def train(Mat, bin_match, f, Lambda, n_iter, B_i_bint):
    R = Mat
    # Step 1 Initialize matrix M by assigning the average rating for that movie as the first row, and small random numbers for the remaining entries.
    # set the first row of M as the average ratings of each movie(Probably from just the training matrix)
    M = 5 *np.random.rand(R.shape[1], f)
    M[:,0] = R.sum(0)/(R!=0).sum(0).astype(float)
    
    U = 5 *np.random.rand(R.shape[0], f)#initialize U matrix
    
    mu = R.sum()/(R!=0).sum().astype(float)#average rating
    b_i = (R.sum(0)/(R!=0).sum(0).astype(float)) - mu #initialize b_i = len(movies)
    b_u = (R.sum(1)/(R!=0).sum(1).astype(float)) - mu #initialize b_u = len(users)
    
    #function to minimize: sum(r_ui - ^r_ui) + lambda(b_i^2 + b_u^2+ |q|^2 + |p|^2 )
    for epoch in range(n_iter):
        #Step 2 Fix M, Solve U by minimizing the objective function
        for i in range(len(U)):
            I_i = np.nonzero(R[i,:])[0] #set of movies that user i rated
            n_ui = len(I_i) #number of ratings user i has given matrix 
            M_Ii = M[I_i,:] #denotes the sub-matrix of M where rows j in I_i are selected
            E = np.identity(f)
            R_i_I = R[i,I_i]
            U[i,:] = np.dot(np.linalg.inv(np.dot(M_Ii.T,M_Ii) + (Lambda*n_ui*E)),np.dot(R_i_I,M_Ii))

        #Step 3 Fix U, solve M by minimizing the objective function similarly; 
        for j in range(len(M)):
            I_j = np.nonzero(R[:,j])[0] #set of users that have rated movie j
            n_mj = len(I_j) #number of ratings movie j has received 
            U_Ij = U[I_j,:] #denotes the sub-matrix of U where rows i in I_j are selected
            R_j_I = R[I_j, j]
            M[j,:] = np.dot(np.linalg.inv(np.dot(U_Ij.T,U_Ij) + (Lambda*n_mj*E)),np.dot(R_j_I,U_Ij))
        
        #get predictions
        R_hat = predict(U, M, mu,b_i, b_u, bin_match ,B_i_bint)
    
        #get the error
        error = get_error(R_hat, R)
        print(error)
        
        b_i = b_i +  
        #update all the biases
        
    #Step 4 Repeat Steps 2 and 3 until a stopping criterion is satisfied.
    return R, U, M, b_u, b_i, B_i_bint

In [75]:
A,B,C,D,E,F = train(R, B, 30, 0.2, 50, B_i_bint)




LinAlgError: Singular matrix