In [10]:
import pandas as pd
import numpy as np

#read in the data
data = pd.read_csv("../data/ml-latest-small/ratings.csv")

#get the necessary dimensions
n_users = data['userId'].nunique()
n_movies = data['movieId'].nunique()

In [24]:
#split the data into different bins for the temporal function
def get_bin(num):
    #subtracts 25 because they started collecting the data 25 years after january 1970
    return int((num)/(60*60*24*365)) - 25
data["bin"] = data["timestamp"].apply(get_bin)

In [60]:
#R matrix
n_users = data.userId.unique().shape[0]
n_items = data.movieId.unique().shape[0]
movieId = sorted(data.movieId.unique())

R = np.zeros((n_users, n_items))
B = np.zeros((n_users, n_items)) #stored the bin number for each user movie pair
for row in data.itertuples():
    #print(row)
    R[row[1]-1, movieId.index(row[2])] = row[3]
    B[row[1]-1, movieId.index(row[2])] = row[5]

    

In [137]:
np.random.seed(0)
def get_error(U,M, R, b_i, b_u , Bi_bin_t):
    #modify this to include the biases
    return (R - np.dot(U,M.T))

def train(Mat, B, f, Lambda, n_iter):
    R = Mat
    # Step 1 Initialize matrix M by assigning the average rating for that movie as the first row, and small random numbers for the remaining entries.
    M = 5 *np.random.rand(R.shape[1], f)
    #set the first row of M as the average ratings of each movie(Probably from just the training matrix)
    M[:,0] = R.sum(0)/(R!=0).sum(0).astype(float)
    
    #initialize U matrix
    U = 5 *np.random.rand(R.shape[0], f)
    
    #initialize b_i = len(movies)
    b_i = R.sum(0)/(R!=0).sum(0).astype(float) 
    #initialize bi_bin(t) = bin by number of movies. For each movie, check it's bin bias
    
    #initialize b_u = len(users)
    b_u = R.sum(1)/(R!=0).sum(1).astype(float)
    #average rating
    mu = R.sum()/(R!=0).sum().astype(float)
    
    #function to minimize: sum(r_ui - ^r_ui) + lambda(b_i^2 + b_u^2+ |q|^2 + |p|^2 )
    for epoch in range(n_iter):
        #Step 2 Fix M, Solve U by minimizing the objective function. Function in paper, also update bu and 
        for i in range(len(U)):
            I_i = np.nonzero(R[i,:])[0] #set of movies that user i rated
            #print(len(I_i))
            n_ui = len(I_i) #number of ratings user i has given matrix 
            M_Ii = M[I_i,:] #denotes the sub-matrix of M where columns j in I_i are selected
            E = np.identity(f)
            R_i_I = R[i,I_i]
            U[i,:] = np.dot(np.linalg.inv(np.dot(M_Ii.T,M_Ii) + (Lambda*n_ui*E)),np.dot(R_i_I,M_Ii))

        #Step 3 Fix U, solve M by minimizing the objective function similarly; 
        for j in range(len(M)):
            I_j = np.nonzero(R[:,j])[0] #set of users that have rated movie j
            n_mj = len(I_j) #number of ratings movie j has received 
            U_Ij = U[I_j,:] #denotes the sub-matrix of M where columns j in I_i are selected
            E = np.identity(f)
            R_j_I = R[I_j, j]
            M[j,:] = np.dot(np.linalg.inv(np.dot(U_Ij.T,U_Ij) + (Lambda*n_mj*E)),np.dot(R_j_I,U_Ij))
        #reset R
        #R = np.dot(U, M.T)
    #Step 4 Repeat Steps 2 and 3 until a stopping criterion is satisfied.
    #return R, U and M
    return np.dot(U,M.T)

In [138]:
train(R, B, 10, 0.2, 100)


array([[4.51342272, 3.96331536, 3.76936356, ..., 3.56748037, 3.56748037,
        4.34488352],
       [3.6342106 , 3.26164648, 3.14776617, ..., 3.03733029, 3.03733029,
        3.48877487],
       [1.32305573, 1.37847761, 1.22442083, ..., 1.56731623, 1.56731623,
        1.66472635],
       ...,
       [3.29457954, 2.93800857, 2.75426003, ..., 2.75748768, 2.75748768,
        3.32306046],
       [3.38624005, 3.04959075, 2.93000389, ..., 2.7133109 , 2.7133109 ,
        3.18036299],
       [3.87298509, 3.37473961, 3.18324419, ..., 3.2185075 , 3.2185075 ,
        3.91974041]])