In [3]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error

users = pd.read_table('ml-1m/users.dat',  sep = '::', header = None, names= ['ID', 'Gender', 'Age', 'Occupation', 'Zip-code'])
ratings = pd.read_table('ml-1m/ratings.dat',  sep = '::', header = None, names= ['UserID', 'MovieID', 'Rating', 'Timestamp'])
movies = pd.read_table('ml-1m/movies.dat',  sep = '::', engine='python', encoding = "ISO-8859-1", header = None, names= ['ID', 'Title', 'Genre'])

In [4]:
def compute_RMSE(U,V, M):
    U_V = np.dot(U,V)
    diff = np.subtract(M, U_V)
    RMSE = np.sqrt(np.nansum(np.square(diff))/np.count_nonzero(~np.isnan(M)))
    print(RMSE, "RMSE")
    return RMSE

def UV_dec(U, V, M, d):
    #U_r_s
    for r in range(U.shape[0]):
        for s in range(U.shape[1]):
            m = M[r,~np.isnan(M[r,:])]
            if(len(m)==0):
                continue
            idx = np.argwhere(~np.isnan(M[r,:])).reshape(-1)
            v = V[s, idx]
            p = np.dot(U[r,:],V[:, idx]) - (U[r,s] * V[s, idx])
            U[r,s] = np.sum(v*(m-p))/np.sum(np.square(v))
    #V_r_s
    for r in range(V.shape[0]):
        for s in range(V.shape[1]):
            m = M[~np.isnan(M[:,s]),s]
            if(len(m)==0):
                continue
            idx = np.argwhere(~np.isnan(M[:,s])).reshape(-1)
            u = U[idx, r]
            p = np.dot(U[idx,:],V[:,s]) - (U[idx,r] * V[r, s])
            V[r,s] = np.sum(u*(m-p))/np.sum(np.square(u))
    return U,V



In [8]:
kf = KFold(n_splits = 5, random_state=42, shuffle = True)
d = 2
#print(ratings)
for trainid, testid in kf.split(ratings):
    #data_train = ratings.iloc[trainid]
    #data_test = ratings.iloc[testid]
    #for i in testid:
        #print(i)
    train = ratings.copy()
    train.loc[testid, "Rating"] = np.nan
    u_m = train.pivot(index = "UserID", columns = "MovieID", values = "Rating")
    M = u_m.to_numpy()
    a = u_m.stack().mean(skipna = True)
    columns = list(u_m.columns)
    
    test = ratings.copy()
    test.loc[trainid, "Rating"] = np.nan
    u_m_test = test.pivot(index = "UserID", columns = "MovieID", values = "Rating")
    M_test = u_m_test.to_numpy()

    #print(M)
    #print(M_test)

    U = np.full([u_m.shape[0],d], np.sqrt(a/d))
    V = np.full([d,u_m.shape[1]], np.sqrt(a/d))
    compute_RMSE(U,V, M)
    for i in range(15):
        U, V = UV_dec(U,V,M,d)
        compute_RMSE(U,V,M)
    
    test_score = compute_RMSE(U,V, M_test)
    print(test_score, "test RMSE")

1.116443331346803 RMSE
0.9276663707078154 RMSE
0.9039406867687133 RMSE
0.9004163086982866 RMSE
0.8976807130379336 RMSE
0.8949125549374837 RMSE
0.8918709019133886 RMSE
0.888526885671501 RMSE
0.885004144455155 RMSE
0.8815340496576012 RMSE
0.8783308278713075 RMSE
0.875521125127221 RMSE
0.8731345239070177 RMSE
0.8711381864964292 RMSE
0.8694748428729406 RMSE
0.8680855414943004 RMSE
0.8945450115621454 RMSE
0.8945450115621454 test RMSE
1.1176293301815807 RMSE
0.9290564831501205 RMSE
0.9053447351410632 RMSE
0.9017972926953788 RMSE
0.8990470993916038 RMSE
0.8962661737557673 RMSE
0.8932014452817714 RMSE
0.8898123649504247 RMSE
0.8862384522504574 RMSE
0.8827184732677593 RMSE
0.8794777157436865 RMSE
0.8766484369596189 RMSE
0.8742605921372152 RMSE
0.8722781419890262 RMSE
0.8706391523456245 RMSE
0.8692802441021684 RMSE
0.8975279541660915 RMSE
0.8975279541660915 test RMSE
1.1173222639636156 RMSE
0.9289881370731948 RMSE
0.90542729111088 RMSE
0.9018780705117546 RMSE
0.8991343582720198 RMSE
0.8963689489

In [None]:
movies.rename(columns = {'ID':'MovieID'}, inplace = True)
merged = pd.merge(ratings, movies, on = 'MovieID')
utility_matrix = merged.pivot(index = "UserID", columns = "MovieID", values = "Rating")
#print(utility_matrix)
#print(utility_matrix.iloc[0,:].dropna())

In [None]:
d = 6

M = ratings.pivot(index = "UserID", columns = "MovieID", values = "Rating")
M = utility_matrix.to_numpy()
a = utility_matrix.stack().mean(skipna = True)

U = np.full([utility_matrix.shape[0],d], np.sqrt(a/d))
V = np.full([d,utility_matrix.shape[1]], np.sqrt(a/d))
print(len(np.dot(U,V)))
compute_RMSE(U,V, M)
for i in range(100):
    U, V = UV_dec(U,V,M,d)
    compute_RMSE(U,V,M)

In [None]:
def compute_RMSE(U,V, M):
    U_V = np.dot(U,V)
    diff = np.subtract(M, U_V)
    RMSE = np.sqrt(np.nansum(np.square(diff))/np.count_nonzero(~np.isnan(M)))
    print(RMSE, "RMSE")
    return RMSE
def UV_dec(U, V, M, d):
    #U_r_s
    for r in range(U.shape[0]):
        for s in range(U.shape[1]):
            m = M[r,~np.isnan(M[r,:])]
            if(len(m)==0):
                continue
            idx = np.argwhere(~np.isnan(M[r,:])).reshape(-1)
            v = V[s, idx]
            p = np.dot(U[r,:],V[:, idx]) - (U[r,s] * V[s, idx])
            U[r,s] = np.sum(v*(m-p))/np.sum(np.square(v))
    #V_r_s
    for r in range(V.shape[0]):
        for s in range(V.shape[1]):
            m = M[~np.isnan(M[:,s]),s]
            if(len(m)==0):
                continue
            idx = np.argwhere(~np.isnan(M[:,s])).reshape(-1)
            u = U[idx, r]
            p = np.dot(U[idx,:],V[:,s]) - (U[idx,r] * V[r, s])
            V[r,s] = np.sum(u*(m-p))/np.sum(np.square(u))
def UV_dec(U, V, M, d):
    #U_r_s
    for r in range(U.shape[0]):
        for s in range(U.shape[1]):
            
            #flatten to 1-d array containing indexis nonblank entries
            idx = np.argwhere(~np.isnan(M[r,:])).reshape(-1)
            #for each nonblank entry do:
            nom, den = 0,0
            for j in idx:
                p = [0 if k==s else U[r,k] * V[k,j] for k in range(d)]
                nom += V[s,j] * (M[r,j] - np.sum(p))
                den += np.square(V[s,j])
            U[r,s] = nom/den
                
                
            
    
    #V_r_s
    for r in range(V.shape[0]):
        for s in range(V.shape[1]):
            idx = np.argwhere(~np.isnan(M[:,s])).reshape(-1)
            nom, den = 0,0
            for i in idx:
                p = [0 if k==r else U[i,k] * V[k,s] for k in range(d)]
                nom += U[i,r] * (M[i,s] - np.sum(p))
                den += np.square(U[i,r])
            V[r,s] = nom/den
    
    return U,V

In [None]:
def UV_dec_2(U, V, M, d):
    k = [i for i in range(d)]
    #U_r_s
    for r in range(U.shape[0]):
        for s in range(U.shape[1]):
            m = M[r,~np.isnan(M[r,:])]
            idx = np.argwhere(~np.isnan(M[r,:])).reshape(-1)
            v = V[s, idx]
            p = np.dot(U[r,:],V[:, idx]) - (U[r,s] * V[s, idx])
            U[r,s] = np.sum(v*(m-p))/np.sum(np.square(v)) 
    
    #V_r_s
    for r in range(V.shape[0]):
        for s in range(V.shape[1]):
            idx = np.argwhere(~np.isnan(M[:,s])).reshape(-1)
            nom, den = 0,0
            for i in idx:
                p = [0 if k==r else U[i,k] * V[k,s] for k in range(d)]
                nom += U[i,r] * (M[i,s] - np.sum(p))
                den += np.square(U[i,r])
            V[r,s] = nom/den
    
    return U,V

In [None]:
d = 2

utility_matrix = ratings.pivot(index = "UserID", columns = "MovieID", values = "Rating")
a = utility_matrix.stack().mean(skipna = True)

U = np.full([utility_matrix.shape[0],d], np.random.uniform(0,5))#np.sqrt(a/d))
V = np.full([d,utility_matrix.shape[1]], np.random.uniform(0,5))#np.sqrt(a/d))
compute_RMSE(U,V, M)
for i in range(100):
    U, V = UV_dec(U,V,M,d)
    compute_RMSE(U,V,M)
    
    
    

In [None]:

        
    
U_V = np.dot(U,V)
#convert pandas to numpy 
M = utility_matrix.to_numpy()
print(M[0,~np.isnan(M[0,:])])
#print(len(M[0,~np.isnan(M[0,:])]))
#print(np.unique(M[0,~np.isnan(M[0,:])],return_counts = True))
#print(np.unique(ratings.loc[ratings["UserID"]==1, "Rating"], return_counts = True))
idx = np.argwhere(~np.isnan(M[0,:])).reshape(-1)
for i in idx:
    print(M[0,i])

RMSE = compute_RMSE(U_V, M)  



In [None]:
incremental_update(U, V):
    
    