In [66]:
import math
import numpy
import random

# Regular Matrix Factorization
# Loss Function: (R-U.Vt)**2 + |U|**2 + |V|** 2
class MatrixFactorization():
    def __init__(self,K=10,alpha = 0.0002,beta = 0.02,num_iterations = 200000):
        # K : latent factors 
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.num_iterations = num_iterations
      
    def fit(self,R):
        # R : user-item matrix in sparse-matrix format
        
        # create training data from user-item matrix
        self.samples = [(i,j,v) for i,j,v in zip(R.row, R.col, R.data)]
        print("number of samples = ",len(self.samples))
        
        # decomposed matrix P (user-feature) and Q (item-feature)
        # initialize a random array
        self.P = numpy.random.normal(scale=1.0/self.K, size=(R.shape[0],self.K))
        self.Q = numpy.random.normal(scale=1.0/self.K, size=(R.shape[1],self.K))
    
        # run SGD to find P and Q
        for iteration in range(self.num_iterations):
            # for every non-zero value (in samples)
            # calculate the predicted value
            sample_num = 0
            
            # shuffle the samples
            random.shuffle(self.samples)
            
            for i,j,v in self.samples:
                v_predicted = numpy.dot(self.P[i,:],self.Q[j,:].T)
            
                # calculate error of the predicted value
                e_predicted = v - v_predicted
                
                P_i = self.P[i,:][:]
                # update P and Q
                self.P[i,:] += self.alpha*(2*e_predicted*self.Q[j,:] - self.beta*self.P[i,:])
                self.Q[j,:] += self.alpha*(2*e_predicted*P_i - self.beta*self.Q[j,:])
            
                # print progress
                if sample_num % 500000 == 0 and sample_num != 0:
                    print ("sample_num = ",sample_num)
                sample_num += 1
                
            # calculate M.S.E. for termination condition
            #print("mse = ",self.mse())
        
            # show progress
            if iteration % 200 == 0:
                print("mse = ",self.mse())
                print("Iteration = ",iteration)
                  
    def mse(self):
        e_sum = 0.0
        for i,j,v in self.samples:
            v_predicted = numpy.dot(self.P[i,:],self.Q[j,:].T)
            
            # calculate error of the predicted value
            e_predicted = v - v_predicted
            
            e_sum += e_predicted**2
        
        return (math.sqrt(e_sum))
    
    def predict(self,usern):
        predictions = numpy.dot(self.P[usern,:],self.Q.T)
        return predictions


In [1]:
# Common code to generate user-item matrix
import pandas
import numpy
from scipy import sparse

# user-item matrix generated in earlier file
user_item_sparse = sparse.load_npz('generated_data/user_item_matrix.npz')

user_loc_f = numpy.load('generated_data/user_item_matrix_columns.npz')
user_list = list(user_loc_f['arr_0'])
loc_list = list(user_loc_f['arr_1'])


In [2]:

from sklearn.decomposition import NMF
model = NMF(n_components=10, init='random', random_state=0,verbose=1)
P = model.fit_transform(user_item_sparse)
Q = model.components_.T
print("P,Q decomposition is done")

#samples = [(v,(i,j)) for i,j,v in zip(user_item_sparse.row, user_item_sparse.col, user_item_sparse.data)]
#uis = sparse.coo_matrix((user_item_sparse.data[0:100],(user_item_sparse.row[0:100],user_item_sparse.col[0:100])))
'''
MF = MatrixFactorization()
MF.fit(user_item_sparse)
P = MF.P
Q = MF.Q
print("P,Q decomposition is done")
'''


violation: 1.0
violation: 3.7435846438213174
violation: 0.9477558429994196
violation: 0.3834783459675236
violation: 0.21636693222740783
violation: 0.1785582398919382
violation: 0.17849875102617438
violation: 0.1356712991742559
violation: 0.09562199093405682
violation: 0.06765060536628578
violation: 0.04644042912689369
violation: 0.031081187273073484
violation: 0.02087422792623596
violation: 0.014383100956620067
violation: 0.010325770455752885
violation: 0.007825938302876973
violation: 0.006299843167201236
violation: 0.005391152764259383
violation: 0.004823797445452411
violation: 0.004492094497934926
violation: 0.004290575791196893
violation: 0.0041042131793395074
violation: 0.0038898765659640703
violation: 0.003614496802303819
violation: 0.0033018609641311753
violation: 0.002981728707221211
violation: 0.0026635729505791205
violation: 0.0023567915125541905
violation: 0.0020683813803573634
violation: 0.0018054082663631343
violation: 0.0015631784202805546
violation: 0.001351472469369519
v

'\nMF = MatrixFactorization()\nMF.fit(user_item_sparse)\nP = MF.P\nQ = MF.Q\nprint("P,Q decomposition is done")\n'

In [3]:
#store the decomposed P and Q to csv 
#pandas.DataFrame(P,index=user_set).to_csv("generated_data/MF_decomposed_P.csv")
#pandas.DataFrame(Q,columns=loc_set).to_csv("generated_data/MF_decomposed_Q.csv")
#test_P = pandas.read_csv("decomposed_P.csv",index_col=0,float_precision='round_trip')
#test_Q = pandas.read_csv("decomposed_Q.csv",index_col=0,float_precision='round_trip')
#numpy.array_equal(P,test_P.to_numpy())
#numpy.array_equal(Q,test_Q.to_numpy())

numpy.savez('generated_data/user_item_decomposed.npz',P,Q)
numpy.savez('generated_data/user_item_decomposed_P.npz',P)
numpy.savez('generated_data/user_item_decomposed_Q.npz',Q)