In [278]:
import numpy as np
import argparse
import matplotlib.pyplot as plt
from sklearn.metrics import auc, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.utils.estimator_checks import check_estimator
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.model_selection import GridSearchCV


K=2
lamU=0.1
lamV=0.1

lr = 0.001 #step size i.e. learning rate
MAX_ITER = 10 #200

path = "../dataset/data.npz"

npzfile = np.load(path)

In [248]:
npzfile.files

['item_id', 'user_id', 'rating']

In [249]:
#data combination and split. implement based on numpy API
#
dataAll = np.asarray([npzfile['user_id'].reshape(-1), npzfile['item_id'].reshape(-1), npzfile['rating'].reshape(-1)] )
dataAll=dataAll.T
[USER_ID_MAX, ITEM_ID_MAX, _] = np.max(dataAll, axis=0) #TODO: n unique
trainData, testData = train_test_split(dataAll, test_size=0.2, random_state=0)


In [None]:
R_train = dataToMatrix(trainData)
indicator = (R_train > 0).astype(int)
np.random.seed(seed=1) # for reproducible
#initialize params
U= np.random.random_sample((K, USER_ID_MAX))
V= np.random.random_sample((K, ITEM_ID_MAX))

R_test = dataToMatrix(testData)
indicatoR_test = (R_test>0).astype(int)


R_pred = U.T @ V # matmul, TODO: logistic function to overcome rating out range
diff = indicator * (R_train - R_pred)

rmseTrain=[]
rmseTest=[]


for epoch in range(MAX_ITER):
     #TODO: Linear with # of rating. take advantage of the sparsity of the matrix.
    dEdU = -V @ diff.T  + lamU * U
    dEdV = -U @ diff  + lamV * V
    # sync update 
    Ut = U - lr * dEdU #TODO:  - or +
    Vt = V - lr * dEdV
    U = Ut
    V = Vt
    
    R_pred = U.T @ V #matmul, TODO: logistic function to overcome rating out range
    diff = indicator * (R_train - R_pred)
    
    # evaluate RMSE loss
    rmseTrain.append(np.sqrt( (diff**2).sum()/indicator.sum()))
    diffTest = indicatoR_test * (R_test - R_pred)
    rmseTest.append(np.sqrt( (diffTest**2).sum()/indicatoR_test.sum()))
    print("Epoch {}: Train RMSE: {:.4f}\t Test RMSE: {:.4f}".format(epoch, rmseTrain[epoch], rmseTest[epoch]))



In [293]:
#training and infer without cross validation
# note that idx starts from 0. user_id=1 ->  row 0 in the matrix
def dataToMatrix(data, dim=[USER_ID_MAX, ITEM_ID_MAX]):
    matrix = np.zeros(dim)
    for ins in data:
        matrix[ins[0]-1,ins[1]-1] = ins[2]
    return matrix

#rawData -input data with format [[userId, itemId, rating]xN]
def PMF_test(rawData, U, V):
    #TODO: use sparsity.
    R_test = dataToMatrix(rawData)
    indicatoR_test = (R_test>0).astype(int)
    R_pred = U.T @ V 
    diff = indicatoR_test * (R_test - R_pred)
    rmse = np.sqrt( (diff**2).sum()/indicatoR_test.sum())
    
    return rmse

#rawData -input data with format [[userId, itemId, rating]xN]
#K - the number of latent features
def PMF_train(rawData, K=2, maxIter=10, lamU=0.1, lamV=0.1, verbose=False):
    R_train = dataToMatrix(rawData)
    indicator = (R_train > 0).astype(int)
    np.random.seed(seed=1)
    #initialize params
    U= np.random.random_sample((K, USER_ID_MAX))
    V= np.random.random_sample((K, ITEM_ID_MAX))
    
    R_pred = U.T @ V # matmul, TODO: logistic function to overcome rating out range
    diff = indicator * (R_train - R_pred)
    
    rmseTrain=[]
    n_iter = 0
    for epoch in range(maxIter):
         #TODO: Linear with # of rating. take advantage of the sparsity of the matrix.
        dEdU = -V @ diff.T  + lamU * U
        dEdV = -U @ diff  + lamV * V
        # sync update 
        Ut = U - lr * dEdU #TODO:  - or +
        Vt = V - lr * dEdV
        U = Ut
        V = Vt

        R_pred = U.T @ V #matmul, TODO: logistic function to overcome rating out range
        diff = indicator * (R_train - R_pred)
        
        n_iter +=1
        
        rmseTrain.append(np.sqrt( (diff**2).sum()/indicator.sum()))
        # evaluate RMSE loss
        if(verbose):
            print("Epoch {}: Train RMSE: {:.4f}".format(epoch, rmseTrain[epoch]))

    return U,V, n_iter, rmseTrain[-1]
    
U,V,n_iter, rmse = PMF_train(trainData,K=3, verbose=True)
PMF_test(testData, U, V)
    

Epoch 0: Train RMSE: 2.3795
Epoch 1: Train RMSE: 1.7711
Epoch 2: Train RMSE: 1.4818
Epoch 3: Train RMSE: 1.3286
Epoch 4: Train RMSE: 1.2330
Epoch 5: Train RMSE: 1.1682
Epoch 6: Train RMSE: 1.1217
Epoch 7: Train RMSE: 1.0870
Epoch 8: Train RMSE: 1.0604
Epoch 9: Train RMSE: 1.0394


1.056744314856704

In [None]:
class PMF(BaseEstimator,TransformerMixin): #TODO: Transformer needed?
    def __init__(self, maxIter=200, K=2, lamU=0.1, lamV=0.1 ):
        self.maxIter = maxIter
        self.K = K
        self.lamU = lamU
        self.lamV = lamV
        
        self.U = np.random.random_sample((K, USER_ID_MAX))
        self.V = np.random.random_sample((K, ITEM_ID_MAX))
        
    
    # interface for estimator
    def fit(self, X, y=None, **params):
        U, V, n_iter_, train_rmse_ = PMF_train(X, K=self.K, lamU=self.lamU, lamV=self.lamV)
        
        #parameters with trailing _ is used to check if the estimator has been fitted
        #TODO: add validation rmse
        self.rmse_=  train_rmse_
        self.n_iter_ = n_iter_
        self.U = U
        self.V = V
        
        return self
    
    # interface for transformer 
    def transform(self, X):
        #check_is_fitted(self, 'rmse_') #TOBE checked
        U, V, n_iter_, train_rmse_ = PMF_train(X)
        
        return U
    
    # interface for Grid Search
    def score(self, X, y=None):
        rmse = PMF_test(X, self.U, self.V)
        print("Score: K={}, rmse={}".format(self.K, rmse) )
        #since build-in gridsearch pick params by "the bigger the better"
        return -rmse
        
    #def predict(self, X):

tuned_params = {'K':[2,3]}
pmfEst = PMF()
#model.fit(trainData)
#check_estimator(PMF)
gs=GridSearchCV(pmfEst, tuned_params, cv=5)
gs.fit(trainData)
print(gs.best_params_)
print(gs.cv_results_)


Score: K=2, rmse=1.1840571665053443
Score: K=2, rmse=1.150187272884002
Score: K=2, rmse=1.1794908533565276
Score: K=2, rmse=1.151004720246238
Score: K=2, rmse=1.180255370377696
Score: K=2, rmse=1.1506989282373195
Score: K=2, rmse=1.1921504184964513
Score: K=2, rmse=1.1496392109291786
Score: K=2, rmse=1.187056323796341
Score: K=2, rmse=1.148932383615149
Score: K=3, rmse=1.1144947817230701
Score: K=3, rmse=1.085357686281359
Score: K=3, rmse=1.1147910040975892
Score: K=3, rmse=1.0860304352430905
Score: K=3, rmse=1.1150147910813726
Score: K=3, rmse=1.0857650569388393
Score: K=3, rmse=1.127379658992456
Score: K=3, rmse=1.0841440330685876


In [312]:
sorted(gs.cv_results_)

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'mean_train_score',
 'param_K',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split0_train_score',
 'split1_test_score',
 'split1_train_score',
 'split2_test_score',
 'split2_train_score',
 'split3_test_score',
 'split3_train_score',
 'split4_test_score',
 'split4_train_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score',
 'std_train_score']

In [251]:
#linear running time in observed ratings. using the sparsity
np.random.seed(seed=1)
U= np.random.random_sample((K, USER_ID_MAX))
V= np.random.random_sample((K, ITEM_ID_MAX))
MAX_ITER2=2
for epoch in range(MAX_ITER2):
    # linear algo. Batch Gradient Descent
    Ut = U
    Vt = V
    for (userID,itemID,rij) in trainData:
        i = userID - 1
        j = itemID - 1
        rij_pred = np.dot((U.T)[i,:],V[:,j])
        diff_ij = rij - rij_pred
        #partially update the latent features of user i and item j
        Ut[:,i] += - lr * (-V[:,j] * diff_ij) 
        Vt[:,j] += - lr * (-U[:,i] * diff_ij)
        
        #rmseTrain[epoch] += diff_ij**2
    #update U and V
    Ut = Ut - lr * ( lamU * U)
    Vt = Vt - lr * (lamV * V)
    U = Ut
    V = Vt
    
    #evaluate
    #rmseTrain[epoch] = np.sqrt(rmseTrain[epoch]/len(trainData))
    R_pred = U.T @ V
    diffTrain = indicator * (R_train - R_pred)
    rmseTrain[epoch] = np.sqrt( (diffTrain**2).sum()/indicator.sum())
    diffTest = indicatoR_test * (R_test - R_pred)
    rmseTest[epoch] = np.sqrt( (diffTest**2).sum()/indicatoR_test.sum())
    print("Epoch {}: Train RMSE: {:.4f}\t Test RMSE: {:.4f}".format(epoch, rmseTrain[epoch], rmseTest[epoch]))
    
###===> The running time is longer than that of using the complete matrix    
        

Epoch 0: Train RMSE: 2.7059	 Test RMSE: 2.7170
Epoch 1: Train RMSE: 2.1115	 Test RMSE: 2.1261
Epoch 2: Train RMSE: 1.7169	 Test RMSE: 1.7300
Epoch 3: Train RMSE: 1.4985	 Test RMSE: 1.5105
Epoch 4: Train RMSE: 1.3645	 Test RMSE: 1.3766
Epoch 5: Train RMSE: 1.2731	 Test RMSE: 1.2862
Epoch 6: Train RMSE: 1.2072	 Test RMSE: 1.2214
Epoch 7: Train RMSE: 1.1577	 Test RMSE: 1.1732
Epoch 8: Train RMSE: 1.1197	 Test RMSE: 1.1364
Epoch 9: Train RMSE: 1.0897	 Test RMSE: 1.1077


In [252]:
#tuning params with cross-validation and grid-search, based on sklearn BaseEstimator
len(trainData)

80000

In [253]:
# for python test
np.random.seed(seed=1)
a = np.random.random_sample((2,2))
b = (a > 0).astype(int)
b[0,0]=0
print(RMatrix.shape)
print(V.shape)
print(U.shape)
dt=np.asarray([1,1])
indicator.sum()

(943, 1682)
(2, 1682)
(2, 943)


80000