In [170]:
import numpy as np
import argparse
import matplotlib.pyplot as plt
from sklearn.metrics import auc, mean_squared_error
from sklearn.model_selection import train_test_split

path = "../dataset/data.npz"

npzfile = np.load(path)

In [3]:
npzfile.files

['item_id', 'user_id', 'rating']

In [72]:
#reshape the npzfile and split the dataset
item_id = npzfile['item_id'][0:10]
user_id = npzfile['user_id'][0:10]
item_id_train, item_id_test,user_id_train, user_id_test = train_test_split(item_id,user_id, test_size=0.2, random_state = 1)
print(item_id)
print(user_id)

[[242]
 [302]
 [377]
 [ 51]
 [346]
 [474]
 [265]
 [465]
 [451]
 [ 86]]
[[196]
 [186]
 [ 22]
 [244]
 [166]
 [298]
 [115]
 [253]
 [305]
 [  6]]


In [77]:
#data combination and split. implement from first principle
data = []
for i in range(10):
    data.append([npzfile["user_id"][i][0],npzfile['item_id'][i][0], npzfile['rating'][i][0]])
data = np.asarray(data, dtype='int32')

In [121]:
#data combination and split. implement based on numpy API
#
dataAll = np.asarray([npzfile['user_id'].reshape(-1), npzfile['item_id'].reshape(-1), npzfile['rating'].reshape(-1)] )
dataAll=dataAll.T
[USER_ID_MAX, ITEM_ID_MAX, _] = np.max(dataAll, axis=0) #TODO: n unique
trainData, testData = train_test_split(dataAll, test_size=0.2, random_state=0)


In [246]:
#training and infer without cross validation

K=2
lamU=0.1
lamV=0.1

lr = 0.001 #step size i.e. learning rate
MAX_ITER = 10 #200

# note that idx starts from 0. user_id=1 ->  row 0 in the matrix
def dataToMatrix(data, dim=[USER_ID_MAX, ITEM_ID_MAX]):
    matrix = np.zeros(dim)
    for ins in data:
        matrix[ins[0]-1,ins[1]-1] = ins[2]
    return matrix

R_train = dataToMatrix(trainData)
indicator = (R_train > 0).astype(int)
np.random.seed(seed=1) # for reproducible
#initialize params
U= np.random.random_sample((K, USER_ID_MAX))
V= np.random.random_sample((K, ITEM_ID_MAX))

R_test = dataToMatrix(testData)
indicatoR_test = (R_test>0).astype(int)


R_pred = U.T @ V # matmul, TODO: logistic function to overcome rating out range
diff = indicator * (R_train - R_pred)

rmseTrain=[]
rmseTest=[]


for epoch in range(MAX_ITER):
     #TODO: Linear with # of rating. take advantage of the sparsity of the matrix.
    dEdU = -V @ diff.T  + lamU * U
    dEdV = -U @ diff  + lamV * V
    # sync update 
    Ut = U - lr * dEdU #TODO:  - or +
    Vt = V - lr * dEdV
    U = Ut
    V = Vt
    
    R_pred = U.T @ V #matmul, TODO: logistic function to overcome rating out range
    diff = indicator * (R_train - R_pred)
    
    # evaluate RMSE loss
    rmseTrain.append(np.sqrt( (diff**2).sum()/indicator.sum()))
    diffTest = indicatoR_test * (R_test - R_pred)
    rmseTest.append(np.sqrt( (diffTest**2).sum()/indicatoR_test.sum()))
    print("Epoch {}: Train RMSE: {:.4f}\t Test RMSE: {:.4f}".format(epoch, rmseTrain[epoch], rmseTest[epoch]))




Epoch 0: Train RMSE: 2.7541	 Test RMSE: 2.7649
Epoch 1: Train RMSE: 2.1329	 Test RMSE: 2.1473
Epoch 2: Train RMSE: 1.7141	 Test RMSE: 1.7247
Epoch 3: Train RMSE: 1.4961	 Test RMSE: 1.5070
Epoch 4: Train RMSE: 1.3596	 Test RMSE: 1.3713
Epoch 5: Train RMSE: 1.2672	 Test RMSE: 1.2800
Epoch 6: Train RMSE: 1.2012	 Test RMSE: 1.2152
Epoch 7: Train RMSE: 1.1521	 Test RMSE: 1.1674
Epoch 8: Train RMSE: 1.1145	 Test RMSE: 1.1311
Epoch 9: Train RMSE: 1.0850	 Test RMSE: 1.1029


In [245]:
#linear running time in observed ratings. using the sparsity
np.random.seed(seed=1)
U= np.random.random_sample((K, USER_ID_MAX))
V= np.random.random_sample((K, ITEM_ID_MAX))

for epoch in range(MAX_ITER):
    # linear algo. Batch Gradient Descent
    Ut = U
    Vt = V
    for (userID,itemID,rij) in trainData:
        i = userID - 1
        j = itemID - 1
        rij_pred = np.dot((U.T)[i,:],V[:,j])
        diff_ij = rij - rij_pred
        #partially update the latent features of user i and item j
        Ut[:,i] += - lr * (-V[:,j] * diff_ij) 
        Vt[:,j] += - lr * (-U[:,i] * diff_ij)
        
        #rmseTrain[epoch] += diff_ij**2
    #update U and V
    Ut = Ut - lr * ( lamU * U)
    Vt = Vt - lr * (lamV * V)
    U = Ut
    V = Vt
    
    #evaluate
    #rmseTrain[epoch] = np.sqrt(rmseTrain[epoch]/len(trainData))
    R_pred = U.T @ V
    diffTrain = indicator * (R_train - R_pred)
    rmseTrain[epoch] = np.sqrt( (diffTrain**2).sum()/indicator.sum())
    diffTest = indicatoR_test * (R_test - R_pred)
    rmseTest[epoch] = np.sqrt( (diffTest**2).sum()/indicatoR_test.sum())
    print("Epoch {}: Train RMSE: {:.4f}\t Test RMSE: {:.4f}".format(epoch, rmseTrain[epoch], rmseTest[epoch]))
    
###===> The running time is longer than that of using the complete matrix    
        

Epoch 0: Train RMSE: 2.7059	 Test RMSE: 2.7170
Epoch 1: Train RMSE: 2.1115	 Test RMSE: 2.1261
Epoch 2: Train RMSE: 1.7169	 Test RMSE: 1.7300
Epoch 3: Train RMSE: 1.4985	 Test RMSE: 1.5105
Epoch 4: Train RMSE: 1.3645	 Test RMSE: 1.3766
Epoch 5: Train RMSE: 1.2731	 Test RMSE: 1.2862
Epoch 6: Train RMSE: 1.2072	 Test RMSE: 1.2214
Epoch 7: Train RMSE: 1.1577	 Test RMSE: 1.1732
Epoch 8: Train RMSE: 1.1197	 Test RMSE: 1.1364
Epoch 9: Train RMSE: 1.0897	 Test RMSE: 1.1077


In [203]:
#tuning params with cross-validation and grid-search, based on sklearn BaseEstimator
len(trainData)

80000

In [183]:
# for python test
np.random.seed(seed=1)
a = np.random.random_sample((2,2))
b = (a > 0).astype(int)
b[0,0]=0
print(RMatrix.shape)
print(V.shape)
print(U.shape)
dt=np.asarray([1,1])
indicator.sum()

(943, 1682)
(2, 1682)
(2, 943)


80000