In [2]:

import pandas as pd
import numpy as np
from math import isnan
import joblib
import numpy.ma as ma
import threading

In [9]:
class Threads(threading.Thread):
    def __init__(self, threadID, k_fold_dataset, lr=0.005):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.k_fold_dataset = k_fold_dataset
        self.lr = lr
    
    def run(self):
        train_set = self.k_fold_dataset[self.threadID-1][0]
        test_set = self.k_fold_dataset[self.threadID-1][1]
        model = SVD(
            self.threadID, 
            train_set, 
            test_set, 
            lr=self.lr)
        model.training()

# cv for User/Movie matrix only
class CrossValidation():
    def __init__(self, file_path, shuffle=True, normolize=False):
        self.file_path = file_path
        self.normolize = normolize
        self.shuffle = shuffle

    def process_dataset(self):
        ratings = pd.read_csv(
            self.file_path,
            sep='::',
            engine='python',
            header=None,
            names=['UserID','MovieID','Rating','Timestamp'])

        ratings = ratings.pivot(
            index = 'UserID', 
            columns ='MovieID', 
            values = 'Rating')
        return ratings
    
    def split(self, fold=5, all=False, save=False):

        # process raw data
        ratings = self.process_dataset()

        # not folding but regarding whole dataset as train set
        if all:
            return ratings.to_numpy()
        
        dict_ratings_valid = dict()
        lst_ratings_valid = []

        
        cols = ratings.columns.values.tolist()
        for i, row in ratings.iterrows():
            temp = []
            for col in cols:
                if isnan(row[col]) == False:
                    temp.append([col, row[col]])
            dict_ratings_valid[i] = temp
        
        print(dict_ratings_valid[2])
        return

In [2]:
import joblib
import numpy as np
def load_dataset(train_set_path, test_set_path):
        return joblib.load(train_set_path), joblib.load(test_set_path)

In [6]:
train, test = load_dataset('train_set_2', 'test_set_2')

In [7]:
count = 0
for row in train:
    for col in row:
        if np.isnan(col) == False:
            count += 1

print(count)

800193


In [8]:
count_test = 0
for row in test:
    for col in row:
        if np.isnan(col) == False:
            count_test += 1
print(count_test)

200016


In [11]:
valid_el = []
for index, el in np.ndenumerate(train):
    if np.isnan(el) == False:
        valid_el.append([index, el])

In [12]:
valid_el

[[(0, 47), 5.0],
 [(0, 144), 5.0],
 [(0, 253), 4.0],
 [(0, 513), 5.0],
 [(0, 517), 4.0],
 [(0, 574), 4.0],
 [(0, 580), 4.0],
 [(0, 581), 5.0],
 [(0, 639), 3.0],
 [(0, 689), 3.0],
 [(0, 708), 3.0],
 [(0, 740), 4.0],
 [(0, 853), 3.0],
 [(0, 858), 4.0],
 [(0, 877), 4.0],
 [(0, 957), 5.0],
 [(0, 964), 5.0],
 [(0, 970), 5.0],
 [(0, 1025), 4.0],
 [(0, 1104), 5.0],
 [(0, 1107), 3.0],
 [(0, 1117), 4.0],
 [(0, 1178), 5.0],
 [(0, 1195), 5.0],
 [(0, 1421), 4.0],
 [(0, 1574), 4.0],
 [(0, 1658), 5.0],
 [(0, 1727), 4.0],
 [(0, 1781), 5.0],
 [(0, 2147), 3.0],
 [(0, 2162), 5.0],
 [(0, 2205), 4.0],
 [(0, 2483), 3.0],
 [(0, 2488), 4.0],
 [(0, 2557), 4.0],
 [(0, 2586), 4.0],
 [(0, 2592), 4.0],
 [(0, 2599), 5.0],
 [(0, 2710), 4.0],
 [(0, 2898), 4.0],
 [(0, 2969), 4.0],
 [(0, 3177), 4.0],
 [(1, 20), 1.0],
 [(1, 106), 5.0],
 [(1, 159), 3.0],
 [(1, 228), 3.0],
 [(1, 258), 4.0],
 [(1, 283), 3.0],
 [(1, 309), 5.0],
 [(1, 339), 4.0],
 [(1, 346), 5.0],
 [(1, 358), 4.0],
 [(1, 370), 5.0],
 [(1, 420), 2.0],
 [(1, 

In [2]:
class SVD():
    def __init__(self, threadID, train_set, test_set, K_features=10, lr=0.005, reg=0.05, epoch=75):
        self.threadID =  threadID
        self.train_set = train_set
        self.test_set = test_set
        self.K_features = K_features
        self.lr = lr
        self.reg = reg
        self.epoch = epoch
    
    # initialize params
    def initialize_latent_vectors(self):
        # To be reproducible
        np.random.seed(11)
        self.U = np.random.rand(self.train_set.shape[0], self.K_features)
        np.random.seed(44)
        self.M = np.random.rand(self.K_features, self.train_set.shape[1])
    
    # 1) loss function
    def MAE(self, dataset):
        mask = np.isnan(dataset)
        masked_array = ma.array(dataset, mask=mask)
        error = masked_array - np.matmul(self.U, self.M)
        MAE = np.mean(np.absolute(error))
        return MAE

    # 2) loss function
    def RMSE(self, dataset):
        mask = np.isnan(dataset)
        masked_array = ma.array(dataset, mask=mask)
        error = masked_array - np.matmul(self.U, self.M)
        RMSE = np.mean(error**2) ** 0.5
        return RMSE
    
    def update_params(self, i, j, error):
        self.U[i,:] += self.lr * (2 * error * self.M[:,j] - self.reg * self.U[i,:])
        self.M[:,j] += self.lr * (2 * error * self.U[i,:] - self.reg * self.M[:,j])
    
    def save_params(self):
        joblib.dump(self.U, 'U_vec'+str(self.threadID))
        joblib.dump(self.M, 'M_vec'+str(self.threadID))
    
    def load_params(self):
        self.U = joblib.load('U_vec'+str(self.threadID))
        self.M = joblib.load('M_vec'+str(self.threadID))

    def testing(self):
        MAE = self.MAE(self.test_set)
        RMSE = self.RMSE(self.test_set)
        print('Thread-->',self.threadID,' Test-->', 'RMSE: ', RMSE, 'MAE: ', MAE)
        
    
    # training
    def training(self, retrain=False):

        if retrain:
            self.load_params()
            print('Thread-->',self.threadID,' Params loaded')
        else:
        # initialize params
            self.initialize_latent_vectors()

        # start training
        '''
        1) Iterate over each known element 
        2) update the ith row of U and the jth column of M.
        '''
        valid_el = []
        for index, el in np.ndenumerate(self.train_set):
                if isnan(el) == False:
                    valid_el.append([index, el])
        # start training
        for epoch in range(self.epoch):
            # valid_el->foramt: [[(i,j),rating],...]
            for index, rating_idx in enumerate(valid_el):
                i = rating_idx[0][0]
                j = rating_idx[0][1]
                rating = rating_idx[1]
                pred = np.dot(self.U[i,:], self.M[:, j]) # row vec * col vec
                error = rating - pred # get error
                # update each param item
                # print(i,'',j, '', rating)
                self.update_params(i, j, error)
                if index % 10 == 0:
                    MAE = self.MAE(self.train_set)
                    RMSE = self.RMSE(self.train_set)
                    print('Thread-->',self.threadID,' Train-->', ' Epoch: ', epoch, 'RMSE: ', RMSE, 'MAE: ', MAE)
                if index != 0 and index % 500 == 0:
                    self.save_params()
                    print('Thread-->',self.threadID,' Params saved')

In [10]:
cv = CrossValidation('ratings.dat')
dataset = cv.split(fold=5) # split and get 5 dataset including train&test

[[21, 1.0], [95, 2.0], [110, 5.0], [163, 4.0], [165, 3.0], [235, 3.0], [265, 4.0], [292, 3.0], [318, 5.0], [349, 4.0], [356, 5.0], [368, 4.0], [380, 5.0], [434, 2.0], [442, 3.0], [457, 4.0], [459, 3.0], [480, 5.0], [498, 3.0], [515, 5.0], [589, 4.0], [590, 5.0], [593, 5.0], [647, 3.0], [648, 4.0], [736, 4.0], [780, 3.0], [902, 2.0], [920, 5.0], [982, 4.0], [1084, 3.0], [1090, 2.0], [1096, 4.0], [1103, 3.0], [1124, 5.0], [1188, 4.0], [1193, 5.0], [1196, 5.0], [1198, 4.0], [1207, 4.0], [1210, 4.0], [1213, 2.0], [1217, 3.0], [1225, 5.0], [1244, 3.0], [1245, 2.0], [1246, 5.0], [1247, 5.0], [1253, 3.0], [1259, 5.0], [1265, 3.0], [1293, 5.0], [1357, 5.0], [1370, 5.0], [1372, 3.0], [1385, 3.0], [1408, 3.0], [1442, 4.0], [1527, 4.0], [1537, 4.0], [1544, 4.0], [1552, 3.0], [1597, 3.0], [1610, 5.0], [1687, 3.0], [1690, 3.0], [1784, 5.0], [1792, 3.0], [1801, 3.0], [1834, 4.0], [1873, 4.0], [1917, 3.0], [1945, 5.0], [1953, 4.0], [1954, 5.0], [1955, 4.0], [1957, 5.0], [1962, 5.0], [1968, 2.0], [200

In [None]:
def process_dataset(self):
        ratings = pd.read_csv(
            self.file_path,
            sep='::',
            engine='python',
            header=None,
            names=['UserID','MovieID','Rating','Timestamp'])

        ratings = ratings.pivot(
            index = 'UserID', 
            columns ='MovieID', 
            values = 'Rating')
        return ratings
    
    def split(self, fold=5, all=False, save=False):

        # alpha = round((fold - 1) / fold, 1)

        # process raw data
        ratings = self.process_dataset()

        # not folding but regarding whole dataset as train set
        if all:
            return ratings.to_numpy()
        
        dict_ratings_valid = dict()
        for i, row in ratings.iterrows():
            u_rating = row[np.isnan(row) == False]
            u_rating_j = list(u_rating.index)
            u_rating = list(u_rating)
            value = list(zip(u_rating_j,u_rating))
            random.seed(88)
            random.shuffle(value)
            dict_ratings_valid[i] = value
        
        results = []
        for num in range(fold):
            dict_ratings_valid_train = dict()
            dict_ratings_valid_test = dict()
            fold_idx = [i for i in range(fold)]
            fold_idx.remove(num)
            for user_id in dict_ratings_valid.keys():
                tmp = [dict_ratings_valid[user_id][i::fold] for i in range(fold)]
                dict_ratings_valid_test[user_id] = tmp[num]
                tr_tmp = []
                for idx in fold_idx:
                    tr_tmp.extend(tmp[idx]) 
                dict_ratings_valid_train[user_id] = tr_tmp
            results.append([dict_ratings_valid_train, dict_ratings_valid_test])
        
        # results    [x]    [y]    [z]
        # format  |k-fold|tr/tst|user_id|
        return results, ratings.to_numpy()

In [1]:
import pandas as pd
import numpy as np
from math import isnan
import joblib
import numpy.ma as ma
import threading
import random

In [4]:
# Multi-threads
class Threads(threading.Thread):
    def __init__(self, threadID, train_set, test_set, lr=0.005, k_factors=10):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.train_set = train_set
        self.test_set = test_set
        self.lr = lr
        self.k_factors = k_factors
    
    def run(self):
        model = SVD(
            self.threadID, 
            self.train_set, 
            self.test_set, 
            lr=self.lr,
            K_features=self.k_factors)
        model.training(retrain=False)

# cv for User/Movie matrix only
class CrossValidation():
    def __init__(self, file_path, shuffle=True, normolize=False):
        self.file_path = file_path
        self.normolize = normolize
        self.shuffle = shuffle

    def load_dataset(self, train_set_path, test_set_path):
        return joblib.load(train_set_path), joblib.load(test_set_path)

    def process_dataset(self):
        ratings = pd.read_csv(
            self.file_path,
            sep='::',
            engine='python',
            header=None,
            names=['UserID','MovieID','Rating','Timestamp'])

        ratings = ratings.pivot(
            index = 'UserID', 
            columns ='MovieID', 
            values = 'Rating')
        return ratings
    
    def split(self, fold=5, all=False, save=False):

        # process raw data
        ratings = self.process_dataset()

        # not folding but regarding whole dataset as train set
        if all:
            return ratings.to_numpy()
        
        dict_ratings_valid = dict()
        for i, row in ratings.iterrows():
            u_rating = row[np.isnan(row) == False]
            u_rating_j = list(u_rating.index)
            random.seed(88)
            random.shuffle(u_rating_j)
            dict_ratings_valid[i] = u_rating_j
        
        results = []
        for num in range(fold):
            dict_ratings_valid_train = dict()
            dict_ratings_valid_test = dict()
            train_set = ratings.copy()
            test_set = ratings.copy()
            fold_idx = [i for i in range(fold)]
            fold_idx.remove(num)
            # use dictionary to store each user with ratings
            for user_id in dict_ratings_valid.keys():
                tmp = [dict_ratings_valid[user_id][i::fold] for i in range(fold)]
                dict_ratings_valid_test[user_id] = tmp[num] # test_dict
                tr_tmp = []
                for idx in fold_idx:
                    tr_tmp.extend(tmp[idx]) 
                dict_ratings_valid_train[user_id] = tr_tmp # train_dict

            # get train set
            for user_id in dict_ratings_valid_test:
                cols = dict_ratings_valid_test[user_id]
                train_set.loc[user_id, cols] = np.nan
            
            # get test set
            for user_id in dict_ratings_valid_train:
                cols = dict_ratings_valid_train[user_id]
                test_set.loc[user_id, cols] = np.nan
            
            # normolize
            if self.normolize:
                ratings_mean_tr = np.nanmean(train_set.to_numpy(), axis=1)
                ratings_mean_tt = np.nanmean(test_set.to_numpy(), axis=1)
                train_set = train_set - ratings_mean_tr.reshape(-1, 1)
                test_set = test_set - ratings_mean_tt.reshape(-1, 1)

            if save:
                joblib.dump(test_set.to_numpy(), 'test_set_' + str(num))
                joblib.dump(train_set.to_numpy(), 'train_set_' + str(num))
                print('Fold_' + str(num) + ' Saved')
            
            results.append([train_set.to_numpy(), test_set.to_numpy()])
        # results    [x]    [y]    [z]
        # format  |k-fold|tr/tst|user_id|
        return results

class SVD():
    def __init__(self, threadID, train_set, test_set, K_features=10, lr=0.005, reg=0.05, epoch=75):
        self.threadID =  threadID
        self.train_set = train_set
        self.test_set = test_set
        self.K_features = K_features
        self.lr = lr
        self.reg = reg
        self.epoch = epoch
    
    # initialize params
    def initialize_latent_vectors(self):
        # To be reproducible
        np.random.seed(33)
        self.U = np.random.rand(self.train_set.shape[0], self.K_features)
        np.random.seed(33)
        self.M = np.random.rand(self.K_features, self.train_set.shape[1])
    
    # 1) loss function
    def MAE(self, dataset):
        mask = np.isnan(dataset)
        masked_array = ma.array(dataset, mask=mask)
        error = masked_array - np.matmul(self.U, self.M)
        MAE = np.mean(np.absolute(error))
        # MAE = mae(masked_array, np.dot(self.U, self.M))
        return MAE

    # 2) loss function
    def RMSE(self, dataset):
        mask = np.isnan(dataset)
        masked_array = ma.array(dataset, mask=mask)
        error = masked_array - np.matmul(self.U, self.M)
        RMSE = np.sqrt((error**2).mean())
        # RMSE = mse(masked_array, np.dot(self.U, self.M)) ** 0.5
        return RMSE
    
    def update_params(self, i, j, error):
        self.U[i,:] += self.lr * (2 * error * self.M[:,j] - self.reg * self.U[i,:])
        self.M[:,j] += self.lr * (2 * error * self.U[i,:] - self.reg * self.M[:,j])
    
    def save_params(self):
        joblib.dump(self.U, 'U_vec'+str(self.threadID))
        joblib.dump(self.M, 'M_vec'+str(self.threadID))
    
    def load_params(self):
        self.U = joblib.load('U_vec'+str(self.threadID))
        self.M = joblib.load('M_vec'+str(self.threadID))

    def testing(self):
        MAE = self.MAE(self.test_set)
        RMSE = self.RMSE(self.test_set)
        print('Thread-->',self.threadID,' Test-->', 'RMSE: ', RMSE, 'MAE: ', MAE)
        
    # training
    def training(self, retrain=False):

        if retrain:
            self.load_params()
            print('Thread-->',self.threadID,' Params loaded')
        else:
        # initialize params
            self.initialize_latent_vectors()

        # start training
        '''
        1) Iterate over each known element 
        2) update the ith row of U and the jth column of M.
        '''
        # valid_el = dict()
        # for index, el in np.ndenumerate(self.train_set):
        #     if isnan(el) == False:
        #         valid_el.update({index:el})
            
        # start training
        for epoch in range(self.epoch):
            # for coord, rating in valid_el.items():
            for index, el in np.ndenumerate(self.train_set):
                if isnan(el) == False:
                    i = index[0]
                    j = index[1]
                    pred = np.dot(self.U[i,:], self.M[:, j]) # row vec * col vec  
                    error = el - pred # get error
                    # update each param item
                    self.update_params(i,j, error)
                    # self.U[i,:] += self.lr * (2 * error * self.M[:,j] - self.reg * self.U[i,:])
                    # self.M[:,j] += self.lr * (2 * error * self.U[i,:] - self.reg * self.M[:,j])
                
            MAE = self.MAE(self.train_set)
            RMSE = self.RMSE(self.train_set)
            print(' Epoch',epoch, ' Thread-->',self.threadID,' Train-->', 'RMSE: ', RMSE, 'MAE: ', MAE)
            self.testing() # get performance of model on test set

                    

In [5]:
cv = CrossValidation('ratings.dat')
train_path = 'train_set_1'
test_path = 'test_set_1'
train_set, test_set = cv.load_dataset(train_path, test_path)
model = SVD(
        1,
        train_set, 
        test_set
        )
model.training(retrain=False)

 Epoch 0  Thread--> 1  Train--> RMSE:  1.0067985715305465 MAE:  0.7854379203386799
Thread--> 1  Test--> RMSE:  1.0223616983287411 MAE:  0.7988847002298936
 Epoch 1  Thread--> 1  Train--> RMSE:  0.9178967396006897 MAE:  0.7250272590175104
Thread--> 1  Test--> RMSE:  0.9395231718065548 MAE:  0.7422750680987753
 Epoch 2  Thread--> 1  Train--> RMSE:  0.912222530488904 MAE:  0.723302811406637
Thread--> 1  Test--> RMSE:  0.9367481294782953 MAE:  0.7427041308193976
 Epoch 3  Thread--> 1  Train--> RMSE:  0.9069467649650452 MAE:  0.7192935344110981
Thread--> 1  Test--> RMSE:  0.9344694025931031 MAE:  0.7408991407058291
 Epoch 4  Thread--> 1  Train--> RMSE:  0.8995148279300396 MAE:  0.7132822488775614
Thread--> 1  Test--> RMSE:  0.9304304378155611 MAE:  0.7373464691249414
 Epoch 5  Thread--> 1  Train--> RMSE:  0.8897734634303736 MAE:  0.7053032365111488
Thread--> 1  Test--> RMSE:  0.92451377159244 MAE:  0.7321687100127698
 Epoch 6  Thread--> 1  Train--> RMSE:  0.8794330365770139 MAE:  0.69681651

KeyboardInterrupt: 

In [None]:
from uv_ch9 import iniUV, normalize
import pandas as pd
import csv
import numpy as np
import random

def errors(m_true, m_pred, index):
  sum1 = 0
  sum2 = 0
  num = len(index)
  for i in index:
    e = m_true[i[0]][i[1]] - m_pred[i[0]][i[1]]
    sum1 += e**2
    sum2 += abs(e)

  return np.sqrt(sum1/num), sum2/num


# update method in Page 24 of gravity-Tikk.pdf
def updateUV(m, u, v, irate, l, train):
    # generate e
    pred = np.dot(u, v)
    for i in train:
        error = m[i[0]][i[1]] - pred[i[0]][i[1]]
        u[i[0]] = u[i[0]] + irate * (2 * error * v[:, i[1]] - l * u[i[0]])
        v[:, i[1]] = v[:, i[1]] + irate * (2 * error * u[i[0]] - l * v[:, i[1]])

    return u, v


def MF(M, threshold, max_time, k, train, test):
    # print(M)
    M, means = normalize(M)
    # M = M.values
    u, v = iniUV(M, k)
    for i in range(max_time):
        u, v = updateUV(M, u, v, 0.005, 0.05, train)
        pred = np.dot(u, v)
        r,m = errors(M, pred, train)
        print("RMSE:", r, "MAE:", m)
        if r < threshold:
            break

    return errors(M, np.dot(u, v), test), r, m


def testTrainSplit(M):
    index = []
    random.seed(0)
    M = M.values
    for i in range(M.shape[0]):
        for j in range(M.shape[1]):
            if M[i, j] == M[i, j]:
                index.append((i, j))

    i1 = random.sample(index, round(len(index) / 5))
    index = list(set(index) - set(i1))
    i2 = random.sample(index, round(len(index) / 4))
    index = list(set(index) - set(i2))
    i3 = random.sample(index, round(len(index) / 3))
    index = list(set(index) - set(i3))
    i4 = random.sample(index, round(len(index) / 2))
    i5 = list(set(index) - set(i4))
    train1 = i2 + i3 + i4 + i5
    train2 = i1 + i3 + i4 + i5
    train3 = i1 + i2 + i4 + i5
    train4 = i1 + i2 + i3 + i5
    train5 = i1 + i2 + i3 + i4

    return [train1, train2, train3, train4, train5], \
           [i1, i2, i3, i4, i5]


def main():
    # load fire
    M = {}
    with open('ratings.dat', 'r') as f:
        reader = csv.reader(f)
        for line in reader:
            data = line[0].split('::')
            if data[0] in M:
                M[data[0]][data[1]] = data[2]
            else:
                M[data[0]] = {data[1]: data[2]}

    M = pd.DataFrame(M)
    M = M.iloc[:5, :10]
    # print(uvDecomposition(M, threshold, max_time,10))

    # 5 fold
    M = M.astype('float')
    M = M.T
    train_r = 0
    train_m = 0
    test_r = 0
    test_m = 0
    train, test = testTrainSplit(M)
    for i in range(5):
        e = MF(M, threshold, max_time, 10, train[i], test[i])
        print(e)
        test_r += e[0][0] / 5
        test_m += e[0][1] / 5
        train_r += e[1] / 5
        train_m += e[2] / 5
    print("testRMSE:", test_r,
          "\ntestMAE:", test_m,
          "\ntrainRMSE:", train_r,
          "\ntrainMAE:", train_m)


if __name__ == "__main__":
    max_time = 75
    threshold = 0.1
    main()