# Recommender systems

## 1.3 Matrix Factorization
- Update the weights based on the rows or columns of matrices,
$$
\begin{align*}
    U^{(t+1)}[i, :] &= U^{(t)}[i, :] + \eta\left( 2e_{ij}M^{(t)}[:, j] - \lambda U^{(t)}[i, :] \right) \\
    M^{(t+1)}[:, j] &= M^{(t)}[:, j] + \eta\left( 2e_{ij}U^{(t)}[i, :] - \lambda M^{(t)}[:, j] \right) 
\end{align*}
$$
- Set the random seed to 1 both in weights initialization and five-fold division.
- Multiprocessing programming.

In [1]:
# -*- coding: utf-8 -*-

import sklearn
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
import multiprocessing

path = "./ml-1m/ratings.dat"
seeds = 1


class GravityTikk():
    """
    Implement 1.3 Matrix Factorization.
    Args:
        path: File path.
        num_factors: The number of features.
        num_iter: Max iterations.
        regularization: factor lambda of regularization term.
        learn_rate: Learning rate.
        seeds: Set the random state.
        save_info: Path to save feature matrices.
    """
    def __init__(self, path, num_factors, num_iter, regularization, learn_rate, seeds, save_info):
        self.path = path
        self.num_factors = num_factors
        self.num_iter = num_iter
        self.regularization = regularization
        self.learn_rate = learn_rate
        self.seeds = seeds
        self.save_info = save_info
        self.I = 6040
        self.J = 3952
        self.table = pd.read_table(self.path, sep="::", names=["UserID", "MovieID", "Rating", "Timestamp"], 
                                   engine="python")
    
    def _train_iteration(self, U, M, table_train):
        """
        One iteration of training model.
        """
        for i in range(table_train.shape[0]):
            # Calculate the error
            error = table_train[i, 2] - np.matmul(U[table_train[i, 0] - 1, :], M[:, table_train[i, 1] - 1])
        
            # Calculate the gradient
            gradient_U = 2 * error * M[:, table_train[i, 1] - 1] - self.regularization * U[table_train[i, 0] - 1, :]
            gradient_M = 2 * error * U[table_train[i, 0] - 1, :] - self.regularization * M[:, table_train[i, 1] - 1]
            
            # Update U, M
            U[table_train[i, 0] - 1, :] += self.learn_rate * gradient_U
            M[:, table_train[i, 1] - 1] += self.learn_rate * gradient_M
        
        return U, M
    
    def _evaluation(self, U, M, table):
        """
        Evaluate model performance.
        """
        y_true = table[:, 2]
        y_pred = []
        
        for i in range(table.shape[0]):
            y_pred.append(np.matmul(U[table[i, 0] - 1, :], M[:, table[i, 1] - 1]))
        
        # If y_pred < 1, y_pred = 1. If y_pred > 5, y_pred = 5.
        y_pred = np.array(y_pred)
        y_pred = np.where(y_pred < 1, 1, y_pred)
        y_pred = np.where(y_pred > 5, 5, y_pred)
        
        # Calculate RMSE and MAE
        rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
        mae = np.mean(np.abs(y_true - y_pred))
        return rmse, mae
        
    def run_model(self, train_index, test_index, num_work=0):
        """
        Run the experiment on one fold.
        """
        table_train, table_test = self.table.iloc[train_index], self.table.iloc[test_index]
        table_train, table_test = table_train.to_numpy(), table_test.to_numpy()

        # Initialization
        np.random.seed(self.seeds)
        U = np.random.normal(0, 0.1, (self.I, self.num_factors))
        np.random.seed(self.seeds)
        M = np.random.normal(0, 0.1, (self.num_factors, self.J))

        # Training model
        for epoch in range(num_iter):
            info = f"\rWorker {num_work}: {epoch} epoch.\r"
            print(info, end="")
            U, M = self._train_iteration(U, M, table_train)
            print(" " * len(info), end="")
            
        # Save U, M
        np.save(f"./UM/{self.save_info}_U_{num_work}.npy", U) 
        np.save(f"./UM/{self.save_info}_M_{num_work}.npy", M)
        
        # Evaluating model
        train_rmse, train_mae = self._evaluation(U, M, table_train)
        self.train_rmse_lst.append(train_rmse)
        self.train_mae_lst.append(train_mae)
        
        test_rmse, test_mae = self._evaluation(U, M, table_test)
        self.test_rmse_lst.append(test_rmse)
        self.test_mae_lst.append(test_mae)
        
        print(f"\rWorker {num_work}: Done.\nTrain RMSE: {train_rmse}, MAE: {train_mae}.\nTest RMSE: {test_rmse}, MAE: {test_mae}.")
        
    def main(self):
        kf = KFold(n_splits=5, shuffle=True, random_state=self.seeds) # five folds
        arg_lst = []
        num_work = 1
        for train_index, test_index in kf.split(self.table):
            arg_lst.append((train_index, test_index, num_work))
            num_work += 1
        
        # Multiprocessing
        manager = multiprocessing.Manager()
        self.train_rmse_lst = manager.list()
        self.train_mae_lst = manager.list()
        self.test_rmse_lst = manager.list()
        self.test_mae_lst = manager.list()
        workers = []
        for i in range(5):
            p = multiprocessing.Process(target=self.run_model, args=arg_lst[i])
            workers.append(p)
            p.start()

        for p in workers:
            p.join()

        self.train_rmse_lst = np.array(self.train_rmse_lst)
        self.train_mae_lst = np.array(self.train_mae_lst)
        self.test_rmse_lst = np.array(self.test_rmse_lst)
        self.test_mae_lst = np.array(self.test_mae_lst)
        
        print(f"\nTrain RMSE Mean: {np.mean(self.train_rmse_lst)}")
        print(f"Train RMSE std: {np.std(self.train_rmse_lst)}")
        print(f"\nTrain MAE Mean: {np.mean(self.train_mae_lst)}")
        print(f"Train MAE std: {np.std(self.train_mae_lst)}")
        
        print(f"\nTest RMSE Mean: {np.mean(self.test_rmse_lst)}")
        print(f"Test RMSE std: {np.std(self.test_rmse_lst)}")
        print(f"\nTest MAE Mean: {np.mean(self.test_mae_lst)}")
        print(f"Test MAE std: {np.std(self.test_mae_lst)}\n")

In [2]:
%%time
# Hyperparameters: Setting 1, Suggested Setting
num_factors = 10
num_iter = 75
regularization = 0.05
learn_rate = 0.005

model = GravityTikk(path, num_factors, num_iter, regularization, learn_rate, seeds, "Setting_1")
model.main()

Worker 5: Done.                           
Train RMSE: 0.7683601075592711, MAE: 0.6031151686238267.
Test RMSE: 0.869252118445245, MAE: 0.6788092372142813.
Worker 3: Done.
Train RMSE: 0.7692080879100817, MAE: 0.6039271897899572.
Test RMSE: 0.8667277030457634, MAE: 0.6771413764266705.
Worker 2: Done.                           
Train RMSE: 0.7688675734040371, MAE: 0.6030635926962993.
Test RMSE: 0.869961123237032, MAE: 0.6795618867494324.
Worker 1: Done.
Train RMSE: 0.7687806027625633, MAE: 0.6036467031032469.
Test RMSE: 0.8688135792354811, MAE: 0.6793417110229322.
Worker 4: Done.      
Train RMSE: 0.7695233923797611, MAE: 0.604229214831441.
Test RMSE: 0.8681647402923015, MAE: 0.6778429775492133.

Train RMSE Mean: 0.7689479528031429
Train RMSE std: 0.00039467198908102393

Train MAE Mean: 0.6035963738089543
Train MAE std: 0.00045340360719527795

Test RMSE Mean: 0.8685838528511647
Test RMSE std: 0.0010970117837797114

Test MAE Mean: 0.6785394377925058
Test MAE std: 0.0009166753391357066

CPU

In [3]:
%%time
# Hyperparameters: Setting 2
num_factors = 20
num_iter = 75
regularization = 0.05
learn_rate = 0.005

model = GravityTikk(path, num_factors, num_iter, regularization, learn_rate, seeds, "Setting_2")
model.main()

Worker 5: Done.                           
Train RMSE: 0.7000436331962429, MAE: 0.5472587163336917.
Test RMSE: 0.8853501529849257, MAE: 0.6877561033227745.
Worker 1: Done.
Train RMSE: 0.7003354623549547, MAE: 0.5474956277467825.
Test RMSE: 0.8850883813443313, MAE: 0.6885226869761961.
Worker 4: Done.och.                       
Train RMSE: 0.7005654171342083, MAE: 0.5478999179616113.
Test RMSE: 0.8849430954431121, MAE: 0.6878591775524062.
Worker 2: Done.      
Train RMSE: 0.7001989296843371, MAE: 0.5468004775412718.
Test RMSE: 0.8858082288824141, MAE: 0.6883424110941695.
Worker 3: Done.      
Train RMSE: 0.7003560334230837, MAE: 0.5478898065930472.
Test RMSE: 0.8826690432936825, MAE: 0.6864840763568284.

Train RMSE Mean: 0.7002998951585653
Train RMSE std: 0.00017365636761776292

Train MAE Mean: 0.5474689092352809
Train MAE std: 0.00041342931009758155

Test RMSE Mean: 0.8847717803896931
Test RMSE std: 0.001091771830001555

Test MAE Mean: 0.6877928910604749
Test MAE std: 0.0007146072363437

In [4]:
%%time
# Hyperparameters: Setting 3
num_factors = 10
num_iter = 100
regularization = 0.05
learn_rate = 0.005

model = GravityTikk(path, num_factors, num_iter, regularization, learn_rate, seeds, "Setting_3")
model.main()

Worker 3: Done.och.                       
Train RMSE: 0.7676115130465193, MAE: 0.6023186435014537.
Test RMSE: 0.8680923980044838, MAE: 0.6778206777685686.
Worker 1: Done.      
Train RMSE: 0.7671447086099045, MAE: 0.602093143030455.
Test RMSE: 0.8701534264618374, MAE: 0.6800962248980237.
Worker 5: Done.och.                       
Train RMSE: 0.7667639200427913, MAE: 0.6015428217625625.
Test RMSE: 0.8707837737807261, MAE: 0.6796100573091163.
Worker 2: Done.      
Train RMSE: 0.7672440591992136, MAE: 0.6014728735722354.
Test RMSE: 0.8711468042809881, MAE: 0.6802037689134355.
Worker 4: Done.      
Train RMSE: 0.7678718338092348, MAE: 0.602625982773912.
Test RMSE: 0.8697171843897353, MAE: 0.6787139198311353.

Train RMSE Mean: 0.7673272069415327
Train RMSE std: 0.0003833952802915864

Train MAE Mean: 0.6020106929281237
Train MAE std: 0.0004446046010671937

Test RMSE Mean: 0.869978717383554
Test RMSE std: 0.0010648630307538373

Test MAE Mean: 0.6792889297440559
Test MAE std: 0.00090312561119

In [5]:
%%time
# Hyperparameters: Setting 4
num_factors = 10
num_iter = 75
regularization = 0.01
learn_rate = 0.005

model = GravityTikk(path, num_factors, num_iter, regularization, learn_rate, seeds, "Setting_4")
model.main()

Worker 4: Done.                           
Train RMSE: 0.7632412432495219, MAE: 0.5955417399619365.
Test RMSE: 0.8800734003309089, MAE: 0.6821658987625727.
Worker 3: Done.
Train RMSE: 0.7629294300940438, MAE: 0.595132831987448.
Test RMSE: 0.8792546811915221, MAE: 0.6814628644315769.
Worker 5: Done.                           
Train RMSE: 0.7618778535004692, MAE: 0.5942762130463269.
Test RMSE: 0.881042540260366, MAE: 0.6829680176699284.
Worker 2: Done.
Train RMSE: 0.7627406897827123, MAE: 0.5944101358043267.
Test RMSE: 0.8818924101549159, MAE: 0.6839446153786101.
Worker 1: Done.      
Train RMSE: 0.7625131134655544, MAE: 0.5949206366619278.
Test RMSE: 0.8813513623505448, MAE: 0.6839251525032416.

Train RMSE Mean: 0.7626604660184604
Train RMSE std: 0.0004583175420140644

Train MAE Mean: 0.5948563114923932
Train MAE std: 0.0004660473045156737

Test RMSE Mean: 0.8807228788576517
Test RMSE std: 0.0009425927128970909

Test MAE Mean: 0.6828933097491859
Test MAE std: 0.0009747640554609066

CPU 

In [6]:
%%time
# Hyperparameters: Setting 5
num_factors = 10
num_iter = 75
regularization = 0.05
learn_rate = 0.001

model = GravityTikk(path, num_factors, num_iter, regularization, learn_rate, seeds, "Setting_5")
model.main()

Worker 4: Done.                           
Train RMSE: 0.7995131609109801, MAE: 0.6306805228799153.
Test RMSE: 0.861120907669563, MAE: 0.6759132640553595.
Worker 3: Done.
Train RMSE: 0.797677201124004, MAE: 0.6290281186698652.
Test RMSE: 0.8583405702971703, MAE: 0.6746248330543247.
Worker 1: Done.                           
Train RMSE: 0.7980031717521593, MAE: 0.6290379954199994.
Test RMSE: 0.8606193243204672, MAE: 0.6763413993038675.
Worker 5: Done.
Train RMSE: 0.7976918426982756, MAE: 0.6289112804059047.
Test RMSE: 0.8616604275949277, MAE: 0.6764743933286965.
Worker 2: Done.      
Train RMSE: 0.7969787147044373, MAE: 0.6281863982603141.
Test RMSE: 0.8635206324338419, MAE: 0.6780072350774219.

Train RMSE Mean: 0.7979728182379712
Train RMSE std: 0.0008400185061625756

Train MAE Mean: 0.6291688631271997
Train MAE std: 0.000818979442966997

Test RMSE Mean: 0.8610523724631939
Test RMSE std: 0.0016738062660901203

Test MAE Mean: 0.676272224963934
Test MAE std: 0.0010861556305292468

CPU ti