# Recommender systems

## 1.3 Matrix Factorization

In [1]:
# -*- coding: utf-8 -*-

import sklearn
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
import multiprocessing

path = "./ml-1m/ratings.dat"
seeds = 1


class GravityTikk():
    """
    Implement 1.3 Matrix Factorization.
    Args:
        path: File path.
        num_factors: The number of features.
        num_iter: Max iterations.
        regularization: factor lambda of regularization term.
        learn_rate: Learning rate.
        seeds: Set the random state.
        save_info: Path to save feature matrices.
    """
    def __init__(self, path, num_factors, num_iter, regularization, learn_rate, seeds, save_info):
        self.path = path
        self.num_factors = num_factors
        self.num_iter = num_iter
        self.regularization = regularization
        self.learn_rate = learn_rate
        self.seeds = seeds
        self.save_info = save_info
        self.I = 6040
        self.J = 3952
        self.table = pd.read_table(self.path, sep="::", names=["UserID", "MovieID", "Rating", "Timestamp"], 
                                   engine="python")
    
    def _train_iteration(self, U, M, table_train):
        """
        One iteration of training model.
        """
        for i in range(table_train.shape[0]):
            # Calculate the error
            error = table_train[i, 2] - np.matmul(U[table_train[i, 0] - 1, :], M[:, table_train[i, 1] - 1])
        
            # Calculate the gradient
            gradient_U = 2 * error * M[:, table_train[i, 1] - 1] - self.regularization * U[table_train[i, 0] - 1, :]
            gradient_M = 2 * error * U[table_train[i, 0] - 1, :] - self.regularization * M[:, table_train[i, 1] - 1]
            
            # Update U, M
            U[table_train[i, 0] - 1, :] += self.learn_rate * gradient_U
            M[:, table_train[i, 1] - 1] += self.learn_rate * gradient_M
        
        return U, M
    
    def _evaluation(self, U, M, table_test):
        """
        Evaluate model performance.
        """
        y_true = table_test[:, 2]
        y_pred = []
        
        for i in range(table_test.shape[0]):
            y_pred.append(np.matmul(U[table_test[i, 0] - 1, :], M[:, table_test[i, 1] - 1]))
        
        # If y_pred < 1, y_pred = 1. If y_pred > 5, y_pred = 5.
        y_pred = np.array(y_pred)
        y_pred = np.where(y_pred < 1, 1, y_pred)
        y_pred = np.where(y_pred > 5, 5, y_pred)
        
        # Calculate RMSE and MAE
        rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
        mae = np.mean(np.abs(y_true - y_pred))
        return rmse, mae
        
    def run_model(self, train_index, test_index, num_work=0):
        """
        Run the experiment on one fold.
        """
        table_train, table_test = self.table.iloc[train_index], self.table.iloc[test_index]
        table_train, table_test = table_train.to_numpy(), table_test.to_numpy()

        # Initialization
        np.random.seed(self.seeds)
        U = np.random.normal(0, 0.1, (self.I, self.num_factors))
        np.random.seed(self.seeds)
        M = np.random.normal(0, 0.1, (self.num_factors, self.J))

        # Training model
        for epoch in range(num_iter):
            info = f"\rWorker {num_work}: {epoch} epoch.\r"
            print(info, end="")
            U, M = self._train_iteration(U, M, table_train)
            print(" " * len(info), end="")
            
        # Save U, M
        np.save(f"./UM/{self.save_info}_U_{num_work}.npy", U) 
        np.save(f"./UM/{self.save_info}_M_{num_work}.npy", M)
        
        # Evaluating model
        rmse, mae = self._evaluation(U, M, table_test)
        self.rmse_lst.append(rmse)
        self.mae_lst.append(mae)
        print(f"\rWorker {num_work}: Done, RMSE: {rmse}, MAE: {mae}.")
        
    def main(self):
        kf = KFold(n_splits=5, shuffle=True, random_state=self.seeds) # five folds
        arg_lst = []
        num_work = 1
        for train_index, test_index in kf.split(self.table):
            arg_lst.append((train_index, test_index, num_work))
            num_work += 1
        
        # Multiprocessing
        manager = multiprocessing.Manager()
        self.rmse_lst = manager.list()
        self.mae_lst = manager.list()
        workers = []
        for i in range(5):
            p = multiprocessing.Process(target=self.run_model, args=arg_lst[i])
            workers.append(p)
            p.start()

        for p in workers:
            p.join()

        self.rmse_lst = np.array(self.rmse_lst)
        self.mae_lst = np.array(self.mae_lst)
        print(f"\nRMSE Mean: {np.mean(self.rmse_lst)}")
        print(f"RMSE std: {np.std(self.rmse_lst)}")
        print(f"\nMAE Mean: {np.mean(self.mae_lst)}")
        print(f"MAE std: {np.std(self.mae_lst)}")

In [2]:
%%time
# Hyperparameters: Setting 1, Suggested Setting
num_factors = 10
num_iter = 75
regularization = 0.05
learn_rate = 0.005

model = GravityTikk(path, num_factors, num_iter, regularization, learn_rate, seeds, "Setting_1")
model.main()

Worker 4: Done, RMSE: 0.8681647402923015, MAE: 0.6778429775492133.
Worker 3: Done, RMSE: 0.8667277030457634, MAE: 0.6771413764266705.
Worker 1: Done, RMSE: 0.8688135792354811, MAE: 0.6793417110229322.
Worker 2: Done, RMSE: 0.869961123237032, MAE: 0.6795618867494324.
Worker 5: Done, RMSE: 0.869252118445245, MAE: 0.6788092372142813.

RMSE Mean: 0.8685838528511646
RMSE std: 0.0010970117837797114

MAE Mean: 0.6785394377925058
MAE std: 0.0009166753391357066
CPU times: user 3.31 s, sys: 243 ms, total: 3.56 s
Wall time: 13min 38s


In [3]:
%%time
# Hyperparameters: Setting 2
num_factors = 20
num_iter = 75
regularization = 0.05
learn_rate = 0.005

model = GravityTikk(path, num_factors, num_iter, regularization, learn_rate, seeds, "Setting_2")
model.main()

Worker 5: Done, RMSE: 0.8853501529849257, MAE: 0.6877561033227745.
Worker 3: Done, RMSE: 0.8826690432936825, MAE: 0.6864840763568284.
Worker 4: Done, RMSE: 0.8849430954431121, MAE: 0.6878591775524062.
Worker 1: Done, RMSE: 0.8850883813443313, MAE: 0.6885226869761961.
Worker 2: Done, RMSE: 0.8858082288824141, MAE: 0.6883424110941695.

RMSE Mean: 0.8847717803896931
RMSE std: 0.001091771830001555

MAE Mean: 0.6877928910604749
MAE std: 0.0007146072363437182
CPU times: user 3.15 s, sys: 269 ms, total: 3.42 s
Wall time: 18min 38s


In [4]:
%%time
# Hyperparameters: Setting 3
num_factors = 10
num_iter = 100
regularization = 0.05
learn_rate = 0.005

model = GravityTikk(path, num_factors, num_iter, regularization, learn_rate, seeds, "Setting_3")
model.main()

Worker 3: Done, RMSE: 0.8680923980044838, MAE: 0.6778206777685686.
Worker 1: Done, RMSE: 0.8701534264618374, MAE: 0.6800962248980237.
Worker 2: Done, RMSE: 0.8711468042809881, MAE: 0.6802037689134355.
Worker 4: Done, RMSE: 0.8697171843897353, MAE: 0.6787139198311353.
Worker 5: Done, RMSE: 0.8707837737807261, MAE: 0.6796100573091163.

RMSE Mean: 0.869978717383554
RMSE std: 0.0010648630307538373

MAE Mean: 0.6792889297440559
MAE std: 0.0009031256111939246
CPU times: user 3.28 s, sys: 299 ms, total: 3.57 s
Wall time: 24min 52s


In [5]:
%%time
# Hyperparameters: Setting 4
num_factors = 10
num_iter = 75
regularization = 0.01
learn_rate = 0.005

model = GravityTikk(path, num_factors, num_iter, regularization, learn_rate, seeds, "Setting_4")
model.main()

Worker 2: Done, RMSE: 0.8818924101549159, MAE: 0.6839446153786101.
Worker 3: Done, RMSE: 0.8792546811915221, MAE: 0.6814628644315769.
Worker 5: Done, RMSE: 0.881042540260366, MAE: 0.6829680176699284.
Worker 4: Done, RMSE: 0.8800734003309089, MAE: 0.6821658987625727.
Worker 1: Done, RMSE: 0.8813513623505448, MAE: 0.6839251525032416.

RMSE Mean: 0.8807228788576514
RMSE std: 0.0009425927128970909

MAE Mean: 0.6828933097491859
MAE std: 0.0009747640554609066
CPU times: user 3.06 s, sys: 266 ms, total: 3.33 s
Wall time: 18min 49s


In [6]:
%%time
# Hyperparameters: Setting 5
num_factors = 10
num_iter = 75
regularization = 0.05
learn_rate = 0.001

model = GravityTikk(path, num_factors, num_iter, regularization, learn_rate, seeds, "Setting_5")
model.main()

Worker 2: Done, RMSE: 0.8635206324338419, MAE: 0.6780072350774219.
Worker 4: Done, RMSE: 0.861120907669563, MAE: 0.6759132640553595.
Worker 3: Done, RMSE: 0.8583405702971703, MAE: 0.6746248330543247.
Worker 1: Done, RMSE: 0.8606193243204672, MAE: 0.6763413993038675.
Worker 5: Done, RMSE: 0.8616604275949277, MAE: 0.6764743933286965.

RMSE Mean: 0.8610523724631941
RMSE std: 0.0016738062660901203

MAE Mean: 0.676272224963934
MAE std: 0.0010861556305292468
CPU times: user 3.02 s, sys: 294 ms, total: 3.32 s
Wall time: 18min 42s
