# Recommender systems

### A Parent Class for Task 1.2 and 1.3

In [1]:
# -*- coding: utf-8 -*-

import sklearn
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
import multiprocessing
import random

path = "./ml-1m/ratings.dat"
seeds = 1

class MatrixModels():
    """
    The template for task 1.2 and 1.3.
    Args:
        path: File path.
        num_factors: The number of features.
        num_iter: Max iterations.
        seeds: Set the random state.
        save_UM: Save the feature matrices.
        save_info: Path to save feature matrices.
        random_normal: The matrix initialization method.
    """
    def __init__(self, path, num_factors, num_iter, seeds, save_UM, save_info="", random_normal=True):
        self.path = path
        self.num_factors = num_factors
        self.num_iter = num_iter
        self.seeds = seeds
        self.save_UM = save_UM
        self.save_info = save_info
        self.random_normal = random_normal
        self.I = 6040
        self.J = 3952
        self.table = pd.read_table(self.path, sep="::", names=["UserID", "MovieID", "Rating", "Timestamp"], 
                                   engine="python")
    
    def _evaluation(self, U, M, table):
        """
        Evaluate model performance.
        """
        y_true = table[:, 2]
        y_pred = []
        
        for i in range(table.shape[0]):
            y_pred.append(np.matmul(U[table[i, 0] - 1, :], M[:, table[i, 1] - 1]))
        
        # If y_pred < 1, y_pred = 1. If y_pred > 5, y_pred = 5.
        y_pred = np.array(y_pred)
        y_pred = np.where(y_pred < 1, 1, y_pred)
        y_pred = np.where(y_pred > 5, 5, y_pred)
        
        # Calculate RMSE and MAE
        rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
        mae = np.mean(np.abs(y_true - y_pred))
        return rmse, mae
    
    def _train_iteration(U, M, table_train):
        print("Implement this method.")
        return U, M
        
    def run_model(self, train_index, test_index, num_work=0):
        """
        Run the experiment on one fold.
        """
        table_train, table_test = self.table.iloc[train_index], self.table.iloc[test_index]
        table_train, table_test = table_train.to_numpy(), table_test.to_numpy()

        # Initialization
        if self.random_normal:
            np.random.seed(self.seeds)
            U = np.random.normal(0, 0.1, (self.I, self.num_factors))
            np.random.seed(self.seeds)
            M = np.random.normal(0, 0.1, (self.num_factors, self.J))
        else:
            U = np.ones((self.I, self.num_factors))
            M = np.ones((self.num_factors, self.J))

        # Training model
        for epoch in range(num_iter):
            info = f"\rWorker {num_work}: {epoch} epoch.\r"
            print(info, end="")
            U, M = self._train_iteration(U, M, table_train)
            print(" " * len(info), end="")
            
        # Save results
        if self.save_UM:
            np.save(f"./UM/{self.save_info}_U_{num_work}.npy", U) 
            np.save(f"./UM/{self.save_info}_M_{num_work}.npy", M)
        
        # Evaluating model
        train_rmse, train_mae = self._evaluation(U, M, table_train)
        self.train_rmse_lst.append(train_rmse)
        self.train_mae_lst.append(train_mae)
        
        test_rmse, test_mae = self._evaluation(U, M, table_test)
        self.test_rmse_lst.append(test_rmse)
        self.test_mae_lst.append(test_mae)
        
        print(f"\rWorker {num_work}: Done.\nTrain RMSE: {train_rmse}, MAE: {train_mae}.\nTest RMSE: {test_rmse}, MAE: {test_mae}.")
        
    def main(self):
        kf = KFold(n_splits=5, shuffle=True, random_state=self.seeds) # five folds
        arg_lst = []
        num_work = 1
        for train_index, test_index in kf.split(self.table):
            arg_lst.append((train_index, test_index, num_work))
            num_work += 1
        
        # Multiprocessing
        manager = multiprocessing.Manager()
        self.train_rmse_lst = manager.list()
        self.train_mae_lst = manager.list()
        self.test_rmse_lst = manager.list()
        self.test_mae_lst = manager.list()
        workers = []
        for i in range(5):
            p = multiprocessing.Process(target=self.run_model, args=arg_lst[i])
            workers.append(p)
            p.start()

        for p in workers:
            p.join()

        self.train_rmse_lst = np.array(self.train_rmse_lst)
        self.train_mae_lst = np.array(self.train_mae_lst)
        self.test_rmse_lst = np.array(self.test_rmse_lst)
        self.test_mae_lst = np.array(self.test_mae_lst)
        
        print(f"\nTrain RMSE Mean: {np.mean(self.train_rmse_lst)}")
        print(f"Train RMSE std: {np.std(self.train_rmse_lst)}")
        print(f"\nTrain MAE Mean: {np.mean(self.train_mae_lst)}")
        print(f"Train MAE std: {np.std(self.train_mae_lst)}")
        
        print(f"\nTest RMSE Mean: {np.mean(self.test_rmse_lst)}")
        print(f"Test RMSE std: {np.std(self.test_rmse_lst)}")
        print(f"\nTest MAE Mean: {np.mean(self.test_mae_lst)}")
        print(f"Test MAE std: {np.std(self.test_mae_lst)}\n")

## 1.2 UV Matrix Decomposition

In [2]:
class UVDecomposition(MatrixModels):
    """
    Implement 1.2 UV Matrix Decomposition.
    Args:
        path: File path.
        num_factors: The number of features.
        num_iter: Max iterations.
        seeds: Set the random state.
        save_UM: Save the feature matrices.
        save_info: Path to save feature matrices. 
        random_normal: The matrix initialization method.
    """
    def __init__(self, path, num_factors, num_iter, seeds, save_UM, save_info="", random_normal=False):
        super().__init__(path, num_factors, num_iter, seeds, save_UM, save_info, random_normal)
        
    def _update_U(self, index, table_train, U, V):
        r, s = index
        sum_1 = 0
        sum_2 = 0
        M = table_train[table_train[:, 0] == (r + 1), :]
        
        for row in range(M.shape[0]):
            j = M[row, 1] - 1
            prod = np.matmul(U[r, :], V[:, j]) - U[r, s] * V[s, j]
            sum_1 += V[s, j] * (M[row, 2] - prod)
            sum_2 += V[s, j] ** 2
            
        if sum_2 == 0:
            sum_2 = 0.001
        U[r, s] = sum_1 / sum_2
        return U

    def _update_V(self, index, table_train, U, V):
        r, s = index
        sum_1 = 0
        sum_2 = 0
        M = table_train[table_train[:, 1] == (s + 1), :]

        for row in range(M.shape[0]):
            i = M[row, 0] - 1
            prod = np.matmul(U[i, :], V[:, s]) - U[i, r] * V[r, s]
            sum_1 += U[i, r] * (M[row, 2] - prod)
            sum_2 += U[i, r] ** 2
        
        if sum_2 == 0:
            sum_2 = 0.001
        V[r, s] = sum_1 / sum_2
        return V
        
    def _train_iteration(self, U, V, table_train):
        """
        One iteration of training model.
        """
        u_index = list(np.ndindex(U.shape))
        v_index = list(np.ndindex(V.shape))
        random.shuffle(u_index)
        random.shuffle(v_index)
        while (len(u_index) > 0) | (len(v_index) > 0):
            if (len(u_index) > 0):
                u = u_index.pop()
                U = self._update_U(u, table_train, U, V)
            if (len(v_index) > 0):
                v = v_index.pop()
                V = self._update_V(v, table_train, U, V)
        return U, V

In [3]:
%%time
num_factors = 10
num_iter = 50

model = UVDecomposition(path, num_factors, num_iter, seeds, False)
model.main()

Worker 4: Done.      
Train RMSE: 0.7759741550052814, MAE: 0.6042379607138241.
Test RMSE: 0.9194980525335328, MAE: 0.7093309673525477.
Worker 2: Done.      
Train RMSE: 0.7794876980234263, MAE: 0.6070292356916613.
Test RMSE: 0.9258889270376295, MAE: 0.7134553354164748.
Worker 3: Done.      
Train RMSE: 0.7827827866936587, MAE: 0.6098721520616756.
Test RMSE: 0.9254678759818764, MAE: 0.714363396890343.
Worker 1: Done.      
Train RMSE: 0.7784982938398914, MAE: 0.606155124270036.
Test RMSE: 0.9249084172855424, MAE: 0.7143216585693684.
Worker 5: Done.      
Train RMSE: 0.7844804095561114, MAE: 0.6109711793082921.
Test RMSE: 0.9252062074775714, MAE: 0.7134409112139641.

Train RMSE Mean: 0.7802446686236739
Train RMSE std: 0.003041172817314274

Train MAE Mean: 0.6076531304090977
Train MAE std: 0.002458867896031895

Test RMSE Mean: 0.9241938960632305
Test RMSE std: 0.0023699120772479277

Test MAE Mean: 0.7129824538885396
Test MAE std: 0.001869097468438227

CPU times: user 8.04 s, sys: 1.35 s, 

## 1.3 Matrix Factorization
- Update the weights based on the rows or columns of matrices,
$$
\begin{align*}
    U^{(t+1)}[i, :] &= U^{(t)}[i, :] + \eta\left( 2e_{ij}M^{(t)}[:, j] - \lambda U^{(t)}[i, :] \right) \\
    M^{(t+1)}[:, j] &= M^{(t)}[:, j] + \eta\left( 2e_{ij}U^{(t)}[i, :] - \lambda M^{(t)}[:, j] \right) 
\end{align*}
$$
- Set the random seed to 1 both in weights initialization and five-fold division.
- Multiprocessing programming.

In [4]:
class GravityTikk(MatrixModels):
    """
    Implement 1.3 Matrix Factorization.
    Args:
        path: File path.
        num_factors: The number of features.
        num_iter: Max iterations.
        seeds: Set the random state.
        save_UM: Save the feature matrices.
        save_info: Path to save feature matrices.
        regularization: factor lambda of regularization term.
        learn_rate: Learning rate.
    """
    def __init__(self, path, num_factors, num_iter, seeds, save_UM, save_info, regularization, learn_rate):
        super().__init__(path, num_factors, num_iter, seeds, save_UM, save_info)
        self.regularization = regularization
        self.learn_rate = learn_rate
    
    def _train_iteration(self, U, M, table_train):
        """
        One iteration of training model.
        """
        for i in range(table_train.shape[0]):
            # Calculate the error
            error = table_train[i, 2] - np.matmul(U[table_train[i, 0] - 1, :], M[:, table_train[i, 1] - 1])
        
            # Calculate the gradient
            gradient_U = 2 * error * M[:, table_train[i, 1] - 1] - self.regularization * U[table_train[i, 0] - 1, :]
            gradient_M = 2 * error * U[table_train[i, 0] - 1, :] - self.regularization * M[:, table_train[i, 1] - 1]
            
            # Update U, M
            U[table_train[i, 0] - 1, :] += self.learn_rate * gradient_U
            M[:, table_train[i, 1] - 1] += self.learn_rate * gradient_M
        
        return U, M

In [5]:
%%time
# Hyperparameters: Setting 1, Suggested Setting
num_factors = 10
num_iter = 75
regularization = 0.05
learn_rate = 0.005

model = GravityTikk(path, num_factors, num_iter, seeds, True, "Setting_1", regularization, learn_rate)
model.main()

Worker 5: Done.                                                                                                            
Train RMSE: 0.7683601075592711, MAE: 0.6031151686238267.
Test RMSE: 0.869252118445245, MAE: 0.6788092372142813.
Worker 3: Done.
Train RMSE: 0.7692080879100817, MAE: 0.6039271897899571.
Test RMSE: 0.8667277030457634, MAE: 0.6771413764266703.
Worker 1: Done.                           
Train RMSE: 0.7687806027625633, MAE: 0.6036467031032468.
Test RMSE: 0.8688135792354811, MAE: 0.6793417110229322.
Worker 4: Done.
Train RMSE: 0.7695233923797611, MAE: 0.604229214831441.
Test RMSE: 0.8681647402923015, MAE: 0.6778429775492132.
Worker 2: Done.      
Train RMSE: 0.7688675734040371, MAE: 0.6030635926962993.
Test RMSE: 0.869961123237032, MAE: 0.6795618867494325.

Train RMSE Mean: 0.7689479528031429
Train RMSE std: 0.00039467198908102393

Train MAE Mean: 0.6035963738089541
Train MAE std: 0.00045340360719525925

Test RMSE Mean: 0.8685838528511646
Test RMSE std: 0.00109701178377

In [6]:
%%time
# Hyperparameters: Setting 2
num_factors = 20
num_iter = 75
regularization = 0.05
learn_rate = 0.005

model = GravityTikk(path, num_factors, num_iter, seeds, True, "Setting_2", regularization, learn_rate)
model.main()

Worker 1: Done.                                                                                                                                  
Train RMSE: 0.7003354623549547, MAE: 0.5474956277467824.
Test RMSE: 0.8850883813443313, MAE: 0.6885226869761961.
Worker 2: Done.      
Train RMSE: 0.7001989296843371, MAE: 0.5468004775412718.
Test RMSE: 0.8858082288824141, MAE: 0.6883424110941696.
Worker 3: Done.
Train RMSE: 0.7003560334230837, MAE: 0.5478898065930472.
Test RMSE: 0.8826690432936825, MAE: 0.6864840763568284.
Worker 4: Done.                           
Train RMSE: 0.7005654171342083, MAE: 0.5478999179616113.
Test RMSE: 0.884943095443112, MAE: 0.6878591775524062.
Worker 5: Done.
Train RMSE: 0.700043633196243, MAE: 0.5472587163336917.
Test RMSE: 0.8853501529849256, MAE: 0.6877561033227745.

Train RMSE Mean: 0.7002998951585653
Train RMSE std: 0.00017365636761773015

Train MAE Mean: 0.5474689092352809
Train MAE std: 0.00041342931009758014

Test RMSE Mean: 0.8847717803896931
Test RMS

In [7]:
%%time
# Hyperparameters: Setting 3
num_factors = 10
num_iter = 100
regularization = 0.05
learn_rate = 0.005

model = GravityTikk(path, num_factors, num_iter, seeds, True, "Setting_3", regularization, learn_rate)
model.main()

Worker 5: Done.                                                                                                                                                                          
Train RMSE: 0.7667639200427913, MAE: 0.6015428217625622.
Test RMSE: 0.8707837737807261, MAE: 0.6796100573091163.
Worker 4: Done.                           
Train RMSE: 0.7678718338092347, MAE: 0.602625982773912.
Test RMSE: 0.8697171843897353, MAE: 0.6787139198311353.
Worker 2: Done.
Train RMSE: 0.7672440591992136, MAE: 0.6014728735722354.
Test RMSE: 0.8711468042809881, MAE: 0.6802037689134355.
Worker 3: Done.                           
Train RMSE: 0.7676115130465193, MAE: 0.6023186435014537.
Test RMSE: 0.8680923980044838, MAE: 0.6778206777685687.
Worker 1: Done.
Train RMSE: 0.7671447086099045, MAE: 0.602093143030455.
Test RMSE: 0.8701534264618374, MAE: 0.6800962248980237.

Train RMSE Mean: 0.7673272069415327
Train RMSE std: 0.0003833952802915549

Train MAE Mean: 0.6020106929281237
Train MAE std: 0.00044

In [8]:
%%time
# Hyperparameters: Setting 4
num_factors = 10
num_iter = 75
regularization = 0.01
learn_rate = 0.005

model = GravityTikk(path, num_factors, num_iter, seeds, True, "Setting_4", regularization, learn_rate)
model.main()

Worker 3: Done.och.                                                                                                                           
Train RMSE: 0.7629294300940439, MAE: 0.595132831987448.
Test RMSE: 0.8792546811915221, MAE: 0.6814628644315769.
Worker 2: Done.                                                                     
Train RMSE: 0.7627406897827123, MAE: 0.5944101358043267.
Test RMSE: 0.8818924101549159, MAE: 0.6839446153786101.
Worker 4: Done.
Train RMSE: 0.7632412432495219, MAE: 0.5955417399619365.
Test RMSE: 0.8800734003309089, MAE: 0.6821658987625727.
Worker 5: Done.
Train RMSE: 0.7618778535004692, MAE: 0.5942762130463269.
Test RMSE: 0.881042540260366, MAE: 0.6829680176699284.
Worker 1: Done.
Train RMSE: 0.7625131134655544, MAE: 0.5949206366619278.
Test RMSE: 0.8813513623505449, MAE: 0.6839251525032414.

Train RMSE Mean: 0.7626604660184604
Train RMSE std: 0.0004583175420140774

Train MAE Mean: 0.5948563114923932
Train MAE std: 0.00046604730451567367

Test RMSE M

In [9]:
%%time
# Hyperparameters: Setting 5
num_factors = 10
num_iter = 75
regularization = 0.05
learn_rate = 0.001

model = GravityTikk(path, num_factors, num_iter, seeds, True, "Setting_5", regularization, learn_rate)
model.main()

Worker 5: Done.                           
Train RMSE: 0.7976918426982755, MAE: 0.6289112804059047.
Test RMSE: 0.8616604275949277, MAE: 0.6764743933286965.
Worker 4: Done.      
Train RMSE: 0.7995131609109802, MAE: 0.6306805228799153.
Test RMSE: 0.861120907669563, MAE: 0.6759132640553595.
Worker 3: Done.                           
Train RMSE: 0.797677201124004, MAE: 0.6290281186698651.
Test RMSE: 0.8583405702971703, MAE: 0.6746248330543247.
Worker 2: Done.
Train RMSE: 0.7969787147044373, MAE: 0.6281863982603141.
Test RMSE: 0.8635206324338418, MAE: 0.6780072350774219.
Worker 1: Done.      
Train RMSE: 0.7980031717521593, MAE: 0.6290379954199995.
Test RMSE: 0.8606193243204672, MAE: 0.6763413993038675.

Train RMSE Mean: 0.7979728182379712
Train RMSE std: 0.0008400185061626239

Train MAE Mean: 0.6291688631271997
Train MAE std: 0.0008189794429669973

Test RMSE Mean: 0.8610523724631939
Test RMSE std: 0.0016738062660900876

Test MAE Mean: 0.676272224963934
Test MAE std: 0.0010861556305292468
