# Recommender systems

## 1.3 Matrix Factorization

In [1]:
# -*- coding: utf-8 -*-

import sklearn
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
import multiprocessing

path = "./ml-1m/ratings.dat"

# Hyperparameters
num_factors = 10
num_iter = 75
regularization = 0.05
learn_rate = 0.005

seeds = 1


class GravityTikk():
    """
    Implement 1.3 Matrix Factorization.
    Args:
        path: File path.
        num_factors: The number of features.
        num_iter: Max iterations.
        regularization: factor lambda of regularization term.
        learn_rate: Learning rate.
    """
    def __init__(self, path, num_factors, num_iter, regularization, learn_rate, seeds):
        self.path = path
        self.num_factors = num_factors
        self.num_iter = num_iter
        self.regularization = regularization
        self.learn_rate = learn_rate
        self.seeds = seeds
        self.I = 6040
        self.J = 3952
        self.table = pd.read_table(self.path, sep="::", names=["UserID", "MovieID", "Rating", "Timestamp"], 
                                   engine="python")
    
    def _train_iteration(self, U, M, table_train):
        """
        One iteration of training model.
        """
        for i in range(table_train.shape[0]):
            # Calculate the error
            error = table_train[i, 2] - np.matmul(U[table_train[i, 0] - 1, :], M[:, table_train[i, 1] - 1])
        
            # Calculate the gradient
            gradient_U = 2 * error * M[:, table_train[i, 1] - 1] - self.regularization * U[table_train[i, 0] - 1, :]
            gradient_M = 2 * error * U[table_train[i, 0] - 1, :] - self.regularization * M[:, table_train[i, 1] - 1]
            
            # Update U, M
            U[table_train[i, 0] - 1, :] += self.learn_rate * gradient_U
            M[:, table_train[i, 1] - 1] += self.learn_rate * gradient_M
        
        return U, M
    
    def _evaluation(self, U, M, table_test):
        """
        Evaluate model performance.
        """
        y_true = table_test[:, 2]
        y_pred = []
        
        for i in range(table_test.shape[0]):
            y_pred.append(np.matmul(U[table_test[i, 0] - 1, :], M[:, table_test[i, 1] - 1]))
        
        return np.sqrt(np.mean((y_true - np.array(y_pred)) ** 2))
        
    def run_model(self, train_index, test_index, num_work=0):
        """
        Run the experiment on one fold.
        """
        table_train, table_test = self.table.iloc[train_index], self.table.iloc[test_index]
        table_train, table_test = table_train.to_numpy(), table_test.to_numpy()

        # Initialization
        np.random.seed(self.seeds)
        U = np.random.normal(0, 0.1, (self.I, self.num_factors))
        np.random.seed(self.seeds)
        M = np.random.normal(0, 0.1, (self.num_factors, self.J))

        # Training model
        for epoch in range(num_iter):
            info = f"\rWorker {num_work}: {epoch} epoch.\r"
            print(info, end="")
            U, M = self._train_iteration(U, M, table_train)
            print(" " * len(info), end="")
            
        # Save U, M
        np.save(f"./UM/U_{num_work}.npy", U) 
        np.save(f"./UM/M_{num_work}.npy", M)
        
        # Evaluating model
        rmse = self._evaluation(U, M, table_test)
        self.rmse_lst.append(rmse)
        print(f"\rWorker {num_work}: Done, RMSE: {rmse}.")
        
    def main(self):
        kf = KFold(n_splits=5, shuffle=True, random_state=self.seeds) # five folds
        arg_lst = []
        num_work = 1
        for train_index, test_index in kf.split(self.table):
            arg_lst.append((train_index, test_index, num_work))
            num_work += 1
        
        # Multiprocessing
        manager = multiprocessing.Manager()
        self.rmse_lst = manager.list()
        workers = []
        for i in range(5):
            p = multiprocessing.Process(target=self.run_model, args=arg_lst[i])
            workers.append(p)
            p.start()

        for p in workers:
            p.join()

        self.rmse_lst = np.array(self.rmse_lst)
        print(f"\nMean: {np.mean(self.rmse_lst)}")
        print(f"std: {np.std(self.rmse_lst)}")
        
        
if __name__ == "__main__":
    model = GravityTikk(path, num_factors, num_iter, regularization, learn_rate, seeds)
    model.main()

Worker 2: Done, RMSE: 0.8723353073109018.
Worker 5: Done, RMSE: 0.871580729932603.  
Worker 1: Done, RMSE: 0.8708779166325852.
Worker 3: Done, RMSE: 0.8690114582284748. 
Worker 4: Done, RMSE: 0.8703583826014601.

Mean: 0.8708327589412048
std: 0.0011279468624224753
