# STA326 Assignment 3

In [5]:
import os
import time
import os.path as osp

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim

import scipy.sparse as sp
import math
import random
from copy import deepcopy
import time
import csv

In [6]:
print(torch.__version__)

2.3.0+cu121


In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Data

In [8]:
def load_all(data_name):
    """ We load all the three file here to save time in each epoch. """
    train_rating = './Data/{}.train.rating'.format(data_name)
    train_data = pd.read_csv(
        train_rating, 
        sep='\t', header=None, names=['user', 'item'], 
        usecols=[0, 1], dtype={0: np.int32, 1: np.int32})

    user_num = train_data['user'].max() + 1
    item_num = train_data['item'].max() + 1

    train_data = train_data.values.tolist()

    # load ratings as a dok matrix
    train_mat = sp.dok_matrix((user_num, item_num), dtype=np.float32)
    for x in train_data:
        train_mat[x[0], x[1]] = 1.0

    test_negative = './Data/{}.test.negative'.format(data_name)
    test_data = []
    with open(test_negative, 'r') as fd:
        line = fd.readline()
        while line != None and line != '':
            arr = line.split('\t')
            u = eval(arr[0])[0]
            test_data.append([u, eval(arr[0])[1]])
            for i in arr[1:]:
                test_data.append([u, int(i)])
            line = fd.readline()
    return train_data, test_data, user_num, item_num, train_mat

In [9]:
class NCFData(Dataset):
    def __init__(self, features, num_item, train_mat=None, num_ng=0, is_training=None):
        super(NCFData, self).__init__()
        """ Note that the labels are only useful when training, we thus 
            add them in the ng_sample() function.
        """
        self.features_ps = features
        self.num_item = num_item
        self.train_mat = train_mat
        self.num_ng = num_ng
        self.is_training = is_training
        self.labels = [0 for _ in range(len(features))]
        
        if self.is_training:
            self.ng_sample()

    def ng_sample(self):
#         assert self.is_training, 'no need to sampling when testing'

        self.features_ng = []
        for x in self.features_ps:
            u = x[0]
            for t in range(self.num_ng):
                j = np.random.randint(self.num_item)
                while (u, j) in self.train_mat:
                    j = np.random.randint(self.num_item)
                self.features_ng.append([u, j])

        labels_ps = [1 for _ in range(len(self.features_ps))]
        labels_ng = [0 for _ in range(len(self.features_ng))]

        self.features_fill = self.features_ps + self.features_ng
        self.labels_fill = labels_ps + labels_ng

    def __len__(self):
        return (self.num_ng + 1) * len(self.labels)

    def __getitem__(self, idx):
        features = self.features_fill if self.is_training \
                    else self.features_ps
        labels = self.labels_fill if self.is_training \
                    else self.labels

        user = features[idx][0]
        item = features[idx][1]
        label = labels[idx]
        return user, item ,label

In [10]:
def loader(data_name, batch_size, test_num_ng):
#     data_name = 'ml-1m'
#     batch_size = 256
#     test_num_ng = 99

    train_data, test_data, user_num ,item_num, train_mat = load_all(data_name)

    # construct the train and test datasets
    train_dataset = NCFData(train_data, item_num, train_mat, 4, True)
    test_dataset = NCFData(test_data, item_num, train_mat, 0, False)

    train_loader = DataLoader(train_dataset,batch_size=batch_size, shuffle=True, num_workers=4)
    test_loader = DataLoader(test_dataset,batch_size=test_num_ng+1, shuffle=False, num_workers=0)
    return user_num,item_num,train_loader, test_loader

## Evaluate

In [11]:
def hit(gt_item, pred_items):
    if gt_item in pred_items:
        return 1
    return 0

def ndcg(gt_item, pred_items):
    if gt_item in pred_items:
        index = pred_items.index(gt_item)
        return np.reciprocal(np.log2(index+2))
    return 0

def metrics(model, test_loader, top_k):
    HR, NDCG = [], []

    for user, item, label in test_loader:
        user = user.to(device)
        item = item.to(device)

        predictions = model(user, item)
        _, indices = torch.topk(predictions, top_k)
        recommends = torch.take(item, indices).cpu().numpy().tolist()

        gt_item = item[0].item()
        HR.append(hit(gt_item, recommends))
        NDCG.append(ndcg(gt_item, recommends))

    return np.mean(HR), np.mean(NDCG)

## Model

GMF, MLP and NeuMF model are written in one Module class `NCF`.
* GMF can be implemented using `NCF(user_num, item_num, factor_num, 0, 0.0, 'GMF', None, None)`.
* MLP-i can be implemented using `NCF(user_num, item_num, factor_num, i, 0.0, 'MLP', None, None)`, where i represents the number of hidden layers.
* NeuMF with pre-training can be implemented using `NCF(user_num, item_num, factor_num, i, 0.0, 'NeuMF-pre', GMF_model, MLP_model)`, where  GMF_model and MLP_model represent pre trained GMF and MLP, respectively, and i represents the number of hidden layers of MLP_model.
* NeuMF that are trained from scratch can be implemented using `NCF(user_num, item_num, factor_num, i, 'NeuMF', None, None, None)`.

In [12]:
class NCF(nn.Module):
    def __init__(self, user_num, item_num, factor_num, num_layers,dropout, model, GMF_model=None, MLP_model=None):
        super(NCF, self).__init__()
        """
        user_num: number of users;
        item_num: number of items;
        factor_num: number of predictive factors;
        num_layers: the number of layers in MLP model;
        dropout: dropout rate between fully connected layers;
        model: 'MLP', 'GMF', 'NeuMF-end', and 'NeuMF-pre';
        GMF_model: pre-trained GMF weights;
        MLP_model: pre-trained MLP weights.
        """        
        self.dropout = dropout
        self.model = model
        self.GMF_model = GMF_model
        self.MLP_model = MLP_model

        # Embedding layers for GMF and MLP
        self.embed_user_GMF = nn.Embedding(user_num, factor_num)
        self.embed_item_GMF = nn.Embedding(item_num, factor_num)
        self.embed_user_MLP = nn.Embedding(user_num, int(factor_num * (2 ** (num_layers - 1))))
        self.embed_item_MLP = nn.Embedding(item_num, int(factor_num * (2 ** (num_layers - 1))))

        # MLP layers
        MLP_modules = []
        for i in range(num_layers):
            input_size = factor_num * (2 ** (num_layers - i))
            MLP_modules.append(nn.Dropout(p=self.dropout))
            MLP_modules.append(nn.Linear(input_size, input_size // 2))
            MLP_modules.append(nn.ReLU())
        self.MLP_layers = nn.Sequential(*MLP_modules)

        # Prediction layer
        if self.model in ['MLP', 'GMF']:
            predict_size = factor_num 
        else:
            predict_size = factor_num * 2
        self.predict_layer = nn.Linear(predict_size, 1)

        self._init_weight_()
        
    def _init_weight_(self):
        """ We leave the weights initialization here. """
        if not self.model == 'NeuMF-pre':
            nn.init.normal_(self.embed_user_GMF.weight, std=0.01)
            nn.init.normal_(self.embed_user_MLP.weight, std=0.01)
            nn.init.normal_(self.embed_item_GMF.weight, std=0.01)
            nn.init.normal_(self.embed_item_MLP.weight, std=0.01)

            for m in self.MLP_layers:
                if isinstance(m, nn.Linear):
                    nn.init.xavier_uniform_(m.weight)
            nn.init.kaiming_uniform_(self.predict_layer.weight, a=1, nonlinearity='sigmoid')

            for m in self.modules():
                if isinstance(m, nn.Linear) and m.bias is not None:
                    m.bias.data.zero_()
        else:
            # embedding layers
            self.embed_user_GMF.weight.data.copy_(
                            self.GMF_model.embed_user_GMF.weight)
            self.embed_item_GMF.weight.data.copy_(
                            self.GMF_model.embed_item_GMF.weight)
            self.embed_user_MLP.weight.data.copy_(
                            self.MLP_model.embed_user_MLP.weight)
            self.embed_item_MLP.weight.data.copy_(
                            self.MLP_model.embed_item_MLP.weight)

            # mlp layers
            for (m1, m2) in zip(
                self.MLP_layers, self.MLP_model.MLP_layers):
                if isinstance(m1, nn.Linear) and isinstance(m2, nn.Linear):
                    m1.weight.data.copy_(m2.weight)
                    m1.bias.data.copy_(m2.bias)

            # predict layers
            predict_weight = torch.cat([
                self.GMF_model.predict_layer.weight, 
                self.MLP_model.predict_layer.weight], dim=1)
            precit_bias = self.GMF_model.predict_layer.bias + \
                        self.MLP_model.predict_layer.bias

            self.predict_layer.weight.data.copy_(0.5 * predict_weight)
            self.predict_layer.bias.data.copy_(0.5 * precit_bias)

    def forward(self, user, item):
        if not self.model == 'MLP':
            embed_user_GMF = self.embed_user_GMF(user)
            embed_item_GMF = self.embed_item_GMF(item)
            output_GMF = embed_user_GMF * embed_item_GMF
        if not self.model == 'GMF':
            embed_user_MLP = self.embed_user_MLP(user)
            embed_item_MLP = self.embed_item_MLP(item)
            interaction = torch.cat((embed_user_MLP, embed_item_MLP), -1)
            output_MLP = self.MLP_layers(interaction)

        if self.model == 'GMF':
            concat = output_GMF
        elif self.model == 'MLP':
            concat = output_MLP
        else:
            concat = torch.cat((output_GMF, output_MLP), -1)
            
        prediction = self.predict_layer(concat)
        prediction = torch.sigmoid(prediction)
        return prediction.view(-1)


## Training model

In [32]:
def training(model, train_loader, test_loader, NUM_EPOCHS, optimizer_name, lr, data_name, model_name, csv_name): 
    if optimizer_name == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=lr)
    else:
        optimizer = optim.SGD(model.parameters(), lr=lr)
        
    loss_function = nn.BCEWithLogitsLoss()
    top_k = 10
    
    # Create a CSV file if it does not exist.
    csv_file_path = f'./Output/{csv_name}.csv'
    file_exists = os.path.exists(csv_file_path)
    if not file_exists:
        with open(csv_file_path, 'w', newline='') as csvfile:
            csvwriter = csv.writer(csvfile)
            header = ['epoch', 'HR', 'NDCG', 'data_name', 'model_name']
            csvwriter.writerow(header)

    for epoch in range(NUM_EPOCHS):
        model.train()
        start_time = time.time()

        for user, item, label in train_loader:
            user = user.to(device)
            item = item.to(device)
            label = label.float().to(device)

            prediction = model(user, item)
            loss = loss_function(prediction, label)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        model.eval()
        
        HR, NDCG = metrics(model, test_loader, top_k)
        
        # Write data to csv file.
        with open(csv_file_path, 'a', newline='') as csvfile:
            csvwriter = csv.writer(csvfile)
            row = [epoch+1, HR, NDCG, data_name, model_name]
            csvwriter.writerow(row)
        
        # Save model data every five epochs.
        if (epoch+1) % 5 == 0:
            state = {
                    'state_dict': model.state_dict(),
                    'HR': HR,
                    'NDCG': NDCG,
                    'top_k': top_k,
                    'epoch': (epoch+1),
            }
            save_dir = './Model'
            if not os.path.exists(save_dir):
                os.mkdir(save_dir)
            model_path = osp.join(save_dir, "{}_{}_{}.pth".format(model_name ,data_name , epoch+1))

            torch.save(state, model_path)
            
            # Output information every five epochs.
            elapsed_time = time.time() - start_time
            print("The time elapse of epoch {:d} for model {} and data {}".format(epoch+1,model_name, data_name) + " is: " + 
                    time.strftime("%H: %M: %S", time.gmtime(elapsed_time)))
            print("HR: {:.3f}\tNDCG: {:.3f}".format(np.mean(HR), np.mean(NDCG)))

In [34]:
NUM_EPOCHS = 10
factor_num = 8
batch_size = 256

Performance Comparison

In [36]:

for data_name in ['ml-1m', 'pinterest-20']:
    user_num, item_num, train_loader, test_loader = loader(data_name, batch_size=batch_size, test_num_ng=99)
    
    for model_name in ['gmf','neumf_pre']:
        optimizer_name = 'adam'
        lr = 0.001
        if model_name=='gmf':
            model = NCF(user_num, item_num, factor_num, 0, 0.0, 'GMF', None, None)
        else:
            model_GMF = NCF(user_num, item_num, factor_num, 0, 0.0, 'GMF', None, None)
            model_GMF.to(device)
            checkpoint_GMF = torch.load('Model/gmf_pinterest-20_10.pth')
            model_GMF.load_state_dict(checkpoint_GMF['state_dict'])

            model_MLP = NCF(user_num, item_num, factor_num, 3, 0.0, 'MLP', None, None)
            model_MLP.to(device)
            checkpoint_MLP = torch.load('Model/mlp_3_pinterest-20_10.pth')
            model_MLP.load_state_dict(checkpoint_MLP['state_dict'])
            
            model = NCF(user_num, item_num, factor_num, 3, 0.0, 'NeuMF-pre',model_GMF,model_MLP)
            
            optimizer_name = 'sgd'
        model.to(device)
        
        training(model, train_loader, test_loader, NUM_EPOCHS, optimizer_name, lr, data_name, model_name, 'ncf')


The time elapse of epoch 5 for model gmf and data ml-1m is: 00: 00: 35
HR: 0.620	NDCG: 0.354
The time elapse of epoch 10 for model gmf and data ml-1m is: 00: 00: 34
HR: 0.636	NDCG: 0.364
The time elapse of epoch 5 for model neumf_pre and data ml-1m is: 00: 00: 35
HR: 0.662	NDCG: 0.386
The time elapse of epoch 10 for model neumf_pre and data ml-1m is: 00: 00: 34
HR: 0.662	NDCG: 0.386
The time elapse of epoch 5 for model gmf and data pinterest-20 is: 00: 01: 06
HR: 0.845	NDCG: 0.516
The time elapse of epoch 10 for model gmf and data pinterest-20 is: 00: 01: 08
HR: 0.848	NDCG: 0.521
The time elapse of epoch 5 for model neumf_pre and data pinterest-20 is: 00: 01: 35
HR: 0.862	NDCG: 0.535
The time elapse of epoch 10 for model neumf_pre and data pinterest-20 is: 00: 01: 37
HR: 0.861	NDCG: 0.535



MLP with hidden layers ranging from 0 to 4

In [35]:
# Train MLP with hidden layers ranging from 0 to 4 on data_name

def MLP_layers(data_name, NUM_EPOCHS, optimizer_name, lr):
    user_num, item_num, train_loader, test_loader = loader(data_name, batch_size=batch_size, test_num_ng=99)
    for i in range(5):       
        model = NCF(user_num, item_num, 8, i, 0.0, 'MLP', None, None)
        model.to(device)
        training(model, train_loader, test_loader, NUM_EPOCHS, optimizer_name, lr, data_name, f'mlp_{i}', 'mlp', layers_num=i)

In [33]:
for data_name in ['ml-1m','pinterest-20']:
    MLP_layers(data_name, NUM_EPOCHS, 'adam', lr=1e-3)


The time elapse of epoch 5 for model mlp_0 and data ml-1m is: 00: 00: 38
HR: 0.453	NDCG: 0.252
The time elapse of epoch 10 for model mlp_0 and data ml-1m is: 00: 00: 37
HR: 0.453	NDCG: 0.253
The time elapse of epoch 5 for model mlp_1 and data ml-1m is: 00: 00: 38
HR: 0.558	NDCG: 0.313
The time elapse of epoch 10 for model mlp_1 and data ml-1m is: 00: 00: 38
HR: 0.592	NDCG: 0.330
The time elapse of epoch 5 for model mlp_2 and data ml-1m is: 00: 00: 44
HR: 0.605	NDCG: 0.344
The time elapse of epoch 10 for model mlp_2 and data ml-1m is: 00: 00: 44
HR: 0.627	NDCG: 0.363
The time elapse of epoch 5 for model mlp_3 and data ml-1m is: 00: 00: 46
HR: 0.633	NDCG: 0.365
The time elapse of epoch 10 for model mlp_3 and data ml-1m is: 00: 00: 49
HR: 0.652	NDCG: 0.378
The time elapse of epoch 5 for model mlp_4 and data ml-1m is: 00: 00: 52
HR: 0.650	NDCG: 0.378
The time elapse of epoch 10 for model mlp_4 and data ml-1m is: 00: 00: 48
HR: 0.665	NDCG: 0.387
The time elapse of epoch 5 for model mlp_0 a

## Result

In [15]:
df_mlp = pd.read_csv('./Output/mlp.csv')
df_mlp[df_mlp['epoch']==10]

Unnamed: 0,epoch,HR,NDCG,data_name,model_name
9,10,0.453477,0.252633,ml-1m,mlp_0
19,10,0.591722,0.330141,ml-1m,mlp_1
29,10,0.627483,0.362892,ml-1m,mlp_2
39,10,0.652483,0.377942,ml-1m,mlp_3
49,10,0.664735,0.386886,ml-1m,mlp_4
59,10,0.274412,0.140705,pinterest-20,mlp_0
69,10,0.82813,0.502906,pinterest-20,mlp_1
79,10,0.833928,0.50839,pinterest-20,mlp_2
89,10,0.841829,0.512778,pinterest-20,mlp_3
99,10,0.84558,0.51695,pinterest-20,mlp_4


In [37]:
df_ncf = pd.read_csv('./Output/ncf.csv')
df_ncf[df_ncf['epoch']==10]

Unnamed: 0,epoch,HR,NDCG,data_name,model_name
9,10,0.636093,0.363548,ml-1m,gmf
19,10,0.661921,0.385522,ml-1m,neumf_pre
29,10,0.847537,0.520955,pinterest-20,gmf
39,10,0.860927,0.534515,pinterest-20,neumf_pre
