In [None]:
import numpy as np
import math
import argparse
import time
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import save_npz, load_npz
from scipy.sparse import vstack, hstack
import scipy.sparse as sparse
from sklearn.utils.extmath import randomized_svd

In [None]:
#load npz data files
def load_numpy(path, name):
    return load_npz(path+name).tocsr()

In [None]:
#predictions
def predict(matrix_U, matrix_V, topK, matrix_Train, measure="Cosine"):
    prediction = []
    for i in range(matrix_U.shape[0]):
        vector_u = matrix_U[i]
        vector_train = matrix_Train[i]
        if len(vector_train.nonzero()[0]) > 0:
            #calculate top k recommendations
                train_index = vector_train.nonzero()[1]
                predictions = matrix_V.dot(vector_u)
                candidate_indeces = np.argpartition(-predictions, topK+len(train_index))[:topK+len(train_index)]
                predictions = candidate_indeces[predictions[candidate_indeces].argsort()[::-1]]
                predictions = np.delete(predictions, np.isin(predictions, train_index).nonzero()[0])
                vector_predictions = predictions[:topK]
        else:
            vector_predictions = np.zeros(topK, dtype=np.float32)

        prediction.append(vector_predictions)

    return np.vstack(prediction)

In [None]:
# metrics 
# recall@k
def recallk(vector_true_dense, hits, **unused):
    hits = len(hits.nonzero()[0])
    return float(hits)/len(vector_true_dense)

#support function for ndcg@k
def dcg_at_k(r, k, method=0):
    r = np.asfarray(r)[:k]
    if r.size:
        return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
    return 0.

#ndcg@k
def ndcg_at_k(hits, k=100, **unused):
    #r: Relevance scores in rank order
    dcg_max = dcg_at_k(sorted(hits, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_at_k(hits, k) / dcg_max

#evaluate the two metrics
def evaluate(matrix_Predict, matrix_Test, atK, metric_names= ['Recall', 'NDCG@K']):
    #mapping of metrics to functions
    local_metrics = {
        "Recall": recallk,
        "NDCG@K": ndcg_at_k
    }

    output = {}

    for k in atK:
        results = {name: [] for name in metric_names}
        #retrieve top k predictions
        topK_Predict = matrix_Predict[:, :k]

        for user_index in range(topK_Predict.shape[0]):
            predicted = topK_Predict[user_index]
            if len(predicted.nonzero()[0]) > 0:
                vector_true = matrix_Test[user_index]
                vector_dense = vector_true.nonzero()[1]
                hits = np.isin(predicted, vector_dense)

                if vector_dense.size > 0:
                    for name in metric_names:
                        results[name].append(local_metrics[name](vector_true_dense=vector_dense,
                                                                 vector_predict=predicted,
                                                                 hits=hits))
        #create a summary of results
        summary = {}
        for name in metric_names:
            summary['{0}@{1}'.format(name, k)] = (np.average(results[name]),1.96*np.std(results[name])/np.sqrt(matrix_Predict.shape[0]))
        output.update(summary)

    return output

In [None]:
def svd(matrix_train, embeded_matrix=np.empty((0)), iteration=10, rank=200):
    """
    matrix_train: rating matrix
    embeded_matrix: item or user embedding matrix(side info)
    iteration: number of random SVD iterations
    rank: SVD top K eigenvalue ranks
    """
    if embeded_matrix.shape[0] > 0:
        matrix_train = vstack((matrix_train, embeded_matrix.T))

    start_time = time.time()

    _, _, Qt = randomized_svd(matrix_train,
                                    n_components=rank,
                                    n_iter=iteration,
                                    power_iteration_normalizer='QR',
                                    random_state=1)

    RQ = matrix_train.dot(sparse.csc_matrix(Qt).T)
    return np.array(RQ.todense()), Qt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Tina
#data_path = '/content/drive/MyDrive/MASc_First_Year/CSC2515/CSC2515_Project_Part2/data/npz_files_threshold_4/'

#Sophie
data_path = '/content/drive/MyDrive/CSC2515_Project_Part2/data/npz_files_threshold_4/'

In [None]:
def main(args):
    # Load Data
    R_train = load_numpy(path=data_path, name='Rtrain.npz')
    R_valid = load_numpy(path=data_path, name='Rvalid.npz')
    R_test = load_numpy(path=data_path, name='Rtest.npz')
    
    #for baseline models, use both train set and validation set to do the training
    R_train = R_train + R_valid
    print("Train U-I Dimensions: {0}".format(R_train.shape))

    RQ, Yt = svd(R_train, embeded_matrix=np.empty((0)), rank=args.rank)
    
    predictions = predict(matrix_U=RQ, matrix_V=Yt.T, topK=args.topk, matrix_Train=R_train)
    result = evaluate(predictions, R_test, [args.topk])

    print('\nresults:')
    for metric in result.keys():
        print("{0}:{1}".format(metric, result[metric]))

In [None]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', dest='iter', type=int, default=200)
    #need to use k = 50 and 100 to calculate recall@50 and ndcg@100
    parser.add_argument('-r', dest='rank', type=int, default=50)  #100       # latent dimension
    parser.add_argument('-k', dest='topk', type=int, default=50) #100
    parser.add_argument('-f')
    args = parser.parse_args()

    main(args)

Train U-I Dimensions: (943, 1682)

results:
Recall@50:(0.3175042539899959, 0.013589050408734264)
NDCG@K@50:(0.4932274485692947, 0.013350887038479111)


### results


for rank = 100:

Recall@100:(0.3988413406125281, 0.01354893192221299)
NDCG@K@100:(0.4685252795216232, 0.011379502821609482)


Recall@50:(0.28045221106181933, 0.012783177534785323)
NDCG@K@50:(0.4678655743556272, 0.012864579453204105)




---

for rank = 50:

Recall@50:(0.3175042539899959, 0.013589050408734264)
NDCG@K@50:(0.4932274485692947, 0.013350887038479111)

Recall@100:(0.4560779052296026, 0.014269623708064297)
NDCG@K@100:(0.49700161260162556, 0.011935493245809416)




---


results with rank = 50 work better, use this as baseline.
