In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Imports and Dirs

In [None]:
import numpy as np
import os
import pandas as pd
from scipy import sparse
import pickle

In [None]:
!pip install surprise
import surprise
from surprise.trainset import Trainset
from  surprise.dataset import Dataset, DatasetAutoFolds
from surprise.reader import Reader
from surprise.prediction_algorithms.matrix_factorization import SVDpp

In [None]:
"""
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--root', type=str)
parser.add_argument('--input_data', type=str)
parser.add_argument('--training_results', type=str, default="svd_ml10m_training_results")
parser.add_argument('--eval_results', type=str, default= "eval_results")

args = parser.parse_args()
"""

#uncomment if running as .ipynb on Google Colabas
#comment if running as  .py on cluster such as HIPPO
class argclass(object):
  def __init__(self):
    self.root = "/content/drive/MyDrive/COMP700_Honours Project"
    self.input_data = "Data/movielens_10m/"
    self.training_results ="svd_ml10m_training_results"
    self.eval_results = "eval_results"
args = argclass()

In [None]:
root = args.root
data = os.path.join(root,args.input_data)
training_results = os.path.join(root, args.training_results)
eval_results = os.path.join(training_results, args.eval_results)

try:
  os.mkdir(training_results)
except FileExistsError:
  pass

try:
  os.mkdir(eval_results)
except FileExistsError:
  pass

#TRAINING

In [None]:
reader = Reader(rating_scale=(0,1))

In [None]:
train_data = pd.read_csv(os.path.join(data,"split/train_rec.csv")) #PRIMARY TRAINING DATA

vad_data = pd.read_csv(os.path.join(data,"split/vad_rec.csv"))  #OPTIONAL WHETHER TO ADD OR NOT. WHICH IS MORE FAIR TO THE EXPERIMENT? (LEAVE OUT OR INCLUDE VAD FOR TRAINING FOR SVD++?)
test_train_data = pd.read_csv(os.path.join(data,"split/test_training_rec.csv")) #HAS TO BE ADDED IN SO SVD++ HAS SEEN THE USERS INVOLVED


print(train_data.shape)
train_data = pd.concat([train_data, vad_data, test_train_data])


#NOTE: SVDpp using explicit form since it has its own internal mechanism to get implicit data


for i in range(len(test_train_data['rate'])):
  test_train_data['rate'][i] = 1

print(train_data.shape)

(7133394, 3)
(9718293, 3)


In [None]:

train_datasetautofolds = DatasetAutoFolds(df = train_data[['uid','bid','rate']], reader=reader)
trainset = train_datasetautofolds.build_full_trainset()


In [None]:
model = SVDpp(verbose=True,)

In [None]:
model.fit(trainset)

In [None]:
pickle.dump(model, open(os.path.join(training_results,"svdpp_model.pkl",) , "wb"))

#Predictions

In [None]:
model = pickle.load(open(os.path.join(training_results,"svdpp_model.pkl",) , "rb"))

In [None]:
def predict_with_svd(svd_model, input_recs, u_lookup, b_lookup, prev_cutoff=0):
  
  temp_dir_pred = "temp_preds_svd.csv"
  #temp_dir_pred_svd = os.path.join(root, temp_dir_pred_svd)
  
  columns = ['uid','bid','rate']
  results = []
  
  for i in range(prev_cutoff, len(input_recs['uid'])):

    uid = input_recs['uid'][i]
    bid = input_recs['bid'][i]

    rate = svd_model.predict(uid, bid).est

    results.append([uid, bid, rate])

    if not i%1000:
      results = pd.DataFrame(results, columns = columns)

      is_new_file = i==0
      results.to_csv(temp_dir_pred,
                     index=False, 
                     header=is_new_file, 
                     mode= "w" if is_new_file else "a"
                     )
      results = []

    if not i%1000:
      print(i)
  
  results = pd.DataFrame(results, columns = ['uid','bid','rate'])
  results.to_csv(temp_dir_pred,
                index=False, 
                header=False, 
                mode="a"
                )
  
  #here done with partial writes. re-read full file and form matrix
  results = pd.read_csv(temp_dir_pred)
  reverse_u_lookup = {uid:i for (i, uid) in enumerate(u_lookup)}
  reverse_b_lookup = {bid:i for (i, bid) in enumerate(b_lookup)}
  

  rows = [ reverse_u_lookup[uid] for uid in results['uid'] ] 
  cols = [ reverse_b_lookup[bid] for bid in results['bid'] ] 
  values = results['rate']

  num_u = len(reverse_u_lookup.keys())
  num_b = len(reverse_b_lookup.keys())

  sparse_pred = sparse.csr_matrix(  (values , (rows, cols)) , shape=( num_u,num_b) )

  results = None
  return sparse_pred

In [None]:
with open(os.path.join(data,"unique_b.txt") , 'r', encoding='utf-8') as bfile:
  unique_b = bfile.readlines()
  b_lookup = [b.strip() for b in unique_b]

with open(os.path.join(data,"split","unique_u_test.txt") , 'r', encoding='utf-8') as ufile:
  unique_u = ufile.readlines()
  test_u_lookup = [u.strip() for u in unique_u]

test_test_recs = pd.read_csv(os.path.join(data,"split/test_testing_rec.csv")) # predict only on test testing data

In [None]:
svd_pred = predict_with_svd(model, test_test_recs, test_u_lookup, b_lookup, )

In [None]:
pickle.dump(svd_pred, open(os.path.join(eval_results,"svdpp_preds.pkl",) , "wb"))

#Evaluation

In [None]:
model = pickle.load(open(os.path.join(training_results,"svdpp_model.pkl",) , "rb"))

In [None]:
from tensorflow.keras.utils import Sequence 
from tensorflow.keras import Model, layers
!pip3 install tensorflow-ranking
import tensorflow_ranking as tfr
import math

In [None]:
class datagen(Sequence):
  def __init__(self, x_set, y_set, batch_size=500, max_samples_per_epoch=None):
    self.x = x_set
    self.y = y_set
    self.batch_size = batch_size
    self.max_samples_per_epoch = max_samples_per_epoch

    self.shuffled_idx = np.arange(np.shape(self.x)[0]) #get indexes
    np.random.shuffle(self.shuffled_idx) # shuffle
    if max_samples_per_epoch is not None:
      self.shuffled_idx = self.shuffled_idx[:max_samples_per_epoch] #cutoff at max no of samples allowed in epoch

  def on_epoch_end(self):
    self.shuffled_idx = np.arange(np.shape(self.x)[0]) #get indexes
    np.random.shuffle(self.shuffled_idx) # shuffle
    if self.max_samples_per_epoch is not None:
      self.shuffled_idx = self.shuffled_idx[:self.max_samples_per_epoch] #cutoff at max no of samples allowed in epoch

  def __len__(self):
      return math.ceil((self.shuffled_idx.shape[0]) / self.batch_size)

  def __getitem__(self, idx):

    b_idx = idx * self.batch_size
    e_idx = (idx + 1) * self.batch_size

    idx = self.shuffled_idx[b_idx:e_idx] #cut slice of indexes using begin and end indexes
    batch_x = np.array(self.x[idx].todense())
    batch_y = np.array(self.y[idx].todense())

    return batch_x , batch_y


In [None]:
#test_true = pickle.load(open(os.path.join(data,"split/matrices/implicit","test_impl.pkl"), "rb"))

test_test = pickle.load(open(os.path.join(data,"split/matrices/implicit","test_testing_impl.pkl"), "rb"))


test_pred = pickle.load(open(os.path.join(eval_results,"svdpp_preds.pkl",) , "rb"))

eval_datagen = datagen(x_set=test_pred ,y_set=test_test)  #evaluate against all test data, both test-training and test-testing

In [None]:
eval_datagen.__getitem__(0)[0].shape[1]

127351

In [None]:
input = layers.Input(eval_datagen.__getitem__(0)[0].shape[1], name = 'input')

placeholder = Model(inputs = input, outputs = input, name = 'dummy')

In [None]:
metrics = [ tfr.keras.metrics.RecallMetric(name = "recall_20", topn=20,),
           tfr.keras.metrics.RecallMetric(name = "recall_50", topn=50,),
           tfr.keras.metrics.NDCGMetric(name = "ndcg_100", topn=100,)         
]

placeholder.compile(metrics = metrics)


In [None]:
evals = placeholder.evaluate(eval_datagen, verbose=1)

