<a href="https://colab.research.google.com/github/ShaelinN/VAE_RECOMMENDER_IMPLICIT_FEEDBACK/blob/main/Run.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#imports and dirs

In [1]:
import os
import pandas as pd
import numpy as np

from scipy import sparse
import pickle

In [2]:
import tensorflow as tf
from tensorflow import keras
from keras import layers, activations, Model, losses, metrics
import math
from keras.callbacks import Callback,ModelCheckpoint
import tensorflow.keras.backend as tfback

import argparse

In [45]:
'''
parser = argparse.ArgumentParser()
parser.add_argument('--root', type=str, default=rt)
parser.add_argument('--input_data', type=str, default=inp)
parser.add_argument('--output_results', type=str, default=outp)
parser.add_argument('--vae_weight_file', type=str)
parser.add_argument('--svdpp_file', type=str)

parser.add_argument('--u_lookup', type=str)
parser.add_argument('--b_lookup', type=str)

parser.add_argument('--prev_cutoff_prediction', type=int)

parser.add_argument('--prediction_temp_file_vae', type=str)
parser.add_argument('--prediction_temp_file_svd', type=str)



args = parser.parse_args()
'''
class argclass(object):
  def __init__(self):
    self.root = "/content/drive/MyDrive/COMP700_Honours Project"
    self.input_data = "Data/remove_low_interaction_users/matrices/implicit"
    self.output_results ="results"
    self.vae_weight_file = "models/vae_epoch_01_loss_210.97.hdf5"
    self.svdpp_file = "svdpp_model.pkl"
    self.u_lookup = "Data/remove_low_interaction_users/unique_u.txt"
    self.b_lookup = "Data/remove_low_interaction_users/unique_b.txt"

    self.prev_cutoff_prediction = 0

    self.prediction_temp_file_vae = "vae_pred_temp.csv"
    self.prediction_temp_file_svd = "svd_pred_temp.csv"


args = argclass()

In [4]:
root = args.root #"/content/drive/MyDrive/COMP700_Honours Project"

In [22]:
input_data  = os.path.join(root, args.input_data)

vae_weights = os.path.join(root, args.vae_weight_file)
svdpp_model = os.path.join(root, args.svdpp_file)

In [6]:
output_results = os.path.join(root, args.output_results)

In [7]:
try:
  os.mkdir(output_results)
except FileExistsError:
  pass

#Make Predictions

In [57]:
#convert a DF of results into a sparse matrix. allows for easier reuse of code
def results_mat_from_df(results, u_lookup, b_lookup):
  num_u = len(u_lookup)  
  num_b = len(b_lookup) 

  #u_map = {strid:intidx for intidx, strid in enumerate(u_lookup)}
  #b_map = {strid:intidx for intidx, strid in enumerate(b_lookup)}

  rows = results['uid'] #[ u_map[uid] for uid in results['uid'] ] 
  cols = results['bid']#[ b_map[bid] for bid in results['bid'] ] 

  values = results['rate']

  sparse_pred = sparse.csr_matrix(  (values , (rows, cols)) , shape=( num_u,num_b) )

  return sparse_pred

In [61]:
def predictions_to_csv(test_mat, u_lookup, b_lookup, model, model_type = "SVDpp"):
  columns=['uid','bid', 'rate']
  results = []

  prev_cutoff = args.prev_cutoff_prediction if args.prev_cutoff_prediction is not None else 0
  for user in range(prev_cutoff, test_mat.shape[0]):
    #uid = u_lookup[user]
    test_vec  = np.array(test_mat[user].todense())

    if model_type == "SVDpp":
      test_pred = svdpp_predictions(model, user, u_lookup, b_lookup, test_vec)
    elif model_type =="VAE":
      test_pred = vae_predictions(model, test_vec)

    for business in range(test_vec.shape[1]):
      #bid = b_lookup[business]
      result_rate = test_pred[business]
      if result_rate != 0:
        results.append([user, business, result_rate])


    if not user%100:
      print(user)
    if not user%1000:
      results = pd.DataFrame(results, columns = columns)
      temp_dir = args.prediction_temp_file_vae if model_type=="VAE" else args.prediction_temp_file_svd if model_type=="SVDpp" else "temp_preds.csv"
      is_new_file = user==0
      results.to_csv(temp_dir,
                     index=False, 
                     header=is_new_file, 
                     mode= "w" if is_new_file else "a"
                     )
      results = []

  results = pd.DataFrame(results, columns = columns)
  results.to_csv(temp_dir,
                index=False, 
                header=False, 
                mode="a"
                )

def csv_preds_to_sparse(file, u_lookup, b_lookup):
  results = pd.read_csv(file)
  results_mat = results_mat_from_df(results, u_lookup, b_lookup)
  resuls_df = None
  return results_mat

In [10]:
def svdpp_predictions(svdpp_machine, user, u_lookup, b_lookup, test_vec):
  uid = u_lookup[user]
  bids = [ b_lookup[business]  for business in range(test_vec.shape[1])]
  test_pred = []
  for bid in bids:
    test_pred.append(test_pred.append(svdpp_machine.predict(uid, bid))["est"])
  return test_pred

In [11]:
def vae_predictions(vae_machine, test_vec):
  test_pred = vae_machine.predict(test_vec)
  test_pred = test_pred[0]  
  return test_pred


#Evaluate Results

In [12]:
# recall for one user vector only
def sub_value_recall_at_k(vec_true , vec_pred, k):
  #check if there are really items this user enjoyed
  top_rated = np.argwhere(vec_true)
  if len(top_rated) ==0:
    return
  
  top_predicted = sorted(range(len(vec_pred)), key=lambda i: vec_pred[i]) [-k:]
  
  sum = 0.0
  for i in range(0, k):
    if top_predicted[i] in top_rated:
      sum+=1.0
  recall = sum/float(min(k, len(top_rated)))

  return recall

In [13]:
# ndcg for one user vector only
def sub_value_ndcg(vec_true, vec_pred, k):
  top_rated = np.argwhere(vec_true)
  if len(top_rated) ==0:
    return
  top_predicted = sorted(range(len(vec_pred)), key=lambda i: vec_pred[i])[-k:]
  sum_ndcg = 0
  for i in range(0, k):
    if top_predicted[i] in top_rated:
      ndcg = 1/(math.log(i+2))
    else:
      ndcg = 0
    sum_ndcg += ndcg
  return sum_ndcg 

In [14]:
#avg recalls and ndcg over entire user set
def evaluate(full_true_mat, test_preds_mat, u_lookup, b_lookup):
  recall20 = []
  recall50 = []
  ndcg20 = []

  for user in range(full_true_mat.shape[0]):
    uid = u_lookup[user]

    test_true = np.array(full_true_mat[user].todense())
    test_pred = np.array(test_preds_mat[user].todense())

    recall20.append(sub_value_recall_at_k(test_true, test_pred, 20))
    recall50.append(sub_value_recall_at_k(test_true, test_pred, 50))
    ndcg20.append(sub_value_ndcg(test_true, test_pred, 20))

  recall20 = np.array(recall20)
  recall20_avg = np.sum(recall20) / float(len(recall20))

  recall50 = np.array(recall50)
  recall50_avg = np.sum(recall50) / float(len(recall50))

  ndcg20 = np.array(ndcg20)
  ndcg20 = ndcg20 / ndcg20.max()  #NORMALISATION
  ndcg20_avg = np.sum(ndcg20) / float(len(ndcg20))

  return recall20_avg , recall50_avg , ndcg20_avg

#ACTUAL RUN

##set up

In [15]:
full_true = pickle.load(open(os.path.join(input_data,"impl_all.pkl"), "rb"))
test = pickle.load(open(os.path.join(input_data,"impl_test.pkl"), "rb"))

with open(os.path.join(root, args.b_lookup) , 'r', encoding='utf-8') as bfile:
  unique_b = bfile.readlines()
  idx_to_b = [b.strip() for b in unique_b]

with open(os.path.join(root, args.u_lookup) , 'r', encoding='utf-8') as ufile:
  unique_u = ufile.readlines()
  idx_to_u = [u.strip() for u in unique_u]


##Get predictions

###SVD++

In [None]:
#svdpp = pickle.load(svdpp_model) # get the trained svdpp from where it is stored

In [None]:
#predictions_to_csv(test, idx_to_u, idx_to_b, svdpp, "SVDpp")
#svdpp_results = csv_preds_to_sparse(args.prediction_temp_file_svd , idx_to_u, idx_to_b)

#pickle.dump(svdpp_results, open(os.path.join(output_results,"svd_preds.pkl"), "wb"))

###VAE

In [16]:
class Sampling(layers.Layer):
  def __init__(self, name="Sampling", **kwargs):
    super(Sampling, self).__init__(name=name, **kwargs)
    
  def call(self, inputs):
    z_mean, z_log_var = inputs
    batch = tf.shape(z_mean)[0]
    dim = tf.shape(z_mean)[1]


    #epsilon = distribution.sample()
    
    epsilon = tfback.random_normal(shape=(batch,dim))
    sample = epsilon * tf.exp(0.5 * z_log_var)  +   z_mean  #Reparametrization trick: convert from standard normal to desired distribution

    return sample

In [17]:
#NOTE: NO ANNEALLING FOR TESTING

KLBeta = 1
def VAE_loss(y_true, y_pred):
    global KLBeta
    reconst_loss =  original_dim * losses.binary_crossentropy(y_true, y_pred)
    KLDiv = KLBeta * losses.kl_divergence(y_true, y_pred)

    return  reconst_loss + KLDiv

In [18]:
original_dim = 127350 #test.shape[1]
intermediate_dim =  512
latent_dim = intermediate_dim//2 

In [19]:
class vae_builder(object):
  def __init__(self, original_dim, intermediate_dim, latent_dim, name='VAE'):
    self.name = name
    self.original_dim = original_dim
    self.intermediate_dim = intermediate_dim
    self.latent_dim = latent_dim

  
  def build(self):
    self.input = layers.Input(self.original_dim, name = 'input')
    self.dropout = layers.Dropout(rate=0.5)(self.input)

    #encoder
    self.e1 = layers.Dense(self.intermediate_dim, activation=activations.hard_sigmoid, name = 'e1')(self.dropout)
    self.e2 = layers.Dense(self.intermediate_dim, activation=activations.hard_sigmoid, name = 'e2')(self.e1)
    self.e3 = layers.Dense(self.intermediate_dim, activation=activations.hard_sigmoid, name = 'e3')(self.e2)
    self.e4 = layers.Dense(self.intermediate_dim, activation=activations.hard_sigmoid, name = 'e4')(self.e3)
    self.e5 = layers.Dense(self.intermediate_dim, activation=activations.hard_sigmoid, name = 'e5')(self.e4)

    #sampling
    self.mean = layers.Dense(self.latent_dim, name = 'mean')(self.e5)  
    self.log_var = layers.Dense(self.latent_dim, name = 'log_var')(self.e5)  
    self.sampling = Sampling()([self.mean, self.log_var])

    #decoder
    self.d1 = layers.Dense(self.intermediate_dim, activation='relu',name = 'd1')(self.sampling)
    self.d2 = layers.Dense(self.original_dim, activation='relu', name = 'd2')(self.d1)  
    
    self.output = layers.Activation(activations.swish, name = 'output')(self.d2)

    #to model
    vae = Model(inputs = self.input, outputs = self.output, name = self.name)
    return vae

In [23]:
#BUILD STRUCTURE
v = vae_builder(original_dim, intermediate_dim, latent_dim).build()

#configurable optimiser
slow_adam = keras.optimizers.Adam(learning_rate=1e-4)

v.compile(optimizer=slow_adam, loss=VAE_loss)

v.load_weights(vae_weights)


In [62]:
predictions_to_csv(test, idx_to_u, idx_to_b, v, "VAE")
vae_results = csv_preds_to_sparse(args.prediction_temp_file_vae , idx_to_u, idx_to_b)

pickle.dump(vae_results, open(os.path.join(output_results,"vae_preds.pkl"), "wb"))

0
100
200
300


KeyboardInterrupt: ignored

##Evaluate predictions

In [None]:
#svdpp_results = pickle.load( open(os.path.join(output_results,"svd_preds.pkl"), "rb"))

In [None]:
#vae_results = pickle.load(open(os.path.join(output_results,"vae_preds.pkl"), "rb"))

In [None]:
#vae_metrics = evaluate(full_true, vae_results, idx_to_u, idx_to_b)
#vae_metrics

In [None]:
#svdpp_metrics = evaluate(full_true, svdpp_results, idx_to_u, idx_to_b)
#svdpp_metrics