<a href="https://colab.research.google.com/github/SavvinaDaniil/BiasInRecommendation/blob/main/Book%20recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab"/></a>

This notebook should be run on Google Colab.

# Process
In this notebook, I will train the book recommendation algorithms using two different packages: <a href="http://surpriselib.com/">Surprise</a> & <a href="https://cornac.readthedocs.io/en/latest/">Cornac</a>. 

## A. Import libraries

In [1]:
!pip install cornac
!pip install surprise



In [2]:
%tensorflow_version 1.x
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import os
import matplotlib.pyplot as plt
import random as rd
import pandas as pd
import numpy as np
#from run_algorithms import train_algorithms, train_algorithms_kf, prepare_dataset, prepare_dataset_kf
from tqdm import tqdm
import time
pd.set_option("display.precision", 6)

# Cornac imports
import cornac
from cornac.eval_methods import RatioSplit
from cornac.data import Reader as CornacReader #Reader exists in both packages
from cornac.models import MostPop, MF, PMF, BPR, NeuMF, WMF, HPF, VAECF, NMF
from cornac.models import NMF as CornacNMF #NMF exists in both packages
from cornac.metrics import MAE, MSE, RMSE, Precision, Recall, NDCG, AUC, MAP, FMeasure, MRR

from surprise import BaselineOnly, KNNBasic, KNNWithMeans, SVDpp, SVD
from surprise import NMF as SurpriseNMF #NMF exists in both packages
from surprise import Dataset
from surprise import Reader as SurpriseReader #Reader exists in both packages
from surprise.model_selection import train_test_split, KFold, GridSearchCV
from surprise import accuracy
from sklearn.preprocessing import MinMaxScaler

from collections import defaultdict
from scipy import stats
from numpy.linalg import norm
import seaborn as sns
# set plot style: grey grid in the background:
sns.set(style="darkgrid")

TensorFlow 1.x selected.


## B. Set hyperparameters
There are certain hyperparameters that need to be tuned before the run. 

In [3]:
item_threshold = 5 # remove users with less than item_threshold items
user_threshold = 5 # remove items with less than user_threshold users
top_threshold = 200 # remove users who have rated more than top_threshold items
recommendation_type = "books" # books, music or movies
item_col = "book" # the item column
my_seed = 0 # random_seed
top_fraction_items = 0.2 # the limit for an item to be considered popular
top_fraction_users = 0.2# the limit for a user to be considered High Mainstriminess
split_by = "pop_fraq" # sort users by fraction of popular items (pop_fraq) or by average popularity in profile (pop_item_fraq)
test_size = 0.2 # the percentage of "hold out" data that are used for testing
rating_threshold = 1.0 # needed for the cornac library
predict_col = "rating" # the column we are predicting
train_way = "simple_split"
n_splits = 5 # the amount of splits

if train_way == "simple_split": n_splits = 1
rd.seed(my_seed)
np.random.seed(my_seed)

These additions will be useful so we can load and save the different files (plots and processed data) with clarity on the hyperparameters.

In [4]:
addition_1 = "_u"+str(item_threshold)+"_i"+str(user_threshold)+"_t"+str(top_threshold)
addition_2 = addition_1 + "_tfi"+str(int(100*top_fraction_items))
addition_3 = addition_2 + "_tfu"+str(int(100*top_fraction_users))
addition_4 = addition_3 + ("_sbpf" if (split_by=="pop_fraq") else "_sbpif")

## C. Read files

In [5]:
user_events_file = "https://raw.githubusercontent.com/SavvinaDaniil/BiasInRecommendation/main/data/processed/"+recommendation_type+"/ratings"+addition_1+".csv"
high_user_file = "https://raw.githubusercontent.com/SavvinaDaniil/BiasInRecommendation/main/data/processed/"+recommendation_type+"/high_users"+addition_4+".csv"
low_user_file = "https://raw.githubusercontent.com/SavvinaDaniil/BiasInRecommendation/main/data/processed/"+recommendation_type+"/low_users"+addition_4+".csv"
medium_user_file = "https://raw.githubusercontent.com/SavvinaDaniil/BiasInRecommendation/main/data/processed/"+recommendation_type+"/med_users"+addition_4+".csv"
df_item_dist_file = "https://raw.githubusercontent.com/SavvinaDaniil/BiasInRecommendation/main/data/processed/"+recommendation_type+"/item_pop_dist"+addition_1+".csv"

In [6]:
os.system("wget "+user_events_file)
os.system("wget "+low_user_file)
os.system("wget "+high_user_file)
os.system("wget "+medium_user_file)
os.system("wget "+df_item_dist_file)

0

In [7]:
low = pd.read_csv(low_user_file, index_col=0)
med = pd.read_csv(medium_user_file, index_col=0)
high = pd.read_csv(high_user_file, index_col=0)

In [8]:
num_users = len(low) + len(med) + len(high)
print(num_users)

6358


In [9]:
df_item_dist = pd.read_csv(df_item_dist_file, index_col = 0)

## D. Recommendation 

In [10]:
# we need two df_item_dist: one for Surprise, one for Cornac
df_item_dist_Surprise = df_item_dist.copy()

### Surprise

In [11]:
df_events = pd.read_csv(user_events_file, low_memory = False, header=0) # create dataframe

In [12]:
# load dataset in Surprise
reader = SurpriseReader(rating_scale=(df_events[predict_col].min(), df_events[predict_col].max()))
data = Dataset.load_from_df(df_events, reader)

In [13]:
# split in train and test set
trainset, testset = train_test_split(data, test_size = test_size, random_state = 0)

In [14]:
# select and initialize algorithms
algo_names = ["Random",
              "MostPopular",
              #'UserItemAvg',
                  'UserKNN',
                  'UserKNNAvg',
                  'NMF', 
                  'SVD']

# the default parameters for all algorithms
algos = [] 
algos.append(None)#Random())
algos.append(None)#MostPopular())
#algos.append(BaselineOnly()) 
algos.append(KNNBasic(sim_options = {'name': 'cosine', 'user_based': True})) 
algos.append(KNNWithMeans(sim_options = {'name': 'cosine', 'user_based': True})) 
algos.append(SurpriseNMF())
algos.append(SVD())

In [15]:
def get_top_n_Surprise(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

In [16]:
def get_top_n_random_Surprise(testset, df_item_dist, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r in testset:
      if len(top_n[uid]) == 0:
        for i in range(0, 10):
          top_n[uid].append((rd.choice(df_item_dist.index), i))
    
    return top_n

In [17]:
def get_top_n_mp_Surprise(testset, item_dist, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r in testset:
        if len(top_n[uid]) == 0:
            for iid, count in df_item_dist[:n]["count"].items():
                top_n[uid].append((iid, count))
    return top_n

In [18]:
trainset_for_testing = trainset.build_anti_testset() + trainset.build_testset()

In [21]:
len(trainset_for_testing)

44003718

In [30]:
trainset2 = trainset.build_anti_testset()

In [None]:
i = 0
low_rec_gap_list = [] # one entry per algorithm
medium_rec_gap_list = []
high_rec_gap_list = []
start = time.time()

for i in tqdm(range(0, len(algo_names))): # for every algorithm
    print("~~~~~~~~~~~~~~~~NEW~~~~~~~~~~~~~~~~~")
    df_item_dist_Surprise[algo_names[i]] = 0 # I am adding a column to Surprise
    low_rec_gap = 0
    medium_rec_gap = 0
    high_rec_gap = 0
    
    # get accuracy for personalized approaches
    if algo_names[i] != 'Random' and algo_names[i] != 'MostPopular': # for proper algorithms
        algos[i].fit(trainset) # fit
        predictions = algos[i].test(trainset2) # predict
        print(algo_names[i]) # end of fitting

        #get_mae_of_groups(predictions, low, med, high) TO BE ADDED
    
    # get top-n items and calculate gaps for all algorithms
    if algo_names[i] == 'Random':
        top_n = get_top_n_random_Surprise(trainset2, df_item_dist, n=10)
        print(algo_names[i])
    elif algo_names[i] == 'MostPopular':
        top_n = get_top_n_mp_Surprise(trainset2, df_item_dist, n=10)
        print(algo_names[i])
    else:
        top_n = get_top_n_Surprise(predictions, n=10)

    # calculate GAPs
    low_count = 0
    med_count = 0
    high_count = 0
    for uid, user_ratings in top_n.items():
        iid_list = []
        for (iid, _) in user_ratings:
            df_item_dist_Surprise.loc[iid, algo_names[i]] += 1
            iid_list.append(iid)
        gap = sum(df_item_dist_Surprise["count"].loc[iid_list]) / len(iid_list)
        if uid in low.index:
            low_rec_gap += gap
            low_count += 1
        elif uid in med.index:
            medium_rec_gap += gap
            med_count += 1
        elif uid in high.index:
            high_rec_gap += gap
            high_count += 1
    low_rec_gap_list.append(low_rec_gap / low_count)
    medium_rec_gap_list.append(medium_rec_gap / med_count)
    high_rec_gap_list.append(high_rec_gap / high_count)
    i += 1 # next algorithm
    end = time.time()
    print("It took " + str(np.round(end-start)) + " seconds.")
    start = time.time()

  0%|          | 0/6 [00:00<?, ?it/s]

~~~~~~~~~~~~~~~~NEW~~~~~~~~~~~~~~~~~
Random


 17%|█▋        | 1/6 [00:36<03:03, 36.79s/it]

It took 37.0 seconds.
~~~~~~~~~~~~~~~~NEW~~~~~~~~~~~~~~~~~
MostPopular


 33%|███▎      | 2/6 [01:14<02:28, 37.24s/it]

It took 38.0 seconds.
~~~~~~~~~~~~~~~~NEW~~~~~~~~~~~~~~~~~
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [80]:
# Not implemented here yet.
# if train_way == "kfold":
#     for alg in algo_names:
#         df_item_dist[alg] = df_item_dist[alg]/n_splits

In [46]:
#len(np.unique([x[0] for x in trainset_for_testing])),
len(np.unique([x[0] for x in testset]))  # unique users


5357

In [55]:
#len(np.unique([x[1] for x in trainset_for_testing])),
len(np.unique([x[1] for x in testset])) # unique items

5691

## Save files
To save:
1. df_item_dist
2. low_rec_gap_list etc

In [88]:
save = True

In [89]:
if save:
  from google.colab import drive
  import pickle as pkl
  drive.mount('/content/drive')

  df_item_dist_Surprise.to_csv("/content/drive/My Drive/item_pop_dist"+addition_1+"_results_Surprise.csv")
  with open("/content/drive/My Drive/low_rec_gap_list_Surprise"+addition_4+".pkl","wb") as f:
    pkl.dump(low_rec_gap_list,f)
  with open("/content/drive/My Drive/med_rec_gap_list_Surprise"+addition_4+".pkl","wb") as f:
    pkl.dump(medium_rec_gap_list,f)
  with open("/content/drive/My Drive/high_rec_gap_list_Surprise"+addition_4+".pkl","wb") as f:
    pkl.dump(high_rec_gap_list,f)

Mounted at /content/drive


### Cornac

In [19]:
# load dataset in Cornac
reader = CornacReader()
data = reader.read(user_events_file.split("/")[-1],sep =",", skip_lines =1)

In [38]:
# Split the data based on ratio
rs = RatioSplit(data=data, test_size=test_size, rating_threshold=rating_threshold, seed=0)

In [56]:
len([x for x in rs.__dict__["train_set"].item_data])

6921

In [61]:
# initialize models, here we are comparing: simple, traditional, and neural networks based models
models = [
          # 1: Random
          # 2: MostPop
          MostPop(),
          # 3: UserKNN
          # 4: BPR
          BPR(k=10, max_iter=200, learning_rate=0.001, lambda_reg=0.01, seed=123),
          # 5: MF
          MF(k=30, max_iter=100, learning_rate=0.01, lambda_reg=0.001, seed=123),
          # 6: PMF
          PMF(k=10, max_iter=100, learning_rate=0.001, lambda_reg=0.001),
          # 7: NMF
          NMF(k=15, max_iter=50, learning_rate=0.005, lambda_u=0.06, lambda_v=0.06, lambda_bu=0.02, lambda_bi=0.02, use_bias=False, verbose=True, seed=123),
          # 8: WMF
          WMF(k=50, max_iter=50, learning_rate=0.001, lambda_u=0.01, lambda_v=0.01, verbose=True, seed=123),
          # 9: PF
          HPF(k=50, seed=123, hierarchical=False, name="PF"),
          # 10: NueMF
          NeuMF(num_factors=8, layers=[32, 16, 8], act_fn="tanh", num_epochs=1, num_neg=3, batch_size=256, lr=0.001, seed=42, verbose=True),
          # 11: VAECF
          VAECF(k=10, autoencoder_structure=[20], act_fn="tanh", likelihood="mult", n_epochs=100, batch_size=100, learning_rate=0.001, beta=1.0, seed=123, use_gpu=True, verbose=True)
          ]

In [62]:
# define metrics to evaluate the models
metrics = [MAE(), MSE(), RMSE(), AUC(), MAP(), MRR(), 
           Precision(k=5), Precision(k=10), Precision(k=20), Precision(k=50),
           Recall(k=5), Recall(k=10), Recall(k=20), Recall(k=50),
           NDCG(k=5), NDCG(k=10), NDCG(k=20), NDCG(k=50),
           FMeasure(k=5), FMeasure(k=10), FMeasure(k=20), FMeasure(k=50)]

# put it together in an experiment, voilà!
exp = cornac.Experiment(eval_method=rs, models=models, metrics=metrics, user_based=True)
exp.run()

  0%|          | 0/50 [00:00<?, ?it/s]

Optimization finished!


  0%|          | 0/50 [00:00<?, ?it/s]

Learning completed!
Learning...
Learning completed!


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]


TEST:
...
        |    MAE |     MSE |   RMSE |    AUC |  F1@10 |  F1@20 |   F1@5 |  F1@50 |    MAP |    MRR | NDCG@10 | NDCG@20 | NDCG@5 | NDCG@50 | Precision@10 | Precision@20 | Precision@5 | Precision@50 | Recall@10 | Recall@20 | Recall@5 | Recall@50 | Train (s) | Test (s)
------- + ------ + ------- + ------ + ------ + ------ + ------ + ------ + ------ + ------ + ------ + ------- + ------- + ------ + ------- + ------------ + ------------ + ----------- + ------------ + --------- + --------- + -------- + --------- + --------- + --------
MostPop | 2.5202 | 10.0391 | 2.7988 | 0.6544 | 0.0105 | 0.0100 | 0.0099 | 0.0091 | 0.0147 | 0.0350 |  0.0175 |  0.0223 | 0.0141 |  0.0334 |       0.0076 |       0.0063 |      0.0092 |       0.0051 |    0.0253 |    0.0397 |   0.0151 |    0.0820 |    0.0094 |  18.3432
BPR     | 6.5658 | 46.4381 | 6.6616 | 0.6775 | 0.0105 | 0.0102 | 0.0099 | 0.0092 | 0.0148 | 0.0350 |  0.0175 |  0.0226 | 0.0141 |  0.0334 |       0.0076 |       0.0064 |      0.0092 |     

In [49]:
def compute_user_knn(C):
  ctime = time.time()
  print("Training User-based Collaborative Filtering...", )

  sim = C.dot(C.T)
  norms = [norm(C[i]) for i in range(C.shape[0])]

  for i in tqdm(range(C.shape[0])):
    sim[i][i] = 0.0
    for j in range(i+1, C.shape[0]):
      sim[i][j] /= (norms[i] * norms[j])
      sim[j][i] /= (norms[i] * norms[j])

  print("Done. Elapsed time:", time.time() - ctime, "s")
  rec_score = sim.dot(C)
  return rec_score

In [50]:
def read_training_data():
  training_matrix = np.zeros((rs.train_set.matrix.shape[0], rs.train_set.matrix.shape[1]))
  for uid in tqdm(rs.train_set.uid_map.values()):
    for iid in rs.train_set.iid_map.values():
      training_matrix[uid, iid] = rs.train_set.matrix[uid, iid]
  return training_matrix

In [51]:
# creating users-books rating matrix (will be used for User-KNN algorithm)
training_matrix = read_training_data()

100%|██████████| 6358/6358 [19:54<00:00,  5.32it/s]


In [53]:
len(training_matrix), len(training_matrix[0])

(6358, 6921)

In [57]:
# running User-KNN algorithms and getting the user-book scores
user_knn_scores = compute_user_knn(training_matrix)

Training User-based Collaborative Filtering...


100%|██████████| 6358/6358 [00:23<00:00, 275.51it/s] 


Done. Elapsed time: 32.87551736831665 s


In [59]:
# UserKNN recommendation algorithm
def get_top_n_UserKNN(n=10):
    print("User-KNN model is selected:")
    top_n = defaultdict(list)
    # test_items = list(rs.test_set.iid_map.keys())
    for uid in rs.train_set.uid_map.values():
      user_id = list(rs.train_set.user_ids)[uid]
      top_n_items_idxs = list(reversed(user_knn_scores[uid].argsort()))[:n]
      for iid in top_n_items_idxs:
        item_id = list(rs.train_set.item_ids)[iid]
        top_n[int(user_id)].append((int(item_id), user_knn_scores[uid][iid]))
    return top_n

In [77]:
model = exp.models[4]
model.rank(0)

(array([5748, 6247, 5063, ..., 4191, 6194, 5466]),
 array([6.1997457, 5.3605504, 5.9973583, ..., 6.5793386, 5.7426567,
        3.3694768], dtype=float32))

In [76]:
for model in exp.models:
  print(model.name, len(model.rank(0)[0]))

MostPop 6921
BPR 6921
MF 6921
PMF 6921
NMF 6921
WMF 6921
PF 6921
NeuMF 6921
VAECF 6921


In [None]:
def get_top_n(algo_name, n=10):
  for model in exp.models:
    if model.name == algo_name:
      print(model.name + " model is selected:")
      top_n = defaultdict(list)
      for uid in model.train_set.uid_map.values():
        user_id = list(model.train_set.user_ids)[uid]
        try:
          item_rank = model.rank(user_idx=uid)[0] # model.rank: item rank, item_score
        except:
          item_rank = model.rank(user_idx=int(uid))[0]
        # collect top N items
        item_rank_top = item_rank[:n]
        for iid in item_rank_top:
          item_id = list(model.train_set.item_ids)[iid]
          top_n[int(user_id)].append((int(item_id), model.score(uid, iid)))
  return top_n

In [None]:
# random recommendation algorithm
def get_top_n_random(n=10):
    print("Random model is selected:")
    top_n = defaultdict(list)
    test_items = list(rs.test_set.iid_map.keys())
    for uid in rs.train_set.uid_map.values():
      if uid not in top_n.keys():
        user_id = list(rs.train_set.user_ids)[uid]
        for i in range(0, n):
          top_n[int(user_id)].append((int(rd.choice(test_items)), i))
    return top_n

In [None]:
algo_names = ['Random', 'MostPop', 'UserKNN', 'MF', 'PMF', 'BPR', 'NMF', 'WMF', 'PF', 'NeuMF', 'VAECF']

i = 0
low_rec_gap_list = [] # one entry per algorithmus
medium_rec_gap_list = []
high_rec_gap_list = []

for i in range(0, len(algo_names)):
    df_item_dist[algo_names[i]] = 0
    low_rec_gap = 0
    medium_rec_gap = 0
    high_rec_gap = 0
    
    if algo_names[i] == 'Random':
      top_n = get_top_n_random(n=10)
    elif algo_names[i] == 'UserKNN':
      top_n = get_top_n_UserKNN(n=10)
    else:
      top_n = get_top_n(algo_names[i], n=10)
    low_count = 0
    med_count = 0
    high_count = 0
    for uid, user_ratings in top_n.items():
        iid_list = []
        for (iid, _) in user_ratings:
            df_item_dist.loc[iid, algo_names[i]] += 1
            iid_list.append(iid)
        gap = sum(df_item_dist["count"].loc[iid_list]) / len(iid_list)
        if uid in low.index:
            low_rec_gap += gap
            low_count += 1
        elif uid in med.index:
            medium_rec_gap += gap
            med_count += 1
        elif uid in high.index:
            high_rec_gap += gap
            high_count += 1
    low_rec_gap_list.append(low_rec_gap / low_count)
    medium_rec_gap_list.append(medium_rec_gap / med_count)
    high_rec_gap_list.append(high_rec_gap / high_count)
    i += 1 # next algorithm

## Save files
To save:
1. df_item_dist
2. low_rec_gap_list etc
3. exp.result & exp.metric
4. training user ids

In [None]:
save = True

In [None]:
if save:
  from google.colab import drive
  import pickle as pkl
  drive.mount('/content/drive')

  df_item_dist.to_csv("/content/drive/My Drive/item_pop_dist"+addition_1+"_results_Cornac.csv")
  with open("/content/drive/My Drive/experiment_results_cornac"+addition_4+".pkl","wb") as f:
    pkl.dump(exp.result,f)
  with open("/content/drive/My Drive/experiment_metrics_cornac"+addition_4+".pkl","wb") as f:
    pkl.dump(exp.metrics,f)
  with open("/content/drive/My Drive/low_rec_gap_list_cornac"+addition_4+".pkl","wb") as f:
    pkl.dump(low_rec_gap_list,f)
  with open("/content/drive/My Drive/med_rec_gap_list_cornac"+addition_4+".pkl","wb") as f:
    pkl.dump(medium_rec_gap_list,f)
  with open("/content/drive/My Drive/high_rec_gap_list_cornac"+addition_4+".pkl","wb") as f:
    pkl.dump(high_rec_gap_list,f)
  with open("/content/drive/My Drive/training_user_ids"+addition_4+"_cornac.pkl","wb") as f:
    pkl.dump(list(rs.train_set.user_ids),f)