<a href="https://colab.research.google.com/github/SavvinaDaniil/BiasInRecommendation/blob/main/Book%20recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab"/></a>

This notebook should be run on Google Colab.

# Process
In this notebook, I will train the book recommendation algorithms using two different packages: <a href="http://surpriselib.com/">Surprise</a> & <a href="https://cornac.readthedocs.io/en/latest/">Cornac</a>. 

## A. Import libraries

In [None]:
!pip install cornac
!pip install surprise



In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import os
import matplotlib.pyplot as plt
import random as rd
import pandas as pd
import numpy as np
from run_algorithms import train_algorithms, train_algorithms_kf, prepare_dataset, prepare_dataset_kf
from tqdm import tqdm
import time
pd.set_option("display.precision", 5)
import cornac
from cornac.eval_methods import RatioSplit
from cornac.data import Reader
from cornac.models import MostPop, MF, PMF, BPR, NeuMF, WMF, HPF, VAECF, NMF
from cornac.metrics import MAE, MSE, RMSE, Precision, Recall, NDCG, AUC, MAP, FMeasure, MRR
from sklearn.preprocessing import MinMaxScaler
from collections import defaultdict
from scipy import stats
from numpy.linalg import norm
import seaborn as sns
# set plot style: grey grid in the background:
sns.set(style="darkgrid")



## B. Set hyperparameters
There are certain hyperparameters that need to be tuned before the run. 

In [None]:
item_threshold = 5 # remove users with less than item_threshold items
user_threshold = 5 # remove items with less than user_threshold users
top_threshold = 200 # remove users who have rated more than top_threshold items
recommendation_type = "books" # books, music or movies
item_col = "book" # the item column
my_seed = 0 # random_seed
top_fraction_items = 0.2 # the limit for an item to be considered popular
top_fraction_users = 0.2# the limit for a user to be considered High Mainstriminess
split_by = "pop_fraq" # sort users by fraction of popular items (pop_fraq) or by average popularity in profile (pop_item_fraq)

In [None]:
test_size = 0.2
rating_threshold = 1.0

These additions will be useful so we can load and save the different files (plots and processed data) with clarity on the hyperparameters.

In [None]:
addition_1 = "_u"+str(item_threshold)+"_i"+str(user_threshold)+"_t"+str(top_threshold)
addition_2 = addition_1 + "_tfi"+str(int(100*top_fraction_items))
addition_3 = addition_2 + "_tfu"+str(int(100*top_fraction_users))
addition_4 = addition_3 + ("_sbpf" if (split_by=="pop_fraq") else "_sbpif")

In [None]:
rd.seed(my_seed) #seed for random functions
np.random.seed(my_seed) #seed for all numpy fuctions

## C. Read files

In [None]:
os.system("wget https://raw.githubusercontent.com/SavvinaDaniil/BiasInRecommendation/main/data/processed/")

2048

In [None]:
user_events_file = "https://raw.githubusercontent.com/SavvinaDaniil/BiasInRecommendation/main/data/processed/"+recommendation_type+"/ratings"+addition_1+".csv"
high_user_file = "https://raw.githubusercontent.com/SavvinaDaniil/BiasInRecommendation/main/data/processed/"+recommendation_type+"/high_users"+addition_4+".csv"
low_user_file = "https://raw.githubusercontent.com/SavvinaDaniil/BiasInRecommendation/main/data/processed/"+recommendation_type+"/low_users"+addition_4+".csv"
medium_user_file = "https://raw.githubusercontent.com/SavvinaDaniil/BiasInRecommendation/main/data/processed/"+recommendation_type+"/med_users"+addition_4+".csv"

In [None]:
os.system("wget "+user_events_file)
os.system("wget "+low_user_file)
os.system("wget "+high_user_file)
os.system("wget "+medium_user_file)

0

## D. Recommendation 

### Cornac

In [None]:
# load dataset in Cornac
reader = Reader()
data = reader.read(user_events_file.split("/")[-1],sep =",", skip_lines =1)

In [None]:
# Split the data based on ratio
rs = RatioSplit(data=data, test_size=test_size, rating_threshold=rating_threshold, seed=my_seed)

In [None]:
# initialize models, here we are comparing: simple, traditional, and neural networks based models
models = [
          # 1: Random
          # 2: MostPop
          MostPop(),
          # 3: UserKNN
          # 4: BPR
          BPR(k=10, max_iter=200, learning_rate=0.001, lambda_reg=0.01, seed=123),
          # 5: MF
          MF(k=30, max_iter=100, learning_rate=0.01, lambda_reg=0.001, seed=123),
          # 6: PMF
          PMF(k=10, max_iter=100, learning_rate=0.001, lambda_reg=0.001),
          # 7: NMF
          NMF(k=15, max_iter=50, learning_rate=0.005, lambda_u=0.06, lambda_v=0.06, lambda_bu=0.02, lambda_bi=0.02, use_bias=False, verbose=True, seed=123),
          # 8: WMF
          WMF(k=50, max_iter=50, learning_rate=0.001, lambda_u=0.01, lambda_v=0.01, verbose=True, seed=123),
          # 9: PF
          HPF(k=50, seed=123, hierarchical=False, name="PF"),
          # 10: NueMF
          NeuMF(num_factors=8, layers=[32, 16, 8], act_fn="tanh", num_epochs=1, num_neg=3, batch_size=256, lr=0.001, seed=42, verbose=True),
          # 11: VAECF
          VAECF(k=10, autoencoder_structure=[20], act_fn="tanh", likelihood="mult", n_epochs=100, batch_size=100, learning_rate=0.001, beta=1.0, seed=123, use_gpu=True, verbose=True)
          ]

In [None]:
# define metrics to evaluate the models
metrics = [MAE(), MSE(), RMSE(), AUC(), MAP(), MRR(), 
           Precision(k=5), Precision(k=10), Precision(k=20), Precision(k=50),
           Recall(k=5), Recall(k=10), Recall(k=20), Recall(k=50),
           NDCG(k=5), NDCG(k=10), NDCG(k=20), NDCG(k=50),
           FMeasure(k=5), FMeasure(k=10), FMeasure(k=20), FMeasure(k=50)]

# put it together in an experiment, voilà!
exp = cornac.Experiment(eval_method=rs, models=models, metrics=metrics, user_based=True)
exp.run()