<a href="https://colab.research.google.com/github/SavvinaDaniil/BiasInRecommendation/blob/main/2.%20Book%20Recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab"/></a>

This notebook should be run on Google Colab.

# Process
In this notebook, I will train the book recommendation algorithms using two different packages: <a href="http://surpriselib.com/">Surprise</a> & <a href="https://cornac.readthedocs.io/en/latest/">Cornac</a>. 

## A. Import libraries

In [2]:
!pip install cornac

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [1]:
!pip uninstall tensorflow -y
!pip install tensorflow-gpu==1.15
!apt install --allow-change-held-packages libcudnn7=7.4.1.5-1+cuda10.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-gpu==1.15
  Downloading tensorflow_gpu-1.15.0-cp37-cp37m-manylinux2010_x86_64.whl (411.5 MB)
[K     |████████████████████████████████| 411.5 MB 7.2 kB/s 
Collecting tensorboard<1.16.0,>=1.15.0
  Downloading tensorboard-1.15.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 47.8 MB/s 
Collecting keras-applications>=1.0.8
  Downloading Keras_Applications-1.0.8-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 7.7 MB/s 
Collecting tensorflow-estimator==1.15.1
  Downloading tensorflow_estimator-1.15.1-py2.py3-none-any.whl (503 kB)
[K     |████████████████████████████████| 503 kB 75.6 MB/s 
Collecting gast==0.2.2
  Downloading gast-0.2.2.tar.gz (10 kB)
Building wheels for collected packages: gast
  Building wheel for gast (setup.py) ... [?25l[?25hdone
  Created wheel for gast: filename=gast-0.2.2-py3-none-any.whl siz

In [6]:
#%tensorflow_version 1.x
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import os
import matplotlib.pyplot as plt
import random as rd
import pandas as pd
import numpy as np
#from run_algorithms import train_algorithms, train_algorithms_kf, prepare_dataset, prepare_dataset_kf
from tqdm import tqdm
import time
pd.set_option("display.precision", 6)

# Cornac imports
import cornac
from cornac.eval_methods import RatioSplit
from cornac.data import Reader as CornacReader #Reader exists in both packages
from cornac.models import MostPop, MF, PMF, BPR, NeuMF, WMF, HPF, VAECF, NMF
from cornac.models import NMF as CornacNMF #NMF exists in both packages
from cornac.metrics import MAE, MSE, RMSE, Precision, Recall, NDCG, AUC, MAP, FMeasure, MRR


from collections import defaultdict
from scipy import stats
from numpy.linalg import norm
import seaborn as sns
# set plot style: grey grid in the background:
sns.set(style="darkgrid")
pd.set_option("display.precision", 8)

## B. Set hyperparameters
There are certain hyperparameters that need to be tuned before the run. 

In [7]:
item_col = "ISBN" # the item column
my_seed = 0 # random_seed
test_size = 0.2 # the percentage of "hold out" data that are used for testing
rating_threshold = 1.0 # needed for the cornac library
predict_col = "rating" # the column we are predicting
train_way = "simple_split"
n_splits = 5 # the amount of splits

if train_way == "simple_split": n_splits = 1
rd.seed(my_seed)
np.random.seed(my_seed)

## C. Read files

General.

In [8]:
user_events_file = "https://raw.githubusercontent.com/SavvinaDaniil/BiasInRecommendation/main/data/processed/final_fairbook_ratings.csv"
high_user_file = "https://raw.githubusercontent.com/SavvinaDaniil/BiasInRecommendation/main/data/processed/high_users.csv"
low_user_file = "https://raw.githubusercontent.com/SavvinaDaniil/BiasInRecommendation/main/data/processed/low_users.csv"
medium_user_file = "https://raw.githubusercontent.com/SavvinaDaniil/BiasInRecommendation/main/data/processed/med_users.csv"
df_item_dist_file = "https://raw.githubusercontent.com/SavvinaDaniil/BiasInRecommendation/main/data/processed/item_pop_dist.csv"

Country.

In [None]:
# USA_oriented_file = "https://raw.githubusercontent.com/SavvinaDaniil/BiasInRecommendation/main/data/processed/books/high_users_u5_i5_t200_tfu20_USAr_new.csv"
# midUSA_oriented_file = "https://raw.githubusercontent.com/SavvinaDaniil/BiasInRecommendation/main/data/processed/books/med_users_u5_i5_t200_tfu20_USAr_new.csv"
# lowUSA_oriented_file = "https://raw.githubusercontent.com/SavvinaDaniil/BiasInRecommendation/main/data/processed/books/low_users_u5_i5_t200_tfu20_USAr_new.csv"

Read all files.

In [10]:
low_popularity = pd.read_csv(low_user_file, index_col=0)
med_popularity = pd.read_csv(medium_user_file, index_col=0)
high_popularity = pd.read_csv(high_user_file, index_col=0)

In [11]:
# USA_oriented = pd.read_csv(USA_oriented_file, index_col=0)
# midUSA_oriented = pd.read_csv(midUSA_oriented_file, index_col=0)
# lowUSA_oriented = pd.read_csv(lowUSA_oriented_file, index_col=0)

Confirm number of users.

In [12]:
num_users1 = len(low_popularity) + len(med_popularity) + len(high_popularity)
#num_users3 = len(USA_oriented) + len(midUSA_oriented) + len(lowUSA_oriented)
print(num_users1)
#num_users = num_users1

6354


Distribution of items (for popularity).

In [13]:
df_item_dist = pd.read_csv(df_item_dist_file, index_col = 0)

In [14]:
df_item_dist.head()

Unnamed: 0,count
121,0.05901794
68,0.04359459
40,0.03981744
413,0.03903053
443,0.03729934


## D. Recommendation 

### Cornac

In [15]:
# load dataset in Cornac
os.system("wget "+user_events_file)
reader = CornacReader()
data = reader.read(user_events_file.split("/")[-1],sep =",", skip_lines =1)

In [16]:
# Split the data based on ratio
rs = RatioSplit(data=data, test_size=test_size, rating_threshold=rating_threshold, seed=0)

In [17]:
# initialize models, here we are comparing: simple, traditional, and neural networks based models
models = [
          # 1: Random
          # 2: MostPop
          MostPop(),
          # 3: UserKNN
          # 4: BPR
          BPR(k=10, max_iter=200, learning_rate=0.001, lambda_reg=0.01, seed=123),
          # 5: MF
          MF(k=30, max_iter=100, learning_rate=0.01, lambda_reg=0.001, seed=123),
          # 6: PMF
          PMF(k=10, max_iter=100, learning_rate=0.001, lambda_reg=0.001),
          # 7: NMF
          NMF(k=15, max_iter=50, learning_rate=0.005, lambda_u=0.06, lambda_v=0.06, lambda_bu=0.02, lambda_bi=0.02, use_bias=False, verbose=True, seed=123),
          # 8: WMF
          WMF(k=50, max_iter=50, learning_rate=0.001, lambda_u=0.01, lambda_v=0.01, verbose=True, seed=123),
          # 9: PF
          HPF(k=50, seed=123, hierarchical=False, name="PF"),
          # 10: NueMF
          NeuMF(num_factors=8, layers=[32, 16, 8], act_fn="tanh", num_epochs=1, num_neg=3, batch_size=256, lr=0.001, seed=42, verbose=True),
          # 11: VAECF
          VAECF(k=10, autoencoder_structure=[20], act_fn="tanh", likelihood="mult", n_epochs=100, batch_size=100, learning_rate=0.001, beta=1.0, seed=123, use_gpu=True, verbose=True)
          ]

In [18]:
# define metrics to evaluate the models
metrics = [MAE(), MSE(), RMSE(), AUC(), MAP(), MRR(), 
           Precision(k=5), Precision(k=10), Precision(k=20), Precision(k=50),
           Recall(k=5), Recall(k=10), Recall(k=20), Recall(k=50),
           NDCG(k=5), NDCG(k=10), NDCG(k=20), NDCG(k=50),
           FMeasure(k=5), FMeasure(k=10), FMeasure(k=20), FMeasure(k=50)]

# put it together in an experiment, voilà!
exp = cornac.Experiment(eval_method=rs, models=models, metrics=metrics, user_based=True)
exp.run()

  0%|          | 0/50 [00:00<?, ?it/s]

Optimization finished!


  0%|          | 0/50 [00:00<?, ?it/s]

Learning completed!
Learning...
Learning completed!


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]


TEST:
...
        |    MAE |     MSE |   RMSE |    AUC |  F1@10 |  F1@20 |   F1@5 |  F1@50 |    MAP |    MRR | NDCG@10 | NDCG@20 | NDCG@5 | NDCG@50 | Precision@10 | Precision@20 | Precision@5 | Precision@50 | Recall@10 | Recall@20 | Recall@5 | Recall@50 | Train (s) | Test (s)
------- + ------ + ------- + ------ + ------ + ------ + ------ + ------ + ------ + ------ + ------ + ------- + ------- + ------ + ------- + ------------ + ------------ + ----------- + ------------ + --------- + --------- + -------- + --------- + --------- + --------
MostPop | 2.4102 |  9.3176 | 2.6756 | 0.6995 | 0.0120 | 0.0118 | 0.0104 | 0.0104 | 0.0165 | 0.0389 |  0.0191 |  0.0252 | 0.0147 |  0.0370 |       0.0089 |       0.0074 |      0.0097 |       0.0058 |    0.0275 |    0.0461 |   0.0156 |    0.0901 |    0.0115 |  13.0529
BPR     | 6.5232 | 45.8364 | 6.6137 | 0.7170 | 0.0120 | 0.0119 | 0.0104 | 0.0105 | 0.0168 | 0.0384 |  0.0191 |  0.0253 | 0.0146 |  0.0371 |       0.0089 |       0.0075 |      0.0097 |     

In [19]:
def compute_user_knn(C):
  ctime = time.time()
  print("Training User-based Collaborative Filtering...", )

  sim = C.dot(C.T)
  norms = [norm(C[i]) for i in range(C.shape[0])]

  for i in tqdm(range(C.shape[0])):
    sim[i][i] = 0.0
    for j in range(i+1, C.shape[0]):
      sim[i][j] /= (norms[i] * norms[j])
      sim[j][i] /= (norms[i] * norms[j])

  print("Done. Elapsed time:", time.time() - ctime, "s")
  rec_score = sim.dot(C)
  return rec_score

In [20]:
def read_training_data():
  training_matrix = np.zeros((rs.train_set.matrix.shape[0], rs.train_set.matrix.shape[1]))
  for uid in tqdm(rs.train_set.uid_map.values()):
    for iid in rs.train_set.iid_map.values():
      training_matrix[uid, iid] = rs.train_set.matrix[uid, iid]
  return training_matrix

In [21]:
# creating users-books rating matrix (will be used for User-KNN algorithm)
training_matrix = read_training_data()

100%|██████████| 6350/6350 [15:49<00:00,  6.68it/s]


In [23]:
# running User-KNN algorithms and getting the user-book scores
user_knn_scores = compute_user_knn(training_matrix)

Training User-based Collaborative Filtering...


100%|██████████| 6350/6350 [00:18<00:00, 339.62it/s] 


Done. Elapsed time: 24.818676948547363 s


In [24]:
# UserKNN recommendation algorithm
def get_top_n_UserKNN(n=10):
    print("User-KNN model is selected:")
    top_n = defaultdict(list)
    # test_items = list(rs.test_set.iid_map.keys())
    for uid in rs.train_set.uid_map.values():
      user_id = list(rs.train_set.user_ids)[uid]
      top_n_items_idxs = list(reversed(user_knn_scores[uid].argsort()))[:n]
      for iid in top_n_items_idxs:
        item_id = list(rs.train_set.item_ids)[iid]
        top_n[int(user_id)].append((int(item_id), user_knn_scores[uid][iid]))
    return top_n

In [25]:
def get_top_n(algo_name, n=10):
  for model in exp.models:
    if model.name == algo_name:
      print(model.name + " model is selected:")
      top_n = defaultdict(list)
      for uid in model.train_set.uid_map.values():
        user_id = list(model.train_set.user_ids)[uid]
        try:
          item_rank = model.rank(user_idx=uid)[0] # model.rank: item rank, item_score
        except:
          item_rank = model.rank(user_idx=int(uid))[0]
        # collect top N items
        item_rank_top = item_rank[:n]
        for iid in item_rank_top:
          item_id = list(model.train_set.item_ids)[iid]
          top_n[int(user_id)].append((int(item_id), model.score(uid, iid)))
  return top_n

In [26]:
# random recommendation algorithm
def get_top_n_random(n=10):
    print("Random model is selected:")
    top_n = defaultdict(list)
    test_items = list(rs.test_set.iid_map.keys())
    for uid in rs.train_set.uid_map.values():
      if uid not in top_n.keys():
        user_id = list(rs.train_set.user_ids)[uid]
        for i in range(0, n):
          top_n[int(user_id)].append((int(rd.choice(test_items)), i))
    return top_n

# Properties analysis

In [27]:
ratings = pd.read_csv(user_events_file, index_col=0)

Create property dictionaries.

In [28]:
country_dict = {}
for isbn in ratings[item_col].unique():
  country_dict[isbn] = ratings.country[ratings[item_col] == isbn].iloc[0]

Create columns of the relevant metric in the recommendations.

In [30]:
algo_names = ['Random', 'MostPop', 'UserKNN', 'MF', 'PMF', 'BPR', 'NMF', 'WMF', 'PF', 'NeuMF', 'VAECF']

# for i in range(0, len(algo_names)):
#   USA_oriented["new_ratio_USA_"+algo_names[i]] = 0.0
#   midUSA_oriented["new_ratio_USA_"+algo_names[i]] = 0.0
#   lowUSA_oriented["new_ratio_USA_"+algo_names[i]] = 0.0

For every algorithm, calculated the metric (male-female difference and USA ratio) in the recommendations for every user.

In [31]:
# for i in range(0, len(algo_names)):
#     USA_or_ratio = 0
#     midUSA_or_ratio = 0
#     lowUSA_or_ratio = 0
    
#     if algo_names[i] == 'Random':
#       top_n = get_top_n_random(n=10)
#     elif algo_names[i] == 'UserKNN':
#       top_n = get_top_n_UserKNN(n=10)
#     else:
#       top_n = get_top_n(algo_names[i], n=10)
#     print("Top n calculated for ", algo_names[i])
#     users = top_n.keys()
#     user_recommendations = {}

#     for user in users:
#       user_recommendations[user] = [x[0] for x in top_n[user]]

#       USA_count = 0
      
#       for rec in user_recommendations[user]:
#         if country_dict[rec] == "USA":
#           USA_count += 1

#       ratio_USA = USA_count/len(user_recommendations[user])        

#       if user in USA_oriented.index:
#         USA_oriented.at[user, "new_ratio_USA_"+algo_names[i]] = ratio_USA
#       elif user in lowUSA_oriented.index:
#         lowUSA_oriented.at[user, "new_ratio_USA_"+algo_names[i]] = ratio_USA
#       else:
#         midUSA_oriented.at[user, "new_ratio_USA_"+algo_names[i]] = ratio_USA

# Save files for properties

In [32]:
save = True

In [None]:
# if save:
#   from google.colab import drive
#   import pickle as pkl
#   drive.mount('/content/drive')

#   USA_oriented.to_csv("/content/drive/My Drive/new_USA_oriented_10_new.csv")
#   midUSA_oriented.to_csv("/content/drive/My Drive/new_midUSA_10_new.csv")
#   lowUSA_oriented.to_csv("/content/drive/My Drive/new_lowUSA_oriented_10_new.csv")
  


Mounted at /content/drive


# Popularity analysis

In [33]:
algo_names = ['Random', 'MostPop', 'UserKNN', 'MF', 'PMF', 'BPR', 'NMF', 'WMF', 'PF', 'NeuMF', 'VAECF']

i = 0
low_rec_gap_list = [] # one entry per algorithmus
medium_rec_gap_list = []
high_rec_gap_list = []

for i in range(0, len(algo_names)):
    df_item_dist[algo_names[i]] = 0
    low_rec_gap = 0
    medium_rec_gap = 0
    high_rec_gap = 0
    
    if algo_names[i] == 'Random':
      top_n = get_top_n_random(n=10)
    elif algo_names[i] == 'UserKNN':
      top_n = get_top_n_UserKNN(n=10)
    else:
      top_n = get_top_n(algo_names[i], n=10)
    low_count = 0
    med_count = 0
    high_count = 0
    for uid, user_ratings in top_n.items():
        iid_list = []
        for (iid, _) in user_ratings:
            df_item_dist.loc[iid, algo_names[i]] += 1
            iid_list.append(iid)
        gap = sum(df_item_dist["count"].loc[iid_list]) / len(iid_list)
        if uid in low_popularity.index:
            low_rec_gap += gap
            low_count += 1
        elif uid in med_popularity.index:
            medium_rec_gap += gap
            med_count += 1
        elif uid in high_popularity.index:
            high_rec_gap += gap
            high_count += 1
    low_rec_gap_list.append(low_rec_gap / low_count)
    medium_rec_gap_list.append(medium_rec_gap / med_count)
    high_rec_gap_list.append(high_rec_gap / high_count)
    i += 1 # next algorithm

Random model is selected:
MostPop model is selected:
User-KNN model is selected:
MF model is selected:
PMF model is selected:
BPR model is selected:
NMF model is selected:
WMF model is selected:
PF model is selected:
NeuMF model is selected:
VAECF model is selected:


## Save files
To save:
1. df_item_dist
2. low_rec_gap_list etc
3. exp.result & exp.metric
4. training user ids

In [34]:
save = True

In [40]:
if save:
  from google.colab import drive
  import pickle as pkl
  drive.mount('/content/drive')

  df_item_dist.to_csv("/content/drive/My Drive/item_pop_dist.csv")
  with open("/content/drive/My Drive/experiment_results_cornac.pkl","wb") as f:
    pkl.dump(exp.result,f)
  with open("/content/drive/My Drive/experiment_metrics_cornac.pkl","wb") as f:
    pkl.dump(exp.metrics,f)
  with open("/content/drive/My Drive/low_rec_gap_list_cornac.pkl","wb") as f:
    pkl.dump(low_rec_gap_list,f)
  with open("/content/drive/My Drive/med_rec_gap_list_cornac.pkl","wb") as f:
    pkl.dump(medium_rec_gap_list,f)
  with open("/content/drive/My Drive/high_rec_gap_list_cornac.pkl","wb") as f:
    pkl.dump(high_rec_gap_list,f)
  with open("/content/drive/My Drive/training_user_ids.pkl","wb") as f:
    pkl.dump(list(rs.train_set.user_ids),f)

Mounted at /content/drive
