# Books Recommender System

In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
######
from lenskit import batch, topn
import lenskit.crossfold as xf
import warnings
warnings.filterwarnings('ignore')
# !pip install lenskit_tf
from lenskit import topn, util
from lenskit.algorithms import Recommender, item_knn, user_knn as knn, als, tf
from lenskit.algorithms import basic


In [8]:
books = pd.read_csv('goodbook/books.csv')
ratings = pd.read_csv('goodbook/ratings.csv')
book_tags = pd.read_csv('goodbook/book_tags.csv')
tags = pd.read_csv('goodbook/tags.csv')

# Start with Book tags

In [9]:
genres = ["Art", "Biography", "Business", "Chick Lit", "Children's", "Christian", "Classics",
          "Comics", "Contemporary", "Cookbooks", "Crime", "Ebooks", "Fantasy", "Fiction",
          "Gay and Lesbian", "Graphic Novels", "Historical Fiction", "History", "Horror",
          "Humor and Comedy", "Manga", "Memoir", "Music", "Mystery", "Nonfiction", "Paranormal",
          "Philosophy", "Poetry", "Psychology", "Religion", "Romance", "Science", "Science Fiction", 
          "Self Help", "Suspense", "Spirituality", "Sports", "Thriller", "Travel", "Young Adult"]

genres = list(map(str.lower, genres))
genres[:4]


['art', 'biography', 'business', 'chick lit']

In [10]:
available_genres = tags.loc[tags.tag_name.str.lower().isin(genres)]


In [11]:
available_genres_books = book_tags[book_tags.tag_id.isin(available_genres.tag_id)]
print('There are {} books that are tagged with above genres'.format(available_genres_books.shape[0]))


There are 60573 books that are tagged with above genres


In [59]:
# 250 in 35min
((950 * 35)/250) #/ 60

133.0

In [12]:
available_genres_books['genre'] = available_genres.tag_name.loc[available_genres_books.tag_id].values
available_genres_books.head()

Unnamed: 0,goodreads_book_id,tag_id,count,genre
1,1,11305,37174,fantasy
5,1,11743,9954,fiction
25,1,7457,958,classics
38,1,22973,673,paranormal
52,1,20939,465,mystery


In [14]:
available_genres_books.to_csv ("RecSys_News/goodbook/available_genres_books.csv", index= False)

In [442]:
np.sort(ratings.groupby('user_id')['rating'].count())[::-1]

array([200, 200, 199, ...,   2,   2,   2])

In [443]:
dup_ratings = ratings.drop_duplicates(keep='first')
dup_ratings

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4
...,...,...,...
981751,10000,48386,5
981752,10000,49007,4
981753,10000,49383,5
981754,10000,50124,5


In [444]:
print (len (dup_ratings.user_id.unique()))
print(len (dup_ratings.book_id.unique()))

53424
10000


In [445]:
available_genres_books = book_tags[book_tags.tag_id.isin(available_genres.tag_id)]
available_genres_books ["book_id"] = available_genres_books ["goodreads_book_id"]
available_genres_books['genre'] = available_genres.tag_name.loc[available_genres_books.tag_id].values

# Merge the DataFrames based on the 'book_id' column
genres_ratings = dup_ratings.merge(available_genres_books, on='book_id', how='inner')
genres_ratings

Unnamed: 0,book_id,user_id,rating,goodreads_book_id,tag_id,count,genre
0,1,314,5,1,11305,37174,fantasy
1,1,314,5,1,11743,9954,fiction
2,1,314,5,1,7457,958,classics
3,1,314,5,1,22973,673,paranormal
4,1,314,5,1,20939,465,mystery
...,...,...,...,...,...,...,...
496543,9998,53249,5,9998,14821,21,horror
496544,9998,53249,5,9998,8055,18,contemporary
496545,9998,53249,5,9998,23471,17,philosophy
496546,9998,53249,5,9998,10210,7,ebooks


In [446]:
df_fil = genres_ratings[['user_id', 'book_id', 'rating', 'genre']]
df_fil

Unnamed: 0,user_id,book_id,rating,genre
0,314,1,5,fantasy
1,314,1,5,fiction
2,314,1,5,classics
3,314,1,5,paranormal
4,314,1,5,mystery
...,...,...,...,...
496543,53249,9998,5,horror
496544,53249,9998,5,contemporary
496545,53249,9998,5,philosophy
496546,53249,9998,5,ebooks


In [447]:
ratings_sorted = df_fil.sort_values(by='user_id')
ratings_sorted

Unnamed: 0,user_id,book_id,rating,genre
484516,2,9762,4,philosophy
484513,2,9762,4,psychology
484514,2,9762,4,spirituality
484517,2,9762,4,religion
484515,2,9762,4,nonfiction
...,...,...,...,...
201189,53424,4214,5,classics
201188,53424,4214,5,fantasy
201195,53424,4214,5,ebooks
201196,53424,4214,5,travel


In [449]:
grouped_df = ratings_sorted.groupby(['user_id', 'book_id']).agg({'genre': ', '.join, 'rating': 'mean'}).reset_index()
grouped_df

Unnamed: 0,user_id,book_id,genre,rating
0,2,9762,"philosophy, psychology, spirituality, religion...",4.0
1,3,9014,"thriller, fantasy, fiction, horror, ebooks, sc...",1.0
2,4,3273,"ebooks, travel, contemporary, fiction, history...",2.0
3,7,1519,"fantasy, philosophy, history, poetry, fiction,...",5.0
4,7,3711,"religion, classics, contemporary, fiction",5.0
...,...,...,...,...
79526,53420,4625,"ebooks, classics, fiction",3.0
79527,53420,6538,"nonfiction, history, suspense, ebooks, science...",4.0
79528,53422,7667,"suspense, mystery, thriller, fiction, crime, s...",4.0
79529,53423,4984,"classics, fiction, biography, ebooks, science,...",5.0


In [450]:
# Step 1: Filter book_ids with less than 10 ratings
book_counts = grouped_df['book_id'].value_counts()
popular_books = book_counts[book_counts >= 3].index
df_filtered_books = grouped_df[grouped_df['book_id'].isin(popular_books)]

# Step 2: Filter users with less than 20 interactions
user_counts = df_filtered_books['user_id'].value_counts()
active_users = user_counts[user_counts >= 10].index
df_filtered = df_filtered_books[df_filtered_books['user_id'].isin(active_users)]

# Step 3: Reset the indices of the filtered DataFrame
df_filtered.reset_index(drop=True, inplace=True)

# Now, df_filtered contains the data where book_ids have at least 10 ratings, users have at least 20 interactions, and the indices are reset.

# Step 4: Create mapping dictionaries for book_id and user_id to integer indices
book_id_to_index = {book_id: index+1 for index, book_id in enumerate(df_filtered['book_id'].unique())}
user_id_to_index = {user_id: index+1 for index, user_id in enumerate(df_filtered['user_id'].unique())}

# Step 5: Map book_id and user_id to integer indices in the DataFrame
df_filtered['book_index'] = df_filtered['book_id'].map(book_id_to_index)
df_filtered['user_index'] = df_filtered['user_id'].map(user_id_to_index)


In [453]:
grouped_df = df_filtered[['user_index', 'book_index', 'rating', 'genre']]
grouped_df.head (50)

Unnamed: 0,user_index,book_index,rating,genre
0,1,1,5.0,"classics, fiction, fantasy, contemporary, myst..."
1,1,2,1.0,"fiction, classics, fantasy, ebooks"
2,1,3,2.0,"classics, science, fiction, fantasy, philosoph..."
3,1,4,5.0,"science, ebooks, religion, philosophy, classic..."
4,1,5,4.0,"ebooks, fiction, classics, contemporary, roman..."
5,1,6,4.0,"ebooks, romance, biography, fiction, contempor..."
6,1,7,4.0,"religion, ebooks, spirituality, romance, philo..."
7,1,8,5.0,"romance, contemporary, fiction, classics"
8,1,9,3.0,"fiction, contemporary, classics, crime, scienc..."
9,1,10,2.0,"religion, fantasy, history, art, romance, eboo..."


In [455]:
# grouped_df= df_filtered.copy()
# Number of unique user_ids and book_ids
num_unique_users = grouped_df['user_index'].nunique()
num_unique_books = grouped_df['book_index'].nunique()

# Total possible interactions (assuming all combinations exist)
total_possible_interactions = num_unique_users * num_unique_books

# Actual number of interactions (non-zero ratings)
num_interactions = grouped_df.shape[0]

# Sparsity calculation
sparsity = 1.0 - (num_interactions / total_possible_interactions)

# Print the results
print(f"Number of unique user_ids: {num_unique_users}")
print(f"Number of unique book_ids: {num_unique_books}")
print(f"Sparsity of the data: {sparsity:.4f}")

Number of unique user_ids: 943
Number of unique book_ids: 761
Sparsity of the data: 0.9816


# Train / Test Split

In [26]:
for i, tp in enumerate(xf.partition_users(grouped_df, 1, xf.SampleN(5))):
  tp.train.to_csv('train-book%d.csv' % (i,), index= False)
  tp.test.to_csv('val-book%d.csv' % (i,), index= False)

In [30]:
grouped_df.to_csv ("goodbook/ratings_filtered_goodbook.csv", index= False)

In [28]:
train = pd.read_csv("goodbook/trainVal-book0.csv")
for i, tp in enumerate(xf.partition_users(grouped_df, 1, xf.SampleN(5))):
  tp.train.to_csv('train-book%d.csv' % (i,), index= False)
  tp.test.to_csv('val-book%d.csv' % (i,), index= False)

In [None]:
grouped_df.to_csv("goodbook/processed_GB_Data.csv", index=False)


# Recommendation

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
######
from lenskit import batch, topn
import lenskit.crossfold as xf
import warnings
warnings.filterwarnings('ignore')
# !pip install lenskit_tf
from lenskit import topn, util
from lenskit.algorithms import Recommender, als, tf
from lenskit.algorithms import basic


In [153]:
p_obfuscation = 0.01
preprocess = ["1Step", "2Step", "original"][1]
mode = ["strategic", "random"][0]
topk = [5, 10][0] #
round = [1, 2, 3][1]
# train = pd.read_csv ("goodbook/trainVal-book0.csv", sep=",", names= ["user", "item", "rating", "genre"]) # 
# train = pd.read_csv (f"goodbook/Random/Adding_user_item_matrix_{p_obfuscation}_top50Inditems_top100IndiUsers_Categories.csv", sep=",", names= ["user", "item"])#, "rating", "genre"]) # 
train = pd.read_csv (f"goodbook/Random/obfuscated_user_item_matrix_{p_obfuscation}_{mode}_top50Inditems_top100IndiUsers_Categories.csv", sep=",", names= ["user", "item"])#, "rating", "genre"]) # 
# val = pd.read_csv ("goodbook/val-book0.csv", sep=",", names= ["user", "item", "rating", "genre"])
test = pd.read_csv ("goodbook/test-book0.csv", sep=",", names= ["user", "item", "rating", "genre"])
topk

5

In [154]:
train

Unnamed: 0,user,item
0,user_id,item_id
1,1,3
2,1,4
3,1,5
4,1,7
...,...,...
7890,943,88
7891,943,90
7892,943,109
7893,943,127


In [155]:
# grouped_df= df_filtered.copy()
# Number of unique user_ids and book_ids
num_unique_users = train['user'].nunique()
num_unique_books = train['item'].nunique()

# Total possible interactions (assuming all combinations exist)
total_possible_interactions = num_unique_users * num_unique_books

# Actual number of interactions (non-zero ratings)
num_interactions = train.shape[0]

# Sparsity calculation
sparsity = 1.0 - (num_interactions / total_possible_interactions)

# Print the results
print(f"Number of unique user_ids: {num_unique_users}")
print(f"Number of unique book_ids: {num_unique_books}")
print(f"Sparsity of the data: {sparsity:.4f}")
print(f"lentgh of the data: ", len (train))

Number of unique user_ids: 944
Number of unique book_ids: 705
Sparsity of the data: 0.9881
lentgh of the data:  7895


In [156]:
train ["rating"] = 1
train = train [["user", "item", "rating"]] #.copy ()
# trainVal_small.to_csv ("goodbook/trainVal_small.csv", index= False)\

# val_small = train [["user", "item", "rating"]] #.copy ()
# train_small.to_csv ("goodbook/train_small.csv", index= False)
test ["rating"] = 1
test = test [["user", "item", "rating"]] #.copy ()
# test_small.to_csv ("goodbook/test_small.csv", index= False)


In [178]:

warnings.filterwarnings('ignore')

# Constants
p_obfuscation = 0.1
preprocess = "2Step"
mode = "random"

# Data Loading
train = pd.read_csv(f"goodbook/Random/Adding_user_item_matrix_{p_obfuscation}_top50Inditems_top100IndiUsers_Categories_{mode}.csv", sep=",", names=["user", "item"])
test = pd.read_csv("goodbook/test-book0.csv", sep=",", names=["user", "item", "rating", "genre"])
# {mode}_ : only for removal outputs
# Sparsity Calculation
num_unique_users = train['user'].nunique()
num_unique_books = train['item'].nunique()
total_possible_interactions = num_unique_users * num_unique_books
num_interactions = train.shape[0]
sparsity = 1.0 - (num_interactions / total_possible_interactions)

print(f"Number of unique user_ids: {num_unique_users}")
print(f"Number of unique book_ids: {num_unique_books}")
print(f"Sparsity of the data: {sparsity:.4f}")
print(f"Length of the data: {len(train)}")

# Preparing Data
train["rating"] = 1
train = train[["user", "item", "rating"]]
test["rating"] = 1
test = test[["user", "item", "rating"]]

# Algorithms
BPR = tf.BPR(features=200, epochs=200)
algo_ii = item_knn.ItemItem(20, feedback='implicit', use_ratings=False)
algo_uu = user_knn.UserUser(50, feedback='implicit', use_ratings=False)
algo_als = als.ImplicitMF(features=50)
pop = basic.Popular()

def evaluation(aname, algo, train, test, topk):
    fittable = util.clone(algo)
    fittable = Recommender.adapt(fittable)
    fittable.fit(train)
    users = test.user.unique()
    recs = batch.recommend(fittable, users, topk)
    recs['Algorithm'] = aname
    return recs

def run_evaluation(round, topk, train, test):
    all_recs = []
    all_recs.append(evaluation('ItemKNN', algo_ii, train, test, topk))
    all_recs.append(evaluation('UserKNN', algo_uu, train, test, topk))
    all_recs.append(evaluation('BPR', BPR, train, test, topk))
    all_recs.append(evaluation('implicitMF', algo_als, train, test, topk))
    all_recs.append(evaluation('pop', pop, train, test, topk))

    all_recs_df = pd.concat(all_recs, ignore_index=True)
    file_path = f"RecSys_News/goodbook/results/Random/RS-{topk}_{p_obfuscation}_{preprocess}_{round}.csv"
    all_recs_df.to_csv(file_path, index=False)

    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    rla.add_metric(topn.hit)
    rla.add_metric(topn.precision)
    rla.add_metric(topn.recall)
    rla.add_metric(topn.recip_rank)

    results = rla.compute(all_recs_df, test)
    results_file_path = f"RecSys_News/goodbook/results/Random/results_RecSys-{topk}_{p_obfuscation}_{preprocess}_{round}.csv"
    results.to_csv(results_file_path, index=False)

    aggres = results.groupby('Algorithm').mean()
    aggres_file_path = f"RecSys_News/goodbook/results/Random/aggres_RecSys-{topk}_{p_obfuscation}_{preprocess}_{round}.csv"
    aggres.to_csv(aggres_file_path, index=False)

for round in range(1, 4):
    for topk in [5, 10]:
        run_evaluation(round, topk, train, test)


Number of unique user_ids: 943
Number of unique book_ids: 721
Sparsity of the data: 0.9861
Length of the data: 9446
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/

## Average the recommendation performance

In [1]:

p_obfuscations = [0.01, 0.02, 0.05, 0.1]
preprocess = "2Step"
mode = "random"

# Define the rounds and top-k values you want to average over
rounds = [1, 2, 3]
topks = [5, 10]

# Initialize an empty list to hold DataFrames
all_dfs = []

# Read the CSV files for each p_obfuscation, round, and top-k
for p_obfuscation in p_obfuscations:
    for round in rounds:
        for topk in topks:
            file_path = f"RecSys_News/goodbook/results/Greedy/2Step/aggres_RecSys-{topk}_{p_obfuscation}_{preprocess}_{round}.csv"
            df = pd.read_csv(file_path)
            df['p_obfuscation'] = p_obfuscation
            df['Round'] = round
            df['TopK'] = topk
            all_dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(all_dfs, ignore_index=True)

# Group by Algorithm, TopK, and p_obfuscation to calculate mean and std for each metric
metrics = ['ndcg', 'hit', 'precision', 'recall', 'recip_rank']
agg_results = combined_df.groupby(['Algorithm', 'TopK', 'p_obfuscation'])[metrics].agg(['mean', 'std']).reset_index()
agg_results
