In [1]:
from IPython.display import display
import gzip
import json
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics as metrics

from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation, cosine
import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error
from math import sqrt
import sys, os
from contextlib import contextmanager
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tcristea\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# 1. Read and parse: goodread_books, goodread_interactions, book_id_map, my_rated_books

In [3]:
def read_authors_dict():
    authors = {}
    with open("inputs/goodreads_book_authors.json", 'r') as f:
        while line := f.readline():
            json_line = json.loads(line)
            authors[json_line["author_id"]] = json_line["name"]
    return authors
    

In [4]:
authors_dict = read_authors_dict()

In [5]:
def get_needed_fields(json_line, authors_dict):
    return {
        "book_id": json_line["book_id"],
        "title": json_line["title"],
        "description": json_line["description"],
        "ratings_count": json_line["ratings_count"],
        "num_pages": json_line["num_pages"],
        "publication_year": json_line["publication_year"],
        "url": json_line["url"],
        # "image_url": json_line["image_url"],
        "average_rating": json_line["average_rating"],
        "authors": " ".join([authors_dict[x["author_id"]] for x in json_line["authors"]]),
        "publisher": json_line["publisher"]
    }

In [6]:
def get_books_df(authors_dict, min_rating_count=1000):
    parsed_books = []
    with gzip.open("inputs/goodreads_books.json.gz", 'r') as f:
        while line := f.readline():
            needed_fields = get_needed_fields(json.loads(line), authors_dict)
            try:
                ratings_count = int(needed_fields["ratings_count"])
            except ValueError:
                continue
            if ratings_count > min_rating_count:
                parsed_books.append(needed_fields)
    books = pd.DataFrame.from_dict(parsed_books)
    books["ratings_count"] = pd.to_numeric(books["ratings_count"])
    books["title"] = books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True)
    books["title"] = books["title"].str.lower()
    books["title"] = books["title"].str.replace("\s+", " ", regex=True)
    books = books[books["title"].str.len() > 0]
    books["book_id"] = pd.to_numeric( books["book_id"])
    return books

In [7]:
books_df = get_books_df(authors_dict, 1000)

In [8]:
books_df

Unnamed: 0,book_id,title,description,ratings_count,num_pages,publication_year,url,average_rating,authors,publisher
0,6066819,best friends forever,Addie Downs and Valerie Adler were eight when ...,51184,368,2009,https://www.goodreads.com/book/show/6066819-be...,3.49,Jennifer Weiner,Atria Books
1,89375,90 minutes in heaven a true story of death and...,As he is driving home from a minister's confer...,68157,,,https://www.goodreads.com/book/show/89375.90_M...,3.91,Don Piper Cecil Murphey,
2,89376,heaven,What is Heaven really going to be like? What w...,7345,533,,https://www.goodreads.com/book/show/89376.Heaven,4.26,Randy Alcorn,
3,89377,penny from heaven,It's 1953 and 11-year-old Penny dreams of a su...,6949,288,2006,https://www.goodreads.com/book/show/89377.Penn...,3.98,Jennifer L. Holm,Random House Books for Young Readers
4,89378,dog heaven,In Newbery Medalist Cynthia Rylant's classic b...,1331,40,1995,https://www.goodreads.com/book/show/89378.Dog_...,4.43,Cynthia Rylant,Blue Sky Press
...,...,...,...,...,...,...,...,...,...,...
90935,335370,rose madder,A grimmer than Grimm fairy tale for our times-...,1201,420,1995,https://www.goodreads.com/book/show/335370.Ros...,3.67,Stephen King,Viking Adult
90936,57064,hammerfall the gene wars 1,One of the most renowned figures in science fi...,1143,457,2002,https://www.goodreads.com/book/show/57064.Hamm...,3.62,C.J. Cherryh,Eos / Harper Voyager
90937,7715664,sin undone demonica 5,HER TOUCH IS DEADLY\nAs the only female Seminu...,23091,400,2010,https://www.goodreads.com/book/show/7715664-si...,4.35,Larissa Ione,Grand Central Publishing
90938,3106983,persepolis the story of a childhood and the st...,The Story of a Childhood and The Story of a Re...,1966,343,2008,https://www.goodreads.com/book/show/3106983-pe...,4.36,Marjane Satrapi Anjali Singh,Vintage


In [9]:
def get_books_csv_map():
    csv_book_mapping = {}
    first_line = True
    with open("inputs/book_id_map.csv", "r") as f:
        while True:
            line = f.readline()
            if not line:
                break
            if first_line:
                first_line = False
                continue
            csv_id, book_id = line.strip().split(",")
            csv_book_mapping[int(csv_id)] = int(book_id)
    return csv_book_mapping

In [10]:
books_csv_ids_map = get_books_csv_map()

In [11]:
def get_my_rated_books_df(books_df):
    rated_books = []
    first_line = True
    with open("inputs/my_rated_books.csv") as f:
        while True:
            line = f.readline()
            if not line:
                break
            if first_line:
                first_line = False
                continue
            book_id, rating = line.strip().split(",")
            rated_books.append({"book_id": int(book_id), "rating": int(rating)})
    books = pd.DataFrame.from_dict(rated_books, dtype = int)
    books_augmented = books_df.merge(books, how="inner", on="book_id")
    return books_augmented

In [12]:
my_rate_books_df = get_my_rated_books_df(books_df)

In [13]:
def get_filtered_users(my_rate_books_df, same_books_ratio):
    users = {}
    first_line = True
    book_set = set(my_rate_books_df["book_id"])
    with open("inputs/goodreads_interactions.csv") as f:
        while True:
            line = f.readline()
            if not line:
                break
            if first_line:
                first_line = False
                continue
            user_id, csv_id, _, rating, _ = line.strip().split(",")
            book_id = books_csv_ids_map.get(int(csv_id))
            if int(book_id) in book_set:
                if int(user_id) not in users:
                    users[int(user_id)] = 1
                else:
                    users[int(user_id)] +=1
    return set([k for k in users if users[k] > len(my_rate_books_df)/same_books_ratio])

In [14]:
filtered_users = get_filtered_users(my_rate_books_df, 1.5)

In [15]:
def get_interactions_df(filtered_users):
    interactions_lists = []
    first_line = True       
    with open("inputs/goodreads_interactions.csv") as f:
        while True:
            line = f.readline()
            if not line:
                break
            if first_line:
                first_line = False
                continue
            user_id, csv_id, _, rating, _ = line.strip().split(",")
            book_id = books_csv_ids_map.get(int(csv_id))
            if int(user_id) in filtered_users:
                interactions_lists.append({"user_id": int(user_id), "book_id": int(book_id), "rating": int(rating)})

    interactions_df = pd.DataFrame.from_dict(interactions_lists, dtype=int)
    # interactions_df = interactions_df[interactions_df["book_id"].isin(filtered_users)]
    return interactions_df

In [16]:
interactions_df_initial = get_interactions_df(filtered_users)

In [17]:
interactions_df = interactions_df_initial
for i, row in my_rate_books_df[["book_id", "rating"]].iterrows():
    interactions_df = pd.concat([interactions_df, pd.DataFrame([{"user_id": 1, "book_id": row["book_id"], "rating": row["rating"]}])], ignore_index=True)
interactions_df = interactions_df.sort_values(by="user_id").reset_index(drop=True)   

In [18]:
# 2. Preprocessing of DataFrames

In [19]:
def preprocess_books_df(books_df):
    def removeNonAscii(s):
        return "".join(i for i in s if  ord(i)<128)
    def make_lower_case(text):
        return text.lower()
    def remove_stop_words(text):
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)
        return text
    def remove_punctuation(text):
        tokenizer = RegexpTokenizer(r'\w+')
        text = tokenizer.tokenize(text)
        text = " ".join(text)
        return text
    def full_normalize(text):
        text = re.sub("[^a-zA-Z0-9 ]", "", text)
        text = re.sub("\s+", " ", text)
        text = removeNonAscii(text)
        text = make_lower_case(text)
        text = remove_stop_words(text)
        return remove_punctuation(text)
        
    aux_df = books_df.copy()
    aux_df["title"] = aux_df["title"].apply(full_normalize)
    aux_df = aux_df[aux_df["title"].str.len() > 0]
    aux_df["description"] = aux_df["description"].apply(full_normalize)
    aux_df = aux_df[aux_df["description"].str.len() > 0]
    aux_df["authors"] = aux_df["authors"].apply(full_normalize)
    aux_df = aux_df[aux_df["authors"].str.len() > 0]
    aux_df["publisher"] = aux_df["publisher"].apply(full_normalize)
    aux_df = aux_df[aux_df["publisher"].str.len() > 0]
    # TODO migh need/not need this for average_rating
    # aux_df["average_rating"] = aux_df["average_rating"].apply(full_normalize)
    # aux_df = aux_df[aux_df["average_rating"].str.len() > 0]
    
    
    return aux_df

In [20]:
books_df_initial = books_df
books_df = preprocess_books_df(books_df)

In [21]:
# 3. Recommender systems have a problem known as user cold-start, in which is hard do provide personalized recommendations for users with none or a very few number of consumed items, due to the lack of information to model their preferences.
#     For this reason, we are keeping in the dataset only users with at least 5 interactions.

In [22]:
users_interactions_count_df = interactions_df.groupby(['user_id', 'book_id']).size().groupby('user_id').size()

In [23]:
print('# users: %d' % len(users_interactions_count_df))

# users: 625


In [24]:
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5].reset_index()[['user_id']]

In [25]:
print('# users with at least 5 interactions: %d' % len(users_with_enough_interactions_df))

# users with at least 5 interactions: 625


In [26]:
print('# of interactions: %d' % len(interactions_df))

# of interactions: 2878532


In [27]:
interactions_from_selected_users_df = interactions_df.merge(users_with_enough_interactions_df, 
               how = 'right',
               left_on = 'user_id',
               right_on = 'user_id')

In [28]:
print('# of interactions from users with at least 5 interactions: %d' % len(interactions_from_selected_users_df))

# of interactions from users with at least 5 interactions: 2878532


In [29]:
# Our User Interactions dataset is complete and all users have at least 5 interactions.

In [30]:
# 4. We aggregate all the interactions a user had with a book (review) by a weighted sum of interaction-type strenght.
    # The weight in our case is the rating (0-5)
    # We then apply a log transformation to smooth the distribution
# TODO: We can also skip this step, as all our interactions are unique (user_id+book_id grouping). This step will only
    # apply the log transformation in our case
    # to skip this step just use
    # interactions_full_df = interactions_from_selected_users_df

In [31]:
def smooth_user_preference(x):
    return math.log(1+x, 2)

In [32]:
interactions_full_df = interactions_from_selected_users_df \
                    .groupby(['user_id', 'book_id'])['rating'].sum() \
                    .apply(smooth_user_preference).reset_index()

In [33]:
print('# of unique user/item interactions: %d' % len(interactions_full_df))

# of unique user/item interactions: 2878532


In [34]:
interactions_full_df.head()

Unnamed: 0,user_id,book_id,rating
0,1,2373,2.584963
1,1,142540,2.584963
2,1,227729,2.0
3,1,395851,2.321928
4,1,395875,2.584963


In [35]:
# 5. Create our TF-IDF vector
    # Ignoring stopwords (words with no semantics) from English
    # Trains a model whose vectors size is 5000, composed by the main unigrams and bigrams found in the corpus
# TODO:
    # - We can adjust the 5000 vector size and check for better/worse results (https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)

In [36]:
vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 2),
                     min_df=0.003,
                     max_df=0.5,
                     max_features=5000,
                     stop_words=stopwords.words('english'))

In [37]:
book_ids = books_df['book_id'].tolist()

In [38]:
# In our TF-IDF we use: title, description, ratings_count, num_pages, publication_year, average_rating, authors and publisher
# TODO:
    # - We can only use title, description as well

In [39]:
tfidf_matrix = vectorizer.fit_transform(books_df['title'] + "" + books_df['description'] + books_df['ratings_count'].astype(str)+ \
                                       books_df['num_pages'].astype(str) +  books_df['publication_year'].astype(str) + \
                                       books_df['average_rating'].astype(str) + books_df['authors'] +  books_df['publisher'])

In [40]:
tfidf_matrix

<66087x4189 sparse matrix of type '<class 'numpy.float64'>'
	with 3389769 stored elements in Compressed Sparse Row format>

In [41]:
# 6.0. To create user profiles, we take all the books the user interacted with and average them. The average is weighted
    # by the rating. The books the user has rated the highest will have a higher strangth in the final user profile.

In [51]:
def get_book_profile(book_id):
    idx = book_ids.index(book_id)
    book_profile = tfidf_matrix[idx:idx+1]
    return book_profile

In [52]:
def get_book_profiles(book_ids):
    book_profiles_list = [get_book_profile(x) for x in book_ids]
    book_profiles = scipy.sparse.vstack(book_profiles_list)
    return book_profiles

In [70]:
def build_users_profile(user_id, interactions_indexed_df):
    interactions_person_df = interactions_indexed_df.loc[user_id]
    user_book_profiles = get_book_profiles(interactions_person_df['book_id'])
    
    user_book_strengths = np.array(interactions_person_df['rating']).reshape(-1,1)
    # print(user_book_strengths)
    # print(user_id)
        
    #Weighted average of item profiles by the interactions strength
    aux_copy = user_book_strengths.copy()
    aux_copy[aux_copy == 0] = 1
    user_book_strengths_weighted_avg = np.sum(user_book_profiles.multiply(aux_copy), axis=0) / np.sum(user_book_strengths)
    # print(user_book_strengths_weighted_avg)
    # print(np.asarray(user_book_strengths_weighted_avg))
    user_profile_norm = sklearn.preprocessing.normalize(np.asarray(user_book_strengths_weighted_avg))
    return user_profile_norm

In [71]:
def build_users_profiles(): 
    interactions_indexed_df = interactions_train_df[interactions_train_df['book_id'] \
                                                   .isin(books_df['book_id'])].set_index('user_id')
    user_profiles = {}
    for user_id in interactions_indexed_df.index.unique():
        user_profiles[user_id] = build_users_profile(user_id, interactions_indexed_df)
    return user_profiles

In [72]:
# 6.1. Split user interactions in 2 datasets: train (80%) and test (20%).

In [87]:
# interactions_train_df, interactions_test_df = train_test_split(interactions_full_df,
#                                    stratify=interactions_full_df['user_id'], 
#                                    test_size=0.20,
#                                    random_state=42)
interactions_train_df = interactions_full_df

In [88]:
print('# interactions on Train set: %d' % len(interactions_train_df))
# print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 2878532


In [89]:
# Warning: Running this takes considerable time: X mins

In [90]:
user_profiles = build_users_profiles()

In [91]:
len(user_profiles)

625

In [92]:
# 7. Let's take a look at the user profile for user_id = 1. This is a vector of 5000 tokens together with
    # how relevant is that token for user_id=1 (tokens are both unigrams and bigrams)
    # We can sort the tokens by their relevance

In [93]:
myprofile = user_profiles[1]

In [94]:
print(myprofile.shape)

(1, 4189)


In [95]:
tfidf_feature_names = vectorizer.get_feature_names_out()

In [96]:
pd.DataFrame(sorted(zip(tfidf_feature_names, 
                        user_profiles[1].flatten().tolist()), key=lambda x: -x[1])[:20],
             columns=['token', 'relevance'])

Unnamed: 0,token,relevance
0,glass,0.308601
1,throne,0.308308
2,assassin,0.213427
3,shatter,0.196741
4,vampire,0.191831
5,childrens,0.173639
6,unspeakable,0.152813
7,court,0.152549
8,usa,0.130375
9,diaries,0.128588


In [147]:
class ContentBasedRecommender:
    
    MODEL_NAME = 'Content-Based'
    
    def __init__(self, books_df=None):
        self.book_ids = book_ids
        self.books_df = books_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def _get_similar_books_to_user_profile(self, user_id, topn=1000):
        #Computes the cosine similarity between the user profile and all book profiles
        cosine_similarities = cosine_similarity(user_profiles[user_id], tfidf_matrix)
        #Gets the top similar books
        similar_indices = cosine_similarities.argsort().flatten()
        #Sort the similar books by similarity
        similar_items = sorted([(book_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
        return similar_items
        
    def recommend_books(self, user_id, books_to_ignore=[], topn=10, verbose=False):
        similar_books = self._get_similar_books_to_user_profile(user_id)
        #Ignores books the user has already rated
        similar_books_filtered = list(filter(lambda x: x[0] not in books_to_ignore, similar_books))
        
        recommendations_df = pd.DataFrame(similar_books_filtered, columns=['book_id', 'recStrength'])

        if verbose:
            if self.books_df is None:
                raise Exception('"books_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.books_df, how = 'left', 
                                                          left_on = 'book_id', 
                                                          right_on = 'book_id')[['recStrength', 'book_id', 'title', 'ratings_count', \
                                                                                 'url', 'average_rating', 'authors', 'publisher', \
                                                                                'publication_year', 'num_pages']]


        return recommendations_df
    
content_based_recommender_model = ContentBasedRecommender(books_df)

In [148]:
# 8. Before evaluating the model, lets see what books are recommended for user_id = 1

In [149]:
def get_books_interacted(user_id, interactions_df):
    interacted_books = interactions_df.loc[user_id]['book_id']
    return set(interacted_books if type(interacted_books) == pd.Series else [interacted_books])

In [150]:
# user_1_recommendations = content_based_recommender_model.recommend_books(1, get_books_interacted(1, interactions_df.set_index('user_id')), 20, True)
user_1_recommendations = content_based_recommender_model.recommend_books(1, {}, 20, True)

In [151]:
# 9. TODO: Evaluate Model

In [180]:
# user_1_recommendations[user_1_recommendations["book_id"] == user_1_recommendations["recStrength"].min()]
# user_1_recommendations[user_1_recommendations["ratings_count"] > 10000].head(30)
user_1_recommendations[user_1_recommendations["book_id"] == 2373].empty

True

In [208]:
aux = user_1_recommendations[["recStrength", "book_id"]].copy()
aux["recStrength"] = aux["recStrength"] * 10

In [231]:
def evaluate_predictions(user_recommendations, my_rate_books_df, metric="Content-Based"):
    predictions = []
    target = []
    predictions_to_ret = {}
    for i, row in my_rate_books_df.iterrows():
        target.append(row["rating"])
        # predictions.append(svd.predict(1, row["book_id"]).est)
        predictions.append(float(user_recommendations[user_recommendations["book_id"] == row["book_id"]]["recStrength"].iloc[0]) if \
                          not user_recommendations[user_recommendations["book_id"] == row["book_id"]].empty else 4.02)
        predictions_to_ret[row["book_id"]] = (float(user_recommendations[user_recommendations["book_id"] == row["book_id"]]["recStrength"].iloc[0]) if \
                          not user_recommendations[user_recommendations["book_id"] == row["book_id"]].empty else 4.02, row["rating"])
        # print(row)
    prediction= pd.DataFrame(predictions).T
    target_prediction = pd.DataFrame(target).T
    

    TPs = 0
    FNs = 0
    # The number of relevant items are the items with actual rating greater or equal to 3.5.
    threshold = 3.5
     # Find the relevant items using the threshold
    relevant_items = []
    for i in range(0, target_prediction.shape[0]):
        for j in range(0, target_prediction.shape[1]):
            if target_prediction.values[i, j] > threshold:
                relevant_items.append((i, j))

    # Compute K
    k = len(relevant_items)

    # Recommended items @ k
    recommended_items_at_k = []
    for i in range(0, target_prediction.shape[0]):
        for j in range(0, target_prediction.shape[1]):
            if prediction.values[i, j] > threshold:
                recommended_items_at_k.append((i, j))

     # Recommended and Relevant items @ k (Intersection)
    recomm_and_relevant_items_at_k = list(set(relevant_items) & set(recommended_items_at_k))

    # Compute Precision @ K
    precision_at_k = len(recomm_and_relevant_items_at_k) / len(recommended_items_at_k)

    print("Precision at k={}, for prediction using metric {}, is: {}".format(k, metric, precision_at_k))

    # Compute Recall @ K
    recall_at_k = len(recomm_and_relevant_items_at_k) / len(relevant_items)

    print("Recall at k={} for prediction using metric {}, is: {}".format(k, metric, recall_at_k))

    # Compute F1 score @ K
    f1_score_at_k = 2 * precision_at_k * recall_at_k / (precision_at_k + recall_at_k)

    print("F1 Score at k={}, for prediction using metric {}, is: {}".format(k, metric, f1_score_at_k))
    return predictions_to_ret
    

In [232]:
content_based_predictions = evaluate_predictions(aux, my_rate_books_df)

Precision at k=22, for prediction using metric Content-Based, is: 0.8666666666666667
Recall at k=22 for prediction using metric Content-Based, is: 0.5909090909090909
F1 Score at k=22, for prediction using metric Content-Based, is: 0.7027027027027029


In [1]:
len(content_based_predictions)

NameError: name 'content_based_predictions' is not defined

In [234]:
# Hybrid Model