In [1]:
import gzip
import json
from tqdm.auto import tqdm
import pandas as pd

In [2]:
with gzip.open('goodreads_books.json.gz', 'r') as f:
    line = f.readline()

In [3]:
# Helper functuion to get the needed fields from the goodreads_books_json
def parse_fields(line):
    data = json.loads(line)
    return {
        'book_id': data['book_id'],
        'title': data['title_without_series'],
        'ratings': data['ratings_count'],
        'url': data['url'],
        'cover_image': data['image_url']
    }

In [4]:
# Creating a list of books which have been rated more than 15 times
book_titles = []
with gzip.open('goodreads_books.json.gz', 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_fields(line)
        try:
            ratings = int(fields['ratings'])
        except ValueError:
            continue
        if ratings > 15:
            book_titles.append(fields)

In [5]:
# Preprocessing by turning ratings to numerical format and creating 'mod_title' column:
# 1) removing non alpha-numeric characters from the original title column
# 2) making it lowercase
# 3) replacing any sunsequent spaces with single space
# 4) removing any row which modified title is empty by any case
# And saving the formatted table as a json file

titles = pd.DataFrame.from_dict(book_titles)
titles['ratings'] = pd.to_numeric(titles['ratings'])
titles['mod_title'] = titles['title'].str.replace('[^a-zA-Z0-9 ]', '', regex=True).str.lower()
titles['mod_title'] = titles['mod_title'].str.replace('\s+', ' ', regex=True)
titles = titles[titles['mod_title'].str.len() > 0]

titles.to_json('books_titles.json')

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(titles['mod_title'])

In [7]:
# creating the main search function, with 2 extra functions for nicier visuals

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

def make_clickable(val):
    """Helper function to make a url clickable in dataframes"""
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

def show_image(val):
    """Helper function to format image column to show actual cover picture"""
    return '<img src="{}" width=70></img>'.format(val)

def search(query, vectorizer):
    """Searches for the given query (name of the book),
    and returns top 5 matching rows from the GoodReads dataframe"""
    
    # lowercase + remove non-alphanumeric
    processed = re.sub('[^a-zA-Z0-9 ]', '', query.lower()) 
    # vectorizes the processed query
    query_vec = vectorizer.transform([processed]) 
    # creates numpy array with similarity scores between vectorized query and tfidf of the database
    similarity = cosine_similarity(query_vec, tfidf).flatten() 
    # gets the indexes of top 10 similar books from the dataframe
    indices = np.argpartition(similarity, -10)[-10:]
    # gets the dataframe items with corresponding indexes sorted by number of ratings
    # so in case of duplicates we can just take the most rated (with most information about it) book
    results = titles.iloc[indices].sort_values('ratings', ascending=False)
    # return a dataframe with top 5 matching books, with a clickable link and an image of the cover
    return results.head(5).style.format({'url': make_clickable, 'cover_image': show_image})

In [8]:
search('anna karenina', vectorizer)

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
1233453,15823480,Anna Karenina,307493,Goodreads,,anna karenina
1104468,15841795,Anna Karenina,144,Goodreads,,anna karenina
45707,15750415,Anna Karenina,85,Goodreads,,anna karenina
533496,29779251,Anna Karenina,67,Goodreads,,anna karenina
960268,17451347,Anna Karenina,45,Goodreads,,anna karenina


In [9]:
# Example list of books a user might like
personal_liked_books = ['5485', '9299235', '894862', '672948', '325661', '12346651', '3335332',
                        '132749', '678974', '415342', '13248', '15823480', '8349198', '840587',
                       '186521', '1167532', '820461']
pers_liked_books_df = pd.DataFrame(personal_liked_books)
pers_liked_books_df.insert(loc=0, column = 'user_id', value = -1)
pers_liked_books_df.columns.values[1] = 'book_id'
pers_liked_books_df['rating'] = np.random.randint(4, 6, size = len(pers_liked_books_df))

In [30]:
pers_liked_books_df

Unnamed: 0,user_id,book_id,rating
0,-1,5485,5
1,-1,9299235,5
2,-1,894862,5
3,-1,672948,4
4,-1,325661,5
5,-1,12346651,5
6,-1,3335332,5
7,-1,132749,4
8,-1,678974,5
9,-1,415342,4


In [22]:
titles

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,alls fairy in love and war avalon web of magic 8
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,the devils notebook
...,...,...,...,...,...,...
1308952,17805813,"Ondine (Ondine Quartet, #0.5)",327,https://www.goodreads.com/book/show/17805813-o...,https://images.gr-assets.com/books/1379766592m...,ondine ondine quartet 05
1308953,331839,Jacqueline Kennedy Onassis: Friend of the Arts,18,https://www.goodreads.com/book/show/331839.Jac...,https://s.gr-assets.com/assets/nophoto/book/11...,jacqueline kennedy onassis friend of the arts
1308954,2685097,The Spaniard's Blackmailed Bride,112,https://www.goodreads.com/book/show/2685097-th...,https://s.gr-assets.com/assets/nophoto/book/11...,the spaniards blackmailed bride
1308955,2342551,The Children's Classic Poetry Collection,36,https://www.goodreads.com/book/show/2342551.Th...,https://s.gr-assets.com/assets/nophoto/book/11...,the childrens classic poetry collection


In [11]:
# creating a map list with indexes being the books ids from goodreads_interactions.csv
# (which are just numbers from 1 to 2,360,651) and values being actual book ids from the 
# and values being actual book ids from the goodreads_books.json
csv_book_mapping = {}
with open('book_id_map.csv', 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(',')
        csv_book_mapping[csv_id] = book_id

In [28]:
# Creating a dict with keys being other users, and values are the number of same books
# they and our used liked.
# Then filtering the list to only include those who have at least n books in common with us
overlap_users = {}
n = 3 # number of shared likes
with open('goodreads_interactions.csv', 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(',')
        try:
            rating = int(rating)
        except ValueError:
            continue
            
        book_id = csv_book_mapping.get(csv_id)
        if book_id in personal_liked_books and rating >= 4:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1
                
filtered_overlap_users = set([i for i in overlap_users if
                              overlap_users[i] >= n])

In [29]:
# If someone is in out filtered_overlap_users (meaning they are liked at least some books
# that we also did), lets look for their history, what else they liked
interactions_list = []
with open('goodreads_interactions.csv', 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(',')
        if user_id in filtered_overlap_users:
            book_id = csv_book_mapping.get(csv_id)
            interactions_list.append([user_id, book_id, rating])

In [31]:
# Creating a base dataframe with our liked books, and books liked by others, who liked
#at least 3 other books that we did

interactions = pd.DataFrame(interactions_list, columns=['user_id', 'book_id', 'rating'])
interactions = pd.concat([pers_liked_books_df[['user_id', 'book_id', 'rating']], interactions])
interactions['book_id'] = interactions['book_id'].astype(str)
interactions['user_id'] = interactions['user_id'].astype(str)
interactions['rating'] = pd.to_numeric(interactions['rating'])
#Now converting IDs to indexes (-1 (our id) becomes 0, same things with book ids) 
#to have a base for creating sparse matrix in the future
interactions['user_index'] = interactions['user_id'].astype('category').cat.codes
interactions['book_index'] = interactions['book_id'].astype('category').cat.codes

In [32]:
# Now to the sparce matrix creation:

from scipy.sparse import coo_matrix

ratings_mat = coo_matrix((interactions['rating'], # the data (array or list)
                             (interactions['user_index'], interactions['book_index']))) #row and column positions
ratings_sparse = ratings_mat.tocsr()

In [33]:
# getting scores for similarity of books liked
my_index = 0
similarity = cosine_similarity(ratings_sparse[my_index, :], ratings_sparse).flatten()

In [34]:
# creating a dataframe with book_ids liked by most similar users to us
indices = np.argpartition(similarity, -10)[-10:]
similar_users = interactions[interactions['user_index'].isin(indices)].copy()
similar_users = similar_users[similar_users['user_id'] != '-1']

In [35]:
book_recs = similar_users.groupby('book_id').rating.agg(['count', 'mean'])

In [36]:
# add more info to our recomendations df
book_titles = pd.read_json('books_titles.json')
book_titles['book_id'] = book_titles['book_id'].astype(str)
book_recs = book_recs.merge(book_titles, how='inner', on='book_id')

In [37]:
# create a metric for penalizing overly-popular books, and recommending those which
# are not as popular overall, but popular among users who are like usю
# also removing that what we have already read
book_recs['adjusted_count'] = book_recs['count'] * (book_recs['count'] / book_recs['ratings'])
book_recs['score'] = book_recs['mean'] * book_recs['adjusted_count']
book_recs = book_recs[~book_recs['book_id'].isin(pers_liked_books_df['book_id'])]

In [44]:
# removing positions which are rarely recommended and lower rated
book_recs = book_recs[book_recs['count'] > 2]
book_recs = book_recs[book_recs['mean'] > 3]

In [45]:
top_recs = book_recs.sort_values('score', ascending=False)

In [46]:
top_recs.style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjusted_count,score
75,1297123,5,4.0,The Black Obelisk,6370,Goodreads,,the black obelisk,0.003925,0.015699
508,91203,5,4.8,Three Comrades,14716,Goodreads,,three comrades,0.001699,0.008154
62,12505,6,3.166667,The Idiot,76392,Goodreads,,the idiot,0.000471,0.001492
41,117833,7,4.0,The Master and Margarita,136321,Goodreads,,the master and margarita,0.000359,0.001438
69,12749,3,3.333333,"Swann's Way (In Search of Lost Time, #1)",25329,Goodreads,,swanns way in search of lost time 1,0.000355,0.001184
430,656,4,4.75,War and Peace,172803,Goodreads,,war and peace,9.3e-05,0.00044
303,355697,4,5.0,All Quiet on the Western Front,255610,Goodreads,,all quiet on the western front,6.3e-05,0.000313
49,11989,3,3.333333,The Plague,97546,Goodreads,,the plague,9.2e-05,0.000308
454,7144,6,3.333333,Crime and Punishment,390293,Goodreads,,crime and punishment,9.2e-05,0.000307
287,332613,5,3.8,One Flew Over the Cuckoo's Nest,498475,Goodreads,,one flew over the cuckoos nest,5e-05,0.000191
