In [None]:
# Download these files

# books_titles.json
# https://drive.google.com/file/d/1Iqv9TROqNgYbUDijSaDegv4EPpxO97t3/view?usp=sharing

# goodreads_interactions.csv
# https://drive.google.com/open?id=1zmylV7XW2dfQVCLeg1LbllfQtHD2KUon

# book_id_map.csv
# https://drive.google.com/uc?id=1CHTAaNwyzvbi1TR08MJrJ03BxA266Yxr

# liked_books.csv
# https://drive.google.com/file/d/1dhPhfD5hAOJjrdf8JhvbOPxDpF4qWYnb/view?usp=sharing


import pandas as pd
# read liked books csv in
my_books = pd.read_csv("liked_books.csv", index_col=0) #The row index
my_books["book_id"] = my_books["book_id"].astype(str) #ensure it's a string

In [None]:
my_books #return my books

In [None]:
# load in book id mapping file
# Loop will read the file line by line until there are no more lines
csv_book_mapping = {}

with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id # Creates dictionary that helps map between the ids in different files

In [None]:
book_set = set(my_books["book_id"]) #Create set that contains all the unique books we've read

In [None]:
overlap_users = {} # Store any user that's read the same books as user

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")

        book_id = csv_book_mapping.get(csv_id)

        if book_id in book_set: #If this is in liked books, then add user id to overlap users dictionary. Also, keeping count a given user has books that overlap with the users liked books.
            if user_id not in overlap_users: #This will return and give us a dictionary
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1

In [None]:
len(overlap_users)  #look at length of dictionary

In [None]:
#Unless a user has read a certain number of books, and they are in common with us (20% common), then take them out of overlap_user
#Only want to find users who have read some of the same books as user
filtered_overlap_users = set([k for k in overlap_users if overlap_users[k] > my_books.shape[0]/4])


In [None]:
len(filtered_overlap_users)

In [None]:
#List that grabs user id, book id, and rating for all user
# Will use to create recs with collaborative filtering
#For every user that has read some of the same books as user, (at least 25% of them), we will create a list of all the books that they've read--> Their reading history may serve as books that we want to read
interactions_list = []

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")

        if user_id in filtered_overlap_users: #If the user is someone in the overlap users list, then add their reading history into the interactions list
            book_id = csv_book_mapping[csv_id]
            interactions_list.append([user_id, book_id, rating])

In [None]:
len(interactions_list) #get length of interactions list

In [None]:
interactions_list[0] #first item in list. Should be a user id, a book id, and the rating it was given (scale 1-5)

In [None]:
interactions = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"]) #Turn into a dataframe.

In [None]:
interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], interactions]) #Add our own ratings into the matrix using concat function

In [None]:
interactions

In [None]:
#Make sure everything is the right datatype
interactions["book_id"] = interactions["book_id"].astype(str) #strings
interactions["user_id"] = interactions["user_id"].astype(str) #strings
interactions["rating"] = pd.to_numeric(interactions["rating"]) #numbers

In [None]:
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes #user ids
#Split each unique user id, and then assign a number to each row

In [None]:
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes  #Split each unique book id, and then assign a number to each row

In [None]:
#Using scipy to import a coo matrix ( a type of sparse matrix)
# In order to create matrix:
# pass in an array or list of data u want in cells, then row positions (user indices), then column positions(book indices)
#created in coo format because they are initially easier to create, will transform to csr format
from scipy.sparse import coo_matrix

ratings_mat_coo = coo_matrix((interactions["rating"], (interactions["user_index"], interactions["book_index"])))

In [None]:
ratings_mat_coo.shape #

In [None]:
ratings_mat = ratings_mat_coo.tocsr() #convert coo matrix to csr matrix

In [None]:
interactions[interactions["user_id"] == "-1"] #Find row positions of our specific user.

In [None]:
my_index = 0 #row zero in matrix

In [None]:
#Cosign similarity will find the similarity between two rows in the matrix
#It could find how similar each user is to us in terms of books they liked, and how they rated them
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten() #Turn into numpy array

In [None]:
similarity[0] #return a float that tells us how similar they are. Similarity of 1 means you have the exact same tastes and preferences. The lower the similarity metric, the less similar your tates and books are

In [None]:
# Find indicis of users that are most similar to us
import numpy as np

indices = np.argpartition(similarity, -15)[-15:] #Find the 15 users that are the most similar to us

In [None]:
indices #Return the 15 users most similar to us in terms of book taste and ratings

In [None]:
similar_users = interactions[interactions["user_index"].isin(indices)].copy() #Find the user indices of the 15 most similar users

In [None]:
similar_users = similar_users[similar_users["user_id"]!="-1"] #Remember that our user id is -1

In [None]:
similar_users #Will return abt 38,000 rows of potential books that we may want to read, based on the users who are most similar

In [None]:
# Group similar users by book id and then taking rating column and aggregate it
#Count up how many times each book appeared, and find the average rating
book_recs = similar_users.groupby("book_id").rating.agg(['count', 'mean'])

In [None]:
book_recs# Lets us know many times a book was recommended to use, and what the avg rating was

In [None]:
#Read in book titles
#Ensure that book id is a string
books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)

In [None]:
book_recs = book_recs.merge(books_titles, how="inner", on="book_id") #merge the two datasets in order to get the book titles into the dataset

In [None]:
book_recs

In [None]:
#Create an adjusted count--> a count but normalized for how many times the book appeared amongst ppl like us, relative to other ppl
#Ratings is  number of times book was rated across all of goodreads
#This method gets books that were popular amongst users like us, but weren't as popular amongst users not like us
book_recs["adjusted_count"] = book_recs["count"] * (book_recs["count"] / book_recs["ratings"])

In [None]:
# How much we might like each book based off average rating * adjusted count
book_recs["score"] = book_recs["mean"] * book_recs["adjusted_count"]

In [None]:
#Take out any book where a book id matches an id of a book we've already read
book_recs = book_recs[~book_recs["book_id"].isin(my_books["book_id"])]

In [None]:
#Take title of books we've liked, and replace any character that doesn't fall within this set of characters
my_books["mod_title"] = my_books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()

In [None]:
# Replace any sequences of spaces
my_books["mod_title"] = my_books["mod_title"].str.replace("\s+", " ", regex=True) #regular expression

In [None]:
#Take out the books in our recommendations where the mod title fits into the books we've already read/liked
book_recs = book_recs[~book_recs["mod_title"].isin(my_books["mod_title"])]

In [None]:
#Remove any recommendation that had 3 or less users read the book and like it
book_recs = book_recs[book_recs["mean"] >=4]

In [None]:
#Only find books where the book rating is greater than 2
book_recs = book_recs[book_recs["count"]>2]

In [None]:
#Sorts recs based on score (Highest to lowest)
top_recs = book_recs.sort_values("mean", ascending=False)

In [None]:
#Improves display of the pandas dataframe
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

top_recs.style.format({'url': make_clickable, 'cover_image': show_image})