<a href="https://www.kaggle.com/code/franciscangeno/book-recommendation-system?scriptVersionId=114484862" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [65]:
from pandas.io import gbq

In [66]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
import gdown
import re
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [67]:
url = "https://drive.google.com/uc?id=1LXpK1UfqtP89H1tYy0pBGHjYk8IhigUK"
output = 'goodreads_books.json.gz'

In [68]:
# # get the google drive file
# gdown.download(url, output, quiet=False)

In [69]:
import gzip
# we are going to read the file line by line
# streams the file without unzipping it
with gzip.open("goodreads_books.json.gz","r") as f:
    line = f.readline()

In [70]:
line
# this is metadata about a single book

b'{"isbn": "0312853122", "text_reviews_count": "1", "series": [], "country_code": "US", "language_code": "", "popular_shelves": [{"count": "3", "name": "to-read"}, {"count": "1", "name": "p"}, {"count": "1", "name": "collection"}, {"count": "1", "name": "w-c-fields"}, {"count": "1", "name": "biography"}], "asin": "", "is_ebook": "false", "average_rating": "4.00", "kindle_asin": "", "similar_books": [], "description": "", "format": "Paperback", "link": "https://www.goodreads.com/book/show/5333265-w-c-fields", "authors": [{"author_id": "604031", "role": ""}], "publisher": "St. Martin\'s Press", "num_pages": "256", "publication_day": "1", "isbn13": "9780312853129", "publication_month": "9", "edition_information": "", "publication_year": "1984", "url": "https://www.goodreads.com/book/show/5333265-w-c-fields", "image_url": "https://images.gr-assets.com/books/1310220028m/5333265.jpg", "book_id": "5333265", "ratings_count": "3", "work_id": "5400751", "title": "W.C. Fields: A Life on Film", "t

In [71]:
# load json line
import json
json.loads(line)
# this returns a dictionary

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

# **PARSING BOOK METADATA**

In [72]:
# go through a single line in metadata and pick fields we want
def parse_fields(line):
    data=json.loads(line)
    return {
        "book_id": data["book_id"],
        "title":data["title_without_series"],
        "ratings":data["ratings_count"],
        "url":data["url"],
        "cover_image":data["image_url"]       
    }

In [73]:
# go line by line, parsing each line
books_titles = []
with gzip.open("goodreads_books.json.gz","r") as f:
    #read every single line
    while True: 
        line = f.readline()
        if not line:
            break
        fields = parse_fields(line)
        try:
# turn ratings into an integer 
# we want to take books that have more than a certain no. of ratings
# we can filter them out as they are not likely to be recommended
            ratings = int(fields["ratings"])
        except ValueError:
            continue
        if ratings > 15:
            books_titles.append(fields) 

# **PROCESSING METADATA WITH PANDAS**


In [74]:
# turn into a dataframe
titles = pd.DataFrame.from_dict(books_titles)

In [75]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [76]:
titles.head(15)

Unnamed: 0,book_id,title,ratings,url,cover_image
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...
5,6066814,"Crowner Royal (Crowner John Mystery, #13)",186,https://www.goodreads.com/book/show/6066814-cr...,https://images.gr-assets.com/books/1328724803m...
6,33394837,The House of Memory (Pluto's Snitch #2),269,https://www.goodreads.com/book/show/33394837-t...,https://images.gr-assets.com/books/1493114742m...
7,89373,The Bonfire of the Vanities,77,https://www.goodreads.com/book/show/89373.The_...,https://s.gr-assets.com/assets/nophoto/book/11...
8,89375,90 Minutes in Heaven: A True Story of Death an...,68157,https://www.goodreads.com/book/show/89375.90_M...,https://s.gr-assets.com/assets/nophoto/book/11...
9,89376,Heaven,7345,https://www.goodreads.com/book/show/89376.Heaven,https://images.gr-assets.com/books/1406508230m...


In [77]:
titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1308957 entries, 0 to 1308956
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   book_id      1308957 non-null  object
 1   title        1308957 non-null  object
 2   ratings      1308957 non-null  object
 3   url          1308957 non-null  object
 4   cover_image  1308957 non-null  object
dtypes: object(5)
memory usage: 49.9+ MB


In [78]:
# ratings column is an object yet should be numeric
titles["ratings"]=pd.to_numeric(titles["ratings"])

In [79]:
titles.describe()

Unnamed: 0,ratings
count,1308957.0
mean,727.9124
std,14932.48
min,16.0
25%,30.0
50%,64.0
75%,187.0
max,4899965.0


In [80]:
titles.isnull().mean()

book_id        0.0
title          0.0
ratings        0.0
url            0.0
cover_image    0.0
dtype: float64

In [81]:
# modify title to minimize search space for the search engine
# replace any characters that don't fall within a certain set lowercase,uppercase,numerals or space
titles["mod_title"]=titles["title"].str.replace("[^a-zA-Z0-9 ]","",regex=True)

In [82]:
titles.head(15)

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,The Unschooled Wizard Sun Wolf and Starhawk 12
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,Best Friends Forever
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,The Aeneid for Boys and Girls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,Alls Fairy in Love and War Avalon Web of Magic 8
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,The Devils Notebook
5,6066814,"Crowner Royal (Crowner John Mystery, #13)",186,https://www.goodreads.com/book/show/6066814-cr...,https://images.gr-assets.com/books/1328724803m...,Crowner Royal Crowner John Mystery 13
6,33394837,The House of Memory (Pluto's Snitch #2),269,https://www.goodreads.com/book/show/33394837-t...,https://images.gr-assets.com/books/1493114742m...,The House of Memory Plutos Snitch 2
7,89373,The Bonfire of the Vanities,77,https://www.goodreads.com/book/show/89373.The_...,https://s.gr-assets.com/assets/nophoto/book/11...,The Bonfire of the Vanities
8,89375,90 Minutes in Heaven: A True Story of Death an...,68157,https://www.goodreads.com/book/show/89375.90_M...,https://s.gr-assets.com/assets/nophoto/book/11...,90 Minutes in Heaven A True Story of Death and...
9,89376,Heaven,7345,https://www.goodreads.com/book/show/89376.Heaven,https://images.gr-assets.com/books/1406508230m...,Heaven


In [83]:
# make lowercase title
titles["mod_title"] = titles["mod_title"].str.lower()

In [84]:
# get rid of spaces in a row
# any spaces in a row, replacece with a single space
titles["mod_title"] = titles["mod_title"].str.replace("\s+"," ",regex=True)

In [85]:
titles.head(20)

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,alls fairy in love and war avalon web of magic 8
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,the devils notebook
5,6066814,"Crowner Royal (Crowner John Mystery, #13)",186,https://www.goodreads.com/book/show/6066814-cr...,https://images.gr-assets.com/books/1328724803m...,crowner royal crowner john mystery 13
6,33394837,The House of Memory (Pluto's Snitch #2),269,https://www.goodreads.com/book/show/33394837-t...,https://images.gr-assets.com/books/1493114742m...,the house of memory plutos snitch 2
7,89373,The Bonfire of the Vanities,77,https://www.goodreads.com/book/show/89373.The_...,https://s.gr-assets.com/assets/nophoto/book/11...,the bonfire of the vanities
8,89375,90 Minutes in Heaven: A True Story of Death an...,68157,https://www.goodreads.com/book/show/89375.90_M...,https://s.gr-assets.com/assets/nophoto/book/11...,90 minutes in heaven a true story of death and...
9,89376,Heaven,7345,https://www.goodreads.com/book/show/89376.Heaven,https://images.gr-assets.com/books/1406508230m...,heaven


In [86]:
# only include titles that have more than 0 length
titles = titles[titles["mod_title"].str.len()>0]

In [87]:
# # to json
# titles.to_json("books_titles.json")

In [88]:
titles.head(20)


Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,alls fairy in love and war avalon web of magic 8
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,the devils notebook
5,6066814,"Crowner Royal (Crowner John Mystery, #13)",186,https://www.goodreads.com/book/show/6066814-cr...,https://images.gr-assets.com/books/1328724803m...,crowner royal crowner john mystery 13
6,33394837,The House of Memory (Pluto's Snitch #2),269,https://www.goodreads.com/book/show/33394837-t...,https://images.gr-assets.com/books/1493114742m...,the house of memory plutos snitch 2
7,89373,The Bonfire of the Vanities,77,https://www.goodreads.com/book/show/89373.The_...,https://s.gr-assets.com/assets/nophoto/book/11...,the bonfire of the vanities
8,89375,90 Minutes in Heaven: A True Story of Death an...,68157,https://www.goodreads.com/book/show/89375.90_M...,https://s.gr-assets.com/assets/nophoto/book/11...,90 minutes in heaven a true story of death and...
9,89376,Heaven,7345,https://www.goodreads.com/book/show/89376.Heaven,https://images.gr-assets.com/books/1406508230m...,heaven


In [89]:
# create an instance of tfidfvectorizer
# takes a list of strings and turns it into a tfidf matrix. 
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(titles["mod_title"])

In [90]:
# turn search query into a vector then match it up against the matrix
#  then compare
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)
def show_image(val):
    return '<img src = "{}" width=50></img>'.format(val)
def search(query, vectorizer):
    # do the same processing that we did with mod_title
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    # turn query into a vector
    query_vec = vectorizer.transform([processed])
    # find the similarity
    similarity =cosine_similarity(query_vec,tfidf).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = titles.iloc[indices]
    # the data has a lot of duplicate books so we need to get the row with the highest rating
    results.sort_values("ratings",ascending =False)
    return results.head(5).style.format({'url':make_clickable, 'cover_image':show_image})

In [91]:
# similarity

In [92]:
# # get to the actual book
# results

In [93]:
# search for another book
search("Romeo and Juliet",vectorizer)

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
42131,25853025,Romeo and Juliet,160,Goodreads,,romeo and juliet
332642,44530,Romeo and Juliet,428,Goodreads,,romeo and juliet
235190,18509114,Romeo and Juliet,32,Goodreads,,romeo and juliet
1081808,17382109,Romeo and Juliet,103,Goodreads,,romeo and juliet
532717,879258,Romeo and Juliet,60,Goodreads,,romeo and juliet


# **CREATE A LIST OF LIKED BOOKS**

In [94]:
# create a list of liked books and store the ids as a string
liked_books = ["8132407","30849411","32737635","5996629"]

# **BOOK RECOMMENDATION DATA**

In [95]:
# url2 = "https://drive.google.com/file/d/1zmylV7XW2dfQVCLeg1LbllfQtHD2KUon/view"
# output2 = 'goodreads_interactions.csv '
# # goodreads_interactions=gdown.download(url2, output2, quiet=False)

In [96]:
# read file in a streaming fashion
# read the file in a line, splitting it on the comma 
# assign the first part to csv_id before the comma and the second part after the comma to book_id
csv_book_mapping = {}
with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
#         in the dictionary, all the keys will be the csv_ids(in the interaction file) and the values are ids in the file we just worked with(metadata)
        csv_book_mapping[csv_id] =book_id      

In [97]:
len(csv_book_mapping)

2360651

In [103]:
interactions = 'goodreads_interactions.csv'

# **FIND USERS WITH THE SAME BOOKS AS US**

In [99]:
# ! head "C:\Users\user\I am learning ML\BOOK-RECOMMENDATION-SYSTEM\goodreads_interactions.csv"

In [119]:
overlap_users = set()
with open("goodreads_interactions.csv",'r') as f:
    while True:
        lines = f.readline()
        if not lines:
            break
        user_id, csv_id,_,rating, _ , = lines.split(',')
        if user_id in overlap_users:
            continue
        try:
            rating = int(rating)
        except ValueError:
            continue
        # turn the csv_id into book_id
        book_id = csv_book_mapping[csv_id]
        if book_id in liked_books and rating >= 4:
            overlap_users.add(user_id)

ValueError: too many values to unpack (expected 5)

In [None]:
# find all the books the users read
# loop through the good reads interaction dataset
rec_lines = []
# reclines will only contain books that users who liked the same books as us have read
with open("/kaggle/working/goodreads_interactions.csv",'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        # split the data again
        user_id, csv_id,_,rating, _, =line.split(",")
        if user_id in overlap_users:
            book_id = csv_book_mapping[csv_id]
            rec_lines.append([user_id, book_id, rating])


In [None]:
len(overlap_users)

In [None]:
len(rec_lines)

In [None]:
# rank recommmendations in rec_lines
recs=pd.DataFrame(rec_lines, columns = ["user_id","book_id","rating"] )
recs["book_id"]=recs["book_id"].astype(str)

In [None]:
top_recs = recs["book_id"].value_counts().head(10)
# get the values of the index
top_recs=top_recs.index.values

In [None]:
# get from book_id to book title
books_titles = pd.read_json("/kaggle/working/books_titles.json")
books_titles["book_id"]=books_titles["book_id"].astype(str)

In [None]:
books_titles.head()

In [None]:
# find the book titles where the book id is in these recommendations
books_titles[books_titles["book_id"].isin(top_recs)]

In [None]:
top_recs

In [None]:
# value counts on all of the recommendations
all_recs = recs["book_id"].value_counts()

In [None]:
all_recs = all_recs.to_frame().reset_index()

In [None]:
all_recs.head(20)

In [None]:
all_recs.columns = ["book_id","book_count"]

In [None]:
all_recs.head(15)

In [None]:
# merge with book titles 
all_recs = all_recs.merge(books_titles,how="inner",on="book_id")

In [None]:
all_recs.head(15)

In [None]:
# create a score to sort the recommendations
all_recs["score"]=all_recs["book_count"] * (all_recs["book_count"]/all_recs["ratings"]) 

In [None]:
all_recs.sort_values("score",ascending=False).head(10)

In [None]:
# remove some of the books with few ratings
popular_recs = all_recs[all_recs["book_count"]>75].sort_values("score",ascending=False)

In [None]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)
def show_image(val):
    return '<img src = "{}" width=50></img>'.format(val)
popular_recs[~popular_recs["book_id"].isin(liked_books)].head(15).style.format({'url':make_clickable, 'cover_image':show_image})