In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
import re
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install gdown
!pip install pandas-gbq

Collecting gdown
  Downloading gdown-4.6.0-py3-none-any.whl (14 kB)
Installing collected packages: gdown
Successfully installed gdown-4.6.0
[0mCollecting pandas-gbq
  Downloading pandas_gbq-0.18.1-py2.py3-none-any.whl (25 kB)
Collecting db-dtypes<2.0.0,>=1.0.4
  Downloading db_dtypes-1.0.5-py2.py3-none-any.whl (14 kB)
Collecting google-api-core<3.0.0dev,>=2.10.2
  Downloading google_api_core-2.11.0-py3-none-any.whl (120 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.3/120.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting google-auth-oauthlib>=0.7.0
  Downloading google_auth_oauthlib-0.8.0-py2.py3-none-any.whl (19 kB)
Collecting pydata-google-auth>=1.4.0
  Downloading pydata_google_auth-1.4.0-py2.py3-none-any.whl (14 kB)
Collecting google-cloud-bigquery-storage<3.0.0dev,>=2.16.2
  Downloading google_cloud_bigquery_storage-2.17.0-py2.py3-none-any.whl (187 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m187.7/187.

In [3]:
from pandas.io import gbq

In [4]:
# get the google drive file
bookdata = !gdown --id 1LXpK1UfqtP89H1tYy0pBGHjYk8IhigUK

In [5]:
import gzip
# we are going to read the file line by line
# streams the file without unzipping it
with gzip.open("/kaggle/working/goodreads_books.json.gz","r") as f:
    line = f.readline()

In [6]:
line
# this is metadata about a single book

b'{"isbn": "0312853122", "text_reviews_count": "1", "series": [], "country_code": "US", "language_code": "", "popular_shelves": [{"count": "3", "name": "to-read"}, {"count": "1", "name": "p"}, {"count": "1", "name": "collection"}, {"count": "1", "name": "w-c-fields"}, {"count": "1", "name": "biography"}], "asin": "", "is_ebook": "false", "average_rating": "4.00", "kindle_asin": "", "similar_books": [], "description": "", "format": "Paperback", "link": "https://www.goodreads.com/book/show/5333265-w-c-fields", "authors": [{"author_id": "604031", "role": ""}], "publisher": "St. Martin\'s Press", "num_pages": "256", "publication_day": "1", "isbn13": "9780312853129", "publication_month": "9", "edition_information": "", "publication_year": "1984", "url": "https://www.goodreads.com/book/show/5333265-w-c-fields", "image_url": "https://images.gr-assets.com/books/1310220028m/5333265.jpg", "book_id": "5333265", "ratings_count": "3", "work_id": "5400751", "title": "W.C. Fields: A Life on Film", "t

In [7]:
# load json line
import json
json.loads(line)
# this returns a dictionary

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

# **PARSING BOOK METADATA**

In [8]:
# go through a single line in metadata and pick fields we want
def parse_fields(line):
    data=json.loads(line)
    return {
        "book_id": data["book_id"],
        "title":data["title_without_series"],
        "ratings":data["ratings_count"],
        "url":data["url"],
        "cover_image":data["image_url"]       
    }

In [9]:
# go line by line, parsing each line
books_titles = []
with gzip.open("/kaggle/working/goodreads_books.json.gz","r") as f:
    #read every single line
    while True: 
        line = f.readline()
        if not line:
            break
        fields = parse_fields(line)
        try:
# turn ratings into an integer 
# we want to take books that have more than a certain no. of ratings
# we can filter them out as they are not likely to be recommended
            ratings = int(fields["ratings"])
        except ValueError:
            continue
        if ratings > 15:
            books_titles.append(fields) 

# **PROCESSING METADATA WITH PANDAS**


In [10]:
# turn into a dataframe
titles = pd.DataFrame.from_dict(books_titles)

In [11]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [12]:
titles.head(15)

Unnamed: 0,book_id,title,ratings,url,cover_image
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...
5,6066814,"Crowner Royal (Crowner John Mystery, #13)",186,https://www.goodreads.com/book/show/6066814-cr...,https://images.gr-assets.com/books/1328724803m...
6,33394837,The House of Memory (Pluto's Snitch #2),269,https://www.goodreads.com/book/show/33394837-t...,https://images.gr-assets.com/books/1493114742m...
7,89373,The Bonfire of the Vanities,77,https://www.goodreads.com/book/show/89373.The_...,https://s.gr-assets.com/assets/nophoto/book/11...
8,89375,90 Minutes in Heaven: A True Story of Death an...,68157,https://www.goodreads.com/book/show/89375.90_M...,https://s.gr-assets.com/assets/nophoto/book/11...
9,89376,Heaven,7345,https://www.goodreads.com/book/show/89376.Heaven,https://images.gr-assets.com/books/1406508230m...


In [13]:
titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1308957 entries, 0 to 1308956
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   book_id      1308957 non-null  object
 1   title        1308957 non-null  object
 2   ratings      1308957 non-null  object
 3   url          1308957 non-null  object
 4   cover_image  1308957 non-null  object
dtypes: object(5)
memory usage: 49.9+ MB


In [14]:
# ratings column is an object yet should be numeric
titles["ratings"]=pd.to_numeric(titles["ratings"])

In [15]:
titles.describe()

Unnamed: 0,ratings
count,1308957.0
mean,727.9124
std,14932.48
min,16.0
25%,30.0
50%,64.0
75%,187.0
max,4899965.0


In [16]:
titles.isnull().mean()

book_id        0.0
title          0.0
ratings        0.0
url            0.0
cover_image    0.0
dtype: float64

In [17]:
# modify title to minimize search space for the search engine
# replace any characters that don't fall within a certain set lowercase,uppercase,numerals or space
titles["mod_title"]=titles["title"].str.replace("[^a-zA-Z0-9 ]","",regex=True)

In [18]:
titles.head(15)

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,The Unschooled Wizard Sun Wolf and Starhawk 12
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,Best Friends Forever
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,The Aeneid for Boys and Girls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,Alls Fairy in Love and War Avalon Web of Magic 8
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,The Devils Notebook
5,6066814,"Crowner Royal (Crowner John Mystery, #13)",186,https://www.goodreads.com/book/show/6066814-cr...,https://images.gr-assets.com/books/1328724803m...,Crowner Royal Crowner John Mystery 13
6,33394837,The House of Memory (Pluto's Snitch #2),269,https://www.goodreads.com/book/show/33394837-t...,https://images.gr-assets.com/books/1493114742m...,The House of Memory Plutos Snitch 2
7,89373,The Bonfire of the Vanities,77,https://www.goodreads.com/book/show/89373.The_...,https://s.gr-assets.com/assets/nophoto/book/11...,The Bonfire of the Vanities
8,89375,90 Minutes in Heaven: A True Story of Death an...,68157,https://www.goodreads.com/book/show/89375.90_M...,https://s.gr-assets.com/assets/nophoto/book/11...,90 Minutes in Heaven A True Story of Death and...
9,89376,Heaven,7345,https://www.goodreads.com/book/show/89376.Heaven,https://images.gr-assets.com/books/1406508230m...,Heaven


In [19]:
# make lowercase title
titles["mod_title"] = titles["mod_title"].str.lower()

In [20]:
# get rid of spaces in a row
# any spaces in a row, replacece with a single space
titles["mod_title"] = titles["mod_title"].str.replace("\s+"," ",regex=True)

In [21]:
titles.head(20)

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,alls fairy in love and war avalon web of magic 8
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,the devils notebook
5,6066814,"Crowner Royal (Crowner John Mystery, #13)",186,https://www.goodreads.com/book/show/6066814-cr...,https://images.gr-assets.com/books/1328724803m...,crowner royal crowner john mystery 13
6,33394837,The House of Memory (Pluto's Snitch #2),269,https://www.goodreads.com/book/show/33394837-t...,https://images.gr-assets.com/books/1493114742m...,the house of memory plutos snitch 2
7,89373,The Bonfire of the Vanities,77,https://www.goodreads.com/book/show/89373.The_...,https://s.gr-assets.com/assets/nophoto/book/11...,the bonfire of the vanities
8,89375,90 Minutes in Heaven: A True Story of Death an...,68157,https://www.goodreads.com/book/show/89375.90_M...,https://s.gr-assets.com/assets/nophoto/book/11...,90 minutes in heaven a true story of death and...
9,89376,Heaven,7345,https://www.goodreads.com/book/show/89376.Heaven,https://images.gr-assets.com/books/1406508230m...,heaven


In [22]:
# only include titles that have more than 0 length
titles = titles[titles["mod_title"].str.len()>0]

In [23]:
# to json
titles.to_json("books_titles.json")

In [24]:
titles.head(20)


Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,alls fairy in love and war avalon web of magic 8
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,the devils notebook
5,6066814,"Crowner Royal (Crowner John Mystery, #13)",186,https://www.goodreads.com/book/show/6066814-cr...,https://images.gr-assets.com/books/1328724803m...,crowner royal crowner john mystery 13
6,33394837,The House of Memory (Pluto's Snitch #2),269,https://www.goodreads.com/book/show/33394837-t...,https://images.gr-assets.com/books/1493114742m...,the house of memory plutos snitch 2
7,89373,The Bonfire of the Vanities,77,https://www.goodreads.com/book/show/89373.The_...,https://s.gr-assets.com/assets/nophoto/book/11...,the bonfire of the vanities
8,89375,90 Minutes in Heaven: A True Story of Death an...,68157,https://www.goodreads.com/book/show/89375.90_M...,https://s.gr-assets.com/assets/nophoto/book/11...,90 minutes in heaven a true story of death and...
9,89376,Heaven,7345,https://www.goodreads.com/book/show/89376.Heaven,https://images.gr-assets.com/books/1406508230m...,heaven


In [25]:
# create an instance of tfidfvectorizer
# takes a list of strings and turns it into a tfidf matrix. 
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(titles["mod_title"])

In [26]:
# turn search query into a vector then match it up against the matrix
#  then compare
def search(query, vectorizer):
    # do the same processing that we did with mod_title
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    # turn query into a vector
    query_vec = vectorizer.transform([processed])
    # find the similarity
    similarity =cosine_similarity(query_vec,tfidf).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = titles.iloc[indices]
    # the data has a lot of duplicate books so we need to get the row with the highest rating
    results.sort_values("ratings",ascending =False)
    return results.head(5)

In [27]:
# similarity

In [28]:
# # get to the actual book
# results

In [29]:
# search for another book
search("East of Eden",vectorizer)

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
1043705,21462738,East of Eden,39,https://www.goodreads.com/book/show/21462738-e...,https://images.gr-assets.com/books/1394837518m...,east of eden
131374,385245,East of Eden,255,https://www.goodreads.com/book/show/385245.Eas...,https://s.gr-assets.com/assets/nophoto/book/11...,east of eden
1253859,1282010,East of Eden,432,https://www.goodreads.com/book/show/1282010.Ea...,https://s.gr-assets.com/assets/nophoto/book/11...,east of eden
887880,8132407,East of Eden,3006,https://www.goodreads.com/book/show/8132407-ea...,https://images.gr-assets.com/books/1328303315m...,east of eden
708809,919458,East of Eden,311,https://www.goodreads.com/book/show/919458.Eas...,https://images.gr-assets.com/books/1309288593m...,east of eden
