In [7]:
#Where search engine is made
#Exploring the data
#!wc-l is a command line utility that will count the number of lines in this goodread books file
#Result should show metadata for about 7.5 million books
!wc -l goodreads_books.json.gz

7588375 goodreads_books.json.gz


In [8]:
#This command shows the size of the file (About 1.9 GB) and it shows what file type it is
!ls -lh | grep goodreads_books.json.gz

-rw-r--r-- 1 tochi tochi 2.0G Sep 25 04:26 goodreads_books.json.gz


In [9]:
#Used streaming-fashion technique here--> Instead of reading whole file into memory at once, file will be read line by line (only enough memory to store a single line at a time therefore less memory used)

import gzip
# Stream the file without unzipping
with gzip.open("goodreads_books.json.gz") as f:
    line = f.readline()  #Read single line from file

In [10]:
line #single line

b'{"isbn": "0312853122", "text_reviews_count": "1", "series": [], "country_code": "US", "language_code": "", "popular_shelves": [{"count": "3", "name": "to-read"}, {"count": "1", "name": "p"}, {"count": "1", "name": "collection"}, {"count": "1", "name": "w-c-fields"}, {"count": "1", "name": "biography"}], "asin": "", "is_ebook": "false", "average_rating": "4.00", "kindle_asin": "", "similar_books": [], "description": "", "format": "Paperback", "link": "https://www.goodreads.com/book/show/5333265-w-c-fields", "authors": [{"author_id": "604031", "role": ""}], "publisher": "St. Martin\'s Press", "num_pages": "256", "publication_day": "1", "isbn13": "9780312853129", "publication_month": "9", "edition_information": "", "publication_year": "1984", "url": "https://www.goodreads.com/book/show/5333265-w-c-fields", "image_url": "https://images.gr-assets.com/books/1310220028m/5333265.jpg", "book_id": "5333265", "ratings_count": "3", "work_id": "5400751", "title": "W.C. Fields: A Life on Film", "t

In [11]:
#Use json module to load the single line
import json

data = json.loads(line) #loading as json object. Will turn result into a python dictionary where each of the individual properties can be accessed
data

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

In [12]:
def parse_fields(line): #Takes a single line and only returns the fields below (book id, title, ratings etc)
    data = json.loads(line)
    return {
        "book_id": data["book_id"],
        "title": data["title_without_series"],
        "ratings": data["ratings_count"],
        "url": data["url"],
        "cover_image": data["image_url"] #helps see the book and click the link for it
    }

In [29]:
books_titles = [] #Read every single line in the file by using a loop
with gzip.open("goodreads_books.json.gz") as f:
    while True:
        line = f.readline()
        if not line:  #When it reaches the end of the file it will break
            break
        fields = parse_fields(line) #runs parsefields function to parse line and return dictionary
        try:
            ratings = int(fields["ratings"]) #only want to take books that have a certain #of ratings. When they don't have a lot of ratings, it's very unlikely that users have read them
        except ValueError:
            continue
        if ratings > 5: #Will only take books that have more than 5 ratings
            books_titles.append(fields)

In [30]:
#Making dataframe
#Book titles will be a list of dictionaries that will be turned into a row in the dataframe
import pandas as pd
titles = pd.DataFrame.from_dict(books_titles)

In [31]:
titles["ratings"] = pd.to_numeric(titles["ratings"]) #Turns ratings into a numerical column. Will let us do ratings on the comparisons column

In [16]:
#mod title minimizes search space and #of potential characters in each title--> For efficiency
#1st thing to do is replace each character that doesn't fall within a certain set
#A regular expression that basically says anything that doesn't fall within a set of characters should be replaced with "nothing" (get rid of it)
titles["mod_title"] = titles["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True)

In [17]:
# makes all titles lowercase
titles["mod_title"] = titles["mod_title"].str.lower()

In [18]:
#Remove any sequential spaces and replace with a single space
titles["mod_title"] = titles["mod_title"].str.replace("\s+", " ", regex=True)

In [19]:
#Remove any title that has less than 0 characters
titles = titles[titles["mod_title"].str.len() > 0]

In [20]:
#
titles.to_json("books_titles.json")

In [21]:
# Returns titles
titles

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,1333909,Good Harbor,10,https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...,good harbor
1,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
2,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
3,287140,Runic Astrology: Starcraft and Timekeeping in ...,15,https://www.goodreads.com/book/show/287140.Run...,https://images.gr-assets.com/books/1413219371m...,runic astrology starcraft and timekeeping in t...
4,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
...,...,...,...,...,...,...
1782574,3084038,"This Sceptred Isle, Vol. 10: The Age of Victor...",12,https://www.goodreads.com/book/show/3084038-th...,https://images.gr-assets.com/books/1494763458m...,this sceptred isle vol 10 the age of victoria ...
1782575,26168430,Sherlock Holmes and the July Crisis,6,https://www.goodreads.com/book/show/26168430-s...,https://images.gr-assets.com/books/1440592011m...,sherlock holmes and the july crisis
1782576,2342551,The Children's Classic Poetry Collection,36,https://www.goodreads.com/book/show/2342551.Th...,https://s.gr-assets.com/assets/nophoto/book/11...,the childrens classic poetry collection
1782577,22017381,"101 Nights: Volume One (101 Nights, #1-3)",70,https://www.goodreads.com/book/show/22017381-1...,https://images.gr-assets.com/books/1398621236m...,101 nights volume one 101 nights 13


In [22]:
#Using scikit learn to build TF*IDF matrix
#Vectorizer takes a list of strings and turns it into tf*idf matrix

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

#run it on mod titles
tfidf = vectorizer.fit_transform(titles["mod_title"])

In [26]:
#Turn a search query into a vector, and then match it up against the matrix and compare them
#To do our comparison of the vectors we'll use a metric called cosine-similarity

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re #Python regular expression library

def make_clickable(val): #Pandas method to style columns with html
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val) #Building html element for each column entry

def show_image(val): #Returns image of book using pandas
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

def search(query,vectorizer):
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())  #Same procesing as mod title but with search query
    query_vec = vectorizer.transform([query]) #turn query into vector using vectorizer
    similarity = cosine_similarity(query_vec, tfidf).flatten() #Find similarities using cosine similarity function, and search tf*idf matrix for your search query
    indices = np.argpartition(similarity, -10)[-10:] #Use numpy partition function to find the 10 most similar results. Want to find indices as well in order to index titles
    results = titles.iloc[indices] #Gives us book title rows with most similar results to query
    results = results.sort_values("ratings", ascending=False) #Sort the values on the number of ratings using pandas sort value method. Will help limit duplicate results by taking the row with the highest number of ratings

    return results.head(5).style.format({'url': make_clickable, 'cover_image': show_image}) #Returns top 5 results and styles them using html. Allows users to click on url of results, and to view the cover image.

In [27]:
search("harry potter and the goblet of fire", vectorizer) #Return search

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
421390,28754622,Harry Potter and the Goblet of Fire,3314,Goodreads,,harry potter and the goblet of fire
1367402,7292005,Harry Potter and the Goblet of Fire,202,Goodreads,,harry potter and the goblet of fire
1345131,17861465,Harry Potter and the Goblet of Fire,174,Goodreads,,harry potter and the goblet of fire
1139381,1071182,Harry Potter and the Goblet of Fire,168,Goodreads,,harry potter and the goblet of fire
1176439,23784313,Harry Potter and the Goblet of Fire,46,Goodreads,,harry potter and the goblet of fire


In [32]:
#Take query results and use book id's to create liked books list.
#Store as strings
liked_books=["3314", "31147619", "29983711", "9401317","9317691", "8153988", "20494944"]
#I'd generally recommend to make your liked books list list less than 10 but more than 3 books just for efficiencie's sake.