In [1]:
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import re
import pandas as pd
import time
import datetime as dt

# Set up Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

# Visit Goodreads
url = 'https://www.goodreads.com/list/show/1.Best_Books_Ever'
browser.visit(url)

html = browser.html
html_soup = soup(html, 'html.parser')

In [2]:
# function that scrapes goodreads
def GoodScrapes(pages, count=1, best_books={}):
    while count < pages:
        html = browser.html
        html_soup = soup(html, 'html.parser')
        close_popup = 'div[class="modal__close"]'
        try:
            browser.find_by_tag(close_popup).button.click()
        except:
            pass
        table = html_soup.find('table', class_='tableList')
        rows = table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            num = int(cols[0].text)
            # info is the piece of html that contains all the book information
            info = cols[2]
            # parse through info to get book information
            name = info.a.text.strip()
            goodreads_link = 'https://www.goodreads.com' + info.findNext(class_='bookTitle')['href']
            goodreads_get_copy = goodreads_link.replace('show/', '') + '/get_a_copy'
            auth = info.findNext(class_='authorName').text
            stars = info.findNext(class_='minirating').text
            score = info.findNext(onclick="Lightbox.showBoxByID('score_explanation', 300); return false;").text.strip()
            score = int(score.replace('score: ', '').replace(',', ''))
            numstars, numvotes = stars.split('—')
            # this next line isolates the numerical star value from its string, not sure how but I found it online
            rating = re.findall(r'\d+\.\d+', numstars)[0]
            numvotes = int(numvotes.replace('ratings', '').strip().replace(',', ''))
            best_books[num] = [name, auth, float(rating), numvotes, score, goodreads_link, goodreads_get_copy]
        tg = 'a[class="next_page"]'
        try:
            browser.find_by_tag(tg).click()
        except:
            browser.find_by_tag(close_popup).button.click()
            browser.find_by_tag(tg).click()
        count += 1
    return best_books

# function that gets amazon links from goodreads pages
def getAmazonLinks(df):
    for index, row in df.iterrows():
        url = row['get_copy']
        browser.visit(url)
        try:
            amazon_link = browser.find_by_text('Amazon')['href']
        except:
            amazon_link = None
        df.loc[index, 'amazon_link'] = amazon_link
        time.sleep(1)

def getGoodreadsInfo(df):
    for index, row in df.iterrows():
        url = row['goodreads_link']
        browser.visit(url)
        try:
            pages, format = browser.find_by_tag('p[data-testid="pagesFormat"]').text.split('pages')
            pages = int(pages.strip())
        except:
            pages = 0
        try:
            published = browser.find_by_tag('p[data-testid="publicationInfo"]').text.replace('First published ', '')
        except:
            published = None
        df.loc[index, 'page_count'] = pages
        df.loc[index, 'published_date'] = published
        time.sleep(1)

In [49]:
# before running the next cells, sign in to goodreads so their stupid popup doesnt ruin the scraping

In [3]:
# get back to the page after signing in
url = 'https://www.goodreads.com/list/show/1.Best_Books_Ever'
browser.visit(url)

In [4]:
# run GoodScrapes with the number of pages to scrape (each page has 100 books)
best_books = GoodScrapes(20)

In [5]:
best_books

{1: ['The Hunger Games (The Hunger Games, #1)',
  'Suzanne Collins',
  4.33,
  7825430,
  3421393,
  'https://www.goodreads.com/book/show/2767052-the-hunger-games',
  'https://www.goodreads.com/book/2767052-the-hunger-games/get_a_copy'],
 2: ['Harry Potter and the Order of the Phoenix (Harry Potter, #5)',
  'J.K. Rowling',
  4.5,
  3096285,
  2928942,
  'https://www.goodreads.com/book/show/2.Harry_Potter_and_the_Order_of_the_Phoenix',
  'https://www.goodreads.com/book/2.Harry_Potter_and_the_Order_of_the_Phoenix/get_a_copy'],
 3: ['Pride and Prejudice',
  'Jane Austen',
  4.28,
  3867634,
  2436223,
  'https://www.goodreads.com/book/show/1885.Pride_and_Prejudice',
  'https://www.goodreads.com/book/1885.Pride_and_Prejudice/get_a_copy'],
 4: ['To Kill a Mockingbird',
  'Harper Lee',
  4.27,
  5599984,
  2288163,
  'https://www.goodreads.com/book/show/2657.To_Kill_a_Mockingbird',
  'https://www.goodreads.com/book/2657.To_Kill_a_Mockingbird/get_a_copy'],
 5: ['The Book Thief',
  'Markus Zus

In [6]:
# put it all into a dataframe
cols = ['name', 'author', 'stars', 'ratings', 'goodreads_score', 'goodreads_link', 'get_copy']
books_df = pd.DataFrame.from_dict(best_books, orient='index', columns=cols)
books_df.head(10)

Unnamed: 0,name,author,stars,ratings,goodreads_score,goodreads_link,get_copy
1,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,4.33,7825430,3421393,https://www.goodreads.com/book/show/2767052-th...,https://www.goodreads.com/book/2767052-the-hun...
2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,4.5,3096285,2928942,https://www.goodreads.com/book/show/2.Harry_Po...,https://www.goodreads.com/book/2.Harry_Potter_...
3,Pride and Prejudice,Jane Austen,4.28,3867634,2436223,https://www.goodreads.com/book/show/1885.Pride...,https://www.goodreads.com/book/1885.Pride_and_...
4,To Kill a Mockingbird,Harper Lee,4.27,5599984,2288163,https://www.goodreads.com/book/show/2657.To_Ki...,https://www.goodreads.com/book/2657.To_Kill_a_...
5,The Book Thief,Markus Zusak,4.39,2321515,1649310,https://www.goodreads.com/book/show/19063.The_...,https://www.goodreads.com/book/19063.The_Book_...
6,"Twilight (The Twilight Saga, #1)",Stephenie Meyer,3.64,6072820,1591243,https://www.goodreads.com/book/show/41865.Twil...,https://www.goodreads.com/book/41865.Twilight/...
7,Animal Farm,George Orwell,3.98,3502504,1454612,https://www.goodreads.com/book/show/170448.Ani...,https://www.goodreads.com/book/170448.Animal_F...
8,The Chronicles of Narnia (Chronicles of Narnia...,C. S. Lewis,4.27,609313,1361719,https://www.goodreads.com/book/show/11127.The_...,https://www.goodreads.com/book/11127.The_Chron...
9,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...,J.R.R. Tolkien,4.61,126203,1354869,https://www.goodreads.com/book/show/30.J_R_R_T...,https://www.goodreads.com/book/30.J_R_R_Tolkie...
10,The Fault in Our Stars,John Green,4.15,4677124,1242562,https://www.goodreads.com/book/show/11870085-t...,https://www.goodreads.com/book/11870085-the-fa...


In [7]:
getGoodreadsInfo(books_df)

In [8]:
books_df.tail(20)

Unnamed: 0,name,author,stars,ratings,goodreads_score,goodreads_link,get_copy,page_count,published_date
1874,"The Novice (Black Magician Trilogy, #2)",Trudi Canavan,4.08,57051,2505,https://www.goodreads.com/book/show/28250.The_...,https://www.goodreads.com/book/28250.The_Novic...,577.0,"January 1, 2002"
1876,"The Naming (The Books of Pellinor, #1)",Alison Croggon,4.02,25721,2499,https://www.goodreads.com/book/show/393146.The...,https://www.goodreads.com/book/393146.The_Nami...,492.0,"January 1, 2002"
1877,"Dance Dance Dance (The Rat, #4)",Haruki Murakami,4.05,81006,2496,https://www.goodreads.com/book/show/17800.Danc...,https://www.goodreads.com/book/17800.Dance_Dan...,393.0,"October 13, 1988"
1878,The Hobbit,Chuck Dixon,4.5,214323,2495,https://www.goodreads.com/book/show/659469.The...,https://www.goodreads.com/book/659469.The_Hobb...,133.0,"January 1, 1989"
1879,"Hometown Girl at Heart (Hometown, #1)",Kirsten Fullmer,4.29,4100,2493,https://www.goodreads.com/book/show/38230320-h...,https://www.goodreads.com/book/38230320-hometo...,321.0,"July 15, 2014"
1881,The Executioner's Song,Norman Mailer,4.06,20548,2491,https://www.goodreads.com/book/show/12468.The_...,https://www.goodreads.com/book/12468.The_Execu...,1056.0,"October 30, 1979"
1882,When Nietzsche Wept,Irvin D. Yalom,4.36,63686,2490,https://www.goodreads.com/book/show/21031.When...,https://www.goodreads.com/book/21031.When_Niet...,310.0,"January 1, 1992"
1884,Little Fires Everywhere,Celeste Ng,4.09,1046498,2489,https://www.goodreads.com/book/show/51704136-l...,https://www.goodreads.com/book/51704136-little...,368.0,"September 12, 2017"
1885,Rage of Angels,Sidney Sheldon,3.96,37582,2488,https://www.goodreads.com/book/show/43328.Rage...,https://www.goodreads.com/book/43328.Rage_of_A...,512.0,"August 1, 1980"
1886,Maus II: A Survivor's Tale: And Here My Troubl...,Art Spiegelman,4.4,139426,2486,https://www.goodreads.com/book/show/15197.Maus_II,https://www.goodreads.com/book/15197.Maus_II/g...,144.0,"January 1, 1991"


In [9]:
# run the function that gets amazon links and adds them to the dataframe
getAmazonLinks(books_df)

In [10]:
books_df.to_csv('goodreads_best_books.csv')

In [52]:
browser.quit()