In [1]:
import pandas as pd
import os
import random
import re
from bs4 import BeautifulSoup
import requests
import time

## Helpers

In [2]:
def scrape_genres(urls):
    headers = {
        "User-Agent": "Mozilla/5.0"
    }
    records = []
    for url in urls:
        r = requests.get(url, headers=headers)
        soup = BeautifulSoup(r.text, "html.parser")
        genres = soup.select("div.shelfStat")
        for g in genres:
    
            # name of genre
            n = g.select_one("a.mediumText.actionLinkLite")
            name_of_genre = n.get_text(strip=True) if n else None
    
            # number of books
            num = g.select_one("div.smallText.greyText")
            number_of_books = num.get_text(strip=True) if num else None
    
            records.append({
                "name": name_of_genre,
                "number_of_books": number_of_books
            })

    df = pd.DataFrame(records)
    df["number_of_books"] = df["number_of_books"].str.replace(" books", "")
    df["number_of_books"] = df["number_of_books"].str.replace(",", "")
    df["number_of_books"] = df["number_of_books"].astype('int')
    return df

## Extract data

### Extract genres

In [3]:
urls = ["https://www.goodreads.com/genres/list?filter=top-level&page=1",
       "https://www.goodreads.com/genres/list?filter=top-level&page=2",
       "https://www.goodreads.com/genres/list?filter=top-level&page=3"]

In [4]:
df_orig = scrape_genres(urls)

In [5]:
df = df_orig

In [6]:
df = df.sort_values(by=['number_of_books'], ascending=False)

In [7]:
df

Unnamed: 0,name,number_of_books
71,fiction,18256553
138,non-fiction,10129112
14,audiobook,4980733
151,poetry,2350618
4,adult,2287246
...,...,...
220,tsars,29
147,percy-bysshe-shelley,26
178,scientific-historical-fiction,14
118,live-action-roleplaying,5


In [8]:
df = df.drop(index=[71, 138, 14])


In [9]:
df

Unnamed: 0,name,number_of_books
151,poetry,2350618
4,adult,2287246
115,lgbt,1034513
48,dark-romance,802813
31,christian,717591
...,...,...
220,tsars,29
147,percy-bysshe-shelley,26
178,scientific-historical-fiction,14
118,live-action-roleplaying,5


In [10]:
df = df[df["number_of_books"] >= 50000]
df

Unnamed: 0,name,number_of_books
151,poetry,2350618
4,adult,2287246
115,lgbt,1034513
48,dark-romance,802813
31,christian,717591
...,...,...
186,sexuality,58063
234,workplace-romance,57060
141,occult,53301
196,social,52320


In [11]:
total_books = sum(df["number_of_books"])

In [12]:
total_books

19750942

In [17]:
df.loc[:, "ratio"] = (df["number_of_books"] / total_books) * 100

In [18]:
df

Unnamed: 0,name,number_of_books,ratio
151,poetry,2350618,11.901296
4,adult,2287246,11.580440
115,lgbt,1034513,5.237791
48,dark-romance,802813,4.064682
31,christian,717591,3.633199
...,...,...,...
186,sexuality,58063,0.293976
234,workplace-romance,57060,0.288898
141,occult,53301,0.269866
196,social,52320,0.264899


In [25]:
ls = list(df["name"].unique())

### Extract books