In [3]:
import pandas as pd
import os
import random
import re
from bs4 import BeautifulSoup
import requests
import time

## Helpers

In [34]:
def scrape_genres(urls):
    headers = {
        "User-Agent": "Mozilla/5.0"
    }
    records = []
    for url in urls:
        r = requests.get(url, headers=headers)
        soup = BeautifulSoup(r.text, "html.parser")
        genres = soup.select("div.shelfStat")
        for g in genres:
    
            # name of genre
            n = g.select_one("a.mediumText.actionLinkLite")
            name_of_genre = n.get_text(strip=True) if n else None
    
            # number of books
            num = g.select_one("div.smallText.greyText")
            number_of_books = num.get_text(strip=True) if num else None
    
            records.append({
                "name": name_of_genre,
                "number_of_books": number_of_books
            })

    df = pd.DataFrame(records)
    df["number_of_books"] = df["number_of_books"].str.replace(" books", "")
    df["number_of_books"] = df["number_of_books"].str.replace(",", "")
    df["number_of_books"] = df["number_of_books"].astype('int')
    return df

## Extract data

In [47]:
urls = ["https://www.goodreads.com/genres/list?filter=top-level&page=1",
       "https://www.goodreads.com/genres/list?filter=top-level&page=2",
       "https://www.goodreads.com/genres/list?filter=top-level&page=3"]

In [48]:
df_orig = scrape_genres(urls)

In [75]:
df = df_orig

In [76]:
df = df.sort_values(by=['number_of_books'], ascending=False)

In [77]:
df = df.head(100)

In [78]:
print(list(df['name']))

['fiction', 'non-fiction', 'audiobook', 'poetry', 'adult', 'lgbt', 'dark-romance', 'christian', 'dark', 'enemies-to-lovers', 'feminism', 'animals', 'novella', 'war', 'friendship', 'unfinished', 'spirituality', 'love', 'biography-memoir', 'smut', 'second-chance', 'forced-proximity', 'holiday', 'relationships', 'race', 'slow-burn', 'cozy', 'graphic-novels-comics', 'roman', 'death', 'summer', 'science-fiction-fantasy', 'agatha-christie', 'inspirational', 'dark-academia', 'anthologies', 'space', 'harry-potter', 'found-family', 'romantic', 'why-choose', 'summer-reads', 'emotional', 'grief', 'medical', 'horror-thriller', 'disability', 'mafia-romance', 'comics-manga', 'fake-dating', 'small-town-romance', 'family-drama', 'middle-east', 'shakespeare', 'new-york', 'united-states', 'second-chance-romance', 'productivity', 'teaching', 'gender', 'textbooks', 'indigenous', 'graphic-novels-manga', 'futuristic', 'hockey-romance', 'london', 'sequential-art', 'urban', 'satire', 'college-romance', 'nativ

In [79]:
df

Unnamed: 0,name,number_of_books
71,fiction,18256553
138,non-fiction,10129112
14,audiobook,4980733
151,poetry,2350618
4,adult,2287246
...,...,...
200,spicy-romance,30096
235,world-literature,30055
50,devotional,30041
114,legal,29588


In [80]:
df = df.drop(index=[71, 138, 14])


In [81]:
df

Unnamed: 0,name,number_of_books
151,poetry,2350618
4,adult,2287246
115,lgbt,1034513
48,dark-romance,802813
31,christian,717591
...,...,...
200,spicy-romance,30096
235,world-literature,30055
50,devotional,30041
114,legal,29588


In [82]:
df = df[df["number_of_books"] >= 50000]
df

Unnamed: 0,name,number_of_books
151,poetry,2350618
4,adult,2287246
115,lgbt,1034513
48,dark-romance,802813
31,christian,717591
...,...,...
186,sexuality,58063
234,workplace-romance,57060
141,occult,53301
196,social,52320
