In [None]:
pip install wikipedia-api pandas tqdm



In [None]:
import wikipediaapi
import pandas as pd
from tqdm import tqdm

wiki = wikipediaapi.Wikipedia(user_agent='BookSummaryDatasetGenerator/1.0 (ziyiwan@umich.edu)', language='en')

# Define categories to scrape
fiction_category = "Category:Novels_by_genre"
nonfiction_category = "Category:Non-fiction_books"

# Function to recursively get book titles from categories
def get_books(category, limit=5000):
    titles = set()

    def recursive_scrape(cat, titles_set):
        for c in cat.categorymembers.values():
            if len(titles_set) >= limit:
                break
            if c.ns == wikipediaapi.Namespace.MAIN:
                titles_set.add(c.title)
            elif c.ns == wikipediaapi.Namespace.CATEGORY:
                recursive_scrape(c, titles_set)

    cat_page = wiki.page(category)
    recursive_scrape(cat_page, titles)

    return list(titles)

# Fetch books from each category
print("Fetching Fiction books...")
fiction_books = get_books(fiction_category, limit=5000)
print(f"Total fiction books fetched: {len(fiction_books)}")

print("Fetching Non-fiction books...")
nonfiction_books = get_books(nonfiction_category, limit=5000)
print(f"Total non-fiction books fetched: {len(nonfiction_books)}")

# Function to get summary
def get_summary(title):
    page = wiki.page(title)
    if page.exists():
        summary = page.summary.split("\n")[0]  # first paragraph
        return summary
    else:
        return None

# Create dataset
data = []

print("Extracting summaries for fiction books...")
for title in tqdm(fiction_books):
    summary = get_summary(title)
    if summary:
        data.append({"Title": title, "Summary": summary, "Category": "Fiction"})

print("Extracting summaries for non-fiction books...")
for title in tqdm(nonfiction_books):
    summary = get_summary(title)
    if summary:
        data.append({"Title": title, "Summary": summary, "Category": "Non-fiction"})

# Save dataset
df = pd.DataFrame(data)
df.to_csv("wikipedia_books_dataset.csv", index=False)
print(df.head())
print(f"Dataset size: {len(df)} entries.")


Fetching Fiction books...
Total fiction books fetched: 5000
Fetching Non-fiction books...
Total non-fiction books fetched: 5000
Extracting summaries for fiction books...


100%|██████████| 5000/5000 [14:09<00:00,  5.88it/s]


Extracting summaries for non-fiction books...


100%|██████████| 5000/5000 [14:28<00:00,  5.76it/s]

                               Title  \
0  Master of the Universe (Twilight)   
1                             Nazgûl   
2             Process of Elimination   
3                     Septima Vector   
4                 Barthanes Damodred   

                                             Summary Category  
0  Fifty Shades of Grey is a 2011 erotic romance ...  Fiction  
1  The Nazgûl (from Black Speech nazg 'ring', and...  Fiction  
2  Super Mystery is a 36-volume series of crossov...  Fiction  
3  The following is a list of characters from the...  Fiction  
4  The Wheel of Time is a series of high fantasy ...  Fiction  
Dataset size: 10000 entries.



