In [1]:
import os
import re
import csv
import requests
from bs4 import BeautifulSoup
import time
import unicodedata

In [2]:
SAVE_DIR = "story_download"
os.makedirs(SAVE_DIR, exist_ok=True)

CSV_PATH = "./dataset/gutenberg.csv"
API_URL = "https://gutendex.com/books/?languages=en"

START_REGEX = r"\*+\s*START\s+OF\s+(?:THE\s+)?PROJECT\s+GUTENBERG\s+EBOOK"
END_REGEX   = r"\*+\s*END\s+OF\s+(?:THE\s+)?PROJECT\s+GUTENBERG\s+EBOOK"
FICTION_TAGS = [
    "fiction", "novel", "story", "stories", "horror",
    "fantasy", "gothic", "adventure", "mystery",
    "detective", "romance", "science fiction", "sci-fi"
]

NONFICTION_EXCLUDE = [
    "poetry", "poem", "drama", "play", "philosophy",
    "religion", "theology", "sermon", "essay", "essays",
    "history", "biography", "autobiography",
    "reference", "dictionary", "textbook", "manual"
]
def is_fiction(book):
    """
    Checks Gutenberg metadata to filter for storybooks only.
    """
    if book["media_type"].lower() == "sound":
        return False

    subs = [s.lower() for s in book.get("subjects", [])]
    
    shelves = [s.lower() for s in book.get("bookshelves", [])]

    if not any(tag in " ".join(subs + shelves) for tag in FICTION_TAGS):
        return False

    if any(ex in " ".join(subs + shelves) for ex in NONFICTION_EXCLUDE):
        return False

    return True

def extract_gutenberg_html(html):
    soup = BeautifulSoup(html, "html.parser")

    header = soup.find(id="pg-header")
    footer = soup.find(id="pg-footer")

    if not header or not footer:
        print("[WARN] Header or footer not found")
        return None

    content_nodes = []
    for elem in header.next_siblings:
        if elem == footer:
            break
        content_nodes.append(elem)

    paragraphs = []
    for node in content_nodes:
        parsed = BeautifulSoup(str(node), "html.parser")

        for p in parsed.find_all("p"):
            if p.attrs:
                continue

            text = p.get_text(" ", strip=True)

            text = re.sub(r"\s{2,}", " ", text)

            if text:
                paragraphs.append(text)

    final_text = "\n\n".join(paragraphs).strip()

    return final_text if len(final_text) > 200 else None


def normalize_gutenberg_text(text):

    lines = [l.strip() for l in text.splitlines()]

    merged = []
    for line in lines:
        if line == "":
            merged.append("") 
        else:
            merged.append(line)
    paragraphs = []
    current = []

    for line in merged:
        if line == "":
            if current:
                paragraphs.append(" ".join(current))
                current = []
        else:
            current.append(line)

    if current:
        paragraphs.append(" ".join(current))

    return "\n\n".join(paragraphs)


def clean_book_text(text: str):
    text = text.replace("\r", " ")
    text = text.replace("\n", " ")

    text = re.sub(r"([.,!?;:])", r" \1 ", text)

    text = re.sub(r"\s{2,}", " ", text)
    text = unicodedata.normalize("NFKC",text)
    return text.strip()
def convert_gutenberg_html_url(api_url, book_id):
    if api_url.endswith(".html.images"):
        return f"https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}-images.html"

    if api_url.endswith(".html.noimages"):
        return f"https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.html"

    return api_url

def download_clean_text(book):
    book_id = book["id"]
    if not is_fiction(book):
        print(f"[SKIP] {book_id}: not fiction")
        return None
    if book["media_type"].lower() == "sound":
        print(f"[SKIP] {book_id}: audiobook")
        return None

    html_url = None

    for fmt, url in book["formats"].items():

        if "text/html" in fmt:

            if url.endswith(".html"):
                html_url = url
                break

            elif url.endswith(".html.images"):
                html_url = f"https://www.gutenberg.org/cache/epub/{book['id']}/pg{book['id']}-images.html"
                break

    if not html_url:
        print(f"[SKIP] {book_id}: no HTML")
        return None

    try:
        r = requests.get(html_url, timeout=15)
        if r.status_code != 200:
            print(f"[ERR] {book_id}: failed HTML download")
            return None

        extracted = extract_gutenberg_html(r.text)
        if not extracted:
            print(f"[SKIP] {book_id}: no valid HTML text found")
            return None

        cleaned = clean_book_text(extracted)

        if len(cleaned) < 1000:
            print(f"[SKIP] {book_id}: too short after cleaning")
            return None

        return cleaned

    except Exception as e:
        print(f"[ERR] {book_id}: {e}")
        return None


def main():
    print("[INFO] Fetching English book list via Gutendex...")
    MAX_BOOK = 3000
    collected = 0
    url = API_URL

    os.makedirs(os.path.dirname(CSV_PATH), exist_ok=True)

    with open(CSV_PATH, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["text"])

        while collected < MAX_BOOK and url:
            r = requests.get(url).json()
            books = r["results"]
            url = r["next"]

            print(f"[INFO] Loaded {len(books)} books from page...")

            for book in books:
                if collected >= MAX_BOOK:
                    break

                book_id = book["id"]
                print(f"[DL] {book_id}...")

                cleaned = download_clean_text(book)
                if not cleaned:
                    continue

                txt_path = os.path.join(SAVE_DIR, f"{book_id}.txt")
                with open(txt_path, "w", encoding="utf-8") as out:
                    out.write(cleaned)

                writer.writerow([cleaned])
                collected += 1
                print(f"[OK] Saved book {book_id} (total clean: {collected})")

            time.sleep(0.3)

    print("\n[DONE] Dataset ready!")
    print(f"TXT dir: {SAVE_DIR}/")
    print(f"CSV: {CSV_PATH}")

In [3]:
main()

[INFO] Fetching English book list via Gutendex...
[INFO] Loaded 32 books from page...
[DL] 84...
[OK] Saved book 84 (total clean: 1)
[DL] 2701...
[OK] Saved book 2701 (total clean: 2)
[DL] 1342...
[OK] Saved book 1342 (total clean: 3)
[DL] 1513...
[SKIP] 1513: not fiction
[DL] 43...
[OK] Saved book 43 (total clean: 4)
[DL] 11...
[OK] Saved book 11 (total clean: 5)
[DL] 100...
[SKIP] 100: not fiction
[DL] 8492...
[OK] Saved book 8492 (total clean: 6)
[DL] 2641...
[OK] Saved book 2641 (total clean: 7)
[DL] 145...
[OK] Saved book 145 (total clean: 8)
[DL] 25344...
[SKIP] 25344: not fiction
[DL] 37106...
[OK] Saved book 37106 (total clean: 9)
[DL] 345...
[OK] Saved book 345 (total clean: 10)
[DL] 16328...
[SKIP] 16328: not fiction
[DL] 67979...
[SKIP] 67979: not fiction
[DL] 16389...
[OK] Saved book 16389 (total clean: 11)
[DL] 2554...
[OK] Saved book 2554 (total clean: 12)
[DL] 1260...
[OK] Saved book 1260 (total clean: 13)
[DL] 6761...
[OK] Saved book 6761 (total clean: 14)
[DL] 5197...


KeyboardInterrupt: 