In [2]:
import pandas as pd
import re
import json
from bs4 import BeautifulSoup

In [42]:
def extract_preface_info(html_file):
    """
    Extracts the book title, author, summary, and notes 
    from the <div id="preface"> section of the given HTML file.
    """
    with open(html_file, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")
    
    # Find the preface <div>
    preface_div = soup.find("div", id="preface")
    if not preface_div:
        return {
            "title": None,
            "author": None,
            "summary": None,
            "notes": None
        }
    
    # We assume there is a <div class="meta"> with our main info
    meta_div = preface_div.find("div", class_="meta")
    
    # 1) Extract the title from <h1> inside .meta
    title = None
    if meta_div:
        h1_tag = meta_div.find("h1")
        if h1_tag:
            title = h1_tag.get_text(strip=True)
    
    # 2) Extract the author from <div class="byline"> (the <a rel="author"> link)
    author = None
    if meta_div:
        byline_div = meta_div.find("div", class_="byline")
        if byline_div:
            author_link = byline_div.find("a", rel="author")
            if author_link:
                author = author_link.get_text(strip=True)
    
    # 3) Extract the summary from the blockquote that follows <p>Summary</p>
    summary = None
    if meta_div:
        summary_p = meta_div.find("p", string=lambda text: text and text.strip().lower() == "summary")
        if summary_p:
            summary_blockquote = summary_p.find_next("blockquote", class_="userstuff")
            if summary_blockquote:
                summary = summary_blockquote.get_text("\n", strip=True)
    
    # 4) Extract the notes from the blockquote that follows <p>Notes</p>
    notes = None
    if meta_div:
        notes_p = meta_div.find("p", string=lambda text: text and text.strip().lower() == "notes")
        if notes_p:
            notes_blockquote = notes_p.find_next("blockquote", class_="userstuff")
            if notes_blockquote:
                notes = notes_blockquote.get_text("\n", strip=True)

    return {
        "title": title,
        "author": author,
        "summary": summary,
        "notes": notes
    }



def extract_book_text(html_file):
    # 1. Read the HTML
    with open(html_file, 'r', encoding='utf-8') as f:
        html_content = f.read()

    # 2. Parse with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # ----------------------------------------------------------------
    # Step A: Remove "Chapter Notes" paragraphs and the blockquotes
    #         within any <div class="meta group">
    # ----------------------------------------------------------------
    meta_divs = soup.find_all("div", class_="meta")
    for meta_div in meta_divs:
        # Find the <p> that exactly or partially matches "Chapter Notes"
        notes_p = meta_div.find("p", string=lambda text: text and "Chapter Notes" in text)
        if notes_p:
            notes_p.decompose()
        
        # Also remove the <blockquote> (class="userstuff") under <div class="meta group">
        blockquote = meta_div.find("blockquote", class_="userstuff")
        if blockquote:
            blockquote.decompose()

    # ----------------------------------------------------------------
    # Step B: Collect the text we DO want in the order it appears:
    #   - <h2 class="heading"> (the chapter titles)
    #   - <div class="userstuff"><p> ... </p></div> (the main story text)
    #
    # We'll iterate through all tags in document order and pick out
    # only those that match our criteria.
    # ----------------------------------------------------------------
    extracted_text = []

    for tag in soup.find_all():
        # 1) If it's a heading
        if tag.name == "h2" and "heading" in tag.get("class", []):
            heading_text = tag.get_text(strip=True)
            if heading_text:
                extracted_text += [f'\n{heading_text}\n']

        # 2) If it's a <p> under a <div class="userstuff">
        elif (tag.name == "p" 
              and tag.parent 
              and tag.parent.name == "div"
              and "userstuff" in tag.parent.get("class", [])):
            p_text = tag.get_text(strip=False)
            if p_text:
                extracted_text.append(p_text)

    # ----------------------------------------------------------------
    # Step C: Return/join the final cleaned text
    # ----------------------------------------------------------------
    return "\n".join(extracted_text)

In [54]:
from tqdm.auto import tqdm
from pathlib import Path

books_path = Path('../data/fanfics/')
clean_text_path = Path('../data/fanfics_clean/')
for book_path in tqdm(list(books_path.glob('*html'))):
    print(book_path)
    book_text = extract_book_text(book_path)
    preface = extract_preface_info(book_path)

    with open(str(clean_text_path / book_path.stem) + '.txt', 'w') as fout:
        for k in ['Title', 'Author', 'Summary', 'Notes']:
            if preface[k.lower()]:
                fout.write(f"{k}: {preface[k.lower()]}\n\n")
        fout.write(book_text)

  0%|          | 0/21 [00:00<?, ?it/s]

../data/fanfics/The_Sith_Strikes_Back.html
../data/fanfics/The_Silmarillion_Simplified.html
../data/fanfics/The_Resurrection_of.html
../data/fanfics/The_Last_of_the_Jedi.html
../data/fanfics/The_Bot_the_World_Forgot.html
../data/fanfics/Sweet_Creatures.html
../data/fanfics/Shattered_Pieces_of_the.html
../data/fanfics/People_Stained_With.html
../data/fanfics/Mirror_Prism.html
../data/fanfics/Laws_of_the_Sea.html
../data/fanfics/Jay_Baby.html
../data/fanfics/In_Which_Harry_and_Ladon.html
../data/fanfics/I_Know_Where_the_Stars.html
../data/fanfics/Harry_Potter_and_the.html
../data/fanfics/Grogu_Tells_Stories.html
../data/fanfics/Flightless_Sparrows.html
../data/fanfics/Creatures_of_Truth.html
../data/fanfics/Christmas_and.html
../data/fanfics/Children_of_the_Desert.html
../data/fanfics/Adagio.html
../data/fanfics/A_Great_Eye_lidless.html


In [56]:
## sample chunks from fanfics

In [57]:
import random
from nltk import sent_tokenize
# random.seed(144)

# Function to get a random chunk from a text with a minimum chunk size
def get_random_chunk(text, min_chunk_size):
    text = ' '.join(text.split())
    sentences = sent_tokenize(text)
    if len(sentences) < min_chunk_size:
        return ' '.join(sentences)  # Return the entire text if it's shorter than min_chunk_size
    n_words = 0
    while not (n_words < 25000 and n_words > 12000):
        max_start = len(sentences) - min_chunk_size
        start = random.randint(0, max_start)
        end = random.randint(start + min_chunk_size, len(sentences))
        chunk = ' '.join(sentences[start:end])
        n_words = len(chunk.split())
    return chunk

In [61]:
data_path = Path('../data/fanfics_clean/')
data = []
for text_path in data_path.glob('*txt'):
    data += [text_path.open('r').read()]


texts = []
min_chunk_size = 10  # Set the minimum chunk size (in sentences)
for _ in tqdm(range(1000)):
    # Choose a random text from the validation set
    n_words = 0
    while n_words < 12000:
        text = random.choice(data)
        n_words = len(text.split())
    # Get a random chunk from this text with the minimum size
    chunk = get_random_chunk(text, min_chunk_size)
    texts.append(chunk)

for t in texts[:5]:
    print(t[:100])    
    print(f"Number of words in the sample: {len(t.split())}")
    print('----------')

  0%|          | 0/1000 [00:00<?, ?it/s]

“Hey, Damian,” Jason said, trying to wrap his brain around the fact that he was smaller than his you
Number of words in the sample: 12057
----------
She’s smiling at him, playful and serious at the same time. Harry knows it really bothers her that h
Number of words in the sample: 20890
----------
“Oh my gosh, I have so much to tell you and Oh!” Her eyes trailed to the side. “Is this Jay, “ she g
Number of words in the sample: 12040
----------
“Don’t worry,” Ashla tells them dryly, “It was your chip.” Crosshair sinks a little in his chair, li
Number of words in the sample: 15152
----------
A Jedi doesn’t hate, but Anakin hates her. He couldn’t not, after everything she’s done to his broth
Number of words in the sample: 12076
----------


In [66]:
import pandas as pd
# df = pd.DataFrame({'text': texts})
# df.to_csv('../data/fanfics_1k_chunks.csv')
df = pd.read_csv('../data/fanfics_1k_chunks.csv', index_col=0)
df.head()

Unnamed: 0,text
0,"“Hey, Damian,” Jason said, trying to wrap his ..."
1,"She’s smiling at him, playful and serious at t..."
2,"“Oh my gosh, I have so much to tell you and Oh..."
3,"“Don’t worry,” Ashla tells them dryly, “It was..."
4,"A Jedi doesn’t hate, but Anakin hates her. He ..."
