In [16]:
import re
import pandas as pd
from nltk.tokenize import sent_tokenize
import nltk

In [17]:
txt_file = "sofs_text.txt"

In [18]:
with open(txt_file, "r", encoding="utf-8") as f:
    raw_text = f.read()

'''
Chapter markers:
SofS: ===== c01.htm =====
      ===== cop.htm =====
      ===== (c\d+)\.htm =====

NbutC: ===== ch10.html =====
       ===== cop.html =====  
       ===== (ch\d+)\.html =====

'''
start = raw_text.find("===== c01.htm =====") # adjust these for different books as shown above
end = raw_text.find("===== cop.htm =====")

diary_text = raw_text[start:end]

sections = re.split(r"===== (c\d+)\.htm =====", diary_text) # adjust the regex (htm/html) 

chapters = []

for i in range(1, len(sections), 2):
    chapter_id = sections[i]
    text = sections[i+1]

    # extract the diary date from the text
    match = re.search(r"\b([A-Z][a-z]+ \d{1,2}(?:st|nd|rd|th)?, \d{4})\b",text)
    date = match.group(1) if match else None

    chapters.append({
        "chapter_id": chapter_id,
        "date": date,
        "text": text.strip()
    })

chapters_df = pd.DataFrame(chapters)
chapters_df.head()

Unnamed: 0,chapter_id,date,text
0,c01,"April 30, 1847",Dear Canada: A Sea of Sorrows\n\n\n\n\n\n\n\n\...
1,c02,"May 1, 1847",Dear Canada: A Sea of Sorrows\n\n\n\n\n\n\n\n\...
2,c03,"June 1, 1847",Dear Canada: A Sea of Sorrows\n\n\n\n\n\n\n\n\...
3,c04,"July 3, 1847",Dear Canada: A Sea of Sorrows\n\n\n\n\n\n\n\n\...
4,c05,"August 1, 1847",Dear Canada: A Sea of Sorrows\n\n\n\n\n\n\n\n\...


In [19]:
def clean_sentence(text):
    # remove standalone date lines
    text = re.sub(
        r"\b[A-Z][a-z]+ \d{1,2}(?:st|nd|rd|th)?, \d{4}\b",
        "",
        text
    )

    # remove excess blank lines left behind
    text = re.sub(r"\n\s*\n", "\n", text)

    return text.strip()

def is_valid_sentence(s):
    # too short to be meaningful
    if len(s) < 7:
        return False

    # obvious tokenizer fragments
    bad_endings = ("Mrs.", "Mr.", "Miss.", "Dr.", "â€œ", '"')
    if s.endswith(bad_endings):
        return False

    return True

In [20]:
sentences = []

for _, row in chapters_df.iterrows():
    sents = sent_tokenize(row["text"])
    for s in sents:
        s = clean_sentence(s)

        if is_valid_sentence(s):
            sentences.append({
                "chapter_id": row["chapter_id"],
                "date": row["date"],
                "sentence": s
            })

sent_df = pd.DataFrame(sentences)
sent_df.head()

Unnamed: 0,chapter_id,date,sentence
0,c01,"April 30, 1847",Dear Canada: A Sea of Sorrows\nApril 1847\nWhe...
1,c01,"April 30, 1847",I wonder what he would think if he were still ...
2,c01,"April 30, 1847",It has sent many to their deaths and now it dr...
3,c01,"April 30, 1847",That is why I have decided to write down our s...
4,c01,"April 30, 1847",I want to write about what is happening to us ...


In [21]:
sample_df = sent_df.sample(frac=1, random_state=42).reset_index(drop=True)
sample_df.to_csv("book_sentences_samples.csv", index=False)