In [9]:
import re
import os
import pandas as pd
from pathlib import Path

In [10]:
raw_dir = Path("data/raw_novels")
clean_dir = Path("data/clean")
clean_dir.mkdir(parents=True, exist_ok=True)
files = {"Jane Austen": raw_dir/"P&P-Jane Austen.txt", "Herman Melville": raw_dir/"BartleBy-Melville.txt"}

In [11]:
def clean_text(raw_text):
    start_pattern = r"\*\*\* START OF THE PROJECT GUTENBERG EBOOK.*\*\*\*"
    end_pattern = r"\*\*\* END OF THE PROJECT GUTENBERG EBOOK.*\*\*\*"
    
    start_match = re.search(start_pattern, raw_text, re.IGNORECASE)
    end_match = re.search(end_pattern, raw_text, re.IGNORECASE)

    if not start_match or not end_match:
        raise ValueError("Gutenberg START/END markers not found")
    text = raw_text[start_match.end():end_match.start()]
    text = re.sub(r'\r\n', '\n', text)
    text = re.sub(r'\n{3,}', '\n\n', text)

    return text.strip()

In [12]:
def extract_paras(text, min_words=50, max_words=300):
    raw_paras = text.split("\n\n")
    final_paras = []
    for para in raw_paras:
        para = para.replace("\n", " ").strip()
        words = para.split()

        if min_words <= len(words) <= max_words:
            final_paras.append(para)

    return final_paras

In [13]:
for author, filepath in files.items():
    print(f"Processing: {author}")

    with open(filepath, "r", encoding="utf-8") as f:
        raw_text = f.read()

    cleaned_text = clean_text(raw_text)
    clean_path = clean_dir / f"{author.replace(' ', '_').lower()}_clean.txt"
    with open(clean_path, "w", encoding="utf-8") as f:
        f.write(cleaned_text)

Processing: Jane Austen
Processing: Herman Melville


In [14]:
raw_dir = Path("data/raw_novels")
clean_dir = Path("data/clean")

In [15]:
def load_human_data():
    human_data = []
    mapping = {
        "Jane Austen": clean_dir / "jane_austen_clean.txt",
        "Herman Melville": clean_dir / "herman_melville_clean.txt"
    }
    
    for author, filepath in mapping.items():
        if filepath.exists():
            with open(filepath, "r", encoding="utf-8") as f:
                text = f.read()
            paras = extract_paras(text)
            for p in paras:
                human_data.append({
                    "text": p,
                    "class": "Human",
                    "target_author": author,
                    "topic": "Original Novel" # Topic is implicit in the novel
                })
            
    return pd.DataFrame(human_data)
def load_ai_data():
    df_generic = pd.read_csv("ai_generic_final.csv")
    df_styled = pd.read_csv("ai_styled_austin.csv")
    return pd.concat([df_generic, df_styled], ignore_index=True)

# Combine
df_human = load_human_data()
df_ai = load_ai_data()
df_all = pd.concat([df_human, df_ai], ignore_index=True)

print(f"Total Paragraphs: {len(df_all)}")
print(df_all['class'].value_counts())
df_all.head()

Total Paragraphs: 995
class
Human         875
ai_generic     60
ai_styled      60
Name: count, dtype: int64


Unnamed: 0,text,class,target_author,topic
0,_Walt Whitman has somewhere a fine and just di...,Human,Jane Austen,Original Novel
1,"_I suppose, however, that the majority of at l...",Human,Jane Austen,Original Novel
2,"_I think, however, though the thought will dou...",Human,Jane Austen,Original Novel
3,"_For if her knowledge was not very extended, s...",Human,Jane Austen,Original Novel
4,_This Swiftian quality appears in the present ...,Human,Jane Austen,Original Novel


In [16]:
output_path = "precog_task0_data.csv"
df_all.to_csv(output_path, index=False)
print(f"{output_path}")

precog_task0_data.csv
