In [1]:
import spacy
import csv
import re

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

# Function to read text and split into sentences
def extract_sentences_from_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Split text into chunks if it exceeds the maximum length
    max_length = nlp.max_length
    sentences = []
    
    # Process the text in chunks
    for start in range(0, len(text), max_length):
        chunk = text[start:start + max_length]
        doc = nlp(chunk)
        sentences.extend([sent.text for sent in doc.sents])

    return sentences

# Function to clean sentences
def clean_sentences(sentences):
    cleaned_sentences = []
    for sentence in sentences:
        # Remove quotation marks
        sentence = sentence.replace('“', '').replace('”', '').replace('"', '')
        
        # Remove sentences like '1.E.7', chapter titles, or asterisks
        if (not re.match(r'^\d+\.\w+\.\d+$', sentence) and  # Matches patterns like '1.E.7'
            not re.match(r'^CHAPTER \d+$', sentence) and  # Matches 'CHAPTER 34'
            not re.match(r'^\*+$', sentence)):  # Matches lines with only asterisks
            cleaned_sentences.append(sentence.strip())  # Strip whitespace
    return cleaned_sentences

# Example usage
file_path = 'data/Moby Dick; Or, The Whale.txt'

# Extract sentences
sentences = extract_sentences_from_text(file_path)

# Clean the extracted sentences
cleaned_sentences = clean_sentences(sentences)

# Optionally, you can save the cleaned sentences to a CSV file
output_file = 'data/cleaned_sentences.csv'
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['sentences'])  # Write header
    for sentence in cleaned_sentences:
        writer.writerow([sentence])  # Write each cleaned sentence

print("Cleaned sentences have been saved to 'data/cleaned_sentences.csv'.")


Cleaned sentences have been saved to 'data/cleaned_sentences.csv'.
