## Import Necessary Libraries

In [11]:
import re
import timeit
import pandas as pd
from collections import Counter

import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
import spacy

## Setup & Configuration

In [12]:
print("--- Setting up environments ---")

# NLTK Setup
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('punkt_tab')

# SpaCy Setup
try:
    nlp = spacy.load("en_core_web_sm")
    nlp.max_length = 2500000
except OSError:
    print("Error: SpaCy model not found. Run: python -m spacy download en_core_web_sm")
    exit()

stop_words = set(stopwords.words('english'))

--- Setting up environments ---


## Helper Functions

In [13]:
def read_file(filename):
    try:
        with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
            return f.read()
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found.")
        return None

def clean_text(text):
    text_lower = text.lower()

    cleaned = re.sub(r'[^a-z\s]', '', text_lower)

    cleaned = re.sub(r'\s+', ' ', cleaned).strip()

    tokens = cleaned.split()
    filtered_tokens = [w for w in tokens if w not in stop_words]

    return " ".join(filtered_tokens)

## Processing Functions

In [14]:
def process_nltk(text):
    sentences = nltk.sent_tokenize(text)

    cleaned = clean_text(text)
    words = cleaned.split()

    return len(sentences), len(words), sentences, words

def process_textblob(text):
    blob = TextBlob(text)
    sentences = blob.sentences

    cleaned = clean_text(text)
    words = cleaned.split()

    return len(sentences), len(words), sentences, words

def process_spacy(text):
    doc = nlp(text)
    sentences = list(doc.sents)

    cleaned = clean_text(text)
    words = cleaned.split()

    return len(sentences), len(words), sentences, words

## Main Execution

In [15]:
def main():
    input_filename = 'alice29.txt'
    original_text = read_file(input_filename)

    if original_text is None:
        return

    print(f"File loaded: {input_filename} (Length: {len(original_text)} chars)")

    N_LOOPS = 10
    print(f"\n[Phase 1] Benchmarking (Running {N_LOOPS} loops per framework)...")
    print("Please wait, this might take a moment...")

    results = []

    # 1. NLTK
    t_nltk = timeit.timeit(lambda: process_nltk(original_text), number=N_LOOPS)
    avg_nltk = t_nltk / N_LOOPS

    s_nltk, w_nltk, sent_list_nltk, word_list_nltk = process_nltk(original_text)
    results.append({'Framework': 'NLTK', 'Avg Time (s)': avg_nltk, 'Sentences': s_nltk, 'Words': w_nltk})
    print(f" -> NLTK finished (Avg: {avg_nltk:.4f}s)")

    # 2. TextBlob
    t_tb = timeit.timeit(lambda: process_textblob(original_text), number=N_LOOPS)
    avg_tb = t_tb / N_LOOPS
    s_tb, w_tb, _, _ = process_textblob(original_text)
    results.append({'Framework': 'TextBlob', 'Avg Time (s)': avg_tb, 'Sentences': s_tb, 'Words': w_tb})
    print(f" -> TextBlob finished (Avg: {avg_tb:.4f}s)")

    # 3. SpaCy
    t_sp = timeit.timeit(lambda: process_spacy(original_text), number=N_LOOPS)
    avg_sp = t_sp / N_LOOPS
    s_sp, w_sp, _, _ = process_spacy(original_text)
    results.append({'Framework': 'SpaCy', 'Avg Time (s)': avg_sp, 'Sentences': s_sp, 'Words': w_sp})
    print(f" -> SpaCy finished (Avg: {avg_sp:.4f}s)")

    df_results = pd.DataFrame(results)

    # Output 1: time_compares.txt
    with open('time_compares.txt', 'w', encoding='utf-8') as f:
        f.write(f"Framework Performance Comparison (Average of {N_LOOPS} runs)\n")
        f.write("=================================================\n\n")
        f.write(df_results.to_string(index=False))

    # Output 2: cleaned.txt
    cleaned_content = clean_text(original_text)
    with open('cleaned.txt', 'w', encoding='utf-8') as f:
        f.write(cleaned_content)

    # Output 3: words.txt
    with open('words.txt', 'w', encoding='utf-8') as f:
        f.write("--- Tokenized Sentences ---\n")
        for sent in sent_list_nltk:
            f.write(str(sent) + "\n")

        f.write("\n--- Tokenized Words (Stopwords Removed) ---\n")
        f.write('\n'.join(word_list_nltk))

    # Output 4: top10words.txt
    top_10 = Counter(word_list_nltk).most_common(10)
    with open('top10words.txt', 'w', encoding='utf-8') as f:
        header = f"{'Rank':<5} {'Word':<15} {'Frequency':<10}\n"
        f.write(header)
        f.write("-" * 30 + "\n")
        for i, (word, freq) in enumerate(top_10, 1):
            f.write(f"{i:<5} {word:<15} {freq:<10}\n")

    print("\n[Phase 2] Output files generated successfully.")

    return df_results

if __name__ == "__main__":
    df = main()
    if 'get_ipython' not in globals():
        print("\n--- Final Comparison Table ---")
        print(df.to_string(index=False))

File loaded: alice29.txt (Length: 148481 chars)

[Phase 1] Benchmarking (Running 10 loops per framework)...
Please wait, this might take a moment...
 -> NLTK finished (Avg: 0.0394s)
 -> TextBlob finished (Avg: 0.0359s)
 -> SpaCy finished (Avg: 3.5505s)

[Phase 2] Output files generated successfully.
