## Import Necessary Libraries

In [1]:
import re
import timeit
import pandas as pd
from collections import Counter

import nltk
from textblob import TextBlob
import spacy

## Setup & Configuration

In [2]:
print("--- Setting up environments ---")

# NLTK Setup
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# SpaCy Setup
try:
    nlp = spacy.load("en_core_web_sm")
    nlp.max_length = 2500000
except OSError:
    print("Error: SpaCy model not found. Run: python -m spacy download en_core_web_sm")
    exit()

--- Setting up environments ---


## Helper Functions

In [3]:
def read_file(filename):
    try:
        with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
            return f.read()
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found.")
        return None

def clean_text(text):
    text_lower = text.lower()
    cleaned = re.sub(r'[^a-z\s]', '', text_lower)
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned

## Processing Functions

In [4]:
def process_nltk(text):
    sentences = nltk.sent_tokenize(text)

    cleaned = clean_text(text)
    words = nltk.word_tokenize(cleaned)
    return len(sentences), len(words), words

def process_textblob(text):
    blob = TextBlob(text)
    sentences = blob.sentences

    cleaned = clean_text(text)
    blob_cleaned = TextBlob(cleaned)
    words = blob_cleaned.words
    return len(sentences), len(words), words

def process_spacy(text):
    doc = nlp(text)

    sentences = list(doc.sents)

    cleaned = clean_text(text)
    doc_cleaned = nlp(cleaned)
    words = [token.text for token in doc_cleaned]
    return len(sentences), len(words), words

## Main Execution

In [5]:
def main():
    input_filename = 'alice29.txt'
    original_text = read_file(input_filename)

    if original_text is None:
        return

    print(f"File loaded: {input_filename} (Length: {len(original_text)} chars)")

    N_LOOPS = 10
    print(f"\n[Phase 1] Benchmarking (Running {N_LOOPS} loops per framework)...")
    print("Please wait, this might take a moment...")

    results = []


    t_nltk = timeit.timeit(lambda: process_nltk(original_text), number=N_LOOPS)
    avg_nltk = t_nltk / N_LOOPS

    s_nltk, w_nltk, words_nltk = process_nltk(original_text)
    results.append({'Framework': 'NLTK', 'Avg Time (s)': avg_nltk, 'Sentences': s_nltk, 'Words': w_nltk})
    print(f" -> NLTK finished (Avg: {avg_nltk:.4f}s)")

    t_tb = timeit.timeit(lambda: process_textblob(original_text), number=N_LOOPS)
    avg_tb = t_tb / N_LOOPS
    s_tb, w_tb, _ = process_textblob(original_text)
    results.append({'Framework': 'TextBlob', 'Avg Time (s)': avg_tb, 'Sentences': s_tb, 'Words': w_tb})
    print(f" -> TextBlob finished (Avg: {avg_tb:.4f}s)")

    t_sp = timeit.timeit(lambda: process_spacy(original_text), number=N_LOOPS)
    avg_sp = t_sp / N_LOOPS
    s_sp, w_sp, _ = process_spacy(original_text)
    results.append({'Framework': 'SpaCy', 'Avg Time (s)': avg_sp, 'Sentences': s_sp, 'Words': w_sp})
    print(f" -> SpaCy finished (Avg: {avg_sp:.4f}s)")

    df_results = pd.DataFrame(results)

    with open('time_compares.txt', 'w', encoding='utf-8') as f:
        f.write(f"Framework Performance Comparison (Average of {N_LOOPS} runs)\n")
        f.write("=================================================\n\n")
        f.write(df_results.to_string(index=False))

    cleaned_content = clean_text(original_text)

    with open('cleaned.txt', 'w', encoding='utf-8') as f:
        f.write(cleaned_content)

    with open('words.txt', 'w', encoding='utf-8') as f:
        f.write('\n'.join(words_nltk))

    top_10 = Counter(words_nltk).most_common(10)
    with open('top10words.txt', 'w', encoding='utf-8') as f:
        header = f"{'Rank':<5} {'Word':<15} {'Frequency':<10}\n"
        f.write(header)
        f.write("-" * 30 + "\n")
        for i, (word, freq) in enumerate(top_10, 1):
            f.write(f"{i:<5} {word:<15} {freq:<10}\n")

    print("\n[Phase 2] Output files generated successfully.")

    return df_results

if __name__ == "__main__":
    df = main()
    if 'get_ipython' not in globals():
        print("\n--- Final Comparison Table ---")
        print(df.to_string(index=False))

File loaded: alice29.txt (Length: 148481 chars)

[Phase 1] Benchmarking (Running 10 loops per framework)...
Please wait, this might take a moment...
 -> NLTK finished (Avg: 0.0612s)
 -> TextBlob finished (Avg: 0.0851s)
 -> SpaCy finished (Avg: 5.7517s)

[Phase 2] Output files generated successfully.
