# Session 3 â€” Sentence-Level Analysis
## Measure 2: Readability Scores
This notebook computes classic readability formulas for **English** such as:
- Flesch Reading Ease
- Flesch-Kincaid Grade Level

We'll analyze and compare Lewis Carroll's two Alice books:
- Alice's Adventures in Wonderland
- Through the Looking-Glass

In [1]:
import re, os
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path

# Set plot styles
plt.rcParams["figure.figsize"] = (10, 5)
plt.rcParams["axes.grid"] = True

# --- Configuration ---
CONFIG = {
    "book1_path": "..\\data\\Fellowship.txt",
    "book2_path": "..\\data\\TwoTowers.txt",
    "book3_path": "..\\data\\TheKing.txt",
    "book1_name": "Fellowship",
    "book2_name": "Two Towers",
    "book3_name": "Return of the King",
}

# Create results directory
Path("../results").mkdir(exist_ok=True)

print("Cell 1: Imports and Config loaded.")

Cell 1: Imports and Config loaded.


In [2]:
# --- Robust Text Loading Functions (from our previous projects) ---
# We use the same functions from Notebook 1 to ensure the
# Foreword/Prologue in Fellowship are correctly stripped.

_GB_START_MARKERS = [
    r"\*\*\*\s*START OF (THIS|THE) PROJECT GUTENBERG EBOOK",
    r"START OF (THIS|THE) PROJECT GUTENBERG EBOOK",
]
_GB_END_MARKERS = [
    r"\*\*\*\s*END OF (THIS|THE) PROJECT GUTENBERG EBOOK",
    r"END OF (THIS|THE) PROJECT GUTENBERG EBOOK",
]
_CHAPTER_HINTS = [
    r"^\s*A LONG-EXPECTED PARTY\s*$", # Start of Fellowship
    r"^\s*The Departure of Boromir\s*$", # Start of Two Towers
    r"^\s*Minas Tirith\s*$", # Start of Return of King
    r"^\s*CHAPTER I\s*$", # Generic fallback
    r"^\s*BOOK I\s*$", # Generic fallback
]

def strip_gutenberg(text: str) -> str:
    """Heuristically strips Gutenberg headers/footers."""
    t = text.replace("\ufeff", "") 
    start_idx, end_idx = None, None
    for pat in _GB_START_MARKERS:
        m = re.search(pat, t, flags=re.IGNORECASE)
        if m:
            start_idx = t.find("\n", m.end())
            if start_idx == -1: start_idx = m.end()
            break
    for pat in _GB_END_MARKERS:
        m = re.search(pat, t, flags=re.IGNORECASE)
        if m:
            end_idx = m.start()
            break
    if start_idx is not None and end_idx is not None and end_idx > start_idx:
        core = t[start_idx:end_idx]
    else: 
        core = t
        found_start = False
        for pat in _CHAPTER_HINTS:
            m = re.search(pat, core, flags=re.IGNORECASE | re.MULTILINE)
            if m: 
                core = core[m.start():]
                found_start = True
                break
        if found_start:
            for pat in _GB_END_MARKERS:
                 m = re.search(pat, core, flags=re.IGNORECASE)
                 if m:
                     core = core[:m.start()]
                     break
    return core.strip()

def load_text(p: str) -> str:
    with open(p, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

def normalize_text(t: str) -> str:
    """Cleans text: strips Gutenberg, normalizes whitespace."""
    t = strip_gutenberg(t)
    # Replace line breaks with spaces, then collapse whitespace
    t = re.sub(r"[\r\n]+", " ", t)
    t = re.sub(r"\s+", " ", t)
    return t

print("Cell 2: Helper functions defined.")

Cell 2: Helper functions defined.


In [3]:
def count_syllables(word):
    """Simple syllable counter for English words"""
    word = word.lower()
    # Remove non-alphabetic characters
    word = re.sub(r'[^a-z]', '', word)
    if not word:
        return 1
    
    # Count vowel groups
    syllables = len(re.findall(r'[aeiouy]+', word))
    
    # Adjust for silent 'e' at the end
    if word.endswith('e') and syllables > 1:
        syllables -= 1
    
    # At least one syllable per word
    return max(1, syllables)

def calculate_readability(text, book_name):
    """Calculate readability scores for a given text"""
    # Split into sentences
    sentences = re.split(r'[.!?]+\s+', text.strip())
    sentences = [s.strip() for s in sentences if s.strip() and len(s.split()) > 2]
    
    # Extract words (alphabetic only)
    words = re.findall(r"[A-Za-z']+", text)
    
    if not sentences or not words:
        print(f"\n{'='*70}\nERROR: No text found for {book_name}. Check stripping.\n{'='*70}")
        return 0, 0, 0, 0
    
    # Count syllables
    syllables = sum(count_syllables(w) for w in words)
    
    num_sentences = len(sentences)
    num_words = len(words)
    
    avg_words_sent = num_words / num_sentences
    avg_syll_word = syllables / num_words
    
    # Flesch Reading Ease
    flesch_ease = 206.835 - 1.015 * (avg_words_sent) - 84.6 * (avg_syll_word)
    
    # Flesch-Kincaid Grade Level
    flesch_grade = 0.39 * (avg_words_sent) + 11.8 * (avg_syll_word) - 15.59
    
    print(f"\n{'='*70}")
    print(f"{book_name}")
    print(f"{'='*70}")
    print(f"Sentences: {num_sentences:,}")
    print(f"Words: {num_words:,}")
    print(f"Syllables: {syllables:,}")
    print(f"Avg words per sentence: {avg_words_sent:.2f}")
    print(f"Avg syllables per word: {avg_syll_word:.2f}")
    
    print(f"\nFlesch Reading Ease: {flesch_ease:.2f}")
    print(f"Flesch-Kincaid Grade Level: {flesch_grade:.2f}")
    
    return flesch_ease, flesch_grade, avg_words_sent, avg_syll_word

print("Cell 3: Readability functions defined.")

Cell 3: Readability functions defined.


In [5]:
def count_syllables(word):
    """Simple syllable counter for English words"""
    word = word.lower()
    # Remove non-alphabetic characters
    word = re.sub(r'[^a-z]', '', word)
    if not word:
        return 1
    
    # Count vowel groups
    syllables = len(re.findall(r'[aeiouy]+', word))
    
    # Adjust for silent 'e' at the end
    if word.endswith('e') and syllables > 1:
        syllables -= 1
    
    # At least one syllable per word
    return max(1, syllables)

def calculate_readability(text, book_name):
    """
    Calculate readability scores for a given text.
    
    Returns 4 values: ease, grade, avg_words_sent, avg_syll_word.
    """
    # Split into sentences
    sentences = re.split(r'[.!?]+\s+', text.strip())
    # Filter for valid sentences
    sentences = [s.strip() for s in sentences if s.strip() and len(s.split()) > 2]
    
    # Extract words (alphabetic only)
    words = re.findall(r"[A-Za-z']+", text)
    
    # --- SAFETY CHECK: Prevents division by zero or errors on empty data ---
    if not sentences or not words:
        print(f"\n{'='*70}\nERROR: Not enough data for {book_name}. Returning zeros.\n{'='*70}")
        # Return 4 zeros if no data is found
        return 0, 0, 0, 0 
    
    # Count syllables
    syllables = sum(count_syllables(w) for w in words)
    
    num_sentences = len(sentences)
    num_words = len(words)
    
    avg_words_sent = num_words / num_sentences
    avg_syll_word = syllables / num_words
    
    # Flesch Reading Ease
    flesch_ease = 206.835 - 1.015 * (avg_words_sent) - 84.6 * (avg_syll_word)
    
    # Flesch-Kincaid Grade Level
    flesch_grade = 0.39 * (avg_words_sent) + 11.8 * (avg_syll_word) - 15.59
    
    print(f"\n{'='*70}")
    print(f"{book_name}")
    print(f"{'='*70}")
    print(f"Sentences: {num_sentences:,}")
    print(f"Words: {num_words:,}")
    print(f"Syllables: {syllables:,}")
    print(f"Avg words per sentence: {avg_words_sent:.2f}")
    print(f"Avg syllables per word: {avg_syll_word:.2f}")
    print(f"\nFlesch Reading Ease: {flesch_ease:.2f}")
    print(f"Flesch-Kincaid Grade Level: {flesch_grade:.2f}")
    
    # --- FIX: Return all 4 required values ---
    return flesch_ease, flesch_grade, avg_words_sent, avg_syll_word

print("Cell 3: Readability functions defined.")

Cell 3: Readability functions defined.


In [None]:
# Visualize comparison
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Flesch Reading Ease comparison
ax1 = axes[0]
books = ['Wonderland', 'Looking-Glass']
ease_scores = [wonderland_ease, looking_glass_ease]
colors = ['#e74c3c', '#3498db']
bars1 = ax1.bar(books, ease_scores, color=colors, edgecolor='black', linewidth=1.5, alpha=0.7)
ax1.set_ylabel('Flesch Reading Ease', fontsize=11)
ax1.set_title('Readability Comparison\n(Higher = Easier)', fontsize=12, fontweight='bold')
ax1.axhline(y=60, color='gray', linestyle='--', linewidth=1, label='Standard (8th-9th grade)')
ax1.legend()
ax1.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bar in bars1:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.1f}', ha='center', va='bottom', fontsize=10, fontweight='bold')

# Flesch-Kincaid Grade Level comparison
ax2 = axes[1]
grade_scores = [wonderland_grade, looking_glass_grade]
bars2 = ax2.bar(books, grade_scores, color=colors, edgecolor='black', linewidth=1.5, alpha=0.7)
ax2.set_ylabel('Grade Level', fontsize=11)
ax2.set_title('Flesch-Kincaid Grade Level\n(Lower = Easier)', fontsize=12, fontweight='bold')
ax2.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bar in bars2:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.1f}', ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.show()

# Summary
print("\n" + "="*70)
print("SUMMARY")
print("="*70)
print("Both Alice books show similar readability levels, suitable for")
print("middle-school readers, though they are considered children's literature.")
print("The slightly lower scores reflect Carroll's sophisticated vocabulary")
print("and complex sentence structures.")

NameError: name 'df_readability' is not defined