Libraries

In [4]:
from pathlib import Path
import re
import pandas as pd
from IPython.display import display

Corpora

In [None]:
CORPORA = {
    "Folk Fairy Tales": r"C:\Users\Sophia\Downloads\MA\CORPORA\German FFT",
    "GPT-5":            r"C:\Users\Sophia\Downloads\MA\CORPORA\GPT-5",
    "GPT-4o":           r"C:\Users\Sophia\Downloads\MA\CORPORA\GPT-4o",
}
OUTDIR = Path(r"C:\Users\Sophia\Downloads\MA\TABLES")

ROUND = 2
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 0)  # automatische Breite

SENT_SPLIT = re.compile(r'[.!?…]+["»”\')\]]*\s+')
WORD_RE = re.compile(r"[A-Za-zÄÖÜäöüß\-']+", re.UNICODE)

In [3]:
def _sentences(text: str):
    text = text.strip()
    if not text:
        return []
    return [s.strip() for s in SENT_SPLIT.split(text) if s.strip()]

def _words(text: str):
    return WORD_RE.findall(text)

def _count_syllables(word: str) -> int:
    if not word:
        return 0
    w = word.lower()
    groups = re.findall(r'[aeiouyäöü]+', w)  
    syl = len(groups)
    syl -= w.count("ie")   
    syl -= w.count("qu")   
    return max(1, syl)

# Flesch Reading Ease (DE) nach Amstad (1978): FRE_de = 180 - ASL - 58.5 * ASW
def flesch_reading_ease_de(text: str):
    sents = _sentences(text)
    if not sents:
        return float("nan"), float("nan"), 0, 0, 0
    ws = [w for s in sents for w in _words(s)]
    if not ws:
        return float("nan"), float("nan"), len(sents), 0, 0
    n_sent = len(sents)
    n_words = len(ws)
    n_syll = sum(_count_syllables(w) for w in ws)
    ASL = n_words / n_sent                 
    ASW = n_syll / n_words                 
    FRE = 180.0 - ASL - 58.5 * ASW
    return FRE, ASL, n_sent, n_words, n_syll

# LIX = (Wörter/Sätze) + (100 * lange Wörter / Wörter), lange Wörter = Länge ≥ 7
def lix(text: str):
    sents = _sentences(text)
    if not sents:
        return float("nan")
    ws = [w for s in sents for w in _words(s)]
    if not ws:
        return float("nan")
    n_sent = len(sents)
    n_words = len(ws)
    n_long = sum(1 for w in ws if len(w) >= 7)
    return (n_words / n_sent) + (100.0 * n_long / n_words)

#auswertung
def analyze_corpora(corpora: dict, outdir: Path):
    outdir.mkdir(parents=True, exist_ok=True)

    rows = []
    for corpus_name, corpus_dir in corpora.items():
        corpus_path = Path(corpus_dir)
        for fp in corpus_path.rglob("*.txt"):
            try:
                text = fp.read_text(encoding="utf-8")
            except Exception:
                try:
                    text = fp.read_text(encoding="latin-1")
                except Exception:
                    print(f"[WARN] kann Datei nicht lesen: {fp}")
                    continue

            fre, asl, n_sent, n_words, n_syll = flesch_reading_ease_de(text)
            lix_val = lix(text)

            rows.append({
                "corpus": corpus_name,
                "doc": str(fp.relative_to(corpus_path)),
                "n_sentences": n_sent,
                "n_words": n_words,
                "n_syllables": n_syll,
                "ASL": asl,
                "FRE_de": fre,
                "LIX": lix_val,
            })

    df = pd.DataFrame(rows)

    per_doc_path = outdir / "readability_per_doc.csv"
    df.to_csv(per_doc_path, index=False, encoding="utf-8-sig")

    agg = (df.groupby("corpus", dropna=False)
             .agg(n_docs=("doc", "count"),
                  mean_ASL=("ASL", "mean"),
                  mean_FRE=("FRE_de", "mean"),
                  mean_LIX=("LIX", "mean"),
                  mean_words=("n_words", "mean"),
                  mean_sentences=("n_sentences", "mean"))
             .reset_index())

    per_corpus_path = outdir / "readability_by_corpus.csv"
    agg.to_csv(per_corpus_path, index=False, encoding="utf-8-sig")

# ausgabe
    print(f"[OK] Gespeichert: {per_doc_path}")
    print(f"[OK] Gespeichert: {per_corpus_path}")

    print("\n=== Readability by corpus (means only) ===")
    display(agg.round(ROUND))

    print("\n=== Readability per document ===")
    display(df.round(ROUND))

    return df, agg
df, agg = analyze_corpora(CORPORA, OUTDIR)

[OK] Gespeichert: C:\Users\Sophia\Downloads\MA\TABLES\readability_per_doc.csv
[OK] Gespeichert: C:\Users\Sophia\Downloads\MA\TABLES\readability_by_corpus.csv

=== Readability by corpus (means only) ===


Unnamed: 0,corpus,n_docs,mean_ASL,mean_FRE,mean_LIX,mean_words,mean_sentences
0,Folk Fairy Tales,122,22.75,70.41,38.93,1639.07,74.43
1,GPT-4o,100,14.19,81.71,28.82,1146.99,81.98
2,GPT-5,100,15.41,79.48,31.24,1261.46,82.8



=== Readability per document ===


Unnamed: 0,corpus,doc,n_sentences,n_words,n_syllables,ASL,FRE_de,LIX
0,Folk Fairy Tales,Bechstein_Aschenbrödel_510.txt,32,918,1418,28.69,60.95,48.95
1,Folk Fairy Tales,Bechstein_Das Dornröschen_410.txt,31,1093,1763,35.26,50.38,57.40
2,Folk Fairy Tales,Bechstein_Das Dukaten-Angele_571.txt,122,2397,3747,19.65,68.91,39.67
3,Folk Fairy Tales,Bechstein_Das Märchen vom Ritter Blaubart_312.txt,51,925,1456,18.14,69.78,39.87
4,Folk Fairy Tales,Bechstein_Das Natterkrönlein_672.txt,44,1252,1860,28.45,64.64,45.39
...,...,...,...,...,...,...,...,...
317,GPT-4o,ChatGPT-4o_95.txt,86,1198,1700,13.93,83.06,26.78
318,GPT-4o,ChatGPT-4o_96.txt,82,1150,1678,14.02,80.62,31.94
319,GPT-4o,ChatGPT-4o_97.txt,83,1258,1765,15.16,82.77,27.80
320,GPT-4o,ChatGPT-4o_98.txt,85,1213,1739,14.27,81.86,29.27
