In [None]:
from pathlib import Path
from typing import Dict, List, Tuple
import os, glob, math
import numpy as np
import pandas as pd
from tqdm import tqdm

Corpus and SpaCy

In [None]:
CORPORA: Dict[str, str] = {
    "Folk Fairy Tales": r"C:\Users\Sophia\Downloads\MA\CORPORA\German FFT",
    "GPT-5":            r"C:\Users\Sophia\Downloads\MA\CORPORA\GPT-5",
    "GPT-4o":           r"C:\Users\Sophia\Downloads\MA\CORPORA\GPT-4o",
}
FILE_PATTERN = "**/*.txt"
STTR_WINDOW = 1000
OUTDIR = Path(r"C:\Users\Sophia\Downloads\MA\TABLES")
OUTDIR.mkdir(parents=True, exist_ok=True)

import spacy
def load_de_model():
    for name in ["de_core_news_lg", "de_core_news_md", "de_core_news_sm"]:
        try:
            return spacy.load(name, disable=["ner","lemmatizer","textcat","morphologizer","tagger","attribute_ruler"])
        except Exception:
            continue
    nlp = spacy.blank("de")
    if "senter" not in nlp.pipe_names:
        nlp.add_pipe("sentencizer")
    return nlp

nlp = load_de_model()

In [1]:
if "senter" not in nlp.pipe_names and "parser" not in nlp.pipe_names:
    try:
        nlp.add_pipe("sentencizer")
    except Exception:
        pass

def read_text(fp: str) -> str:
    try:
        return Path(fp).read_text(encoding="utf-8", errors="ignore")
    except Exception:
        with open(fp, "r", encoding="latin-1", errors="ignore") as f:
            return f.read()

def words_from_doc(doc) -> List[str]:
    return [t.text for t in doc if t.is_alpha]

def sentence_word_counts(doc) -> List[int]:
    counts = []
    for sent in doc.sents:
        c = sum(1 for t in sent if t.is_alpha)
        if c > 0:
            counts.append(c)
    return counts

def mean_and_sd(arr: List[float]) -> Tuple[float, float]:
    if not arr:
        return 0.0, 0.0
    a = np.array(arr, dtype=float)
    return float(a.mean()), float(a.std(ddof=1)) if len(a) > 1 else (float(a.mean()), 0.0)

def sttr_percent(tokens: List[str], window: int = 1000) -> float:
    n = len(tokens)
    if n == 0:
        return 0.0
    if n <= window:
        types = len(set(t.lower() for t in tokens))
        return 100.0 * types / max(1, n)r)
    blocks = [tokens[i:i+window] for i in range(0, n, window)]
    vals = []
    for b in blocks:
        if not b:
            continue
        vals.append(len(set(t.lower() for t in b)) / len(b))
    return 100.0 * (sum(vals) / len(vals)) if vals else 0.0

# doc metrics
rows_doc = []
for group, cdir in CORPORA.items():
    files = glob.glob(os.path.join(cdir, FILE_PATTERN), recursive=True)
    for fp in tqdm(files, desc=f"Processing {group}", unit="file"):
        txt = read_text(fp)
        if not txt.strip():
            continue
        doc = nlp(txt)
n
        words = words_from_doc(doc)
        word_lengths = [len(w) for w in words]

        sent_counts = sentence_word_counts(doc)

        # Tokens/Types/stTTR 
        n_tokens = len(words)
        n_types  = len(set(w.lower() for w in words))
        sttr_val = sttr_percent(words, window=STTR_WINDOW)

        # mean & SD within doc
        mean_wlen, sd_wlen   = mean_and_sd(word_lengths)
        mean_slen, sd_slen   = mean_and_sd(sent_counts)

        rows_doc.append({
            "group": group,
            "path": fp,
            "n_tokens": n_tokens,
            "n_types": n_types,
            "sttr_percent": sttr_val,
            "mean_word_len": mean_wlen,
            "sd_word_len": sd_wlen,           
            "mean_sent_len": mean_slen,
            "sd_sent_len": sd_slen,            
        })

df_docs = pd.DataFrame(rows_doc)
df_docs.to_csv(OUTDIR / "per_document_metrics.csv", index=False)


def agg_complexity_table(df: pd.DataFrame) -> pd.DataFrame:
    def summarize(group_df: pd.DataFrame) -> pd.Series:
        n_docs = len(group_df)
        mw_mean, mw_sd = group_df["mean_word_len"].mean(), group_df["mean_word_len"].std(ddof=1)
        ms_mean, ms_sd = group_df["mean_sent_len"].mean(), group_df["mean_sent_len"].std(ddof=1)
        wl_var = group_df["sd_word_len"].mean()     
        sl_var = group_df["sd_sent_len"].mean()    
        return pd.Series({
            "N_docs": n_docs,
            "Mean word length (mean)": mw_mean,
            "Mean word length (sd)": mw_sd if n_docs > 1 else 0.0,
            "Word length variation (sd)": wl_var,
            "Mean sentence length (mean)": ms_mean,
            "Mean sentence length (sd)": ms_sd if n_docs > 1 else 0.0,
            "Sentence length variation (sd)": sl_var,
        })

    parts = []
    # group
    parts.append(df.groupby("group").apply(summarize).reset_index().rename(columns={"group":"Author/Group"}))
    # overall
    overall = summarize(df)
    overall_df = pd.DataFrame([overall])
    overall_df.insert(0, "Author/Group", "Overall mean")
    parts.append(overall_df)
    out = pd.concat(parts, ignore_index=True)

    out["Mean word length (chars) (mean ± sd)"] = out.apply(
        lambda r: f"{r['Mean word length (mean)']:.2f} ± {r['Mean word length (sd)']:.2f}", axis=1)
    out["Mean sentence length (words) (mean ± sd)"] = out.apply(
        lambda r: f"{r['Mean sentence length (mean)']:.2f} ± {r['Mean sentence length (sd)']:.2f}", axis=1)

    nice_cols = [
        "Author/Group", "N_docs",
        "Mean word length (chars) (mean ± sd)",
        "Word length variation (sd)",
        "Mean sentence length (words) (mean ± sd)",
        "Sentence length variation (sd)"
    ]
    order = ["Overall mean", "Folk Fairy Tales", "GPT-4o", "GPT-5"]
    out["__order__"] = out["Author/Group"].apply(lambda x: order.index(x) if x in order else 999)
    out = out.sort_values("__order__").drop(columns=["__order__"])
    return out[nice_cols]

table1 = agg_complexity_table(df_docs)
table1.to_csv(OUTDIR / "table_sentence_word_complexity.csv", index=False)
table1

Processing Folk Fairy Tales: 100%|██████████| 122/122 [00:23<00:00,  5.15file/s]
Processing GPT-5: 100%|██████████| 100/100 [00:15<00:00,  6.63file/s]
Processing GPT-4o: 100%|██████████| 100/100 [00:13<00:00,  7.31file/s]


Unnamed: 0,Author/Group,N_docs,Mean word length (chars) (mean ± sd),Word length variation (sd),Mean sentence length (words) (mean ± sd),Sentence length variation (sd)
3,Overall mean,322.0,4.58 ± 0.13,2.12117,16.62 ± 3.55,8.756184
0,Folk Fairy Tales,122.0,4.62 ± 0.19,2.247374,20.33 ± 2.72,11.267519
1,GPT-4o,100.0,4.54 ± 0.08,2.00861,14.03 ± 1.53,7.538948
2,GPT-5,100.0,4.59 ± 0.08,2.07976,14.68 ± 1.36,6.909592


Lexical Diversity

In [2]:
import pandas as pd
from pathlib import Path

OUTDIR = Path(r"C:\Users\Sophia\Downloads\MA\TABLES")
df_docs = pd.read_csv(OUTDIR / "per_document_metrics.csv")

def agg_lexdiv_table(df: pd.DataFrame) -> pd.DataFrame:
    def summarize(group_df: pd.DataFrame) -> pd.Series:
        n_docs = len(group_df)
        tok_mean, tok_sd   = group_df["n_tokens"].mean(), group_df["n_tokens"].std(ddof=1)
        typ_mean, typ_sd   = group_df["n_types"].mean(),  group_df["n_types"].std(ddof=1)
        sttr_mean, sttr_sd = group_df["sttr_percent"].mean(), group_df["sttr_percent"].std(ddof=1)
        return pd.Series({
            "N_docs": n_docs,
            "Tokens (mean)": tok_mean, "Tokens (sd)": tok_sd if n_docs > 1 else 0.0,
            "Types (mean)": typ_mean, "Types (sd)": typ_sd if n_docs > 1 else 0.0,
            "stTTR % (mean)": sttr_mean, "stTTR % (sd)": sttr_sd if n_docs > 1 else 0.0,
        })

    parts = []
    parts.append(df.groupby("group").apply(summarize).reset_index().rename(columns={"group":"Author/Group"}))
    overall = summarize(df)
    overall_df = pd.DataFrame([overall])
    overall_df.insert(0, "Author/Group", "Overall mean")
    parts.append(overall_df)
    out = pd.concat(parts, ignore_index=True)

    out["Tokens (mean ± sd)"] = out.apply(lambda r: f"{r['Tokens (mean)']:.2f} ± {r['Tokens (sd)']:.2f}", axis=1)
    out["Types (mean ± sd)"]  = out.apply(lambda r: f"{r['Types (mean)']:.2f} ± {r['Types (sd)']:.2f}", axis=1)
    out["stTTR % (mean ± sd)"]= out.apply(lambda r: f"{r['stTTR % (mean)']:.2f} ± {r['stTTR % (sd)']:.2f}", axis=1)

    nice_cols = ["Author/Group", "N_docs", "Tokens (mean ± sd)", "Types (mean ± sd)", "stTTR % (mean ± sd)"]
    order = ["Overall mean", "Folk Fairy Tales", "GPT-4o", "GPT-5"]
    out["__order__"] = out["Author/Group"].apply(lambda x: order.index(x) if x in order else 999)
    out = out.sort_values("__order__").drop(columns=["__order__"])
    return out[nice_cols]

table2 = agg_lexdiv_table(df_docs)
table2.to_csv(OUTDIR / "table_lexical_diversity.csv", index=False)
table2

Unnamed: 0,Author/Group,N_docs,Tokens (mean ± sd),Types (mean ± sd),stTTR % (mean ± sd)
3,Overall mean,322.0,1367.63 ± 622.34,524.07 ± 136.86,53.12 ± 7.61
0,Folk Fairy Tales,122.0,1635.79 ± 937.51,555.03 ± 210.86,47.36 ± 6.94
1,GPT-4o,100.0,1146.79 ± 99.41,483.77 ± 32.68,57.87 ± 5.95
2,GPT-5,100.0,1261.31 ± 149.65,526.60 ± 49.93,55.39 ± 4.98
