Libraries

In [39]:
import os, re, glob
import pandas as pd
from collections import Counter

Corpora

In [40]:
CORPORA = {
    "Folk Fairy Tales": r"C:\Users\Sophia\Downloads\MA\CORPORA\German FFT",
    "GPT-5":            r"C:\Users\Sophia\Downloads\MA\CORPORA\GPT-5",
    "GPT-4o":           r"C:\Users\Sophia\Downloads\MA\CORPORA\GPT-4o",
}

FILE_PATTERN = "**/*.txt"
TOP_K = 10
WORD_RE = re.compile(r"[A-Za-zÄÖÜäöüß\-]+", flags=re.UNICODE)
START_PHRASES = ["es war einmal", "es waren einmal"]

In [41]:
def norm(s: str) -> str:
    s = s.replace("\ufeff", " ").replace("\xa0", " ").replace("\t", " ")
    return s.lower()

results = {}
totals  = {}

for corpus_name, corpus_dir in CORPORA.items():
    cnt = Counter()
    for fp in glob.glob(os.path.join(corpus_dir, FILE_PATTERN), recursive=True):
        txt = open(fp, "r", encoding="utf-8").read()
        s = norm(txt).lstrip('"\':-–— \n\r')
        first = re.split(r"[.!?]\s", s, maxsplit=1)[0].strip()
        toks = WORD_RE.findall(first)
        if len(toks) >= 3:
            tri = tuple(toks[:3])
            cnt[tri] += 1
    results[corpus_name] = cnt
    totals[corpus_name]  = sum(cnt.values())

Table

In [42]:
for corpus_name, cnt in results.items():
    rows = []
    total = max(1, totals[corpus_name])
    for (tri, c) in cnt.most_common(TOP_K):
        rows.append({"trigram": " ".join(tri), "count": c, "share_%": round(c/total*100, 1)})
    pd.DataFrame(rows).to_csv(
        f"first_sentence_trigrams_{corpus_name.replace(' ', '_')}.csv",
        index=False, encoding="utf-8"
    )


all_tris = set().union(*[set(c.keys()) for c in results.values()])
rows = []
for tri in all_tris:
    row = {"trigram": " ".join(tri)}
    total_sum = 0
    for cname in CORPORA:
        c = results[cname][tri]
        row[f"{cname} (n)"] = c
        row[f"{cname} (%)"] = round(c / max(1, totals[cname]) * 100, 1) if totals[cname] else 0.0
        total_sum += c
    row["_sum"] = total_sum
    rows.append(row)

combo = pd.DataFrame(rows).sort_values("_sum", ascending=False).drop(columns=["_sum"])
combo.to_csv("first_sentence_trigrams_all_corpora.csv", index=False, encoding="utf-8")

print(combo.head(TOP_K).to_string(index=False))
print("\saved:")
print(" - first_sentence_trigrams_all_corpora.csv")
for cname in CORPORA:
    print(f" - first_sentence_trigrams_{cname.replace(' ', '_')}.csv")

          trigram  Folk Fairy Tales (n)  Folk Fairy Tales (%)  GPT-5 (n)  GPT-5 (%)  GPT-4o (n)  GPT-4o (%)
    es war einmal                    54                  44.6         43       43.0          53        53.0
  es waren einmal                     5                   4.1         57       57.0          47        47.0
       es war ein                     6                   5.0          0        0.0           0         0.0
  es lebte einmal                     3                   2.5          0        0.0           0         0.0
 vor alten zeiten                     2                   1.7          0        0.0           0         0.0
      es war eine                     2                   1.7          0        0.0           0         0.0
     es hatte ein                     2                   1.7          0        0.0           0         0.0
       es war vor                     1                   0.8          0        0.0           0         0.0
   in der schweiz           

In [43]:
def first_sentence(s: str) -> str:
    s = norm(s).lstrip('"\':-–— \n\r')
    return re.split(r"[.!?]\s", s, maxsplit=1)[0].strip()

def toks(s: str):
    return WORD_RE.findall(s)

def safe(name: str) -> str:
    return re.sub(r"[^A-Za-z0-9\-]+", "_", name).strip("_")

START_TOKS = {p: tuple(toks(p)) for p in START_PHRASES}

next3 = {c: {p: Counter() for p in START_PHRASES} for c in CORPORA}
matches_per_phrase = {c: {p: 0 for p in START_PHRASES} for c in CORPORA}

for corpus_name, corpus_dir in CORPORA.items():
    for fp in glob.glob(os.path.join(corpus_dir, FILE_PATTERN), recursive=True):
        text = open(fp, "r", encoding="utf-8").read()
        sent = first_sentence(text)
        tt = toks(sent)
        if not tt:
            continue
        for phrase, p_tok in START_TOKS.items():
            k = len(p_tok)
            if len(tt) >= k and tuple(tt[:k]) == p_tok:
                matches_per_phrase[corpus_name][phrase] += 1
                if len(tt) >= k+3:
                    tri = " ".join(tt[k:k+3])
                    next3[corpus_name][phrase][tri] += 1

# 2 tables
for phrase in START_PHRASES:
    hits_5  = matches_per_phrase["GPT-5"][phrase]
    hits_4o = matches_per_phrase["GPT-4o"][phrase]
    tris = set(next3["GPT-5"][phrase].keys()) | set(next3["GPT-4o"][phrase].keys())
    rows = []
    for tri in tris:
        n5  = next3["GPT-5"][phrase][tri]
        n4o = next3["GPT-4o"][phrase][tri]
        rows.append({
            "next_3": tri,
            "GPT-5 (n)": n5,
            "GPT-5 (%)": 0.0 if hits_5==0 else round(n5 / hits_5 * 100, 1),
            "GPT-4o (n)": n4o,
            "GPT-4o (%)": 0.0 if hits_4o==0 else round(n4o / hits_4o * 100, 1),
            "_sum": n5 + n4o
        })

    table = (pd.DataFrame(rows)
               .sort_values("_sum", ascending=False)
               .drop(columns=["_sum"])
               .head(TOP_K))

    print(f"\n=== Top-{TOP_K} nächste Trigramme nach „{phrase}“ ===")
    print(table.to_string(index=False))

    out = f"TOP10_next3_after_{safe(phrase)}_GPT5_vs_GPT4o.csv"
    table.to_csv(out, index=False, encoding="utf-8")
    print(f"(gespeichert: {out})")

print("\nFertig. Zwei CSVs (je Phrase) wurden gespeichert.")


=== Top-10 nächste Trigramme nach „es war einmal“ ===
           next_3  GPT-5 (n)  GPT-5 (%)  GPT-4o (n)  GPT-4o (%)
 ein alter müller         12       27.9          22        41.5
   ein müller der         10       23.3          17        32.1
 ein armer müller          2        4.7           4         7.5
    ein könig der          3        7.0           0         0.0
    ein bauer der          3        7.0           0         0.0
   ein altes weib          0        0.0           2         3.8
ein altes ehepaar          1        2.3           1         1.9
 ein armer köhler          1        2.3           1         1.9
  ein armer hirte          0        0.0           2         3.8
  ein alter bauer          0        0.0           2         3.8
(gespeichert: TOP10_next3_after_es_war_einmal_GPT5_vs_GPT4o.csv)

=== Top-10 nächste Trigramme nach „es waren einmal“ ===
             next_3  GPT-5 (n)  GPT-5 (%)  GPT-4o (n)  GPT-4o (%)
     ein müller und         13       22.8          22