In [40]:
import os, glob, re
from collections import Counter, defaultdict
from pathlib import Path
from typing import Dict, Iterable, List, Tuple
import numpy as np
import pandas as pd
from tqdm import tqdm
import spacy

1. Loading Corpus and large spacy model for lemming

In [41]:
CORPORA = {
    "Folk Fairy Tales": r"C:\Users\Sophia\Downloads\MA\CORPORA\German FFT",
    "GPT-5":            r"C:\Users\Sophia\Downloads\MA\CORPORA\GPT-5",
    "GPT-4o":           r"C:\Users\Sophia\Downloads\MA\CORPORA\GPT-4o",
}

OUTDIR = r"C:\Users\Sophia\Downloads\MA\TABLES\POS"

REPORT_POS = ["NOUN", "PROPN", "VERB", "ADJ", "ADV", "PRON", "ADP", "AUX", "CCONJ", "SCONJ"]

EXCLUDE_STOPWORDS = False
LOWERCASE_LEMMAS  = True    
MIN_LEMMA_LEN     = 2 
TOP_N_ADJ         = 10
SPACY_MODEL = "de_core_news_lg" #large german model for this POS-tagging

print(f"[INFO] Lade spaCy Modell: {SPACY_MODEL}")
nlp = spacy.load(SPACY_MODEL)

[INFO] Lade spaCy Modell: de_core_news_lg




In [43]:
def read_texts_from_dir(d: str) -> Iterable[str]:
    for fp in glob.glob(str(Path(d) / "**/*.txt"), recursive=True):
        try:
            with open(fp, "r", encoding="utf-8") as f:
                yield f.read()
        except Exception:
            with open(fp, "r", encoding="latin-1", errors="ignore") as f:
                yield f.read()

def normalize_whitespace(s: str) -> str:
    s = re.sub(r"[ \t\f\v]+", " ", s)
    s = re.sub(r"[ \t\f\v]*\n[ \t\f\v]*", "\n", s)
    return s

def acceptable_token(tok) -> bool:
    if tok.is_space or tok.is_punct:
        return False
    if tok.like_num:
        return False
    return True

def lemma_of(tok) -> str:
    lem = tok.lemma_ if tok.lemma_ else tok.text
    if LOWERCASE_LEMMAS:
        lem = lem.lower()
    lem = lem.replace("ß", "ss")
    return lem

def analyze_corpus(name: str, dirpath: str) -> Dict[str, any]:
    pos_counts = Counter()
    total_tokens = 0
    adj_counts = Counter()

    texts = list(read_texts_from_dir(dirpath))
    if not texts:
        return {"pos_counts": pos_counts, "total_tokens": 0, "adj_counts": adj_counts}

    for doc in tqdm(nlp.pipe((normalize_whitespace(t) for t in texts), batch_size=32),
                    total=len(texts), desc=f"[{name}]"):
        for tok in doc:
            if not acceptable_token(tok):
                continue
            total_tokens += 1
            pos = tok.pos_
            pos_counts[pos] += 1

            if pos == "ADJ":
                lem = lemma_of(tok)
                if len(lem) < MIN_LEMMA_LEN:
                    continue
                if EXCLUDE_STOPWORDS and tok.is_stop:
                    continue
                adj_counts[lem] += 1

    return {"pos_counts": pos_counts, "total_tokens": total_tokens, "adj_counts": adj_counts}
results = {}
for cname, cdir in CORPORA.items():
    results[cname] = analyze_corpus(cname, cdir)

[Folk Fairy Tales]: 100%|██████████| 122/122 [00:20<00:00,  5.86it/s]
[GPT-5]: 100%|██████████| 100/100 [00:13<00:00,  7.64it/s]
[GPT-4o]: 100%|██████████| 100/100 [00:11<00:00,  8.34it/s]


Build table

In [44]:
rows = []
for pos in REPORT_POS:
    row = {"pos": pos}
    for cname, res in results.items():
        tot = res["total_tokens"] or 1
        share = res["pos_counts"].get(pos, 0) / tot
        row[cname] = round(share, 6)
    rows.append(row)

df_pos = pd.DataFrame(rows).set_index("pos")
df_pos["_mean"] = df_pos.mean(axis=1)
df_pos = df_pos.sort_values("_mean", ascending=False).drop(columns=["_mean"])

pos_csv = Path(OUTDIR) / "POS_shares_by_corpus.csv"
df_pos.to_csv(pos_csv, encoding="utf-8")
print("\n=== Preview POS-share (Top 12) ===")
print(df_pos.head(12).to_string())


=== Preview POS-share (Top 12) ===
       Folk Fairy Tales     GPT-5    GPT-4o
pos                                        
NOUN           0.163053  0.181750  0.189505
VERB           0.155519  0.157595  0.148746
PRON           0.139417  0.128898  0.123340
ADV            0.120775  0.104284  0.094316
ADP            0.079172  0.081969  0.092983
CCONJ          0.061995  0.065400  0.058178
AUX            0.055431  0.048452  0.048170
ADJ            0.034549  0.031575  0.036504
SCONJ          0.026531  0.026477  0.023540
PROPN          0.010048  0.020127  0.021979


VISUALISATION

In [31]:
import numpy as np
import matplotlib.pyplot as plt


df_pos_pct = (df_pos * 100).round(2)
df_pos_pct.to_csv(Path(OUTDIR) / "POS_shares_by_corpus_percent.csv", encoding="utf-8")


pos_order = list(df_pos_pct.index)
corpora_order = list(CORPORA.keys())

In [38]:
# grouped columns
fig, ax = plt.subplots(figsize=(10, 5))
x = np.arange(len(pos_order))
barw = 0.8 / len(corpora_order)

for i, cname in enumerate(corpora_order):
    vals = df_pos_pct[cname].reindex(pos_order).values
    ax.bar(x + i*barw - 0.4 + barw/2, vals, width=barw, label=cname)

ax.set_xticks(x)
ax.set_xticklabels(pos_order, rotation=40, ha="right")
ax.set_ylabel("Anteil (%)")
ax.set_title("POS-Anteile nach Corpus (gruppiert)")
ax.legend()
ax.grid(axis="y", linestyle=":", linewidth=0.7, alpha=0.6)
fig.tight_layout()
fig.savefig(Path(OUTDIR) / "POS_shares_grouped_bar.png", dpi=200)
fig.savefig(Path(OUTDIR) / "POS_shares_grouped_bar.pdf")
plt.close(fig)

# 100% grouped columns
fig, ax = plt.subplots(figsize=(10, 5))
bottom = np.zeros(len(corpora_order))
for pos in pos_order:
    vals = df_pos_pct.loc[pos, corpora_order].values
    ax.bar(corpora_order, vals, bottom=bottom, label=pos)
    bottom += vals

ax.set_ylabel("Anteil (%)")
ax.set_title("POS-Anteil nach Corpus (100% gestapelt)")
ax.legend(ncol=2, fontsize=8, title="POS")
ax.grid(axis="y", linestyle=":", linewidth=0.7, alpha=0.6)
fig.tight_layout()
fig.savefig(Path(OUTDIR) / "POS_shares_stacked100.png", dpi=200)
fig.savefig(Path(OUTDIR) / "POS_shares_stacked100.pdf")
plt.close(fig)

# Heatmap
fig, ax = plt.subplots(figsize=(8, 6))
M = df_pos_pct[corpora_order].values  
im = ax.imshow(M, aspect="auto")  

ax.set_yticks(np.arange(len(pos_order)))
ax.set_yticklabels(pos_order)
ax.set_xticks(np.arange(len(corpora_order)))
ax.set_xticklabels(corpora_order, rotation=25, ha="right")
ax.set_title("POS-Share – Heatmap (%)")


for i in range(M.shape[0]):
    for j in range(M.shape[1]):
        val = M[i, j]
        if val >= 1.0:
            ax.text(j, i, f"{val:.1f}", ha="center", va="center", fontsize=7)

fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label="Share (%)")
fig.tight_layout()
fig.savefig(Path(OUTDIR) / "POS_shares_heatmap.png", dpi=200)
fig.savefig(Path(OUTDIR) / "POS_shares_heatmap.pdf")
plt.close(fig)

print("[OK] Visualisierungen gespeichert in:", OUTDIR)
print(" - POS_shares_grouped_bar.(png|pdf)")
print(" - POS_shares_stacked100.(png|pdf)")
print(" - POS_shares_heatmap.(png|pdf)")

[OK] Visualisierungen gespeichert in: C:\Users\Sophia\Downloads\MA\TABLES\POS
 - POS_shares_grouped_bar.(png|pdf)
 - POS_shares_stacked100.(png|pdf)
 - POS_shares_heatmap.(png|pdf)


Top 10 Adjectives

In [39]:
def top_adj_table_side_by_side(n: int = TOP_N_ADJ) -> pd.DataFrame:
    per_corpus_tops = {}
    for cname, res in results.items():
        tot = res["total_tokens"] or 1
        # (lemma, count, share)
        triples = [(lem, cnt, cnt / tot) for lem, cnt in res["adj_counts"].items()]
        triples.sort(key=lambda x: x[2], reverse=True)  # nach Share
        per_corpus_tops[cname] = triples[:n]

    corpus_order = list(CORPORA.keys())
    cols = []
    for cname in corpus_order:
        cols += [cname, f"share_{cname}", f"count_{cname}"]

    data = []
    for i in range(n):
        row = {}
        for cname in corpus_order:
            if i < len(per_corpus_tops[cname]):
                lem, cnt, sh = per_corpus_tops[cname][i]
                row[cname] = lem
                row[f"share_{cname}"] = round(sh, 6)
                row[f"count_{cname}"] = int(cnt)
            else:
                row[cname] = ""
                row[f"share_{cname}"] = np.nan
                row[f"count_{cname}"] = np.nan
        data.append(row)

    df = pd.DataFrame(data, index=range(1, n + 1), columns=cols)
    df.index.name = "rank"
    return df

df_adj_side = top_adj_table_side_by_side(TOP_N_ADJ)
adj_side_csv = Path(OUTDIR) / f"Top{TOP_N_ADJ}_ADJ_side_by_side.csv"
df_adj_side.to_csv(adj_side_csv, encoding="utf-8")

print(f"\n=== Vorschau: Side-by-Side Top-{TOP_N_ADJ} ADJ (mit Counts) ===")
print(df_adj_side.head(5).to_string())


=== Vorschau: Side-by-Side Top-10 ADJ (mit Counts) ===
     Folk Fairy Tales  share_Folk Fairy Tales  count_Folk Fairy Tales    GPT-5  share_GPT-5  count_GPT-5   GPT-4o  share_GPT-4o  count_GPT-4o
rank                                                                                                                                         
1               gross                0.002275                     455      alt     0.002553          322      alt      0.003801           436
2                 alt                0.001725                     345    klein     0.001974          249    klein      0.002058           236
3               schön                0.001605                     321    gross     0.001427          180   golden      0.001221           140
4             anderer                0.001450                     290   golden     0.001324          167  silbern      0.001151           132
5               klein                0.001255                     251  schwarz     0.000904 

In [36]:
def top_adj_table_side_by_side(n: int = TOP_N_ADJ) -> pd.DataFrame:
    per_corpus_tops = {}
    for cname, res in results.items():
        tot = res["total_tokens"] or 1
        pairs = [(lem, cnt / tot) for lem, cnt in res["adj_counts"].items()]
        pairs.sort(key=lambda x: x[1], reverse=True)
        per_corpus_tops[cname] = pairs[:n]
    corpus_order = list(CORPORA.keys())
    cols = []
    for cname in corpus_order:
        cols += [cname, f"share_{cname}"]

    data = []
    for i in range(n):
        row = {}
        for cname in corpus_order:
            if i < len(per_corpus_tops[cname]):
                lem, sh = per_corpus_tops[cname][i]
                row[cname] = lem
                row[f"share_{cname}"] = round(sh, 6)
            else:
                row[cname] = ""
                row[f"share_{cname}"] = np.nan
        data.append(row)

    df = pd.DataFrame(data, index=range(1, n+1))
    df.index.name = "rank"
    return df

df_adj_side = top_adj_table_side_by_side(TOP_N_ADJ)
adj_side_csv = Path(OUTDIR) / f"Top{TOP_N_ADJ}_ADJ_side_by_side.csv"
df_adj_side.to_csv(adj_side_csv, encoding="utf-8")

print(f"\n=== Vorschau: Side-by-Side Top-{TOP_N_ADJ} ADJ ===")
print(df_adj_side.head(5).to_string())


=== Vorschau: Side-by-Side Top-10 ADJ ===
     Folk Fairy Tales  share_Folk Fairy Tales    GPT-5  share_GPT-5   GPT-4o  share_GPT-4o
rank                                                                                      
1               gross                0.002275      alt     0.002553      alt      0.003801
2                 alt                0.001725    klein     0.001974    klein      0.002058
3               schön                0.001605    gross     0.001427   golden      0.001221
4             anderer                0.001450   golden     0.001324  silbern      0.001151
5               klein                0.001255  schwarz     0.000904   dunkel      0.001037
