In [53]:
import os, re, pathlib, typing
from pathlib import Path
from collections import Counter, defaultdict
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from typing import List, Dict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

LOADING CORPORA

In [54]:
CORPUS_FFT   = r"C:\Users\Sophia\Downloads\MA\CORPORA\German FFT"
CORPUS_LFT   = r"C:\Users\Sophia\Downloads\MA\CORPORA\Literary Fairytales"
CORPUS_GPT_5 = r"C:\Users\Sophia\Downloads\MA\CORPORA\GPT-5"
CORPUS_GPT_4 = r"C:\Users\Sophia\Downloads\MA\CORPORA\GPT-4o"

CORPORA = {
    "Folk Fairy Tales": CORPUS_FFT,
    "Literary Fairy Tales": CORPUS_LFT,
    "GPT-5": CORPUS_GPT_5,
    "GPT-4o": CORPUS_GPT_4
}

In [28]:
for name, path in CORPORA.items():
    files = list(Path(path).rglob("*.txt"))
    print(f"{name}: {len(files)} Dateien")

Folk Fairy Tales: 123 Dateien
Literary Fairy Tales: 13 Dateien
GPT-5: 100 Dateien
GPT-4o: 100 Dateien


MOST FREQUENT WORDS

In [29]:
WORD_RE = re.compile(r"[A-Za-zÄÖÜäöüß]+", flags=re.UNICODE)

def read_text(p: Path) -> str:
    return p.read_text(encoding="utf-8", errors="ignore")

def tokens(text: str):
    return [m.group(0).lower() for m in WORD_RE.finditer(text)]

In [30]:
TOP_MFW     = 1000
CULLING_PCT = 0

In [24]:
#read docs
docs_order = []
groups = {}
texts = {}

for group_name, folder in CORPORA.items():
    p = Path(folder)
    for f in sorted(p.rglob("*.txt")):
        doc_id = f"{group_name}__{f.name}"
        try:
            txt = read_text(f)
        except Exception as e:
            print(f"[WARN] Couldn't read: {f} ({e})")
            continue
        docs_order.append(doc_id)
        groups[doc_id] = group_name
        texts[doc_id] = txt

print(f"[INFO] Read: {len(docs_order)} docs from {len(set(groups.values()))} Corpora.")

[INFO] Read: 336 docs from 4 Corpora.


In [31]:
doc_token_counts = {}
doc_lengths = {}
df_counter = Counter()
global_counts = Counter()

for doc_id in docs_order:
    toks = tokens(texts[doc_id])
    cnt = Counter(toks)
    doc_token_counts[doc_id] = cnt
    L = sum(cnt.values())
    doc_lengths[doc_id] = L
    global_counts.update(cnt)
    for w in cnt.keys():
        df_counter[w] += 1

n_docs = len(docs_order)
cull_threshold = int(np.ceil(CULLING_PCT/100.0 * n_docs))
vocab_culled = [w for w, df in df_counter.items() if df >= cull_threshold]
vocab_sorted = sorted(vocab_culled, key=lambda w: global_counts[w], reverse=True)
mfw = vocab_sorted[:TOP_MFW]
print(f"[INFO] After Culling {len(vocab_culled)} words left; Chosen: Top {len(mfw)} MFW.")

# Frequenzy Matrix
rows = []
for doc_id in docs_order:
    total = max(doc_lengths[doc_id], 1)
    row = {"id": doc_id, "group": groups[doc_id]}
    cnt = doc_token_counts[doc_id]
    for w in mfw:
        row[w] = cnt.get(w, 0) / total
    rows.append(row)

df_freq = pd.DataFrame(rows).set_index("id")
df_freq.to_csv("MFW_freq_matrix_top1000_cull0.csv", encoding="utf-8")
pd.Series(mfw, name="term").to_csv("MFW_terms_top1000_cull0.csv", index=False, encoding="utf-8")
print("[OK] Generated: MFW_freq_matrix_top1000_cull0.csv, MFW_terms_top1000_cull0.csv")

# Top-5 MFW 
mfw_cols = [c for c in df_freq.columns if c != "group"]
group_means = df_freq.groupby("group")[mfw_cols].mean()

top5_rows = []
for grp in group_means.index:
    sub = group_means.loc[grp].sort_values(ascending=False).head(5)
    for term, val in sub.items():
        top5_rows.append({"group": grp, "term": term, "mean_rel_freq": val})

df_top5 = pd.DataFrame(top5_rows).sort_values(["group","mean_rel_freq"], ascending=[True, False])
df_top5.to_csv("Top5_MFW_per_Corpus_top1000_cull0.csv", index=False, encoding="utf-8")

print("[OK] geschrieben: Top5_MFW_per_Corpus_top1000_cull0.csv")
print("\n Top-5 (Preview) ")
print(df_top5.to_string(index=False))

[INFO] After Culling 27590 words left; Chosen: Top 1000 MFW.
[OK] Generated: MFW_freq_matrix_top1000_cull0.csv, MFW_terms_top1000_cull0.csv
[OK] geschrieben: Top5_MFW_per_Corpus_top1000_cull0.csv

 Top-5 (Preview) 
group term  mean_rel_freq
  FFT  und       0.054007
  FFT  der       0.027702
  FFT  die       0.026970
  FFT  sie       0.020478
  FFT   er       0.019475
 GPT4  und       0.041610
 GPT4  der       0.033581
 GPT4  sie       0.024131
 GPT4  die       0.023352
 GPT4  das       0.019049
 GPT5  und       0.049190
 GPT5  die       0.029715
 GPT5  sie       0.026765
 GPT5  der       0.026586
 GPT5   er       0.016876
  LFT  und       0.036767
  LFT  der       0.026749
  LFT  die       0.025189
  LFT   er       0.017495
  LFT   in       0.016414

[INFO] Docs / Corpus:
FFT     123
GPT5    100
GPT4    100
LFT      13


In [32]:
df_top5 = pd.read_csv("Top5_MFW_per_Corpus_top1000_cull0.csv")
order = ["FFT", "LFT", "GPT4", "GPT5"]
blocks = []
for grp in order:
    sub = df_top5[df_top5["group"] == grp].copy()
    sub = sub.sort_values("mean_rel_freq", ascending=False).head(5).reset_index(drop=True)
    sub.index = pd.Index(range(1, len(sub)+1), name="Rank")
    sub.columns = [ "group", f"{grp}_term", f"{grp}_mean_rel_freq" ]
    sub = sub.drop(columns=["group"])
    blocks.append(sub)

side_by_side = pd.concat(blocks, axis=1)
fmt = side_by_side.copy()
for grp in order:
    col = f"{grp}_mean_rel_freq"
    if col in fmt.columns:
        fmt[col] = fmt[col].map(lambda x: f"{x:.4f}")

print("\Top 5")
print(fmt)

\Top 5
     FFT_term FFT_mean_rel_freq LFT_term LFT_mean_rel_freq GPT4_term  \
Rank                                                                   
1         und            0.0540      und            0.0368       und   
2         der            0.0277      der            0.0267       der   
3         die            0.0270      die            0.0252       sie   
4         sie            0.0205       er            0.0175       die   
5          er            0.0195       in            0.0164       das   

     GPT4_mean_rel_freq GPT5_term GPT5_mean_rel_freq  
Rank                                                  
1                0.0416       und             0.0492  
2                0.0336       die             0.0297  
3                0.0241       sie             0.0268  
4                0.0234       der             0.0266  
5                0.0190        er             0.0169  


In [33]:
side_by_side.to_csv("Top5_MFW.csv", encoding="utf-8")
print("\n[OK] Generated: Top5_MFW.csv")


[OK] Generated: Top5_MFW.csv
