In [11]:
import os
import numpy as np
import pandas as pd
import spacy
from collections import Counter
from typing import Optional
import re

In [None]:
CORPORA = {
    "Folk Fairy Tales": r"C:\Users\Sophia\Downloads\MA\CORPORA\German FFT",
    "GPT-5":            r"C:\Users\Sophia\Downloads\MA\CORPORA\GPT-5",
    "GPT-4o":           r"C:\Users\Sophia\Downloads\MA\CORPORA\GPT-4o",
}

COLOR_KEYS = ["silber","gold","schwarz","weiss","rot","blau","gruen","gelb","pink","lila"]

def norm(s: str) -> str:
    s = s.lower()
    s = s.replace("ß", "ss")
    s = (s.replace("ä","ae")
           .replace("ö","oe")
           .replace("ü","ue"))
    return s

CANON_MAP = {
    "weiß": "weiss", "weiss": "weiss",
    "grün": "gruen", "gruen": "gruen",
    "gold": "gold", "golden": "gold",
    "silber": "silber", "silbern": "silber",
    "schwarz": "schwarz",
    "rot": "rot",
    "blau": "blau",
    "gelb": "gelb",
    "pink": "pink",
    "lila": "lila",
}

In [7]:
def to_canon(tok) -> Optional[str]:
    lem = tok.lemma_ if tok.lemma_ else tok.text
    ln = norm(lem)
    key = CANON_MAP.get(lem, CANON_MAP.get(ln, ln))
    return key if key in COLOR_KEYS else None

nlp = spacy.load("de_core_news_sm", disable=["ner"]) #small model

corpus_color_counts = {c: Counter() for c in CORPORA}
corpus_token_counts = {c: 0 for c in CORPORA}

for corpus_name, corpus_dir in CORPORA.items():
    for fname in os.listdir(corpus_dir):
        if not fname.lower().endswith(".txt"):
            continue
        path = os.path.join(corpus_dir, fname)
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()

        doc = nlp(text)
        for tok in doc:
            if tok.is_space or tok.is_punct:
                continue
            corpus_token_counts[corpus_name] += 1
            key = to_canon(tok)
            if key is not None:
                corpus_color_counts[corpus_name][key] += 1

# tabel: counting + per_1k + %
rows = []
for corpus_name in CORPORA.keys():
    total = corpus_token_counts[corpus_name]
    row = {"corpus": corpus_name, "tokens": int(total)}
    for k in COLOR_KEYS:
        c = corpus_color_counts[corpus_name].get(k, 0)
        row[f"{k}_count"]   = int(c)
        row[f"{k}_per_1k"] = (c / total * 1000) if total else np.nan
        row[f"{k}_percent"] = (c / total * 100)   if total else np.nan
    rows.append(row)

df = pd.DataFrame(rows).sort_values("corpus")

rate_cols = [col for col in df.columns if col.endswith("_per_10k") or col.endswith("_percent")]
df[rate_cols] = df[rate_cols].round(2)

df.to_excel("color_counts_per_corpus_normalized.xlsx", index=False)
df.to_csv("color_counts_per_corpus_normalized.csv",
          index=False, sep=";", decimal=",", encoding="utf-8")

print("Fertig:",
      "color_counts_per_corpus_normalized.xlsx",
      "und",
      "color_counts_per_corpus_normalized.csv")



Fertig: color_counts_per_corpus_normalized.xlsx und color_counts_per_corpus_normalized.csv


In [8]:
# Just counts and %
rows = []
for corpus_name in CORPORA.keys():
    total = corpus_token_counts.get(corpus_name, 0)
    row = {"corpus": corpus_name, "tokens": int(total)}
    for k in COLOR_KEYS:
        c = int(corpus_color_counts.get(corpus_name, {}).get(k, 0))
        row[f"{k}_count"] = c
        row[f"{k}_percent"] = (c / total * 100) if total else np.nan
    rows.append(row)

df_counts_percent = pd.DataFrame(rows).sort_values("corpus")

pct_cols = [c for c in df_counts_percent.columns if c.endswith("_percent")]
df_counts_percent[pct_cols] = df_counts_percent[pct_cols].round(2)


try:
    import openpyxl 
    df_counts_percent.to_excel("color_counts_per_corpus_counts_percent.xlsx", index=False)
    print("Excel exportiert: color_counts_per_corpus_counts_percent.xlsx")
except ModuleNotFoundError:
    print("Hinweis: 'openpyxl' nicht installiert – überspringe Excel-Export.")

df_counts_percent.to_csv(
    "color_counts_per_corpus_counts_percent.csv",
    index=False, sep=";", decimal=",", encoding="utf-8"
)
print("CSV exportiert: color_counts_per_corpus_counts_percent.csv")

Excel exportiert: color_counts_per_corpus_counts_percent.xlsx
CSV exportiert: color_counts_per_corpus_counts_percent.csv


Search for occurances of specific colours

In [9]:
try:
    pipe_names = nlp.pipe_names
except NameError:
    import spacy
    nlp = spacy.load("de_core_news_sm", disable=["ner"])
    pipe_names = nlp.pipe_names

if "parser" not in pipe_names and "senter" not in pipe_names:
    nlp.add_pipe("sentencizer")

def canon_key(tok):
    lem = tok.lemma_ if tok.lemma_ else tok.text
    lem_n = norm(lem)
    return CANON_MAP.get(lem, CANON_MAP.get(lem_n, lem_n))

FFT_DIR = CORPORA["Folk Fairy Tales"]
OUT_SENT = "FFT_sentences_lemma_gruen.csv"

sent_rows = []
for fname in os.listdir(FFT_DIR):
    if not fname.lower().endswith(".txt"):
        continue
    path = os.path.join(FFT_DIR, fname)
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()

    doc = nlp(text)
    for idx, sent in enumerate(doc.sents):
        if any(not (t.is_space or t.is_punct) and canon_key(t) == "gruen" for t in sent):
            sent_rows.append({
                "file": fname,
                "sent_idx": idx,
                "sentence": sent.text.strip()
            })

df_sent = pd.DataFrame(sent_rows).sort_values(["file", "sent_idx"])
df_sent.to_csv(OUT_SENT, index=False, encoding="utf-8")
print(f"{len(df_sent)} Sätze gefunden. Export: {OUT_SENT}")

39 Sätze gefunden. Export: FFT_sentences_lemma_gruen.csv


In [10]:
try:
    pipe_names = nlp.pipe_names
except NameError:
    import spacy
    nlp = spacy.load("de_core_news_sm", disable=["ner"])
    pipe_names = nlp.pipe_names

if "parser" not in pipe_names and "senter" not in pipe_names:
    nlp.add_pipe("sentencizer")

def canon_key(tok):
    lem = tok.lemma_ if tok.lemma_ else tok.text
    lem_n = norm(lem)
    return CANON_MAP.get(lem, CANON_MAP.get(lem_n, lem_n))

FFT_DIR = CORPORA["Folk Fairy Tales"]
OUT_SENT = "FFT_sentences_lemma_gelb.csv"

sent_rows = []
for fname in os.listdir(FFT_DIR):
    if not fname.lower().endswith(".txt"):
        continue
    path = os.path.join(FFT_DIR, fname)
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()

    doc = nlp(text)
    for idx, sent in enumerate(doc.sents):
        if any(not (t.is_space or t.is_punct) and canon_key(t) == "gelb" for t in sent):
            sent_rows.append({
                "file": fname,
                "sent_idx": idx,
                "sentence": sent.text.strip()
            })

df_sent = pd.DataFrame(sent_rows).sort_values(["file", "sent_idx"])
df_sent.to_csv(OUT_SENT, index=False, encoding="utf-8")
print(f"{len(df_sent)} Sätze gefunden. Export: {OUT_SENT}")

7 Sätze gefunden. Export: FFT_sentences_lemma_gelb.csv


In [13]:

TARGET_KEY = "gelb"
try:
    pipe_names = nlp.pipe_names
except NameError:
    import spacy
    nlp = spacy.load("de_core_news_sm", disable=["ner"])
    pipe_names = nlp.pipe_names

if "parser" not in pipe_names and "senter" not in pipe_names:
    nlp.add_pipe("sentencizer", first=True)

def sentence_has_target(sent, target_key: str) -> bool:
    for t in sent:
        if not (t.is_space or t.is_punct):
            if canon_key(t) == target_key:
                return True
    return False

all_rows = []
per_corpus_counts = {}

for corpus_name, corpus_dir in CORPORA.items():
    hit_rows = []
    for fname in os.listdir(corpus_dir):
        if not fname.lower().endswith(".txt"):
            continue
        path = os.path.join(corpus_dir, fname)
        try:
            with open(path, "r", encoding="utf-8", errors="ignore") as f:
                text = f.read()
        except Exception as e:
            print(f"[WARN] {corpus_name}::{fname}: {e}")
            continue

        doc = nlp(text)
        for idx, sent in enumerate(doc.sents):
            if sentence_has_target(sent, TARGET_KEY):
                row = {
                    "corpus": corpus_name,
                    "file": fname,
                    "sent_idx": idx,
                    "sentence": sent.text.strip()
                }
                all_rows.append(row)
                hit_rows.append(row)

    per_corpus_counts[corpus_name] = len(hit_rows)

import pandas as pd

df_all = pd.DataFrame(all_rows).sort_values(["corpus", "file", "sent_idx"])
out_all = f"sentences_lemma_{TARGET_KEY}_all_corpora.csv"
df_all.to_csv(out_all, index=False, encoding="utf-8")
print(f"[OK] Gesamt-Export: {out_all}  ({len(df_all)} Sätze)")


for corp in sorted(CORPORA.keys()):
    sub = df_all[df_all["corpus"] == corp]
    out_c = f"sentences_lemma_{TARGET_KEY}_{corp.replace(' ', '_')}.csv"
    sub.to_csv(out_c, index=False, encoding="utf-8")
    print(f"   └─ {corp}: {len(sub)} Sätze  →  {out_c}")

from IPython.display import display
print("\n=== Treffer je Korpus ===")
display(pd.DataFrame(
    [{"corpus": c, "n_sentences_with_target": n} for c, n in per_corpus_counts.items()]
).sort_values("corpus").reset_index(drop=True))

print("\n=== Alle Treffer (erste 50 Zeilen) ===")
display(df_all.head(50))

[OK] Gesamt-Export: sentences_lemma_gelb_all_corpora.csv  (12 Sätze)
   └─ Folk Fairy Tales: 7 Sätze  →  sentences_lemma_gelb_Folk_Fairy_Tales.csv
   └─ GPT-4o: 3 Sätze  →  sentences_lemma_gelb_GPT-4o.csv
   └─ GPT-5: 2 Sätze  →  sentences_lemma_gelb_GPT-5.csv

=== Treffer je Korpus ===


Unnamed: 0,corpus,n_sentences_with_target
0,Folk Fairy Tales,7
1,GPT-4o,3
2,GPT-5,2



=== Alle Treffer (erste 50 Zeilen) ===


Unnamed: 0,corpus,file,sent_idx,sentence
0,Folk Fairy Tales,Bechstein_Das Dukaten-Angele_571.txt,112,Und das Angele behielt seine Tugend bei und le...
1,Folk Fairy Tales,Grimm_Die drei Federn_545A.txt,32,Sie gab ihm eine ausgehöhlte gelbe Rübe mit se...
2,Folk Fairy Tales,Grimm_Die drei Federn_545A.txt,36,Da griff er auf Geratewohl eine aus dem Kreise...
3,Folk Fairy Tales,Grimm_Jorinde und Joringel_405.txt,28,Nun war die Sonne unter: die Eule flog in eine...
4,Folk Fairy Tales,Grimm_Schneewittchen_709.txt,15,"""\n\nDa erschrak die Königin und ward gelb und..."
5,Folk Fairy Tales,Grimm_Von dem Fischer und seiner Frau_555.txt,21,"Als er da nun hinkam, war die See ganz grün un..."
6,Folk Fairy Tales,Grimm_Von dem Fischer und seiner Frau_555.txt,44,"Als er an die See kam, war das Wasser ganz vio..."
9,GPT-4o,ChatGPT-4o_40.txt,14,"Dort, als sie die Feder zeigen wollte, kam ein..."
10,GPT-4o,ChatGPT-4o_80.txt,6,"Er hatte ein schmales Gesicht, durchzogen von ..."
11,GPT-4o,ChatGPT-4o_91.txt,3,"Eines Tages, als der Herbstwind durch das Laub..."
