1. Libraries

In [None]:
from pathlib import Path
import re
import numpy as np
import pandas as pd
import spacy
from tqdm import tqdm
import matplotlib.pyplot as plt
from collections import defaultdict, Counter

2. Corpora | Configuration

In [None]:
CORPORA = {
    "Folk Fairy Tales": r"C:\Users\Sophia\Downloads\MA\CORPORA\German FFT",
    "GPT-5":            r"C:\Users\Sophia\Downloads\MA\CORPORA\GPT-5",
    "GPT-4o":           r"C:\Users\Sophia\Downloads\MA\CORPORA\GPT-4o",
}
SWS_DIR = Path(r"C:\Users\Sophia\Downloads\MA\CODE\Sentiment Analyse\SentiWS_v2.0")

OUTDIR = Path(r"C:\Users\Sophia\Downloads\MA\TABLES\Sentiment\SentiWS")
OUTDIR.mkdir(parents=True, exist_ok=True)

TOP_K = 5
POS_THRESH, NEG_THRESH = 0.05, -0.05
CONF_POS,  CONF_NEG    = 0.60, -0.60

CSV_KWARGS = dict(index=False, sep=";", encoding="utf-8", float_format="%.6f", decimal=",")

3. Loading SentiWS (Remus et al. 2010) https://wortschatz.uni-leipzig.de/en/download

In [None]:
def load_sentiws(base: Path) -> dict:
    lex = {}
    num_re = re.compile(r'[-+]?\d+(?:[.,]\d+)?')
    for p in base.rglob("*.txt"):
        try:
            raw = p.read_text(encoding="utf-8")
        except Exception:
            raw = p.read_text(encoding="latin-1", errors="ignore")
        for line in raw.splitlines():
            line = line.lstrip("\ufeff").strip()
            if not line or line.startswith(("%", "#")):
                continue
            parts = line.split("\t")
            if len(parts) < 2:
                parts = re.split(r"\s+", line, maxsplit=2)
            head = parts[0] if parts else ""
            score = None
            if len(parts) >= 2:
                s = parts[1].strip().replace(",", ".")
                try:
                    score = float(s)
                except ValueError:
                    score = None
            if score is None:
                m = num_re.search(line)
                if not m:
                    continue
                score = float(m.group(0).replace(",", "."))
            if "#" in head:
                lemma = head.split("#", 1)[0]
            elif "|" in head:
                lemma = head.split("|", 1)[0]
            else:
                lemma = head
            tokens = [lemma]
            if len(parts) >= 3 and parts[2].strip():
                forms_str = parts[2]
                forms = [f.strip() for f in re.split(r"[;,]", forms_str) if f.strip()]
                tokens.extend(forms)
            for t in tokens:
                t_low = t.lower()
                prev = lex.get(t_low)
                if prev is None or abs(score) > abs(prev):
                    lex[t_low] = score
                if "ß" in t_low:
                    t_ss = t_low.replace("ß", "ss")
                    prev = lex.get(t_ss)
                    if prev is None or abs(score) > abs(prev):
                        lex[t_ss] = score
    return lex

LEX = load_sentiws(SWS_DIR)
print(f"[INFO] SentiWS geladen: {len(LEX):,} Einträge")

Valence Shifters and Normalisation for historic text

In [None]:
NEGATIONS = {
    "nicht","kein","keine","keiner","keines","keinem","keinen",
    "nie","niemals","nimmer","nimmermehr","ohne","ohn","nichts","weder","keineswegs"
}
INTENSIFIERS = {
    "sehr":1.5, "äußerst":1.7, "aeusserst":1.7, "überaus":1.7, "ueberaus":1.7,
    "wirklich":1.3, "gar":1.2, "besonders":1.3, "hoechst":1.6, "höchst":1.6
}
DIMINISHERS = {
    "kaum":0.5, "wenig":0.7, "ein wenig":0.7, "ein bisschen":0.7, "bisschen":0.7,
    "wenigstens":0.85
}
WINDOW = 3

TH_WORD_MAP = {
    "thun":"tun","gethan":"getan","gethanen":"getanen","thäte":"täte","thät":"tät","thut":"tut","that":"tat","thaten":"taten",
    "thal":"tal","thale":"tale","thales":"tales","thaler":"taler",
    "thor":"tor","thore":"tore","thoren":"toren","thorheit":"torheit",
    "thier":"tier","thiere":"tiere","thieren":"tieren","thräne":"träne","thränen":"tränen",
}
SEYN_MAP = {
    "seyn":"sein","sey":"sei","seyd":"seid","seyin":"sein","seyet":"seiet","seyest":"seiest","seyende":"seiende","seyender":"seiender",
}
def hist_normalize_token(tok: str) -> str:
    s = tok.lower()
    s = s.replace("ſ","s")
    if s in TH_WORD_MAP: s = TH_WORD_MAP[s]
    if s in SEYN_MAP:    s = SEYN_MAP[s]
    s = (s.replace("ä","ae").replace("ö","oe").replace("ü","ue"))
    s = s.replace("ß","ss")
    return s

TABLES | FIGURES

In [None]:
nlp = spacy.load("de_core_news_lg", exclude=["parser","ner","textcat"])
if "senter" not in nlp.pipe_names and "sentencizer" not in nlp.pipe_names:
    nlp.add_pipe("sentencizer")
print("[INFO] spaCy pipeline:", nlp.pipe_names)


contrib_by_corpus = defaultdict(Counter)    
contrib_overall = Counter()
contrib_pos_by_corpus = defaultdict(Counter) 
contrib_neg_by_corpus = defaultdict(Counter)  )
contrib_pos_overall = Counter()
contrib_neg_overall = Counter()

=
def sentiws_sentence_scores(
    sent_doc,
    contrib_counter_abs: Counter=None,
    corpus_key: str=None,
    contrib_pos_ctr: Counter=None,
    contrib_neg_ctr: Counter=None
) -> dict:
    toks = [t for t in sent_doc if t.is_alpha]
    scores = []
    lowers = [t.lower_ for t in toks]
    lemmas = [t.lemma_.lower() for t in toks]

    for i, t in enumerate(toks):
        key_lemma = hist_normalize_token(lemmas[i])
        key_form  = hist_normalize_token(lowers[i])

        sc = LEX.get(key_lemma)
        if sc is None: sc = LEX.get(key_form)
        if sc is None: continue

        start = max(0, i - WINDOW)
        context_lowers = [hist_normalize_token(x) for x in lowers[start:i]]
        context_lemmas = [hist_normalize_token(x) for x in lemmas[start:i]]
        context = context_lemmas[::-1]

        mult = 1.0; negate = False
        context_surface = " ".join(context_lowers)
        if "ein bisschen" in context_surface: mult *= DIMINISHERS["ein bisschen"]
        if "ein wenig"   in context_surface: mult *= DIMINISHERS["ein wenig"]
        for w in context:
            if w in NEGATIONS: negate = True; break
            if w in INTENSIFIERS: mult *= INTENSIFIERS[w]
            if w in DIMINISHERS:  mult *= DIMINISHERS[w]

        val = sc * mult
        if negate: val = -val
        scores.append(val)

        token_key = key_lemma if key_lemma in LEX else key_form
        if contrib_counter_abs is not None:
            contrib_counter_abs[token_key] += abs(val)
        if val > 0 and contrib_pos_ctr is not None:
            contrib_pos_ctr[token_key] += val
        elif val < 0 and contrib_neg_ctr is not None:
            contrib_neg_ctr[token_key] += (-val)

    if not scores:
        return {"compound": 0.0, "raw_sum": 0.0, "n_polar": 0, "label": "neutral"}

    mean_score = float(np.mean(scores))
    compound = float(np.tanh(mean_score))
    label = "positive" if compound >= POS_THRESH else "negative" if compound <= NEG_THRESH else "neutral"
    return {"compound": compound, "raw_sum": float(np.sum(scores)), "n_polar": len(scores), "label": label}

def read_text_safe(fp: Path) -> str:
    for enc in ("utf-8", "latin-1"):
        try:
            return fp.read_text(encoding=enc, errors="ignore")
        except Exception:
            continue
    return ""


In [None]:
rows_sent, rows_doc = [], []
for corpus_name, corpus_dir in CORPORA.items():
    files = list(Path(corpus_dir).rglob("*.txt"))
    print(f"[INFO] {corpus_name}: {len(files)} Dateien")
    for fp in tqdm(files, desc=f"SentiWS Sätze {corpus_name}", unit="Datei"):
        text = read_text_safe(Path(fp))
        if not text.strip(): continue
        doc = nlp(text)
        sents = [s for s in doc.sents if s.text.strip()]
        if not sents: continue

        comp = []
        for i, s in enumerate(sents):
            sc = sentiws_sentence_scores(
                s,
                contrib_counter_abs=contrib_by_corpus[corpus_name],
                corpus_key=corpus_name,
                contrib_pos_ctr=contrib_pos_by_corpus[corpus_name],
                contrib_neg_ctr=contrib_neg_by_corpus[corpus_name],
            )
            _ = sentiws_sentence_scores(
                s,
                contrib_counter_abs=contrib_overall,
                corpus_key="__ALL__",
                contrib_pos_ctr=contrib_pos_overall,
                contrib_neg_ctr=contrib_neg_overall,
            )
            comp.append(sc["compound"])
            rows_sent.append({
                "corpus": corpus_name, "file": str(fp), "sent_idx": i,
                "sent_text": s.text.strip(),
                "sws_compound": sc["compound"],
                "sws_raw_sum": sc["raw_sum"],
                "sws_n_polar": sc["n_polar"],
                "sws_label": sc["label"],
            })

        c = np.array(comp, dtype=float)
        if c.size == 0: continue
        labels = np.array(["positive" if x>=POS_THRESH else "negative" if x<=NEG_THRESH else "neutral" for x in c])
        share_pos = float((labels=="positive").mean()) * 100.0
        share_neg = float((labels=="negative").mean()) * 100.0
        conf_pos  = float((c >= CONF_POS).mean()) * 100.0
        conf_neg  = float((c <= CONF_NEG).mean()) * 100.0
        top_pos_idx = np.argsort(-c)[:TOP_K]
        top_neg_idx = np.argsort(c)[:TOP_K]

        row = {
            "corpus": corpus_name, "file": str(fp),
            "n_sents": int(len(sents)),
            "sws_label_pos_percent": share_pos,
            "sws_label_neg_percent": share_neg,
            "sws_conf_pos_percent":  conf_pos,
            "sws_conf_neg_percent":  conf_neg,
        }
        for j, si in enumerate(top_pos_idx, start=1):
            row[f"top_pos_{j}_compound"] = float(c[si])
            row[f"top_pos_{j}_sent"]     = sents[si].text.strip()
        for j, si in enumerate(top_neg_idx, start=1):
            row[f"top_neg_{j}_compound"] = float(c[si])
            row[f"top_neg_{j}_sent"]     = sents[si].text.strip()
        rows_doc.append(row)

df_sent = pd.DataFrame(rows_sent)
df_doc  = pd.DataFrame(rows_doc)


In [None]:
sent_csv = OUTDIR / "sentiws_sentence_scores_long.csv"
doc_csv  = OUTDIR / "sentiws_doc_aggregates.csv"
df_sent.to_csv(sent_csv, **CSV_KWARGS)
df_doc.to_csv(doc_csv, **CSV_KWARGS)

agg_cols = ["sws_label_pos_percent","sws_label_neg_percent","sws_conf_pos_percent","sws_conf_neg_percent"]
summary = df_doc.groupby("corpus")[agg_cols].mean().reset_index()
sum_csv = OUTDIR / "sentiws_corpus_summary.csv"
summary.to_csv(sum_csv, **CSV_KWARGS)

corpus_order = [c for c in CORPORA.keys() if c in df_doc["corpus"].unique()]
grp = df_doc.groupby("corpus")
mean_pos = grp["sws_label_pos_percent"].mean()
mean_neg = grp["sws_label_neg_percent"].mean()
mean_neu = 100.0 - mean_pos - mean_neg
x = np.arange(len(corpus_order))

plt.figure(figsize=(7,4))
plt.bar(x, [mean_pos.get(c,0.0) for c in corpus_order], label="positive")
plt.bar(x, [mean_neu.get(c,0.0) for c in corpus_order], bottom=[mean_pos.get(c,0.0) for c in corpus_order], label="neutral")
plt.bar(x, [mean_neg.get(c,0.0) for c in corpus_order], bottom=[mean_pos.get(c,0.0)+mean_neu.get(c,0.0) for c in corpus_order], label="negative")
plt.xticks(x, corpus_order, rotation=0)
plt.ylabel("% sentences")
plt.title("SentiWS: average sentence-label distribution by corpus")
plt.legend(); plt.tight_layout()
f1 = OUTDIR / "viz_sentiws_label_share_stacked_by_corpus.png"
plt.savefig(f1, dpi=200); plt.show(); print(f"[OK] Plot:", f1)

for col, title, fname in [
    ("sws_label_pos_percent", "SentiWS: distribution % positive sentences", "viz_sentiws_pos_boxplot_by_corpus.png"),
    ("sws_label_neg_percent", "SentiWS: distribution % negative sentences", "viz_sentiws_neg_boxplot_by_corpus.png"),
]:
    box_data = [df_doc.loc[df_doc["corpus"]==c, col].dropna().values for c in corpus_order]
    plt.figure(figsize=(7,4))
    plt.boxplot(box_data, labels=corpus_order, showfliers=False)
    plt.ylabel("% sentences (doc)")
    plt.title(title)
    plt.xticks(rotation=15)
    plt.tight_layout()
    fp = OUTDIR / fname
    plt.savefig(fp, dpi=200); plt.show(); print(f"[OK] Plot:", fp)

In [None]:
def topk_wide(counter_by_corpus: dict, k: int = 10) -> pd.DataFrame:
    cols = []
    data = {}
    for corp, ctr in counter_by_corpus.items():
        items = sorted(ctr.items(), key=lambda kv: kv[1], reverse=True)
        tokens = [tok for tok, _ in items[:k]]
        if len(tokens) < k:
            tokens += [""] * (k - len(tokens))
        data[corp] = tokens
        cols.append(corp)
    df = pd.DataFrame(data, index=[i for i in range(1, k+1)])
    ordered_cols = [c for c in CORPORA.keys() if c in df.columns]
    return df[ordered_cols]

df_pos_wide = topk_wide(contrib_pos_by_corpus, k=10)
df_neg_wide = topk_wide(contrib_neg_by_corpus, k=10)

pos_wide_csv = OUTDIR / "top10_positive_by_corpus_wide.csv"
neg_wide_csv = OUTDIR / "top10_negative_by_corpus_wide.csv"
df_pos_wide.to_csv(pos_wide_csv, sep=";", encoding="utf-8")
df_neg_wide.to_csv(neg_wide_csv, sep=";", encoding="utf-8")
print("[OK] Top-10 positive (wide):", pos_wide_csv)
print("[OK] Top-10 negative (wide):", neg_wide_csv)