# Lexikonbasierte Evaluation 
### Referenz GerVADER 
- ### https://github.com/KarstenAMF/GerVADER/tree/master
- ### https://github.com/KarstenAMF/GerVADER/blob/master/vaderSentimentGER.py

-----

- **Minimaler Normalizer/Tokenizer**  
  (lowercase + einfache Tokenisierung)

- **Kleiner Stopwort-Filter**  
  (nur häufigste Funktionswörter: Artikel, Präpositionen, Pronomen)

- **Kleine Emoji-Sentimentliste**  
  (ca. 15 Emojis/Emoticons siehe GerVADER)

- **GerVADER-ähnliche Heuristiken**  
  - Intensifier / Downtoner  
  - Negation (Scope)

- **Statische Thresholds**
  - Weighted Average: ±0.15  *(breitere Score-Verteilung)*  
  - Bayesian SentiMerge: ±0.10  *(Shrinkage → kompakter um 0)*

- **Auswertung**
  - Vollständige Kennzahlen je Klasse (Precision / Recall / F1 + Support)  
  - Accuracy  
  - Macro-F1

- **Showcase**  
    - Konkretes Beispiel aus jedem Datensatz (SB10k, GermEval2017)


In [None]:
import re
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple


# Benchmark dataset paths
GERMEVAL_PATH = "../evaluation_datasets/GermEval2017/GermEval2017.tsv"  
SB10K_PATH    = "../evaluation_datasets/SB10k/SB10k.tsv"                

# Lexicon files
LEXICON_VARIANTS = [
    {"name": "Weighted Average",    "path": "aggregated_sentiment_scores_.csv", "score_col": "weighted_avg_score"},
    {"name": "Bayesian SentiMerge", "path": "SentiMerge_score_light.csv",       "score_col": "senti_merge_score"},
    {"name": "AffNorms", "path": "sentiment_lexika_scaled_final.csv",       "score_col": "AffNorms_Val_scaled"},
    {"name": "SentiWS", "path": "sentiment_lexika_scaled_final.csv",       "score_col":              "SentiWS"},
    {"name": "PolArt", "path": "sentiment_lexika_scaled_final.csv",       "score_col":           "PolArt_num"},
    {"name": "GPC", "path": "sentiment_lexika_scaled_final.csv",       "score_col": 
     "GPC_num"},
    {"name": "ALPIN", "path": "sentiment_lexika_scaled_final.csv",       "score_col": "ALPIN_sentiment_scaled"},
    {"name": "ANGST", "path": "sentiment_lexika_scaled_final.csv",       "score_col":  "ANGST_Valence_scaled"},
    {"name": "AffDict", "path": "sentiment_lexika_scaled_final.csv",       "score_col": "AffDict_Eval_scaled"},
]

# Benchmark column names
GE_TEXT_COL, GE_LABEL_COL = "Text", "Sentiment"
SB_TEXT_COL, SB_LABEL_COL = "Normalized", "Sentiment"  


# static thresholds for 3-way classification (pos, neg, neutral)
STATIC_THRESHOLDS: Dict[str, Dict[str, Tuple[float,float]]] = {
    "SB10k": {
        "Weighted Average":    ( 0.15, -0.15),
        "Bayesian SentiMerge": ( 0.1, -0.1),
        "AffNorms":            ( 0.15, -0.15),
        "SentiWS":             ( 0.15, -0.15),
        "PolArt":              ( 0.1, -0.1),
        "GPC":                 ( 0.1, -0.1),
        "ALPIN":               ( 0.1, -0.1),
        "ANGST":               ( 0.1, -0.1),
        "AffDict":             ( 0.1, -0.1),
    },
    "GermEval2017": {
        "Weighted Average":    ( 0.1, -0.1),
        "Bayesian SentiMerge": ( 0.1, -0.1),
        "AffNorms":            ( 0.1, -0.1),
        "SentiWS":             ( 0.1, -0.1),
        "PolArt":              ( 0.1, -0.1),
        "GPC":                 ( 0.1, -0.1),
        "ALPIN":               ( 0.1, -0.1),
        "ANGST":               ( 0.1, -0.1),
        "AffDict":             ( 0.1, -0.1),
    }
}


# Heuristics with switches 
USE_NEGATION = True
USE_INTENSIFIERS = True
# how many tokens after a negation to flip
NEGATION_SCOPE = 3

# Borrowed from https://github.com/KarstenAMF/GerVADER/tree/master
INTENSIFIERS = {
    # intensifier
    "sehr": 1.5, "extrem": 1.7, "unglaublich": 1.6, "total": 1.4, "mega": 1.5, "wirklich": 1.3,
    # weakener
    "ziemlich": 1.2, "recht": 1.15, "relativ": 1.15, "etwas": 1.1, "leicht": 1.1
}
DOWNTONERS = {
    "kaum": 0.7, "wenig": 0.8, "einigermaßen": 0.85, "teilweise": 0.9, "bisschen": 0.9
}
NEGATIONS = {"nicht","nie","keine","kein","keiner","keines","keinem","keinen","ohne","nichts"}

# "mean" is robust; "sum" would also work with different thresholds
AGG_MODE = "mean"   


# Minimal-Tokenizer + stopwords
USE_STOPWORDS = True
STOPWORDS = {
    "der","die","das","ein","eine","einen","einem","einer","eines",
    "und","oder","aber","denn","nur","auch","so","wie","als",
    "zu","von","mit","auf","in","im","am","um","für","an","bei","aus",
    "ich","du","er","sie","es","wir","ihr",
    "mich","dich","ihn","uns","euch",
    "ist","bin","bist","sind","seid","war","waren","werden","wird","sein","hat","haben","habe","hatte","hatten",
    "man","sich","noch","schon","mal"
}

SPLIT_RE = re.compile(r"[^\wäöüÄÖÜß]+", flags=re.UNICODE)

def simple_tokens(text: str) -> List[str]:
    if not isinstance(text, str): 
        return []
    t = text.lower()
    toks = [tok for tok in SPLIT_RE.split(t) if tok]
    if USE_STOPWORDS:
        toks = [tok for tok in toks if tok not in STOPWORDS]
    return toks


# Emoji-/Emoticon-list
EMOJI_SCORES = {
    "🙂": 0.4, "😊": 0.5, "😁": 0.6, "😂": 0.5, "🤣": 0.5, "😍": 0.6, "❤️": 0.6, "❤": 0.6,
    ":)": 0.4, ":-)": 0.4,
    "😡": -0.6, "🤬": -0.7, "😠": -0.5, "😢": -0.5, "😭": -0.6,
    ":(": -0.4, ":-(": -0.4,
    "👍": 0.4, "👎": -0.4
}

def emoji_scores_from_text(text: str) -> List[float]:
    if not isinstance(text, str):
        return []
    t = text  # not lowercased; emojis are case-independent; emoticons are still recognized
    scores = []
    for sym, sc in EMOJI_SCORES.items():
        # simple frequency counting
        cnt = t.count(sym)
        if cnt > 0:
            scores.extend([sc] * cnt)
    return scores


# Utilities
def load_lexicon(path: str, score_col: str) -> Dict[str, float]:
    df = pd.read_csv(path)
    if score_col not in df.columns:
        raise AssertionError(f"Spalte {score_col} fehlt in {path}")
    return (
        df[["Wort", score_col]]
        .dropna(subset=[score_col])
        .assign(Wort=lambda x: x["Wort"].astype(str).str.strip().str.lower())
        .set_index("Wort")[score_col]
        .to_dict()
    )

def apply_heuristics(tokens: List[str], base_scores: List[float]) -> List[float]:
    """
    Intensifiers/downtoners scale the magnitude of the NEXT sentiment-bearing token.
    Negation flips the sign of the next sentiment-bearing tokens within the scope
    """
    scores = base_scores.copy()
    n = len(tokens)

    if USE_INTENSIFIERS:
        i = 0
        while i < n:
            tok = tokens[i]
            mult = INTENSIFIERS.get(tok) or DOWNTONERS.get(tok)
            if mult is not None:
                for j in range(i + 1, n):
                    if scores[j] != 0.0:
                        scores[j] = np.sign(scores[j]) * min(1.0, abs(scores[j]) * mult)
                        break
            i += 1

    if USE_NEGATION:
        i = 0
        while i < n:
            if tokens[i] in NEGATIONS:
                for j in range(i + 1, min(n, i + 1 + NEGATION_SCOPE)):
                    if scores[j] != 0.0:
                        scores[j] = -scores[j]
                i += NEGATION_SCOPE
            else:
                i += 1

    return scores

def score_text(text: str, w2s: Dict[str, float], mode: str = "mean") -> float:
    # (1) regular token scoring
    toks = simple_tokens(text)
    raw = [w2s.get(t, 0.0) for t in toks]
    adj = apply_heuristics(toks, raw)
    vals = [v for v in adj if v != 0.0]
    # (2) add emoji scores (as additional “tokens”)
    vals.extend(emoji_scores_from_text(text))
    if not vals:
        return 0.0
    return float(np.mean(vals) if mode == "mean" else np.sum(vals))

def predict_label(score: float, pos_thr: float, neg_thr: float) -> str:
    if score >= pos_thr:
        return "positive"
    if score <= neg_thr:
        return "negative"
    return "neutral"


# Metriken
def metrics_full(y_true: List[str], y_pred: List[str]) -> dict:
    labels = ["negative", "neutral", "positive"]
    def prf(lbl):
        tp = sum((yt == lbl) and (yp == lbl) for yt, yp in zip(y_true, y_pred))
        fp = sum((yt != lbl) and (yp == lbl) for yt, yp in zip(y_true, y_pred))
        fn = sum((yt == lbl) and (yp != lbl) for yt, yp in zip(y_true, y_pred))
        sup = sum(yt == lbl for yt in y_true)
        P = tp/(tp+fp) if (tp+fp) else 0.0
        R = tp/(tp+fn) if (tp+fn) else 0.0
        F1= 2*P*R/(P+R) if (P+R) else 0.0
        return P, R, F1, sup
    per = {lbl: prf(lbl) for lbl in labels}
    macro_f1 = float(np.mean([per[l][2] for l in labels]))
    acc = float(np.mean([yt==yp for yt, yp in zip(y_true,y_pred)]))
    return {"accuracy": acc, "macro_f1": macro_f1, "per_class": per}

# Evaluation with metrics
def eval_tsv_static(tsv_path: str, text_col: str, label_col: str,
                    w2s: Dict[str, float], dataset_name: str, variant_name: str,
                    agg_mode: str = "mean") -> dict:
    df = pd.read_csv(tsv_path, sep="\t")
    if text_col not in df.columns and "Text" in df.columns:
        text_col = "Text"
    df = df[[text_col, label_col]].dropna()

    # Scores
    df["__score__"] = df[text_col].apply(lambda x: score_text(x, w2s, mode=agg_mode))

    # Thresholds
    try:
        pos_thr, neg_thr = STATIC_THRESHOLDS[dataset_name][variant_name]
    except KeyError:
        raise KeyError(f"Keine statischen Thresholds für dataset={dataset_name}, variant={variant_name}")

    # Predictions + metrics
    preds = [predict_label(s, pos_thr, neg_thr) for s in df["__score__"].values]
    m = metrics_full(df[label_col].tolist(), preds)

    return {"n": len(df), "pos_thr": pos_thr, "neg_thr": neg_thr, "metrics": m}


# Run-through: all variants × both datasets
rows_long = []    # measurements per class

for variant in LEXICON_VARIANTS:
    try:
        w2s = load_lexicon(variant["path"], variant["score_col"])
    except AssertionError as e:
        print(f"Überspringe {variant['name']}: {e}")
        continue

    for ds_name, (path, text_col, label_col) in {
        "GermEval2017": (GERMEVAL_PATH, GE_TEXT_COL, GE_LABEL_COL),
        "SB10k":        (SB10K_PATH,     SB_TEXT_COL, SB_LABEL_COL),
    }.items():
        res = eval_tsv_static(path, text_col, label_col, w2s,
                              dataset_name=ds_name, variant_name=variant["name"], agg_mode=AGG_MODE)
        acc = res["metrics"]["accuracy"]
        mF1 = res["metrics"]["macro_f1"]
        pt, nt = res["pos_thr"], res["neg_thr"]

        # expand per class
        for cls, (P,R,F1,S) in res["metrics"]["per_class"].items():
            rows_long.append({
                "Dataset": ds_name,
                "Variant": variant["name"],
                "Class": cls,
                "Support": S,
                "Precision": P,
                "Recall": R,
                "F1": F1,
                "AccuracyOverall": acc,
                "MacroF1": mF1,
                "pos_thr": pt, "neg_thr": nt
            })

        res.update({"Dataset": ds_name, "Variant": variant["name"]})

# Final results table
results_table = pd.DataFrame(rows_long).sort_values(by=["Dataset","Variant","Class"]).reset_index(drop=True)
results_table


Unnamed: 0,Dataset,Variant,Class,Support,Precision,Recall,F1,AccuracyOverall,MacroF1,pos_thr,neg_thr
0,GermEval2017,ALPIN,negative,6911,0.360825,0.015193,0.029159,0.605428,0.294234,0.1,-0.1
1,GermEval2017,ALPIN,neutral,17747,0.675379,0.876543,0.762923,0.605428,0.294234,0.1,-0.1
2,GermEval2017,ALPIN,positive,1540,0.069589,0.12987,0.090621,0.605428,0.294234,0.1,-0.1
3,GermEval2017,ANGST,negative,6911,0.426181,0.125307,0.193671,0.458852,0.311815,0.1,-0.1
4,GermEval2017,ANGST,neutral,17747,0.674608,0.598355,0.634197,0.458852,0.311815,0.1,-0.1
5,GermEval2017,ANGST,positive,1540,0.06362,0.348052,0.107577,0.458852,0.311815,0.1,-0.1
6,GermEval2017,AffDict,negative,6911,0.358621,0.075242,0.124387,0.511757,0.304942,0.1,-0.1
7,GermEval2017,AffDict,neutral,17747,0.67082,0.703781,0.686905,0.511757,0.304942,0.1,-0.1
8,GermEval2017,AffDict,positive,1540,0.064774,0.257792,0.103534,0.511757,0.304942,0.1,-0.1
9,GermEval2017,AffNorms,negative,6911,0.472857,0.047895,0.086979,0.514887,0.299153,0.1,-0.1


In [None]:
import pandas as pd

def showcase_example(dataset, path, text_col, label_col):
    # loading
    w2s_avg  = load_lexicon("aggregated_sentiment_scores_.csv", "weighted_avg_score")
    w2s_bayes= load_lexicon("SentiMerge_score_light.csv", "senti_merge_score")

    # Thresholds
    pos_avg, neg_avg   = STATIC_THRESHOLDS[dataset]["Weighted Average"]
    pos_b, neg_b       = STATIC_THRESHOLDS[dataset]["Bayesian SentiMerge"]

    # Example post
    df   = pd.read_csv(path, sep="\t").dropna(subset=[text_col,label_col])
    row  = df.iloc[0]
    text, gold = str(row[text_col]), str(row[label_col])

    # tokenize
    toks = simple_tokens(text)

    # get scores
    avg_scores  = [w2s_avg.get(t,0.0) for t in toks]
    bayes_scores= [w2s_bayes.get(t,0.0) for t in toks]

    # only tokens with at least one non-zero score
    tok_table = [(t,a,b) for t,a,b in zip(toks,avg_scores,bayes_scores) if (a!=0.0 or b!=0.0)]

    # aggregate
    vals_avg   = [a for _,a,_ in tok_table if a!=0.0]
    vals_bayes = [b for _,_,b in tok_table if b!=0.0]
    agg_avg    = sum(vals_avg)/len(vals_avg) if vals_avg else 0.0
    agg_bayes  = sum(vals_bayes)/len(vals_bayes) if vals_bayes else 0.0

    pred_avg   = predict_label(agg_avg,pos_avg,neg_avg)
    pred_bayes = predict_label(agg_bayes,pos_b,neg_b)

    # output
    print("="*80)
    print(f"{dataset}  |  Gold-Label: {gold}")
    print("Post/Tweet:", text, "\n")
    print("Tokens mit Scores:")
    print(f"{'Token':15s} {'Weighted Avg':>12s} {'Bayes-light':>12s}")
    for t,a,b in tok_table:
        print(f"{t:15s} {a:12.3f} {b:12.3f}")
    print("\nBerechnung:")
    print(f"  Weighted Avg : {agg_avg:.3f} → {pred_avg} (Thr {pos_avg:+.2f}/{neg_avg:+.2f})")
    print(f"  Bayes-light  : {agg_bayes:.3f} → {pred_bayes} (Thr {pos_b:+.2f}/{neg_b:+.2f})")

# Showcase for one example post from each dataset
showcase_example("SB10k", SB10K_PATH, SB_TEXT_COL, SB_LABEL_COL)
showcase_example("GermEval2017", GERMEVAL_PATH, GE_TEXT_COL, GE_LABEL_COL)

SB10k  |  Gold-Label: positive
Post/Tweet: RT @TheKedosZone : so eine Hearthstone - Key von @BlizzardCSEU_EN für die nett YouTube - Onkel Kedos sein auch schon was ganz fein . * hust * 

Tokens mit Scores:
Token           Weighted Avg  Bayes-light
rt                     0.095        0.044
key                    0.078        0.036
nett                   0.652        0.544
youtube                0.010        0.015
onkel                  0.269        0.289
ganz                   0.330        0.145
fein                   0.652        0.398

Berechnung:
  Weighted Avg : 0.298 → positive (Thr +0.15/-0.15)
  Bayes-light  : 0.210 → positive (Thr +0.10/-0.10)
GermEval2017  |  Gold-Label: positive
Post/Tweet: Bahn: Streik endet vorzeitig Der Streik der Gewerkschaft Deutscher Lokomotivführer (GDL) endet am 21. Mai um 19 Uhr. Die Gewerkschaft und die Deutsche Bahn haben sich auf die tariflichen Grundlagen für einen Flächentarifvertrag für das Zugpersonal und gleichzeitig auf ein Schlic 

Tokens mi