# Zero-Shot NLI Sentiment (Dutch) — LoicDL Models (Title + Lead Only)

This notebook applies zero-shot text classification via NLI to Dutch news articles, using only the **title** and **lead** extracted in the same way as the baseline notebook.

Tested models:
- `LoicDL/bert-base-dutch-cased-finetuned-snli`
- `LoicDL/robbert-v2-dutch-finetuned-snli`
- `loicDL/robbertje-dutch-finetuned-snli`

Output: per article the predicted label and per-class scores (positief/negatief/neutraal) for each model.


In [1]:
import os, re, json
import pdfplumber
import pandas as pd
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer
import numpy as np
from typing import List, Tuple, Dict, Optional
from ftfy import fix_text as _ftfy_fix
import unicodedata

DATA_DIR = "data"
OUT_CSV  = "nli_results.csv"

# LoicDL NLI models (all three)
NLI_MODELS = [
    "LoicDL/bert-base-dutch-cased-finetuned-snli",
    "LoicDL/robbert-v2-dutch-finetuned-snli",
    "loicDL/robbertje-dutch-finetuned-snli",
]

# Zero-shot labels & template (binary)
ZS_LABELS = ["positief", "negatief"]
HYPOTHESIS_TEMPLATE = "Het sentiment van deze tekst is {}"


  warn(


In [2]:
LEXIS_HEADER_PATTERNS = [
    r"^About LexisNexis.*",
    r"^Privacy Policy.*",
    r"^Terms .* Conditions.*",
    r"^Copyright.*",
    r"^User Name:.*",
    r"^Date and Time:.*",
    r"^Job Number:.*",
    r"^Documents \\(\\d+\\).*",
    r"^Client/Matter:.*",
    r"^Search Terms:.*",
    r"^Search Type:.*",
    r"^Content Type.*",
    r"^http[s]?://\\S+",
    r"^Page \\d+ of \\d+",
    r"^Load-Date:.*",
    r"^End of Document",
    r"^Classification$",
    r"^Language:.*",
    r"^Publication-Type:.*",
    r"^Subject:.*",
    r"^Industry:.*",
    r"^\\s*Bookmark_\\d+\\s*$"
]

HEADER_REGEXES = [re.compile(pat, flags=re.IGNORECASE) for pat in LEXIS_HEADER_PATTERNS]

def clean_lines(lines: List[str]) -> List[str]:
    out = []
    for ln in lines:
        s = ln.strip()
        if not s:
            out.append("")
            continue
        if any(rx.match(s) for rx in HEADER_REGEXES):
            continue
        s = re.sub(r"\\s+", " ", s)
        out.append(s)
    txt = "\\n".join(out)
    txt = re.sub(r"\\n{3,}", "\\n\\n", txt)
    return [ln for ln in txt.split("\\n")]
    
def pdf_to_text(path: str) -> str:
    texts = []
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            t = page.extract_text(x_tolerance=1, y_tolerance=1) or ""
            if t:
                texts.append(t)
    raw = "\\n".join(texts)
    raw = raw.replace("\\r\\n", "\\n").replace("\\r", "\\n")
    lines = raw.split("\\n")
    lines = clean_lines(lines)
    return "\\n".join(lines)

def normalize_text(s: str) -> str:
    if not s:
        return s
    # snelle mojibake-detectie en herstel (bv. 'patiÃ«nt' → 'patiënt')
    if re.search(r"[ÂÃ]|Ã.|â€|â€™|â€œ|â€�|â€“|â€”", s):
        try:
            s = s.encode("latin-1", "ignore").decode("utf-8", "ignore")
        except Exception:
            pass
    # unicode normalisatie + wat typografische tekens rechtzetten
    s = unicodedata.normalize("NFKC", s)
    s = (s.replace("“", "\"").replace("”", "\"")
           .replace("‘", "'").replace("’", "'")
           .replace("–", "-").replace("—", "-")
           .replace("\u00ad", ""))  # soft hyphen
    return re.sub(r"\s+", " ", s).strip()

In [3]:
STOP_MARKERS = [
    r"^Classification$", r"^End of Document$", r"^Load-Date:", r"^Subject:", r"^Industry:",
    r"^Language:", r"^Publication-Type:", r"^Graphic$", r"^Bookmark_\d+\s*$"
]
STOP_RX = re.compile("|".join(STOP_MARKERS), re.IGNORECASE)

def extract_title_lead_body(clean_text: str):
    """
    LexisNexis-structuur:
      [koppen/metadata] ... 
      Title (vaak 1-2x herhaald)
      Krant / datum / sectie / lengte / byline ...
      Body
      <artikeltekst (meerdere alinea's, soms paginabreaks)>
      Classification / End of Document / etc.

    Output:
      title: 1 regel
      lead:  korte samenvatting (eerste paragraaf na Body, overslaat 1-woord labels zoals 'Column', 'Geneesmiddelen')
      body:  rest tot aan STOP_MARKERS
    """
    # 1) naar regels
    lines = [ln.strip() for ln in clean_text.replace("\r", "\n").split("\n")]
    lines = [ln for ln in lines if ln is not None]  # behoud lege regels als scheiding

    # 2) vind titelkandidaat (eerste niet-lege regel die NIET 'Page x of y' / URL / About is)
    def is_noise_title(l):
        if not l: return True
        if re.match(r"^Page \d+ of \d+$", l): return True
        if re.match(r"^http[s]?://", l, re.I): return True
        if "About LexisNexis" in l or "Privacy Policy" in l or "Terms" in l: return True
        return False

    first_nonempty = next((i for i,l in enumerate(lines) if l and not is_noise_title(l)), None)
    title = lines[first_nonempty] if first_nonempty is not None else ""

    # 3) vind de EERSTE 'Body' regel (dit markeert begin van inhoud)
    body_idx = next((i for i,l in enumerate(lines) if l.strip().lower() == "body"), None)
    if body_idx is None:
        # fallback: soms staat 'Body' met extra tekst eronder; probeer een zachtere match
        body_idx = next((i for i,l in enumerate(lines) if re.fullmatch(r"\s*Body\s*", l, re.I)), None)

    # als geen Body gevonden: alles na title als body (zeldzaam)
    start_idx = (body_idx + 1) if body_idx is not None else ((first_nonempty + 1) if first_nonempty is not None else 0)

    # 4) knip af bij eerste STOP_MARKER
    end_idx = None
    for j in range(start_idx, len(lines)):
        if STOP_RX.match(lines[j] or ""):
            end_idx = j
            break
    content_lines = lines[start_idx:end_idx] if end_idx else lines[start_idx:]

    # 5) verwijder bekende tussenkopjes / 1-woord labels direct na Body (bv. 'Column', 'Geneesmiddelen')
    while content_lines and re.fullmatch(r"[A-Za-zÀ-ÿ\-’'`]+", content_lines[0]):
        # laat staan als het duidelijk een zin is (eindigt op .?!)
        if re.search(r"[.!?]$", content_lines[0]): break
        # anders overslaan (één-woord of korte rubriek)
        content_lines.pop(0)

    # 6) maak paragrafen (lege regel = scheiding; als die ontbreken, per “lege regel” simuleren op dubbele spaties)
    raw = "\n".join(content_lines)
    # normaliseer meerdere lege regels
    raw = re.sub(r"\n{3,}", "\n\n", raw).strip()
    paras = [p.strip() for p in re.split(r"\n\s*\n", raw) if p.strip()]
    if not paras:
        # fallback: forceer paragrafen grofweg op zinsafsluiting
        paras = [p.strip() for p in re.split(r"(?<=[.!?])\s+", raw) if p.strip()]

    # 7) lead = eerste paragraaf (max 2-3 zinnen), body = rest
    if paras:
        # knip lead op 2-3 zinnen
        sents = re.split(r"(?<=[.!?])\s+", paras[0])
        lead = " ".join(sents[:3]).strip()
        remainder = " ".join(sents[3:]).strip()
        body_paras = ([remainder] if remainder else []) + paras[1:]
        body = "\n\n".join(body_paras).strip()
    else:
        lead, body = "", ""

    title = normalize_text(title)
    lead  = normalize_text(lead)
    body  = normalize_text(body)

    return title.strip(), lead, body

In [4]:
test_files = [
    "data/Bericht 007 Nederlandse pati_nt wacht te lang op betere medicijnen tegen kanker.pdf",
    "data/Bericht 010_Nieuwe kankermedicijnen leveren meer financi_le winst op dan gezondheidswinst.pdf",
    "data/Bericht 011_Hoe controleer je verstopte moedervlekken_.pdf",
    "data/Bericht 016_Ik vind het erg als _n infuus van 25.000 euro wordt weggegooid_.pdf",
]

for fp in test_files:
    txt = pdf_to_text(fp)           # jouw bestaande pdf->text + clean
    title, lead, body = extract_title_lead_body(txt)
    print("\n===", os.path.basename(fp), "===")
    print("TITLE:", title[:120])
    print("LEAD :", lead)
    print("BODY len:", len(body), "chars")
    print("BODY preview:", body[:300].replace("\n"," ") + " ...")


=== Bericht 007 Nederlandse pati_nt wacht te lang op betere medicijnen tegen kanker.pdf ===
TITLE: Nederlandse patiënt wacht te lang op betere medicijnen tegen kanker
LEAD : Wat een prachtig bericht onlangs, dat meer kankerpatiënten de afgelopen decennia bleven leven. Twee derde van de patiënten met de diagnose kanker leeft na vijf jaar nog. Een vooruitgang die volgens het Integraal Kankercentrum Nederland mede te danken is aan innovatieve geneesmiddelen tegen gevorderde en uitgezaaide kanker.
BODY len: 3697 chars
BODY preview: Nog niet voor alle soorten, maar in ieder geval voor huid-, long-, prostaat-, bloed-, nier- en blaaskanker. Dat zijn in aantal niet de minste. Maar het is jammer dat het zo lang duurt voordat dergelijke geneesmiddelen na goedkeuring door de Amerikaanse autoriteiten in de Nederlandse praktijk terecht ...

=== Bericht 010_Nieuwe kankermedicijnen leveren meer financi_le winst op dan gezondheidswinst.pdf ===
TITLE: Nieuwe kankermedicijnen leveren meer financiële wi

In [9]:
def extract_id_from_filename(fname: str) -> int:
    m = re.search(r"bericht[_\s-]*(\d+)", fname, flags=re.IGNORECASE)
    if m:
        return int(m.group(1))
    # fallback: eerste nummer
    m2 = re.search(r"(\d+)", fname)
    return int(m2.group(1)) if m2 else -1
    
def chunk_body(
    text: str,
    tokenizer,
    *,
    max_tokens: int = 400,
    stride: int = 50,
    prefer_paragraphs: bool = True,
    reserve_for_hypothesis: int = 48,   # marge voor [CLS]/[SEP] + hypothesis tokens
) -> List[str]:
    """
    Hakt 'text' in token-gebaseerde chunks met overlap:
      - Houdt rekening met model_max_length en extra tokens voor NLI.
      - Optioneel: eerst per paragraaf splitsen (lege regel = scheiding); 
        te lange paragrafen worden verder gechunked.
      - Geeft decoderede tekst-chunks terug (zonder special tokens).
    """
    text = (text or "").strip()
    if not text:
        return []

    # Veilig step/eff_max bepalen
    if stride >= max_tokens:
        raise ValueError(f"'stride' ({stride}) must be < 'max_tokens' ({max_tokens}).")

    model_max = getattr(tokenizer, "model_max_length", 512)
    if not isinstance(model_max, int) or model_max <= 0 or model_max > 4096:
        model_max = 512  # sane default

    eff_max = max(16, min(max_tokens, model_max - max(0, reserve_for_hypothesis)))

    def _chunk_ids(ids: List[int]) -> List[str]:
        if len(ids) <= eff_max:
            # decode één keer
            s = tokenizer.decode(ids, clean_up_tokenization_spaces=True, skip_special_tokens=True).strip()
            return [s] if s else []
        chunks = []
        i = 0
        step = max(1, eff_max - stride)
        L = len(ids)
        while i < L:
            j = min(i + eff_max, L)
            window = ids[i:j]
            if not window:
                break
            s = tokenizer.decode(window, clean_up_tokenization_spaces=True, skip_special_tokens=True).strip()
            if s:
                chunks.append(s)
            if j >= L:
                break
            i += step
        return chunks

    out: List[str] = []
    if prefer_paragraphs:
        # echte newlines gebruiken; lege regel = paragraafscheiding
        paras = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
        for p in paras:
            ids = tokenizer.encode(p, add_special_tokens=False)
            out.extend(_chunk_ids(ids))
    else:
        ids = tokenizer.encode(text, add_special_tokens=False)
        out.extend(_chunk_ids(ids))

    # filter eventuele lege/duplicaten door whitespace
    return [c for c in out if c]

In [10]:
pipelines = {}
for mid in NLI_MODELS:
    print(f"Loading pipeline for: {mid}")
    pipelines[mid] = pipeline(
        task="zero-shot-classification",
        model=mid
    )
print("Pipelines ready.")
tokenizer = AutoTokenizer.from_pretrained(NLI_MODELS[0])

Loading pipeline for: LoicDL/bert-base-dutch-cased-finetuned-snli


Device set to use cpu


Loading pipeline for: LoicDL/robbert-v2-dutch-finetuned-snli


Device set to use cpu


Loading pipeline for: loicDL/robbertje-dutch-finetuned-snli


Device set to use cpu


Pipelines ready.


In [11]:
def nli_score_text_bin_single(text: str, *, clf):
    """
    Binaire scores (p_pos, p_neg) met JOUW enkele template en labels.
    - gebruikt exact HYPOTHESIS_TEMPLATE en ZS_LABELS
    - case-insensitive mapping
    - normaliseert over POS/NEG
    """
    txt = (text or "").strip()
    if not txt:
        return 0.5, 0.5

    out = clf(
        txt,
        ZS_LABELS,
        multi_label=False,
        hypothesis_template=HYPOTHESIS_TEMPLATE
    )
    labels = [str(x).strip().lower() for x in out["labels"]]
    scores = [float(x) for x in out["scores"]]
    smap = dict(zip(labels, scores))

    p_pos = smap.get("positief", 0.0)
    p_neg = smap.get("negatief", 0.0)

    s = p_pos + p_neg
    if s > 0:
        p_pos /= s
        p_neg /= s
    else:
        p_pos = p_neg = 0.5
    return p_pos, p_neg


def classify_title_lead_body_bin(
    title: str,
    lead: str,
    body: str,
    *,
    clf,
    tokenizer,
    max_tokens: int = 400,
    stride: int = 50,
    max_body_chunks: int | None = None,
    debug: bool = False,
):
    """
    Binair (positief/negatief) op basis van TITLE + LEAD + BODY-CHUNKS.
    - Elke part (title, lead, elke body-chunk) telt 1x mee (gelijke weging).
    - chunk_body(...) komt uit jouw notebook.
    """
    parts: list[tuple[str, str]] = []

    t = (title or "").strip()
    l = (lead  or "").strip()
    b = (body  or "").strip()

    if t: parts.append(("title", t))
    if l: parts.append(("lead",  l))

    # Body → token-gebaseerde chunks via jouw bestaande utility
    if b:
        body_chunks = chunk_body(b, tokenizer, max_tokens=max_tokens, stride=stride)
        if max_body_chunks is not None and len(body_chunks) > max_body_chunks:
            body_chunks = body_chunks[:max_body_chunks]
        for i, ch in enumerate(body_chunks):
            ch = (ch or "").strip()
            if ch:
                parts.append((f"body[{i}]", ch))

    if not parts:
        if debug: print("[DEBUG] No text found (title/lead/body empty).")
        return "negatief", {"positief": 0.5, "negatief": 0.5}

    pos_sum = 0.0
    neg_sum = 0.0
    n = 0

    for name, txt in parts:
        p_pos, p_neg = nli_score_text_bin_single(txt, clf=clf)
        if debug:
            print(f"[DEBUG] part={name:8s} len={len(txt):4d} -> p_pos={p_pos:.4f} p_neg={p_neg:.4f}")
        pos_sum += p_pos
        neg_sum += p_neg
        n += 1

    # simpele gemiddelde (gelijke weging)
    p_pos_final = pos_sum / n
    p_neg_final = neg_sum / n
    if debug:
        print(f"[DEBUG] AGG -> p_pos={p_pos_final:.4f} p_neg={p_neg_final:.4f} (n={n})")

    label = "positief" if p_pos_final >= p_neg_final else "negatief"
    return label, {"positief": float(p_pos_final), "negatief": float(p_neg_final)}

In [12]:
PDF_PATH = "data/Bericht 007 Nederlandse pati_nt wacht te lang op betere medicijnen tegen kanker.pdf"

print("\n=== TITLE + LEAD + BODY CHUNKS (per model, met extractie in de loop) ===")
for mid in NLI_MODELS:
    # 1) PDF -> text
    clean_txt = pdf_to_text(PDF_PATH)

    # 2) Extract title / lead / body
    title, lead, body = extract_title_lead_body(clean_txt)

    # (optioneel) snelle zichtcheck
    print(f"\n[Model] {mid}")
    print("[EXTRACT] TITLE:", title)
    print("[EXTRACT] LEAD :", (lead[:220] + "…") if len(lead) > 220 else lead)
    print("[EXTRACT] BODY chars:", len(body))

    # 3) Classify (title + lead + body-chunks), gelijke weging, debug aan
    result = classify_title_lead_body_bin(
        title, lead, body,
        clf=pipelines[mid],
        tokenizer=tokenizer,   # jouw bestaande tokenizer (voor chunk_body)
        max_tokens=400,
        stride=50,
        # max_body_chunks=12,  # (optioneel) limiter om ruis/runtime te beperken
        debug=True
    )
    print("[RESULT]", result)


=== TITLE + LEAD + BODY CHUNKS (per model, met extractie in de loop) ===


Token indices sequence length is longer than the specified maximum sequence length for this model (759 > 512). Running this sequence through the model will result in indexing errors



[Model] LoicDL/bert-base-dutch-cased-finetuned-snli
[EXTRACT] TITLE: Nederlandse patiënt wacht te lang op betere medicijnen tegen kanker
[EXTRACT] LEAD : Wat een prachtig bericht onlangs, dat meer kankerpatiënten de afgelopen decennia bleven leven. Twee derde van de patiënten met de diagnose kanker leeft na vijf jaar nog. Een vooruitgang die volgens het Integraal Kankerce…
[EXTRACT] BODY chars: 3697
[DEBUG] part=title    len=  67 -> p_pos=0.6052 p_neg=0.3948
[DEBUG] part=lead     len= 324 -> p_pos=0.7392 p_neg=0.2608
[DEBUG] part=body[0]  len=1999 -> p_pos=0.5361 p_neg=0.4639
[DEBUG] part=body[1]  len=1922 -> p_pos=0.4937 p_neg=0.5063
[DEBUG] part=body[2]  len= 288 -> p_pos=0.5539 p_neg=0.4461
[DEBUG] AGG -> p_pos=0.5856 p_neg=0.4144 (n=5)
[RESULT] ('positief', {'positief': 0.5856149931827919, 'negatief': 0.4143850068172081})

[Model] LoicDL/robbert-v2-dutch-finetuned-snli
[EXTRACT] TITLE: Nederlandse patiënt wacht te lang op betere medicijnen tegen kanker
[EXTRACT] LEAD : Wat een pra

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



[Model] loicDL/robbertje-dutch-finetuned-snli
[EXTRACT] TITLE: Nederlandse patiënt wacht te lang op betere medicijnen tegen kanker
[EXTRACT] LEAD : Wat een prachtig bericht onlangs, dat meer kankerpatiënten de afgelopen decennia bleven leven. Twee derde van de patiënten met de diagnose kanker leeft na vijf jaar nog. Een vooruitgang die volgens het Integraal Kankerce…
[EXTRACT] BODY chars: 3697
[DEBUG] part=title    len=  67 -> p_pos=0.3062 p_neg=0.6938
[DEBUG] part=lead     len= 324 -> p_pos=0.5068 p_neg=0.4932
[DEBUG] part=body[0]  len=1999 -> p_pos=0.5007 p_neg=0.4993
[DEBUG] part=body[1]  len=1922 -> p_pos=0.5021 p_neg=0.4979
[DEBUG] part=body[2]  len= 288 -> p_pos=0.5078 p_neg=0.4922
[DEBUG] AGG -> p_pos=0.4647 p_neg=0.5353 (n=5)
[RESULT] ('negatief', {'positief': 0.464725296072167, 'negatief': 0.5352747039278329})


In [26]:
rows = []
pdf_files = [f for f in sorted(os.listdir(DATA_DIR)) if f.lower().endswith('.pdf')]

if not pdf_files:
    print(f"[INFO] No PDF files found in '{DATA_DIR}'.")
else:
    for fname in tqdm(pdf_files, desc="Processing PDFs"):
        fpath = os.path.join(DATA_DIR, fname)
        clean_txt = pdf_to_text(fpath)
        title, lead, body = extract_title_lead_body(clean_txt)  # body NIET meer gebruiken

        rec = {
            "id": extract_id_from_filename(fname),
            "file": fname,
            "title": title,
            "lead": lead,
        }

        for mid, clf in pipelines.items():
            # Alleen title + lead (binaire NLI, geen body)
            label, scores = classify_title_lead_body_bin(
                title, lead, body,
                clf=pipelines[mid],
                tokenizer=tokenizer,          # jouw bestaande tokenizer
                max_tokens=400,
                stride=50,
                # max_body_chunks=12,         # optioneel: limiet om runtime te beheersen
                debug=False
            )

            key = mid.split("/")[-1].replace("-", "_")
            rec[f"{key}_label"]     = label
            rec[f"{key}_positief"]  = scores["positief"]
            rec[f"{key}_negatief"]  = scores["negatief"]
        rows.append(rec)

df_out = pd.DataFrame(rows).sort_values(by=["id", "file"])
print(df_out.head())

Processing PDFs: 100%|█████████████████████████████████████████████████████████████████| 70/70 [01:53<00:00,  1.62s/it]

   id                                               file  \
0   7  Bericht 007 Nederlandse pati_nt wacht te lang ...   
1  10  Bericht 010_Nieuwe kankermedicijnen leveren me...   
2  11  Bericht 011_Hoe controleer je verstopte moeder...   
3  16  Bericht 016_Ik vind het erg als _n infuus van ...   
4  21  Bericht 021_Wachtlijsten en personeelstekort_ ...   

                                               title  \
0  Nederlandse patiënt wacht te lang op betere me...   
1  Nieuwe kankermedicijnen leveren meer financiël...   
2         Hoe controleer je verstopte moedervlekken?   
3  'Ik vind het erg als 'n infuus van 25.000 euro...   
4  Wachtlijsten en personeelstekort: het 'zorginf...   

                                                lead  \
0  Wat een prachtig bericht onlangs, dat meer kan...   
1  Vorige week verscheen in Trouw een artikel met...   
2  Preventie het consult\nMeer dan twintig jaar g...   
3  Slimme ideeën van apotheker Roelof van Leeuwen...   
4  Onbetaalbare zorg\n




In [27]:
df_out.to_csv(OUT_CSV, index=False)
print("[DONE] Wrote binary results to nli_results.csv")

[DONE] Wrote binary results to nli_results.csv


# Eval

In [28]:
# === Load results (NLI models) ===
import re
import pandas as pd

DF_PATH = "nli_results.csv"   # pas aan indien anders
df = pd.read_csv(DF_PATH)

# ---------------------------
# 1) Parse ID (robust)
# ---------------------------
def extract_id(val):
    # Als al numeriek:
    try:
        return int(val)
    except Exception:
        pass
    s = str(val)
    m = re.search(r'bericht[_\s-]*(\d+)', s, flags=re.IGNORECASE)
    if m:
        return int(m.group(1))
    # fallback: neem eerste getal dat voorkomt
    m2 = re.search(r'(\d+)', s)
    return int(m2.group(1)) if m2 else None

if 'id' in df.columns:
    df['id'] = df['id'].apply(extract_id)
elif 'file' in df.columns:
    df['id'] = df['file'].apply(extract_id)
else:
    df['id'] = range(1, len(df) + 1)

# ---------------------------
# 2) Vind alle model-labelkolommen (eindigen op "_label")
# ---------------------------
label_cols = [c for c in df.columns if c.endswith("_label")]
if not label_cols:
    raise KeyError("Geen *_label kolommen gevonden in nli_results.csv")

# labels normaliseren (lowercase)
for c in label_cols:
    df[c] = df[c].astype(str).str.strip().str.lower()

# ---------------------------
# 3) One-hot per model
#    -> bijv. robbert_v2_dutch_finetuned_snli_positief / _neutraal / _negatief
# ---------------------------
one_hot_frames = []
for c in label_cols:
    model_prefix = c.replace("_label", "")
    oh = pd.get_dummies(df[[c]].rename(columns={c: model_prefix}),
                        columns=[model_prefix], prefix=model_prefix)
    one_hot_frames.append(oh)

df_onehot = pd.concat([df[['id']]] + one_hot_frames, axis=1)

# ---------------------------
# 4) Bekijken (eerste regels + samenvatting per model)
# ---------------------------
print(df_onehot.head())

# Samenvatting aantal labels per model
summary = {}
for c in label_cols:
    model_prefix = c.replace("_label", "")
    pos = df_onehot.filter(like=f"{model_prefix}_positief").sum().sum()
    neg = df_onehot.filter(like=f"{model_prefix}_negatief").sum().sum()
    summary[model_prefix] = {"positief": int(pos), "negatief": int(neg)}

summary_df = pd.DataFrame(summary).T
summary_df.index.name = "Model"
print("\nSamenvatting aantallen per model:")
print(summary_df)


   id  bert_base_dutch_cased_finetuned_snli_positief  \
0   7                                           True   
1  10                                           True   
2  11                                           True   
3  16                                           True   
4  21                                           True   

   robbert_v2_dutch_finetuned_snli_negatief  \
0                                     False   
1                                     False   
2                                     False   
3                                     False   
4                                      True   

   robbert_v2_dutch_finetuned_snli_positief  \
0                                      True   
1                                      True   
2                                      True   
3                                      True   
4                                     False   

   robbertje_dutch_finetuned_snli_negatief  \
0                                     True   
1    

In [29]:
# === Evaluatie: Human vs NLI (binair: positief/negatief) ===
import re
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

DF_PATH = "nli_results.csv"   # ← let op: binaire output

# 1) Load data
df_h = pd.read_excel("Human_Sentiment.xlsx")  # kolommen: Artikel, Sentiment
df_n = pd.read_csv(DF_PATH)

# 2) ID helpers
def extract_id(s):
    s = str(s)
    m = re.search(r'bericht[_\s-]*(\d+)', s, flags=re.IGNORECASE)
    if m: return int(m.group(1))
    m2 = re.search(r'(\d+)', s)
    return int(m2.group(1)) if m2 else None

if 'id' in df_n.columns:
    df_n['id'] = df_n['id'].apply(extract_id)
elif 'file' in df_n.columns:
    df_n['id'] = df_n['file'].apply(extract_id)
else:
    df_n['id'] = range(1, len(df_n) + 1)

if not pd.api.types.is_integer_dtype(df_h['Artikel']):
    try:
        df_h['Artikel'] = df_h['Artikel'].astype(int)
    except Exception:
        df_h['Artikel'] = df_h['Artikel'].apply(extract_id)

# 3) Human labels -> alleen pos/neg
df_h['Human_Label'] = df_h['Sentiment'].astype(str).str.strip().str.lower()
df_h = df_h[df_h['Human_Label'].isin({'positief','negatief'})].copy()

# 4) Merge
dfm = pd.merge(df_h, df_n, left_on='Artikel', right_on='id', how='inner')

# 5) Vind alle *_label kolommen (per model)
model_label_cols = [c for c in dfm.columns if c.endswith("_label")]
if not model_label_cols:
    raise KeyError("Geen *_label kolommen gevonden in NLI-resultaten.")

labels_order = ['positief','negatief']

# 6) Evaluatie per model
for col in model_label_cols:
    model_name = col.replace("_label","")
    y_true = dfm['Human_Label'].astype(str).str.lower()
    y_pred = dfm[col].astype(str).str.lower()

    mask = y_true.isin(labels_order) & y_pred.isin(labels_order)
    y_true = y_true[mask]
    y_pred = y_pred[mask]

    print(f"\n=== {model_name} (Zero-Shot NLI, binary) ===")
    print(classification_report(
        y_true, y_pred,
        labels=labels_order,
        target_names=labels_order,
        digits=3
    ))

    cm = confusion_matrix(y_true, y_pred, labels=labels_order)
    cm_df = pd.DataFrame(
        cm,
        index=[f"True {l}" for l in labels_order],
        columns=[f"Pred {l}" for l in labels_order]
    )
    print("Confusion matrix:")
    print(cm_df)



=== bert_base_dutch_cased_finetuned_snli (Zero-Shot NLI, binary) ===
              precision    recall  f1-score   support

    positief      0.631     1.000     0.774        41
    negatief      0.000     0.000     0.000        24

    accuracy                          0.631        65
   macro avg      0.315     0.500     0.387        65
weighted avg      0.398     0.631     0.488        65

Confusion matrix:
               Pred positief  Pred negatief
True positief             41              0
True negatief             24              0

=== robbert_v2_dutch_finetuned_snli (Zero-Shot NLI, binary) ===
              precision    recall  f1-score   support

    positief      0.619     0.951     0.750        41
    negatief      0.000     0.000     0.000        24

    accuracy                          0.600        65
   macro avg      0.310     0.476     0.375        65
weighted avg      0.390     0.600     0.473        65

Confusion matrix:
               Pred positief  Pred negatief

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
