# Zero-Shot NLI Sentiment (Dutch) Newspapers

Dit notebook doet zero shot natural language inference om het sentiment van de geïncludeerde artikelen te classificeren in **positief** en **negatief**.

De volgende modellen zijn getest, de eerste 3 zijn gefinetuned op nederlands en de laatste is multilangual:
- `LoicDL/bert-base-dutch-cased-finetuned-snli`
- `LoicDL/robbert-v2-dutch-finetuned-snli`
- `loicDL/robbertje-dutch-finetuned-snli`
- `MoritzLaurer/mDeBERTa-v3-base-mnli-xnli`

Output: per artikel het voorspelde label en per klasse waarschijnlijkheidssscores (positief/negatief)


# Imports

In [1]:
import os, re, json
import pandas as pd
import numpy as np
from tqdm import tqdm
from typing import List, Tuple, Dict, Optional
from ftfy import fix_text as _ftfy_fix
import unicodedata
import torch
from transformers import pipeline, AutoTokenizer
from sklearn.metrics import classification_report, confusion_matrix

  warn(


# Config

In [12]:
# Output-bestand
OUT_CSV = "out/nli_results.csv"

# NLI-modellen
NLI_MODELS = [
    "LoicDL/bert-base-dutch-cased-finetuned-snli", 
    "LoicDL/robbert-v2-dutch-finetuned-snli",
    "loicDL/robbertje-dutch-finetuned-snli",
    "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli",
]

# Zero-shot labels & template (binary)
ZS_LABELS = ["positief", "negatief"]
HYPOTHESIS_TEMPLATE = "Deze tekst is {}." # dit is de hypothese waarmee de NLI modellen gebruikt worden.

# Sectie-gewichten (title=1, lead=1, body=1)
TITLE_W = 1.0
LEAD_W  = 1.0
BODY_W  = 1.0  # (wordt intern gelijk verdeeld/gewogen over body-chunks)

# Load Title Lead and Body Texts

In [13]:
# === Inlezen uit Title_Lead_Body.xlsx ===
XLSX_PATH = "out/Title_Lead_Body.xlsx"

df_input = pd.read_excel(XLSX_PATH)
# kolomnamen naar lower
df_input = df_input.rename(columns={c: c.lower() for c in df_input.columns})
# vereiste kolommen
need = {"id", "title", "lead", "body"}
missing = need - set(df_input.columns)
if missing:
    raise ValueError(f"Ontbrekende kolommen in {XLSX_PATH}: {missing}")

df_input["id"] = df_input["id"].astype(int)

# Load Models

In [11]:
# === Pipelines en tokenizer klaarzetten ===
device = 0 if torch.cuda.is_available() else -1

# Tokenizer voor chunken: hierbij gebruik ik de MoritzLaurer-tokenizer (stabiel en multilingual)
chunk_tokenizer = AutoTokenizer.from_pretrained("MoritzLaurer/mDeBERTa-v3-base-mnli-xnli")

# Zero-shot pipelines voor alle modellen
pipelines = {
    mid: pipeline("zero-shot-classification", model=mid, tokenizer=mid, device=device)
    for mid in NLI_MODELS
}
list(pipelines.keys())

Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu


['LoicDL/bert-base-dutch-cased-finetuned-snli',
 'LoicDL/robbert-v2-dutch-finetuned-snli',
 'loicDL/robbertje-dutch-finetuned-snli',
 'MoritzLaurer/mDeBERTa-v3-base-mnli-xnli']

# Helpers

In [12]:
# === Body chunken (token-gebonden, met stride) ===
def chunk_body(
    text: str,
    tokenizer,
    *,
    max_tokens: int = 400,
    stride: int = 50,
    prefer_paragraphs: bool = True,
    reserve_for_hypothesis: int = 48,   # marge voor [CLS]/[SEP] + hypothesis tokens
) -> List[str]:
    text = (text or "").strip()
    if not text:
        return []

    # Model max length (veiligheidsmarge)
    try:
        model_max = int(getattr(tokenizer.model_max_length, "__int__", lambda: 512)())
    except Exception:
        model_max = 512

    eff_max = max(16, min(max_tokens, model_max - max(0, reserve_for_hypothesis)))

    def _chunk_ids(ids: List[int]) -> List[str]:
        if len(ids) <= eff_max:
            s = tokenizer.decode(ids, clean_up_tokenization_spaces=True, skip_special_tokens=True).strip()
            return [s] if s else []
        chunks = []
        i, L = 0, len(ids)
        step = max(1, eff_max - stride)
        while i < L:
            j = min(i + eff_max, L)
            window = ids[i:j]
            if window:
                s = tokenizer.decode(window, clean_up_tokenization_spaces=True, skip_special_tokens=True).strip()
                if s:
                    chunks.append(s)
            if j >= L:
                break
            i += step
        return chunks

    out: List[str] = []
    if prefer_paragraphs:
        paras = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
        for p in paras:
            ids = tokenizer.encode(p, add_special_tokens=False)
            out.extend(_chunk_ids(ids))
    else:
        ids = tokenizer.encode(text, add_special_tokens=False)
        out.extend(_chunk_ids(ids))

    return [c for c in out if c]

In [13]:
# === NLI-score voor één tekst (binaire probs, renormalized) ===
def nli_score_text_bin_single(text: str, *, clf) -> Tuple[float, float]:
    """
    Geeft (p_pos, p_neg) terug op basis van zero-shot NLI met je template.
    """
    txt = (text or "").strip()
    if not txt:
        return 0.5, 0.5

    out = clf(
        txt,
        ZS_LABELS,
        multi_label=False,
        hypothesis_template=HYPOTHESIS_TEMPLATE,
    )
    # out['labels'] zijn candidate_labels in score-volgorde
    scores = {str(lbl).lower(): float(scr) for lbl, scr in zip(out["labels"], out["scores"])}
    p_pos = scores.get("positief", 0.0)
    p_neg = scores.get("negatief", 0.0)

    s = p_pos + p_neg
    if s <= 0:
        return 0.5, 0.5
    return p_pos / s, p_neg / s


# === Aggregatie over title/lead/body met gewenste weging ===
def classify_title_lead_body_bin(
    title: str,
    lead: str,
    body: str,
    *,
    clf,                  # zero-shot pipeline
    tokenizer,            # voor chunking
    max_tokens: int = 400,
    stride: int = 50,
    debug: bool = False,
) -> Tuple[str, Dict[str, float]]:
    # Title
    t_pos, t_neg = nli_score_text_bin_single(title, clf=clf) if (title or "").strip() else (0.5, 0.5)
    if debug:
        print(f"[DBG] title: p_pos={t_pos:.4f} p_neg={t_neg:.4f}")

    # Lead
    l_pos, l_neg = nli_score_text_bin_single(lead, clf=clf) if (lead or "").strip() else (0.5, 0.5)
    if debug:
        print(f"[DBG] lead : p_pos={l_pos:.4f} p_neg={l_neg:.4f}")

    # Body → chunks → gewogen gemiddelde waarbij som(chunk-gewichten)=1
    b_pos = b_neg = 0.5
    chunks = chunk_body(
        body,
        tokenizer,
        max_tokens=max_tokens,
        stride=stride,
        prefer_paragraphs=True
    )
    if chunks:
        # weeg chunks naar lengte (tokens) en NORMALISEER naar som=1
        tok_lens = [len(tokenizer.encode(c, add_special_tokens=False)) for c in chunks]
        total = sum(tok_lens)
        if total <= 0:
            weights = [1.0 / len(chunks)] * len(chunks)
        else:
            weights = [tl / total for tl in tok_lens]

        b_pos = 0.0
        b_neg = 0.0
        for w, c in zip(weights, chunks):
            ppos, pneg = nli_score_text_bin_single(c, clf=clf)
            b_pos += w * ppos
            b_neg += w * pneg
        if debug:
            print(f"[DBG] body : chunks={len(chunks)} p_pos={b_pos:.4f} p_neg={b_neg:.4f}")

    # Sectie-gewichten: title=1, lead=1, body=1
    pos_total = TITLE_W * t_pos + LEAD_W * l_pos + BODY_W * b_pos
    neg_total = TITLE_W * t_neg + LEAD_W * l_neg + BODY_W * b_neg

    denom = pos_total + neg_total
    if denom <= 0:
        p_pos_final = p_neg_final = 0.5
    else:
        p_pos_final = pos_total / denom
        p_neg_final = neg_total / denom

    label = "positief" if p_pos_final >= p_neg_final else "negatief"
    return label, {"positief": float(p_pos_final), "negatief": float(p_neg_final)}

# NLI Analysis

In [14]:
rows = []
for _, r in tqdm(df_input.iterrows(), total=len(df_input), desc="NLI"):
    rec = {
        "id": int(r["id"]),
        "title": r["title"],
        "lead": r["lead"],
    }
    title = r["title"]
    lead  = r["lead"]
    body  = r["body"]

    for mid, clf in pipelines.items():
        label, scores = classify_title_lead_body_bin(
            title, lead, body,
            clf=clf,
            tokenizer=chunk_tokenizer,
            max_tokens=300,   # kan 400; 300 = iets sneller, vaak voldoende
            stride=50,
            debug=False
        )
        key = mid.split("/")[-1].replace("-", "_")
        rec[f"{key}_label"]     = label
        rec[f"{key}_positief"]  = float(scores["positief"])
        rec[f"{key}_negatief"]  = float(scores["negatief"])
    rows.append(rec)

df_out = pd.DataFrame(rows).sort_values("id")
df_out.head()

NLI:   0%|                                                                                      | 0/70 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (861 > 512). Running this sequence through the model will result in indexing errors
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
NLI: 100%|██████████████████████████████████████████████████████████████████████████| 70/70 [2:13:06<00:00, 114.09s/it]


Unnamed: 0,id,title,lead,bert_base_dutch_cased_finetuned_snli_label,bert_base_dutch_cased_finetuned_snli_positief,bert_base_dutch_cased_finetuned_snli_negatief,robbert_v2_dutch_finetuned_snli_label,robbert_v2_dutch_finetuned_snli_positief,robbert_v2_dutch_finetuned_snli_negatief,robbertje_dutch_finetuned_snli_label,robbertje_dutch_finetuned_snli_positief,robbertje_dutch_finetuned_snli_negatief,mDeBERTa_v3_base_mnli_xnli_label,mDeBERTa_v3_base_mnli_xnli_positief,mDeBERTa_v3_base_mnli_xnli_negatief
0,7,Nederlandse patiënt wacht te lang op betere me...,"Wat een prachtig bericht onlangs, dat meer kan...",positief,0.681403,0.318597,positief,0.561398,0.438602,negatief,0.485963,0.514037,positief,0.659065,0.340935
1,10,Nieuwe kankermedicijnen leveren meer financiël...,Vorige week verscheen in Trouw een artikel met...,positief,0.647363,0.352637,positief,0.592988,0.407012,positief,0.52293,0.47707,negatief,0.436804,0.563196
2,11,Hoe controleer je verstopte moedervlekken?,Meer dan twintig jaar geleden ontdekte ze op h...,positief,0.646949,0.353051,positief,0.551829,0.448171,negatief,0.33901,0.66099,negatief,0.466271,0.533729
3,16,'Ik vind het erg als 'n infuus van 25.000 euro...,Waarom schrijven artsen 1005 milligram van een...,positief,0.644479,0.355521,positief,0.524401,0.475599,negatief,0.429237,0.570763,negatief,0.320823,0.679177
4,21,Wachtlijsten en personeelstekort: het 'zorginf...,"De gezondheidszorg is 'op', er zit geen rek me...",positief,0.623405,0.376595,positief,0.503445,0.496555,negatief,0.439364,0.560636,negatief,0.402679,0.597321


In [15]:
df_out.to_csv(OUT_CSV, index=False)
print("[DONE] Wrote binary results to out/nli_results.csv")

[DONE] Wrote binary results to out/nli_results.csv


# Evaluation

In [2]:
# === Load results (NLI models) ===

# 1) Resultaten ophalen (in-memory heeft voorrang)
df_results = None
for varname in ("df_out", "df"):
    if varname in globals():
        df_results = globals()[varname].copy()
        break

# 2) Anders van schijf (xlsx > csv)
if df_results is None:
    candidates = [
        "out/nli_results.xlsx",
        "nli_results.xlsx",
        "out/nli_results.csv",
        "nli_results.csv",
    ]
    path = next((p for p in candidates if os.path.exists(p)), None)
    if path is None:
        raise FileNotFoundError("Kon geen nli_results-(xlsx/csv) vinden of in-memory df_out/df.")
    df_results = pd.read_excel(path) if path.endswith(".xlsx") else pd.read_csv(path)

df = df_results.copy()

# 3) Zorg voor nette integer 'id'
def to_int_id(v):
    try:
        return int(v)
    except Exception:
        return pd.NA

if "id" not in df.columns:
    # Laatste vangnet: als er nog 'file' bestaat, probeer daaruit een cijfer te halen
    if "file" in df.columns:
        def extract_id_from_file(val):
            s = str(val)
            m = re.search(r"(\d+)", s)
            return int(m.group(1)) if m else pd.NA
        df["id"] = df["file"].apply(extract_id_from_file)
    else:
        df["id"] = np.arange(1, len(df) + 1)

df["id"] = df["id"].apply(to_int_id)

# 4) Vind *_label kolommen en normaliseer
label_cols = [c for c in df.columns if c.endswith("_label")]
if not label_cols:
    raise KeyError("Geen *_label kolommen gevonden in de NLI-resultaten.")

for c in label_cols:
    df[c] = df[c].astype(str).str.strip().str.lower()

# 5) One-hot per model, robuust (zorgt voor kolommen ..._positief en ..._negatief als aanwezig)
one_hot_frames = []
for c in label_cols:
    model_prefix = c.replace("_label", "")
    # get_dummies (kan ontbreken als label niet voorkomt)
    oh = pd.get_dummies(df[[c]].rename(columns={c: model_prefix}),
                        columns=[model_prefix], prefix=model_prefix)
    # Zorg dat de binaire kolommen bestaan (ook als niet gezien)
    for lab in ("positief", "negatief"):
        col = f"{model_prefix}_{lab}"
        if col not in oh.columns:
            oh[col] = 0
    # Sorteer kolommen voor netheid
    oh = oh.reindex(sorted(oh.columns), axis=1)
    one_hot_frames.append(oh)

df_onehot = pd.concat([df[["id"]]] + one_hot_frames, axis=1)

print(df_onehot.head())

# 6) Samenvatting per model (positief/negatief)
summary = {}
for c in label_cols:
    model_prefix = c.replace("_label", "")
    pos = int(df_onehot.filter(like=f"{model_prefix}_positief").sum().sum())
    neg = int(df_onehot.filter(like=f"{model_prefix}_negatief").sum().sum())
    summary[model_prefix] = {"positief": pos, "negatief": neg}

summary_df = pd.DataFrame(summary).T.astype(int)
summary_df.index.name = "Model"
print("\nSamenvatting aantallen per model:")
print(summary_df)


   id  bert_base_dutch_cased_finetuned_snli_negatief  \
0   7                                          False   
1  10                                          False   
2  11                                          False   
3  16                                          False   
4  21                                          False   

   bert_base_dutch_cased_finetuned_snli_positief  \
0                                           True   
1                                           True   
2                                           True   
3                                           True   
4                                           True   

   robbert_v2_dutch_finetuned_snli_negatief  \
0                                     False   
1                                     False   
2                                     False   
3                                     False   
4                                     False   

   robbert_v2_dutch_finetuned_snli_positief  \
0                   

In [3]:
# === Evaluatie: Human vs NLI (binair: positief/negatief) ===

# 1) Pak dezelfde df_results als in de vorige cell
try:
    df_n = df_results.copy()
except NameError:
    # fallback: alsnog proberen te laden (zelfde logica)
    import os
    candidates = [
        "out/nli_results.xlsx",
        "nli_results.xlsx",
        "out/nli_results.csv",
        "nli_results.csv",
    ]
    path = next((p for p in candidates if os.path.exists(p)), None)
    if path is None:
        raise FileNotFoundError("Kon geen NLI-resultaten vinden voor evaluatie.")
    df_n = pd.read_excel(path) if path.endswith(".xlsx") else pd.read_csv(path)

# 2) Human labels
df_h = pd.read_excel("out/Human_Sentiment.xlsx")  # verwacht: Artikel, Sentiment

# 3) IDs netjes naar int
def to_int_or_none(x):
    try:
        return int(x)
    except Exception:
        return None

if "id" not in df_n.columns:
    raise KeyError("Resultaten missen een 'id' kolom.")
df_n["id"] = df_n["id"].apply(to_int_or_none)

if not pd.api.types.is_integer_dtype(df_h["Artikel"]):
    df_h["Artikel"] = df_h["Artikel"].apply(to_int_or_none)

# 4) Labels normaliseren + alleen pos/neg
df_h["Human_Label"] = df_h["Sentiment"].astype(str).str.strip().str.lower()
df_h = df_h[df_h["Human_Label"].isin({"positief", "negatief"})].copy()

# 5) Merge
dfm = pd.merge(df_h[["Artikel", "Human_Label"]],
               df_n,
               left_on="Artikel", right_on="id",
               how="inner")

# 6) Vind alle *_label kolommen (per model)
model_label_cols = [c for c in dfm.columns if c.endswith("_label")]
if not model_label_cols:
    raise KeyError("Geen *_label kolommen gevonden in NLI-resultaten voor evaluatie.")

labels_order = ["positief", "negatief"]

# 7) Evaluatie per model
for col in model_label_cols:
    model_name = col.replace("_label", "")
    y_true = dfm["Human_Label"].astype(str).str.lower()
    y_pred = dfm[col].astype(str).str.lower()

    mask = y_true.isin(labels_order) & y_pred.isin(labels_order)
    y_true_m = y_true[mask]
    y_pred_m = y_pred[mask]

    print(f"\n=== {model_name} (Zero-Shot NLI, binary) ===")
    print(classification_report(
        y_true_m, y_pred_m,
        labels=labels_order,
        target_names=labels_order,
        digits=3
    ))

    cm = confusion_matrix(y_true_m, y_pred_m, labels=labels_order)
    cm_df = pd.DataFrame(
        cm,
        index=[f"True {l}" for l in labels_order],
        columns=[f"Pred {l}" for l in labels_order]
    )
    print("Confusion matrix:")
    print(cm_df)



=== bert_base_dutch_cased_finetuned_snli (Zero-Shot NLI, binary) ===
              precision    recall  f1-score   support

    positief      0.625     1.000     0.769        40
    negatief      1.000     0.040     0.077        25

    accuracy                          0.631        65
   macro avg      0.812     0.520     0.423        65
weighted avg      0.769     0.631     0.503        65

Confusion matrix:
               Pred positief  Pred negatief
True positief             40              0
True negatief             24              1

=== robbert_v2_dutch_finetuned_snli (Zero-Shot NLI, binary) ===
              precision    recall  f1-score   support

    positief      0.655     0.950     0.776        40
    negatief      0.714     0.200     0.312        25

    accuracy                          0.662        65
   macro avg      0.685     0.575     0.544        65
weighted avg      0.678     0.662     0.597        65

Confusion matrix:
               Pred positief  Pred negatief

In [14]:
errors = dfm[dfm['Human_Label'] != dfm['mDeBERTa_v3_base_mnli_xnli_label']]
cols = ['id', 'title', 'mDeBERTa_v3_base_mnli_xnli_label', 'Human_Label']

In [15]:
print(errors[cols].head())

    id                                              title  \
2   11         Hoe controleer je verstopte moedervlekken?   
3   16  'Ik vind het erg als 'n infuus van 25.000 euro...   
4   26  Tijd om te kiezen: dure behandelingen of voldo...   
8   39                      Het dna van de tumor in kaart   
13  66       'Mijn belangrijkste vraag: wat wil je echt?'   

   mDeBERTa_v3_base_mnli_xnli_label Human_Label  
2                          negatief    positief  
3                          negatief    positief  
4                          positief    negatief  
8                          positief    negatief  
13                         positief    negatief  
