In [28]:
import os, re, math
from dataclasses import dataclass
from typing import List, Tuple, Dict, Optional

import numpy as np
import pandas as pd
from tqdm import tqdm
import pdfplumber

from transformers import pipeline, AutoTokenizer

ROB_BERT_SENTIMENT = "DTAI-KULeuven/robbert-v2-dutch-sentiment"
BERTJE_SENTIMENT   = "DTAI-KULeuven/robbertje-merged-dutch-sentiment"

DATA_DIR = "data"
OUT_CSV  = "sentiment_results.csv"

@dataclass
class Weights:
    title: float = 3.0
    lead: float  = 2.0
    body: float  = 1.0
WEIGHTS = Weights()

NEUTRAL_BAND = 0.10
MAX_TOKENS = 400
STRIDE     = 50


In [29]:
LEXIS_HEADER_PATTERNS = [
    r"^About LexisNexis.*",
    r"^Privacy Policy.*",
    r"^Terms .* Conditions.*",
    r"^Copyright.*",
    r"^User Name:.*",
    r"^Date and Time:.*",
    r"^Job Number:.*",
    r"^Documents \\(\\d+\\).*",
    r"^Client/Matter:.*",
    r"^Search Terms:.*",
    r"^Search Type:.*",
    r"^Content Type.*",
    r"^http[s]?://\\S+",
    r"^Page \\d+ of \\d+",
    r"^Load-Date:.*",
    r"^End of Document",
    r"^Classification$",
    r"^Language:.*",
    r"^Publication-Type:.*",
    r"^Subject:.*",
    r"^Industry:.*",
    r"^\\s*Bookmark_\\d+\\s*$"
]

HEADER_REGEXES = [re.compile(pat, flags=re.IGNORECASE) for pat in LEXIS_HEADER_PATTERNS]

def clean_lines(lines: List[str]) -> List[str]:
    out = []
    for ln in lines:
        s = ln.strip()
        if not s:
            out.append("")
            continue
        if any(rx.match(s) for rx in HEADER_REGEXES):
            continue
        s = re.sub(r"\\s+", " ", s)
        out.append(s)
    txt = "\\n".join(out)
    txt = re.sub(r"\\n{3,}", "\\n\\n", txt)
    return [ln for ln in txt.split("\\n")]
    
def pdf_to_text(path: str) -> str:
    texts = []
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            t = page.extract_text(x_tolerance=1, y_tolerance=1) or ""
            if t:
                texts.append(t)
    raw = "\\n".join(texts)
    raw = raw.replace("\\r\\n", "\\n").replace("\\r", "\\n")
    lines = raw.split("\\n")
    lines = clean_lines(lines)
    return "\\n".join(lines)


In [30]:
STOP_MARKERS = [
    r"^Classification$", r"^End of Document$", r"^Load-Date:", r"^Subject:", r"^Industry:",
    r"^Language:", r"^Publication-Type:", r"^Graphic$", r"^Bookmark_\d+\s*$"
]
STOP_RX = re.compile("|".join(STOP_MARKERS), re.IGNORECASE)

def extract_title_lead_body(clean_text: str):
    """
    LexisNexis-structuur:
      [koppen/metadata] ... 
      Title (vaak 1-2x herhaald)
      Krant / datum / sectie / lengte / byline ...
      Body
      <artikeltekst (meerdere alinea's, soms paginabreaks)>
      Classification / End of Document / etc.

    Output:
      title: 1 regel
      lead:  korte samenvatting (eerste paragraaf na Body, overslaat 1-woord labels zoals 'Column', 'Geneesmiddelen')
      body:  rest tot aan STOP_MARKERS
    """
    # 1) naar regels
    lines = [ln.strip() for ln in clean_text.replace("\r", "\n").split("\n")]
    lines = [ln for ln in lines if ln is not None]  # behoud lege regels als scheiding

    # 2) vind titelkandidaat (eerste niet-lege regel die NIET 'Page x of y' / URL / About is)
    def is_noise_title(l):
        if not l: return True
        if re.match(r"^Page \d+ of \d+$", l): return True
        if re.match(r"^http[s]?://", l, re.I): return True
        if "About LexisNexis" in l or "Privacy Policy" in l or "Terms" in l: return True
        return False

    first_nonempty = next((i for i,l in enumerate(lines) if l and not is_noise_title(l)), None)
    title = lines[first_nonempty] if first_nonempty is not None else ""

    # 3) vind de EERSTE 'Body' regel (dit markeert begin van inhoud)
    body_idx = next((i for i,l in enumerate(lines) if l.strip().lower() == "body"), None)
    if body_idx is None:
        # fallback: soms staat 'Body' met extra tekst eronder; probeer een zachtere match
        body_idx = next((i for i,l in enumerate(lines) if re.fullmatch(r"\s*Body\s*", l, re.I)), None)

    # als geen Body gevonden: alles na title als body (zeldzaam)
    start_idx = (body_idx + 1) if body_idx is not None else ((first_nonempty + 1) if first_nonempty is not None else 0)

    # 4) knip af bij eerste STOP_MARKER
    end_idx = None
    for j in range(start_idx, len(lines)):
        if STOP_RX.match(lines[j] or ""):
            end_idx = j
            break
    content_lines = lines[start_idx:end_idx] if end_idx else lines[start_idx:]

    # 5) verwijder bekende tussenkopjes / 1-woord labels direct na Body (bv. 'Column', 'Geneesmiddelen')
    while content_lines and re.fullmatch(r"[A-Za-zÀ-ÿ\-’'`]+", content_lines[0]):
        # laat staan als het duidelijk een zin is (eindigt op .?!)
        if re.search(r"[.!?]$", content_lines[0]): break
        # anders overslaan (één-woord of korte rubriek)
        content_lines.pop(0)

    # 6) maak paragrafen (lege regel = scheiding; als die ontbreken, per “lege regel” simuleren op dubbele spaties)
    raw = "\n".join(content_lines)
    # normaliseer meerdere lege regels
    raw = re.sub(r"\n{3,}", "\n\n", raw).strip()
    paras = [p.strip() for p in re.split(r"\n\s*\n", raw) if p.strip()]
    if not paras:
        # fallback: forceer paragrafen grofweg op zinsafsluiting
        paras = [p.strip() for p in re.split(r"(?<=[.!?])\s+", raw) if p.strip()]

    # 7) lead = eerste paragraaf (max 2-3 zinnen), body = rest
    if paras:
        # knip lead op 2-3 zinnen
        sents = re.split(r"(?<=[.!?])\s+", paras[0])
        lead = " ".join(sents[:3]).strip()
        remainder = " ".join(sents[3:]).strip()
        body_paras = ([remainder] if remainder else []) + paras[1:]
        body = "\n\n".join(body_paras).strip()
    else:
        lead, body = "", ""

    return title.strip(), lead, body

In [31]:
test_files = [
    "data/Bericht 007 Nederlandse pati_nt wacht te lang op betere medicijnen tegen kanker.pdf",
    "data/Bericht 010_Nieuwe kankermedicijnen leveren meer financi_le winst op dan gezondheidswinst.pdf",
    "data/Bericht 011_Hoe controleer je verstopte moedervlekken_.pdf",
    "data/Bericht 016_Ik vind het erg als _n infuus van 25.000 euro wordt weggegooid_.pdf",
]

for fp in test_files:
    txt = pdf_to_text(fp)           # jouw bestaande pdf->text + clean
    title, lead, body = extract_title_lead_body(txt)
    print("\n===", os.path.basename(fp), "===")
    print("TITLE:", title[:120])
    print("LEAD :", lead)
    print("BODY len:", len(body), "chars")
    print("BODY preview:", body[:300].replace("\n"," ") + " ...")


=== Bericht 007 Nederlandse pati_nt wacht te lang op betere medicijnen tegen kanker.pdf ===
TITLE: Nederlandse patiënt wacht te lang op betere medicijnen tegen kanker
LEAD : Wat een prachtig bericht onlangs, dat meer kankerpatiënten de afgelopen decennia bleven leven. Twee derde van
de patiënten met de diagnose kanker leeft na vijf jaar nog. Een vooruitgang die volgens het Integraal
Kankercentrum Nederland mede te danken is aan innovatieve geneesmiddelen tegen gevorderde en uitgezaaide
kanker.
BODY len: 3697 chars
BODY preview: Nog niet voor alle soorten, maar in ieder geval voor huid-, long-, prostaat-, bloed-, nier- en blaaskanker. Dat zijn in aantal niet de minste. Maar het is jammer dat het zo lang duurt voordat dergelijke geneesmiddelen na goedkeuring door de Amerikaanse autoriteiten in de Nederlandse praktijk terecht ...

=== Bericht 010_Nieuwe kankermedicijnen leveren meer financi_le winst op dan gezondheidswinst.pdf ===
TITLE: Nieuwe kankermedicijnen leveren meer financiële wi

In [32]:
def make_chunks_by_tokens(text: str, tokenizer, max_tokens: int = 400, stride: int = 50) -> List[str]:
    if not text.strip():
        return []
    toks = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    i = 0
    while i < len(toks):
        window = toks[i:i+max_tokens]
        if not window:
            break
        chunk = tokenizer.decode(window, skip_special_tokens=True)
        chunks.append(chunk)
        if i + max_tokens >= len(toks):
            break
        i += max_tokens - stride
    return chunks

def chunk_body(text: str, tokenizer, prefer_paragraphs: bool = True, max_tokens: int = 400, stride: int = 50) -> List[str]:
    if not text.strip():
        return []
    if prefer_paragraphs:
        paras = re.split(r"\\n\\s*\\n", text.strip())
        paras = [p.strip() for p in paras if p.strip()]
        chunks = []
        for p in paras:
            if len(tokenizer.encode(p, add_special_tokens=False)) <= max_tokens:
                chunks.append(p)
            else:
                chunks.extend(make_chunks_by_tokens(p, tokenizer, max_tokens=max_tokens, stride=stride))
        return chunks
    else:
        return make_chunks_by_tokens(text, tokenizer, max_tokens=max_tokens, stride=stride)


In [33]:
ROB_NAME = "DTAI-KULeuven/robbert-v2-dutch-sentiment"
BER_NAME = "DTAI-KULeuven/robbertje-merged-dutch-sentiment"

def load_pipe(name, tries=2):
    last = None
    for i in range(tries):
        try:
            clf = pipeline(
                task="sentiment-analysis",
                model=name,
                tokenizer=name,
                top_k=None,          # vervangt return_all_scores=True
                truncation=True
            )
            print(f"[OK] Loaded: {name}")
            return clf
        except Exception as e:
            last = e
            print(f"[WARN] {name} attempt {i+1} failed: {e}")
            time.sleep(3)
    raise RuntimeError(f"Kon {name} niet laden: {last}")

rob_pipe = load_pipe(ROB_NAME)
ber_pipe = load_pipe(BER_NAME)

# Tokenizer voor chunking (pak RobBERT als die live is, anders BERTje)
try:
    TOKENIZER = AutoTokenizer.from_pretrained(ROB_NAME)
except Exception:
    TOKENIZER = AutoTokenizer.from_pretrained(BER_NAME)
print("[OK] Chunking tokenizer ready")

Device set to use cpu


[OK] Loaded: DTAI-KULeuven/robbert-v2-dutch-sentiment


Device set to use cpu


[OK] Loaded: DTAI-KULeuven/robbertje-merged-dutch-sentiment
[OK] Chunking tokenizer ready


In [34]:
MAX_MODEL_TOKENS = 512       # limiet model
HEADROOM = 8                  # veiligheidsmarge
SAFE_MAX = MAX_MODEL_TOKENS - HEADROOM  # 504 tokens

def token_chunks(text, tokenizer, max_tokens=SAFE_MAX, stride=50):
    """
    Splits lange tekst in chunks zodat ze <512 tokens blijven.
    """
    if not text or not text.strip():
        return []
    ids = tokenizer.encode(text, add_special_tokens=False)
    if len(ids) <= max_tokens:
        return [text]

    out = []
    i = 0
    while i < len(ids):
        window = ids[i:i+max_tokens]
        if not window: 
            break
        out.append(tokenizer.decode(window, skip_special_tokens=True))
        if i + max_tokens >= len(ids): 
            break
        i += max_tokens - stride
    return out

def split_title_lead_if_needed(title, lead, tokenizer):
    title_chs = token_chunks(title, tokenizer) if title else []
    lead_chs  = token_chunks(lead, tokenizer) if lead  else []
    return title_chs, lead_chs


LABEL_MAP = {
    "POSITIVE": "positief", "NEGATIVE": "negatief", "NEUTRAL": "neutraal",
    "Positive": "positief", "Negative": "negatief", "Neutral": "neutraal",
    "positief": "positief", "negatief": "negatief", "neutraal": "neutraal"
}

def normalize_pnn(probs):
    arr = np.array([probs.get("positief",0), probs.get("negatief",0), probs.get("neutraal",0)], dtype=float)
    s = arr.sum()
    if s <= 0: return {"positief":0.0, "negatief":0.0, "neutraal":1.0}
    arr = arr/s
    return {"positief": float(arr[0]), "negatief": float(arr[1]), "neutraal": float(arr[2])}

def score_text_with_pipe(text: str, clf) -> dict:
    if not text.strip():
        return {"positief":0.0, "negatief":0.0, "neutraal":1.0}
    out = clf(text, truncation=True)   # bij top_k=None: lijst van dicts
    scores = out[0]
    probs = {"positief":0.0, "negatief":0.0, "neutraal":0.0}
    for item in scores:
        lab = LABEL_MAP.get(item["label"])
        if lab: probs[lab] = float(item["score"])
    return normalize_pnn(probs)

def aggregate_article_with_pipe(title, lead, body_chunks, clf, tokenizer,
                                weights=WEIGHTS, neutral_band=NEUTRAL_BAND):
    """
    - Chunk ook titel en lead naar <512 tokens.
    - Verdeel gewicht gelijkmatig over alle chunks.
    - Zorgt dat het model veilig kan draaien zonder warnings/errors.
    """
    # Chunk titel en lead
    title_chunks, lead_chunks = split_title_lead_if_needed(title, lead, tokenizer)

    parts = []
    if title_chunks:
        w_each = weights.title / len(title_chunks)
        for t in title_chunks:
            parts.append((t, w_each))
    if lead_chunks:
        w_each = weights.lead / len(lead_chunks)
        for l in lead_chunks:
            parts.append((l, w_each))
    for ch in body_chunks:
        if ch.strip():
            parts.append((ch.strip(), weights.body))

    if not parts:
        return {"p_pos":0.0, "p_neg":0.0, "p_neu":1.0, "score":0.0, "label":"neutraal"}

    acc = np.zeros(3, dtype=float)
    diffs, wts = [], []
    for txt, w in parts:
        # model call, forceren max_length
        out = clf(txt, truncation=True, max_length=MAX_MODEL_TOKENS, padding=False)
        scores = out[0]  # top_k=None -> lijst van dicts
        probs = {"positief":0.0,"negatief":0.0,"neutraal":0.0}
        for item in scores:
            lab = LABEL_MAP.get(item["label"])
            if lab: probs[lab] = float(item["score"])
        probs = normalize_pnn(probs)

        acc   += np.array([probs["positief"], probs["negatief"], probs["neutraal"]]) * w
        diffs += [(probs["positief"] - probs["negatief"]) * w]
        wts   += [w]

    W = max(sum(wts), 1e-9)
    p_pos, p_neg, p_neu = (acc / W).tolist()
    signed = float(sum(diffs) / W)
    label = "neutraal" if abs(signed) < neutral_band else ("positief" if signed > 0 else "negatief")
    return {"p_pos":p_pos, "p_neg":p_neg, "p_neu":p_neu, "score":signed, "label":label}

def avg_two(prob_a: dict, prob_b: dict) -> dict:
    # neem gemiddelde van p_pos/p_neg/p_neu uit twee modellen
    p = {
        "p_pos": (prob_a["p_pos"] + prob_b["p_pos"]) / 2.0,
        "p_neg": (prob_a["p_neg"] + prob_b["p_neg"]) / 2.0,
        "p_neu": (prob_a["p_neu"] + prob_b["p_neu"]) / 2.0,
    }
    signed = (p["p_pos"] - p["p_neg"])
    p["score"] = signed
    p["label"] = "neutraal" if abs(signed) < NEUTRAL_BAND else ("positief" if signed > 0 else "negatief")
    return p

In [35]:
rows = []
pdf_files = [f for f in sorted(os.listdir(DATA_DIR)) if f.lower().endswith('.pdf')]
if not pdf_files:
    print(f"[INFO] Geen PDF-bestanden gevonden in '{DATA_DIR}'.")
else:
    for fname in tqdm(pdf_files, desc="PDFs verwerken"):
        fpath = os.path.join(DATA_DIR, fname)
        clean_txt = pdf_to_text(fpath)
        title, lead, body = extract_title_lead_body(clean_txt)
        body_chunks = chunk_body(body, TOKENIZER, prefer_paragraphs=True, max_tokens=MAX_TOKENS, stride=STRIDE)

        # per model
        rob = aggregate_article_with_pipe(title, lead, body_chunks, rob_pipe, TOKENIZER)
        ber = aggregate_article_with_pipe(title, lead, body_chunks, ber_pipe, TOKENIZER)
        ens = avg_two({"p_pos":rob["p_pos"], "p_neg":rob["p_neg"], "p_neu":rob["p_neu"]},
                      {"p_pos":ber["p_pos"], "p_neg":ber["p_neg"], "p_neu":ber["p_neu"]})

        rows.append({
            "id": os.path.splitext(fname)[0],
            "title": title,
            "lead": lead,
            "n_body_chunks": len(body_chunks),

            # RobBERT
            "rob_p_pos": rob["p_pos"], "rob_p_neg": rob["p_neg"], "rob_p_neu": rob["p_neu"],
            "rob_score": rob["score"], "rob_label": rob["label"],

            # BERTje
            "ber_p_pos": ber["p_pos"], "ber_p_neg": ber["p_neg"], "ber_p_neu": ber["p_neu"],
            "ber_score": ber["score"], "ber_label": ber["label"],

            # Ensemble (gemiddelde van probs)
            "ens_p_pos": ens["p_pos"], "ens_p_neg": ens["p_neg"], "ens_p_neu": ens["p_neu"],
            "ens_score": ens["score"], "ens_label": ens["label"],
        })

    df = pd.DataFrame(rows)
    display(df.head())

PDFs verwerken:   0%|                                                                           | 0/70 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (750 > 512). Running this sequence through the model will result in indexing errors
PDFs verwerken: 100%|██████████████████████████████████████████████████████████████████| 70/70 [03:28<00:00,  2.97s/it]


Unnamed: 0,id,title,lead,n_body_chunks,rob_p_pos,rob_p_neg,rob_p_neu,rob_score,rob_label,ber_p_pos,ber_p_neg,ber_p_neu,ber_score,ber_label,ens_p_pos,ens_p_neg,ens_p_neu,ens_score,ens_label
0,Bericht 007 Nederlandse pati_nt wacht te lang ...,Nederlandse patiënt wacht te lang op betere me...,"Wat een prachtig bericht onlangs, dat meer kan...",2,0.302636,0.697364,0.0,-0.394728,negatief,0.285786,0.714214,0.0,-0.428428,negatief,0.294211,0.705789,0.0,-0.411578,negatief
1,Bericht 010_Nieuwe kankermedicijnen leveren me...,Nieuwe kankermedicijnen leveren meer financiël...,Vorige week verscheen in Trouw een artikel met...,3,0.432665,0.567335,0.0,-0.13467,negatief,0.005389,0.994611,0.0,-0.989222,negatief,0.219027,0.780973,0.0,-0.561946,negatief
2,Bericht 011_Hoe controleer je verstopte moeder...,Hoe controleer je verstopte moedervlekken?,Preventie het consult\nMeer dan twintig jaar g...,2,0.882563,0.117437,0.0,0.765126,positief,0.664079,0.335921,0.0,0.328157,positief,0.773321,0.226679,0.0,0.546642,positief
3,Bericht 016_Ik vind het erg als _n infuus van ...,'Ik vind het erg als 'n infuus van 25.000 euro...,Slimme ideeën van apotheker Roelof van Leeuwen...,3,0.502075,0.497925,0.0,0.004151,neutraal,0.218087,0.781913,0.0,-0.563827,negatief,0.360081,0.639919,0.0,-0.279838,negatief
4,Bericht 021_Wachtlijsten en personeelstekort_ ...,Wachtlijsten en personeelstekort: het 'zorginf...,Onbetaalbare zorg\nDe gezondheidszorg kan het ...,8,0.611262,0.388738,0.0,0.222525,positief,0.313417,0.686583,0.0,-0.373166,negatief,0.46234,0.53766,0.0,-0.075321,neutraal


In [36]:
    df.to_csv("sentiment_results.csv", index=False, encoding="utf-8")
    print("[DONE] Geschreven naar sentiment_results.csv")

[DONE] Geschreven naar sentiment_results.csv
