In [1]:
# Install necessary libraries
!python -m pip install pandas spacy tqdm
!python -m spacy download nl_core_news_sm

# Import
import pandas as pd
import numpy as np
import re
from collections import defaultdict
from tqdm import tqdm
import spacy


Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp311-cp311-win_amd64.whl (11.6 MB)
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   ------ --------------------------------- 1.8/11.6 MB 10.1 MB/s eta 0:00:01
   ---------- ----------------------------- 3.1/11.6 MB 7.7 MB/s eta 0:00:02
   ------------ --------------------------- 3.7/11.6 MB 5.7 MB/s eta 0:00:02
   --------------- ------------------------ 4.5/11.6 MB 5.4 MB/s eta 0:00:02
   ------------------ --------------------- 5.2/11.6 MB 4.9 MB/s eta 0:00:02
   -------------------- ------------------- 6.0/11.6 MB 4.7 MB/s eta 0:00:02
   ----------------------- ---------------- 6.8/11.6 MB 4.6 MB/s eta 0:00:02
   --------------------------- -

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Upload policy documents and dictionary
uploaded = files.upload()
# Select files by name from uploaded dict
policy_path = [f for f in uploaded if f.endswith(('.csv','.xlsx')) and 'dict' not in f][0]
dictionary_path = [f for f in uploaded if f.endswith('.xlsx') and 'dict' in f][0]


ModuleNotFoundError: No module named 'google'

In [None]:
# Load policy documents
if policy_path.endswith('.csv'):
    policy_df = pd.read_csv(policy_path)
elif policy_path.endswith('.xlsx'):
    policy_df = pd.read_excel(policy_path)
else:
    raise ValueError("Unsupported file format for policy documents.")

# Load dictionary
dictionary_df = pd.read_excel(dictionary_path)

# Standardize columns (change if your dict structure differs)
dictionary_df.columns = ['category', 'words']


In [None]:
def wildcard_to_regex(word):
    return re.escape(str(word)).replace(r'\*', '.*')

def safer_pattern(p):
    return re.sub(r'\.\*$', r'\\w*', p.strip()) if isinstance(p, str) else p

# Process dictionary for regex
dictionary_df['regex_pattern'] = dictionary_df['words'].astype(str).apply(wildcard_to_regex)
dictionary_df['regex_pattern'] = dictionary_df['regex_pattern'].apply(safer_pattern)
topic_dict_clean = dictionary_df.groupby('category')['regex_pattern'].apply(list).to_dict()

# Precompile regex patterns for fast lookup
compiled_patterns = []
for topic, patterns in topic_dict_clean.items():
    for pattern in patterns:
        try:
            regex = re.compile(f'^{pattern}$', re.IGNORECASE)
            compiled_patterns.append((regex, topic))
        except:
            continue


In [None]:
# Load SpaCy Dutch model
nlp = spacy.load("nl_core_news_sm")

def split_sentences(text):
    doc = nlp(str(text))
    return [sent.text.strip() for sent in doc.sents]


In [None]:
def count_topic_hits_sentence_regex(text):
    topic_counts = defaultdict(int)
    if pd.isna(text) or not text.strip():
        return topic_counts

    for sent in split_sentences(text):
        if len(sent.split()) < 5:
            continue

        topic_match_counts = defaultdict(int)
        tokens = re.findall(r'\w+', sent, re.UNICODE)
        for token in tokens:
            for regex, topic in compiled_patterns:
                if regex.fullmatch(token):
                    topic_match_counts[topic] += 1

        if len(topic_match_counts) == 1:
            topic, count = next(iter(topic_match_counts.items()))
            if count >= 2:
                topic_counts[topic] += 1

    return topic_counts


In [None]:
topic_counts_list = []
matched_sentences = []

for i, row in tqdm(policy_df.iterrows(), total=len(policy_df), desc="Matching Topics"):
    text = row['clean_text']
    filename = row['filename'] if 'filename' in row else f'doc_{i}'
    if pd.isna(text) or not text.strip():
        topic_counts_list.append({})
        continue

    counts = count_topic_hits_sentence_regex(text)
    topic_counts_list.append(counts)

    # Collect matching sentences for inspection
    for sent in split_sentences(text):
        if len(sent.split()) < 5:
            continue
        topic_match_counts = defaultdict(int)
        tokens = re.findall(r'\w+', sent, re.UNICODE)
        for token in tokens:
            for regex, topic in compiled_patterns:
                if regex.fullmatch(token):
                    topic_match_counts[topic] += 1
        if len(topic_match_counts) == 1:
            topic, count = next(iter(topic_match_counts.items()))
            if count >= 2:
                matched_sentences.append({'sentence': sent.strip(), 'topic': topic, 'document': filename})


In [None]:
# Topic counts per document
topic_counts_df = pd.DataFrame(topic_counts_list).fillna(0).astype(int)
result_df = pd.concat([policy_df.reset_index(drop=True), topic_counts_df], axis=1)
print("✅ Topic matching completed.")
display(result_df.head())

# Save to CSV
result_df.to_csv('topic_counts_per_document.csv', index=False)
files.download('topic_counts_per_document.csv')

# (Optional) Export matched sentences for review
matched_df = pd.DataFrame(matched_sentences)
matched_df.to_csv('matched_sentences.csv', index=False)
files.download('matched_sentences.csv')


TOPIC
# Nieuwe sectie

In [None]:
import pandas as pd

labeled_df = pd.read_csv('matched_sentences.csv')
labeled_df = labeled_df[['sentence', 'topic']]  # Only keep relevant columns
print(labeled_df.head())


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
labeled_df['label'] = le.fit_transform(labeled_df['topic'])
label2topic = dict(zip(labeled_df['label'], labeled_df['topic']))
print(label2topic)


In [None]:
from collections import Counter

# Count class frequencies
label_counts = labeled_df['label'].value_counts()

# Find classes with only 1 example
singleton_labels = label_counts[label_counts == 1].index.tolist()

# Show the removed classes and their topic names
print("The following topic classes have only one example and will be removed:")
for lbl in singleton_labels:
    topic_name = le.inverse_transform([lbl])[0]
    print(f"  Label: {lbl}, Topic: '{topic_name}'")

# Remove singleton classes from the DataFrame
filtered_df = labeled_df[~labeled_df['label'].isin(singleton_labels)].copy()
print(f"\nKept {len(filtered_df)} sentences across {filtered_df['label'].nunique()} topics.")


In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    filtered_df, test_size=0.2, random_state=42, stratify=filtered_df['label']
)
print(train_df.shape, val_df.shape)


In [None]:
!pip install transformers datasets accelerate

import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer


In [None]:
MODEL_NAME = "GroNLP/bert-base-dutch-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(batch):
    return tokenizer(batch["sentence"], padding="max_length", truncation=True, max_length=128)

train_dataset = Dataset.from_pandas(train_df[['sentence', 'label']])
val_dataset = Dataset.from_pandas(val_df[['sentence', 'label']])

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)


In [None]:
num_labels = labeled_df['label'].nunique()
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

import os
os.environ["WANDB_DISABLED"] = "true"
training_args = TrainingArguments(
    output_dir="./bertje-finetuned",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

def compute_metrics(eval_pred):
    from sklearn.metrics import accuracy_score, f1_score
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()


In [None]:
import os
print(os.listdir('./bertje-finetuned'))


In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
# Reload model for inference
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from transformers import AutoModelForSequenceClassification, AutoTokenizer

MODEL_DIR = "./bertje-finetuned/checkpoint-81"
BASE_MODEL = "GroNLP/bert-base-dutch-cased"  # The original model

model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)  # <-- Use the base model here!

# Example: Apply to all sentences in your (unlabeled) policy documents
unseen_policy_df = pd.read_excel("2022_docs_301_to_377 (3)_cleaned.xlsx")  # Or .xlsx as needed

# Split to sentences
import spacy
nlp = spacy.load("nl_core_news_sm")

def split_sentences(text):
    doc = nlp(str(text))
    return [sent.text.strip() for sent in doc.sents]

all_sentences = []
doc_ids = []
for i, row in unseen_policy_df.iterrows():
    doc_id = row['filename'] if 'filename' in row else f'doc_{i}'
    text = row['clean_text']
    if not pd.isna(text):
        sents = split_sentences(text)
        all_sentences.extend(sents)
        doc_ids.extend([doc_id]*len(sents))

from transformers import pipeline

nlp_pipe = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
)

# Predict topics
results = nlp_pipe(all_sentences, truncation=True, max_length=128, batch_size=32)

# Combine results
results_df = pd.DataFrame({
    'document': doc_ids,
    'sentence': all_sentences,
    'label': [r['label'] for r in results],
    'score': [r['score'] for r in results]
})

# Decode numeric label to topic name
results_df['topic'] = results_df['label'].apply(lambda x: label2topic[int(x.replace('LABEL_', ''))])
display(results_df.head())

# Save
results_df.to_csv('bertje_topic_predictions.csv', index=False)
from google.colab import files
files.download('bertje_topic_predictions.csv')


In [None]:
import re
import spacy
from collections import defaultdict
import pandas as pd

nlp = spacy.load("nl_core_news_sm")

def split_sentences(text):
    doc = nlp(str(text))
    return [sent.text.strip() for sent in doc.sents]

def regex_label_sentence(sent):
    tokens = re.findall(r'\w+', sent, re.UNICODE)
    topic_match_counts = defaultdict(int)
    matched_words = defaultdict(list)
    for token in tokens:
        for regex, topic in compiled_patterns:
            if regex.fullmatch(token):
                topic_match_counts[topic] += 1
                matched_words[topic].append(token)
    if len(topic_match_counts) == 1:
        topic, count = next(iter(topic_match_counts.items()))
        if count >= 2:
            return topic, matched_words[topic]
    return None, []

# For all sentences
sent_records = []
for i, row in unseen_policy_df.iterrows():
    doc_id = row['filename'] if 'filename' in row else f'doc_{i}'
    text = row['clean_text']
    if pd.isna(text): continue
    for sent in split_sentences(text):
        topic, match_words = regex_label_sentence(sent)
        sent_records.append({
            'document': doc_id,
            'sentence': sent,
            'topic_regex': topic,
            'regex_words': match_words
        })

sent_df = pd.DataFrame(sent_records)


In [None]:
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import numpy as np

# Set up base BERTje
BASE_MODEL = "GroNLP/bert-base-dutch-cased"
tokenizer_base = AutoTokenizer.from_pretrained(BASE_MODEL)
model_base = AutoModel.from_pretrained(BASE_MODEL).to("cuda" if torch.cuda.is_available() else "cpu")

def bertje_embed(texts, tokenizer, model):
    encoded = tokenizer(list(texts), return_tensors='pt', padding=True, truncation=True, max_length=64).to(model.device)
    with torch.no_grad():
        output = model(**encoded)
    return output.last_hidden_state.mean(dim=1).cpu().numpy()

# Create topic centroids from seed words (dictionary)
embedding_dict = defaultdict(list)
for _, row in dictionary_df.iterrows():
    # Remove possible lists or empty cells
    if isinstance(row['words'], str) and row['words'].strip():
        embedding_dict[row['category']].append(row['words'])

topic_centroids_base = {}
for topic, words in embedding_dict.items():
    if not words:
        print(f"Skipping topic '{topic}' (empty word list)")
        continue
    try:
        centroid = bertje_embed(words, tokenizer_base, model_base).mean(axis=0)
        topic_centroids_base[topic] = centroid
    except Exception as e:
        print(f"Error embedding for topic '{topic}': {e}")
def bertje_label_sentence(sent, centroids, tokenizer, model):
    sent_vec = bertje_embed([sent], tokenizer, model)[0]
    best_topic, best_score = None, 0
    for topic, centroid in centroids.items():
        score = cosine_similarity([sent_vec], [centroid])[0][0]
        if score > best_score:
            best_score = score
            best_topic = topic
    return (best_topic, best_score) if best_score > 0.65 else (None, best_score)

# Apply to sentences
sent_df['topic_bertje_base'] = None
sent_df['bertje_base_score'] = None

for i, row in sent_df.iterrows():
    sent = row['sentence']
    topic, score = bertje_label_sentence(sent, topic_centroids_base, tokenizer_base, model_base)
    sent_df.at[i, 'topic_bertje_base'] = topic
    sent_df.at[i, 'bertje_base_score'] = score


In [None]:
sent_df.to_csv('all_sentence_topic_labels.csv', index=False)
from google.colab import files
files.download('all_sentence_topic_labels.csv')


visualisatie
# Nieuwe sectie

In [None]:
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch

# Choose base or fine-tuned
BASE_MODEL = "GroNLP/bert-base-dutch-cased"
FINETUNED_MODEL = "./bertje-finetuned/checkpoint-81"

def get_word_embeddings(words, model_path, tokenizer_path=None):
    if tokenizer_path is None:
        tokenizer_path = model_path
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    model = AutoModel.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")
    words = [str(w) for w in set(words) if isinstance(w, str) and w.strip()]
    encoded = tokenizer(words, return_tensors='pt', padding=True, truncation=True, max_length=12).to(model.device)
    with torch.no_grad():
        output = model(**encoded)
    return words, output.last_hidden_state.mean(dim=1).cpu().numpy()


word_list = list(set(dictionary_df['words']))
words_base, embeddings_base = get_word_embeddings(word_list, BASE_MODEL)
words_ft, embeddings_ft = get_word_embeddings(word_list, FINETUNED_MODEL, tokenizer_path=BASE_MODEL)


def build_semantic_graph(words, embeddings, topic_map, threshold=0.7):
    G = nx.Graph()
    for i, w in enumerate(words):
        G.add_node(w, topic=topic_map[w])
    sims = cosine_similarity(embeddings)
    for i in range(len(words)):
        for j in range(i+1, len(words)):
            if sims[i, j] > threshold:
                G.add_edge(words[i], words[j], weight=sims[i, j])
    return G

# Map words to their topics
topic_map = {row['words']: row['category'] for _, row in dictionary_df.iterrows()}

G_base = build_semantic_graph(words_base, embeddings_base, topic_map)
G_ft = build_semantic_graph(words_ft, embeddings_ft, topic_map)

def plot_semantic_graph(G, title):
    pos = nx.spring_layout(G, k=0.85, iterations=40, seed=44)
    edge_x, edge_y = [], []
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x += [x0, x1, None]
        edge_y += [y0, y1, None]

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.7, color='#888'),
        hoverinfo='none',
        mode='lines'
    )

    node_x, node_y, node_color, node_text = [], [], [], []
    topics = list(set(dictionary_df['category']))
    topic_colors = {topic: f"hsl({i*360//len(topics)},60%,60%)" for i, topic in enumerate(sorted(topics))}
    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        color = topic_colors[topic_map[node]]
        node_color.append(color)
        node_text.append(f"{node} ({topic_map[node]})")

    node_trace = go.Scatter(
        x=node_x, y=node_y, mode='markers+text',
        text=node_text, hoverinfo='text',
        marker=dict(color=node_color, size=13)
    )

    fig = go.Figure([edge_trace, node_trace])
    fig.update_layout(
        title=title,
        showlegend=False,
        hovermode='closest',
        margin=dict(b=20,l=5,r=5,t=40),
        annotations=[dict(
            text="Edges = high similarity (> threshold). Color = topic. Tooltip = word+topic",
            showarrow=False, xref="paper", yref="paper", x=0, y=1.1, font=dict(size=12))]
    )
    fig.show()

plot_semantic_graph(G_base, "Base BERTje Semantic Network of Dictionary Words")
plot_semantic_graph(G_ft, "Fine-tuned BERTje Semantic Network of Dictionary Words")


In [None]:
import pandas as pd
import networkx as nx
import plotly.graph_objects as go

# Example: dictionary_df = pd.DataFrame({'category': [...], 'words': [...]})

G = nx.Graph()
topic_colors = {topic: f"hsl({i*360//len(set(dictionary_df['category']))},60%,60%)"
                for i, topic in enumerate(sorted(dictionary_df['category'].unique()))}

# Add nodes
for idx, row in dictionary_df.iterrows():
    topic = row['category']
    word = row['words']
    G.add_node(word, topic=topic)

# Link each word to its topic
for topic in dictionary_df['category'].unique():
    words = dictionary_df[dictionary_df['category'] == topic]['words'].tolist()
    for word in words:
        G.add_edge(topic, word)

# Layout
pos = nx.spring_layout(G, k=0.5, iterations=30, seed=42)
edge_x, edge_y = [], []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]

# Edge trace
edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

# Node trace
node_x, node_y, node_color, node_text = [], [], [], []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    if node in dictionary_df['words'].values:
        topic = dictionary_df[dictionary_df['words'] == node]['category'].values[0]
        color = topic_colors[topic]
        text = f"{node} ({topic})"
    else:
        color = 'black'
        text = f"Topic: {node}"
    node_color.append(color)
    node_text.append(text)

node_trace = go.Scatter(
    x=node_x, y=node_y, mode='markers+text',
    text=node_text, hoverinfo='text',
    marker=dict(color=node_color, size=12, line=None))

fig = go.Figure([edge_trace, node_trace])
fig.update_layout(
    title='Regex Dictionary Topic-Word Network',
    showlegend=False,
    hovermode='closest',
    margin=dict(b=20,l=5,r=5,t=40),
    annotations=[dict(text="Each color is a topic; tooltip shows the word", showarrow=False, xref="paper", yref="paper", x=0, y=1.1, font=dict(size=12))]
)
fig.show()


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Choose which BERTje to use (pretrained or fine-tuned)
MODEL_PATH = "GroNLP/bert-base-dutch-cased"  # or "./bertje-finetuned"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModel.from_pretrained(MODEL_PATH)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def bertje_embed(words):
    encoded = tokenizer(list(words), return_tensors='pt', padding=True, truncation=True, max_length=12).to(device)
    with torch.no_grad():
        output = model(**encoded)
    return output.last_hidden_state.mean(dim=1).cpu().numpy()

word_list = dictionary_df['words'].tolist()
embeddings = bertje_embed(word_list)

# Pairwise similarity
sim_matrix = cosine_similarity(embeddings)

# Build network
G = nx.Graph()
for idx, row in dictionary_df.iterrows():
    G.add_node(row['words'], topic=row['category'])

threshold = 0.7  # Tune for sparsity/density
for i, word1 in enumerate(word_list):
    for j, word2 in enumerate(word_list):
        if i < j and sim_matrix[i, j] > threshold:
            G.add_edge(word1, word2, weight=sim_matrix[i, j])

# Layout and plot as above
pos = nx.spring_layout(G, k=0.7, iterations=30, seed=42)
edge_x, edge_y, edge_text = [], [], []
for edge in G.edges(data=True):
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]
    edge_text.append(f"{edge[0]} ↔ {edge[1]}<br>Similarity: {edge[2]['weight']:.2f}")

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.7, color='#888'),
    hoverinfo='none',
    mode='lines')

node_x, node_y, node_color, node_text = [], [], [], []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    topic = dictionary_df[dictionary_df['words'] == node]['category'].values[0]
    color = topic_colors[topic]
    node_color.append(color)
    node_text.append(f"{node} ({topic})")

node_trace = go.Scatter(
    x=node_x, y=node_y, mode='markers+text',
    text=node_text, hoverinfo='text',
    marker=dict(color=node_color, size=12))

fig = go.Figure([edge_trace, node_trace])
fig.update_layout(
    title=f"BERTje Topic-Word Network ({'Pretrained' if MODEL_PATH.endswith('dutch-cased') else 'Fine-tuned'})",
    showlegend=False,
    hovermode='closest',
    margin=dict(b=20,l=5,r=5,t=40),
)
fig.show()


In [None]:
from IPython.display import display, HTML

def highlight_text_spans(text, matches, color_map):
    """
    text: original string
    matches: list of dicts [{'start':int, 'end':int, 'topic':str, 'words':[str]}]
    color_map: topic->color hex
    Returns HTML for display.
    """
    last_idx = 0
    html_chunks = []
    for m in sorted(matches, key=lambda x: x['start']):
        if m['start'] > last_idx:
            html_chunks.append(text[last_idx:m['start']])
        topic = m['topic']
        color = color_map[topic]
        flagged_words = ", ".join(m['words'])
        tooltip = f"Topic: {topic}<br>Matched: {flagged_words}"
        chunk = f"<span style='background:{color};' title='{tooltip}'>{text[m['start']:m['end']]}</span>"
        html_chunks.append(chunk)
        last_idx = m['end']
    html_chunks.append(text[last_idx:])
    return "".join(html_chunks)


In [None]:
topics = dictionary_df['category'].unique().tolist()
import colorsys
def color_palette(n):
    return [
        f"hsl({int(i*360/n)},60%,85%)"
        for i in range(n)
    ]
color_map = dict(zip(topics, color_palette(len(topics))))
