### **Mount Google Drive**
(This lets me save files directly to my Drive.)

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


### **Download the JSON Lemma List**



In [2]:
import requests
import json

# URL for the JSON lemma list
lemmalist_url = "https://anw.ivdnt.org/backend/lemmalist?output=json"

# Download the JSON lemma list
response = requests.get(lemmalist_url)
response.raise_for_status()  # Stop if there is an error
lemmalist = response.json()

# Print out the total number of lemmas to confirm download
print(f"Total lemmas in the list: {len(lemmalist)}")

Total lemmas in the list: 87196


In [None]:
lemmalist[:10]

## Test Set

### **Extract Definitions**

This cell loops over the first 100 lemmas, skips those with no definitions, fetches their XML pages, extracts definitions from both <Kernbetekenis> and <Subbetekenis> elements and stores the results

In [19]:
# If resuming after a failure, set start_index to the lemma index where you left off.
start_index = 0

# Set test_limit to a number for a partial run, or None to run through all lemmas.
test_limit = 100  # For testing, I'll process 100 lemmas. Set to None for a complete run.


In [18]:
import time
from lxml import etree  # lxml is more tolerant than xml.etree.ElementTree

results = []  # This list will hold our output rows.
# List to track lemmas that are expected to have definitions (per JSON) but yield none.
missing_definitions = []

# Loop through lemmas from the specified start index.
for i, entry in enumerate(lemmalist[start_index:], start=start_index):
    if test_limit is not None and i >= start_index + test_limit:
        break

    lemma = entry[0]
    has_no_definition = entry[-1]  # true means no definition available

    # Skip lemmas that JSON says don't have a definition.
    if has_no_definition:
        continue

    print(f"Processing lemma: {lemma} (index {i})")

    # Fetch the XML for this lemma.
    article_url = f"https://anw.ivdnt.org/article/{lemma}?output=xml"
    xml_resp = requests.get(article_url)

    if xml_resp.status_code != 200:
        print(f"Failed to fetch XML for {lemma}")
        missing_definitions.append(lemma)
        continue

    try:
        # Use lxml's parser in recover mode to handle unescaped characters.
        parser = etree.XMLParser(recover=True)
        root = etree.fromstring(xml_resp.content, parser=parser)
    except Exception as e:
        print(f"Error parsing XML for {lemma}: {e}")
        missing_definitions.append(lemma)
        continue

    # Get the article-level lemma_id from the <artikel> element's pid attribute.
    article_lemma_id = root.attrib.get("pid", "")

    # Extract meanings from both <Kernbetekenis> and <Subbetekenis>.
    meaning_elements = root.xpath(".//Kernbetekenis") + root.xpath(".//Subbetekenis")

    if not meaning_elements:
        # If no meaning elements are found, track this lemma.
        missing_definitions.append(lemma)
    else:
        for me in meaning_elements:
            # Get POS from <Woordsoort>/<Type> (if present).
            pos_list = me.xpath(".//Woordsoort/Type/text()")
            pos = " ".join(pos_list).strip() if pos_list else ""

            # Get the meaning number from the element's id attribute (e.g., "bet1.0").
            betekenisnummer = me.attrib.get("id", "")
            if betekenisnummer.startswith("bet"):
                betekenisnummer = betekenisnummer[3:]

            # Get the meaning ID from the element's pid attribute.
            betekenis_id = me.attrib.get("pid", "")

            # Get the full definition (long) from <Definitie> (all nested text).
            definitie_text = " ".join(me.xpath(".//Definitie//text()")).strip()
            # Get the mini definition (short) from <Minidefinitie> (all nested text).
            minidef_text = " ".join(me.xpath(".//Minidefinitie//text()")).strip()

            results.append([lemma, pos, betekenisnummer, article_lemma_id, betekenis_id, definitie_text, minidef_text])

    time.sleep(0.1)

print(f"Processed records for lemmas with indices {start_index} to {i}. Total rows: {len(results)}")

# Verify that every lemma expected to have a definition (per JSON) is in the results.
# Define the end index based on test_limit.
end_index = start_index + test_limit if test_limit is not None else None
expected_lemmas = {entry[0] for entry in lemmalist[start_index:end_index] if not entry[-1]}
actual_lemmas = {row[0] for row in results}
lemmas_missing_in_results = expected_lemmas - actual_lemmas

if lemmas_missing_in_results:
    print("The following lemmas were expected to have definitions but are missing in the results:")
    print(lemmas_missing_in_results)
else:
    print("All lemmas expected to have definitions are present in the results.")

print("Lemmas with issues during processing (missing definitions):")
print(missing_definitions)


Processing lemma: #MeToo (index 0)
Processing lemma: 06-dealer (index 2)
Processing lemma: 06-nummer (index 3)
Processing lemma: 06-prostitutie (index 4)
Processing lemma: 1,5 metereconomie (index 7)
Processing lemma: 1,5 metermaatschappij (index 8)
Processing lemma: 1,5 metersamenleving (index 9)
Processing lemma: 1 april (index 10)


KeyboardInterrupt: 

### **Save the Results to a TSV File**

In [15]:
import csv
import os  # The os library is used here to handle file paths and ensure the output folder exists.

output_folder = "/content/drive/My Drive/Colab Notebooks/Thesis"
os.makedirs(output_folder, exist_ok=True)  # Create the folder if it doesn't already exist.
output_tsv = os.path.join(output_folder, "anw_definitions_test.tsv")

with open(output_tsv, mode="w", encoding="utf-8", newline="") as tsvfile:
    writer = csv.writer(tsvfile, delimiter="\t")
    # Write a header row corresponding to our target columns:
    writer.writerow([
        "Lemma",         # The headword
        "POS",           # Part of Speech
        "MeaningNumber", # e.g., 1.0 or 1.1 (extracted from the meaning element's id)
        "LemmaID",       # The lemma ID from the article-level attribute
        "MeaningID",     # The meaning ID from the meaning element's attribute
        "DefinitionFull",# Full (long) definition text
        "DefinitionShort"# Mini (short) definition text
    ])
    writer.writerows(results)

print(f"Results written to {output_tsv}")


Results written to /content/drive/My Drive/Colab Notebooks/Thesis/anw_definitions_test.tsv


## Full Dataset

In [32]:
# If resuming after a failure, set start_index to the lemma index where you left off.
start_index = 0

# Set test_limit to a number for a partial run, or None to run through all lemmas.
test_limit = None  # For testing, I'll process 100 lemmas. Set to None for a complete run.

# Define batch size for processing (number of lemmas per batch).
batch_size = 1000  # Adjust based on your needs; a larger batch means fewer checkpoints


Set Up Environment & Helper Function

In [33]:
import os
import time
import csv
from lxml import etree
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Set up a requests session with retry logic
session = requests.Session()
retry = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

def process_batch(lemmalist, start_index, batch_size):
    """
    Process a batch of lemmas from the lemmalist starting at start_index,
    processing up to batch_size entries.

    Returns:
      batch_results: list of rows (one per meaning) with:
         [Lemma, POS, MeaningNumber, LemmaID, MeaningID, DefinitionFull, DefinitionShort]
      batch_missing: list of lemmas (strings) that were expected to have definitions but yielded none.
    """
    batch_results = []
    batch_missing = []
    # If test_limit is set, restrict the end index further.
    end_index = start_index + batch_size
    if test_limit is not None:
        end_index = min(end_index, start_index + test_limit)

    for i, entry in enumerate(lemmalist[start_index:end_index], start=start_index):
        lemma = entry[0]
        has_no_definition = entry[-1]  # True means JSON says no definition

        # Skip lemmas that JSON indicates don't have a definition.
        if has_no_definition:
            continue

        print(f"Processing lemma: {lemma} (index {i})")

        article_url = f"https://anw.ivdnt.org/article/{lemma}?output=xml"
        try:
            xml_resp = session.get(article_url, timeout=10)
            xml_resp.raise_for_status()
        except Exception as e:
            print(f"Failed to fetch XML for {lemma}: {e}")
            batch_missing.append(lemma)
            continue

        try:
            parser = etree.XMLParser(recover=True)
            root = etree.fromstring(xml_resp.content, parser=parser)
        except Exception as e:
            print(f"Error parsing XML for {lemma}: {e}")
            batch_missing.append(lemma)
            continue

        # Get article-level lemma ID from the <artikel> element's pid attribute.
        article_lemma_id = root.attrib.get("pid", "")
        # Extract meanings from both <Kernbetekenis> and <Subbetekenis>.
        meaning_elements = root.xpath(".//Kernbetekenis") + root.xpath(".//Subbetekenis")

        if not meaning_elements:
            batch_missing.append(lemma)
        else:
            for me in meaning_elements:
                pos_list = me.xpath(".//Woordsoort/Type/text()")
                pos = " ".join(pos_list).strip() if pos_list else ""
                betekenisnummer = me.attrib.get("id", "")
                if betekenisnummer.startswith("bet"):
                    betekenisnummer = betekenisnummer[3:]
                betekenis_id = me.attrib.get("pid", "")
                definitie_text = " ".join(me.xpath(".//Definitie//text()")).strip()
                minidef_text = " ".join(me.xpath(".//Minidefinitie//text()")).strip()
                batch_results.append([lemma, pos, betekenisnummer, article_lemma_id, betekenis_id, definitie_text, minidef_text])

        time.sleep(0.1)

    return batch_results, batch_missing


 Batch Processing Loop with Checkpointing

In [34]:
# Parameters and output file paths
total_lemmas = len(lemmalist)
output_folder = "/content/drive/My Drive/Colab Notebooks/Thesis"
os.makedirs(output_folder, exist_ok=True)
output_tsv = os.path.join(output_folder, "definitions.tsv")
checkpoint_file = os.path.join(output_folder, "checkpoint.txt")
missing_file = os.path.join(output_folder, "missing_definitions.txt")

# Determine starting point from checkpoint, if available.
if os.path.exists(checkpoint_file):
    with open(checkpoint_file, "r") as cp:
        try:
            start_index = int(cp.read().strip())
            print(f"Resuming from index {start_index}")
        except:
            print("Could not parse checkpoint; starting from 0.")
            start_index = 0

# Determine end_index based on test_limit (if set)
if test_limit is not None:
    end_index = start_index + test_limit
else:
    end_index = total_lemmas

# If starting from 0, write header; otherwise, open in append mode.
write_header = start_index == 0
mode = "w" if write_header else "a"

all_missing = []  # To collect missing definitions across all batches.

with open(output_tsv, mode=mode, encoding="utf-8", newline="") as tsvfile:
    writer = csv.writer(tsvfile, delimiter="\t")
    if write_header:
        writer.writerow(["Lemma", "POS", "MeaningNumber", "LemmaID", "MeaningID", "DefinitionFull", "DefinitionShort"])

    # Process batches until we reach the end of the lemma list (or test_limit if set).
    for batch_start in range(start_index, total_lemmas, batch_size):
        # If test_limit is set and we exceed it, break.
        if test_limit is not None and batch_start >= start_index + test_limit:
            break

        print(f"Processing batch starting at index {batch_start}")
        batch_results, batch_missing = process_batch(lemmalist, batch_start, batch_size)
        writer.writerows(batch_results)
        all_missing.extend(batch_missing)

        # Update checkpoint after each batch.
        current_index = batch_start + batch_size
        with open(checkpoint_file, "w") as cp:
            cp.write(str(current_index))
        print(f"Batch complete. Checkpoint updated to {current_index}.")

print("All batches processed.")

# Write out missing definitions to a file.
with open(missing_file, "w", encoding="utf-8") as mf:
    for lemma in sorted(set(all_missing)):
        mf.write(lemma + "\n")

print(f"Results written to {output_tsv}")
print(f"Lemmas missing definitions have been saved to {missing_file}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processing lemma: toesnibben (index 71179)
Processing lemma: toestel (index 71187)
Processing lemma: toeter (index 71194)
Processing lemma: toeters en bellen (index 71196)
Processing lemma: toetje (index 71198)
Processing lemma: toetsenbordterrorist (index 71210)
Processing lemma: toeval (index 71225)
Processing lemma: toevalsfactor (index 71229)
Processing lemma: toezichthouder (index 71258)
Processing lemma: TOFA (index 71262)
Processing lemma: tofoe (index 71264)
Processing lemma: tofu (index 71265)
Processing lemma: toiletangst (index 71268)
Processing lemma: toiletbezoek (index 71269)
Processing lemma: toiletbezoeker (index 71270)
Processing lemma: toiletborstel (index 71271)
Processing lemma: toiletbril (index 71272)
Processing lemma: toiletpapier (index 71277)
Processing lemma: toiletpapiertje (index 71278)
Processing lemma: toiletpot (index 71279)
Processing lemma: toiletrol (index 71280)
Processing lemma: toiletr

## Convert to a Hugging Face Dataset

In [35]:
!pip install datasets  # If you haven't already installed HF datasets

import pandas as pd
from datasets import Dataset, DatasetDict

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [36]:
# Adjust this path to where your 'definitions.tsv' is located
tsv_path = "/content/drive/My Drive/Colab Notebooks/Thesis/definitions_run1.tsv"

df = pd.read_csv(tsv_path, sep="\t")

print("DataFrame columns:", df.columns.tolist())
print("Number of rows:", len(df))
df.head()


DataFrame columns: ['Lemma', 'POS', 'MeaningNumber', 'LemmaID', 'MeaningID', 'DefinitionFull', 'DefinitionShort']
Number of rows: 43217


Unnamed: 0,Lemma,POS,MeaningNumber,LemmaID,MeaningID,DefinitionFull,DefinitionShort
0,06-dealer,substantief,1.0,190832,190834,drugsdealer die via zijn mobiele telefoon een ...,drugsdealer die via zijn mobiel werkt
1,06-nummer,substantief,1.0,207,208,nummer van een mobiele telefoon; mobieletelefo...,mobieletelefoonnummer
2,06-prostitutie,substantief,1.0,285832,285833,vorm van illegale prostitutie waarbij prostitu...,prostitutie via een mobiele telefoon
3,"1,5 metereconomie",substantief,1.0,902410,907694,economisch systeem waarin mensen die niet tot ...,economie waarin fysieke afstand de norm is
4,"1,5 metermaatschappij",substantief,1.0,909355,909359,maatschappij waarin mensen die niet tot hetzel...,maatschappij waarin fysieke afstand nodig is


In [37]:
# Convert the entire DataFrame to a Dataset
hf_dataset = Dataset.from_pandas(df)

# Check some rows
hf_dataset


Dataset({
    features: ['Lemma', 'POS', 'MeaningNumber', 'LemmaID', 'MeaningID', 'DefinitionFull', 'DefinitionShort'],
    num_rows: 43217
})

In [38]:
hf_dataset[0]  # See first row


{'Lemma': '06-dealer',
 'POS': 'substantief',
 'MeaningNumber': '1.0',
 'LemmaID': 190832,
 'MeaningID': 190834,
 'DefinitionFull': 'drugsdealer die via zijn mobiele telefoon een bestelling voor drugs doorkrijgt en deze vervolgens op afspraak aflevert',
 'DefinitionShort': 'drugsdealer die via zijn mobiel werkt'}

In [40]:
# Saving the dataset in the Arrow fromat (default for HF Datasets)
hf_dataset.save_to_disk("/content/drive/My Drive/Colab Notebooks/Thesis/definitions_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/43217 [00:00<?, ? examples/s]

In [None]:
# Later I can reload it:
from datasets import load_from_disk
loaded_dataset = load_from_disk("/content/drive/My Drive/Colab Notebooks/Thesis/definitions_dataset")
loaded_dataset


In [42]:
!pip install huggingface_hub
from huggingface_hub import notebook_login

notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [43]:
# Pushing to Hugging Face Hub
hf_dataset.push_to_hub("RobbedoesHF/definitions-dataset1", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/44 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/RobbedoesHF/definitions-dataset1/commit/d9f3954b9f07a39dd3e70d964d835cff6ff61e66', commit_message='Upload dataset', commit_description='', oid='d9f3954b9f07a39dd3e70d964d835cff6ff61e66', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/RobbedoesHF/definitions-dataset1', endpoint='https://huggingface.co', repo_type='dataset', repo_id='RobbedoesHF/definitions-dataset1'), pr_revision=None, pr_num=None)