# Dutch Corpora

### Instal tempo-embeddings

In [1]:
%pip install -U -e ..

# Optional Install tempo-embeddings from GitHub
# This can also refer to a specific version or branch

# %pip install --upgrade pip  # Required for properly resolving dependencies
# %pip uninstall -y tempo_embeddings  # Remove existing installation
# %pip install --upgrade git+https://github.com/Semantics-of-Sustainability/tempo-embeddings.git

Obtaining file:///Users/carstenschnober/Documents/SemanticsOfSustainability/workspace/tempo-embeddings
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Collecting weaviate-client~=4.6.5 (from tempo_embeddings==0.0.1)
  Using cached weaviate_client-4.6.7-py3-none-any.whl.metadata (3.3 kB)
Using cached weaviate_client-4.6.7-py3-none-any.whl (328 kB)
Building wheels for collected packages: tempo_embeddings
  Building editable for tempo_embeddings (pyproject.toml) ... [?25ldone
[?25h  Created wheel for tempo_embeddings: filename=tempo_embeddings-0.0.1-0.editable-py3-none-any.whl size=9565 sha256=9b3a402d87942ec563d815b05d6300b7b5e18446018b71a2754bb4080bc4617b
  Stored in directory: /private/var/folders/d8/j5_fyf8941j_492zvf8948y40000gn/T/pip-ephem-wheel-cache-_f8bfysn/wheels/9d/dd/1c/d0e8ab

## Load Data

The data needs to be downloaded and provided in the path configured in the next cell.

NOTE: You have to manually adapt the `DATA_DIR` below.

In [5]:
import operator
from functools import reduce

from tqdm import tqdm

from tempo_embeddings.text.corpus import Corpus

In [6]:
WINDOW_SIZE = 300  # Size of passages in characters
USE_FULL_SENTENCES = False  # For now, this parameter overrides the window size. Instead of splitting arbitrarily the passages in WINDO_SIZE chunks, each Sentence will be a Passage

RANDOM_SAMPLE_ANP = 200
RANDOM_SAMPLE_STATEN_GENERAAL = 200

STATEN_GENERAAL_BLACKLIST = ["1987"]

FILTER_TERMS = [
    "duurzaam",
    "milieu",
]  # Search term(s) for filtering the corpus. If empty, it will only create Passages for the pieces of text that match the FILTER_TERMS
FILTER_TERMS = open("sustainability-filter-words.txt").read().split("\n")
FILTER_TERMS

['milieuproblemen',
 'Milieunormen',
 'Milieubeweging ',
 'Milieu-aspecten',
 'milieueffecten',
 'Milieumaatregelen',
 'Milieuvriendelijk ',
 'Milieubeleid',
 'milieuoogpunt',
 'koolzuur',
 'koolzuurgas',
 'stikstofdioxide',
 'zwaveldioxide',
 'isolatie',
 'Rookgassen',
 'verzuring',
 'Zuinig',
 'vervuilde',
 'niet-vervuilde',
 'luchtvervuiling',
 'luchtverontreiniging',
 'energieverbruik',
 'electriciteitsverbruik',
 'energieverspilling',
 'isolatie',
 'energieverslindend',
 'heffing',
 'Broeikaseffect',
 'broeikas-effect',
 'broeikasgassen',
 'Energiebesparingsmogelijkheden',
 'CO2-uitstoot',
 'wereldklimaat']

In [7]:
## NOTE: Adapt the `DATA_DIR` below manually!
## For a shared Google Drive, create a shortcut into your own Google Drive
## See https://stackoverflow.com/questions/54351852/accessing-shared-with-me-with-colab

from tempo_embeddings.settings import CORPUS_DIR

try:
    import google.colab  # noqa: F401

    IN_COLAB = True
except ModuleNotFoundError:
    IN_COLAB = False

assert CORPUS_DIR is not None

## Load Model

In [13]:
from tempo_embeddings.embeddings.model import (
    EmbeddingsMethod,
    SentenceTransformerModelWrapper,
)

kwargs = {"accelerate": True}

### Sentence Transformers
MODEL_NAME = "NetherlandsForensicInstitute/robbert-2022-dutch-sentence-transformers"
# MODEL_NAME = "textgain/allnli-GroNLP-bert-base-dutch-cased"
model_class = SentenceTransformerModelWrapper

model = model_class.from_pretrained(MODEL_NAME, **kwargs)
model.embeddings_method = EmbeddingsMethod.MEAN
model

<tempo_embeddings.embeddings.model.SentenceTransformerModelWrapper at 0x334782ed0>

# Database Access

In [14]:
import weaviate
from tempo_embeddings.embeddings import WeaviateDatabaseManager

db = WeaviateDatabaseManager(client=weaviate.connect_to_local(port=8087), model=model)

12:04:57 INFO:HTTP Request: GET http://localhost:8087/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"


12:04:57 INFO:HTTP Request: GET http://localhost:8087/v1/meta "HTTP/1.1 200 OK"
12:04:57 INFO:HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"
12:04:57 INFO:HTTP Request: GET http://localhost:8087/v1/schema/TempoEmbeddings "HTTP/1.1 200 OK"
12:04:57 INFO:HTTP Request: GET http://localhost:8087/v1/schema/TempoEmbeddings "HTTP/1.1 200 OK"


### ANP

In [15]:
COLLECTION_NAME = "ANP"
ANP_DIR = CORPUS_DIR / COLLECTION_NAME
assert RANDOM_SAMPLE_ANP == 0 or ANP_DIR.is_dir(), f"{ANP_DIR} not found."

In [16]:
import random

random.seed(0)

anp_files = list(ANP_DIR.glob("ANP_????.csv.gz"))

if RANDOM_SAMPLE_ANP and len(anp_files) > RANDOM_SAMPLE_ANP:
    anp_files = random.sample(anp_files, k=RANDOM_SAMPLE_ANP)

print(f"Found {len(anp_files)} ANP Files")
anp_files[:10]

Found 48 ANP Files


[PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/ANP/ANP_1937.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/ANP/ANP_1938.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/ANP/ANP_1939.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/ANP/ANP_1940.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/ANP/ANP_1941.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/ANP/ANP_1942.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/ANP/ANP_1943.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantic

In [17]:
import logging

ingested_files: set[str] = set()
try:
    ingested_files.update(*db.filenames(COLLECTION_NAME))
except ValueError:
    logging.warning(f"No '{COLLECTION_NAME}' files ingested yet")

print(f"Already ingested '{COLLECTION_NAME}' files: {ingested_files}")

12:05:17 INFO:HTTP Request: POST http://localhost:8087/v1/graphql "HTTP/1.1 200 OK"


Already ingested 'ANP' files: set()


In [18]:
anp_corpus = (
    reduce(
        operator.add,
        (
            Corpus.from_csv_file(
                path,
                filter_terms=FILTER_TERMS,
                text_columns=["content"],
                encoding="iso8859_15",
                compression="gzip",
                delimiter=";",
                window_size=WINDOW_SIZE,
                nlp_pipeline=None,
            )
            for path in tqdm(anp_files, unit="file")
            if path.name not in ingested_files
        ),
    )
    if anp_files
    else Corpus()
)

len(anp_corpus)

100%|██████████| 48/48 [11:05<00:00, 13.85s/file]


26728

In [19]:
db.ingest(corpus=anp_corpus, name=COLLECTION_NAME)

12:16:30 INFO:HTTP Request: GET http://localhost:8087/v1/schema/ANP "HTTP/1.1 404 Not Found"
12:16:30 INFO:HTTP Request: POST http://localhost:8087/v1/objects "HTTP/1.1 200 OK"
12:16:30 INFO:HTTP Request: POST http://localhost:8087/v1/schema "HTTP/1.1 200 OK"
Embeddings: 100%|█████████▉| 3341/3342 [06:43<00:00,  8.27batch/s]
Embeddings: 100%|█████████▉| 3340/3342 [06:43<00:00,  8.27batch/s]


In [20]:
set(db.filenames(COLLECTION_NAME))

01:13:40 INFO:HTTP Request: POST http://localhost:8087/v1/graphql "HTTP/1.1 200 OK"
01:13:40 INFO:HTTP Request: POST http://localhost:8087/v1/graphql "HTTP/1.1 200 OK"


{'anp_1949_06_28_54_ocr.xml',
 'anp_1954_03_03_14_ocr.xml',
 'anp_1958_09_29_56_ocr.xml',
 'anp_1961_01_20_94_ocr.xml',
 'anp_1962_01_19_43_ocr.xml',
 'anp_1962_10_01_56_ocr.xml',
 'anp_1965_01_29_79_ocr.xml',
 'anp_1968_09_26_100_ocr.xml',
 'anp_1968_09_26_106_ocr.xml',
 'anp_1969_10_06_121_ocr.xml',
 'anp_1969_10_27_99_ocr.xml',
 'anp_1970_06_24_35_ocr.xml',
 'anp_1970_10_14_58_ocr.xml',
 'anp_1972_09_15_86_ocr.xml',
 'anp_1973_11_08_23_ocr.xml',
 'anp_1973_11_08_86_ocr.xml',
 'anp_1974_06_02_104_ocr.xml',
 'anp_1975_06_17_118_ocr.xml',
 'anp_1976_04_28_12_ocr.xml',
 'anp_1976_06_09_142_ocr.xml',
 'anp_1976_06_24_129_ocr.xml',
 'anp_1979_09_19_56_ocr.xml',
 'anp_1979_10_02_142_ocr.xml',
 'anp_1979_11_06_14_ocr.xml',
 'anp_1979_11_30_69_ocr.xml',
 'anp_1980_05_13_155_ocr.xml',
 'anp_1980_05_13_1_ocr.xml',
 'anp_1980_06_21_21_ocr.xml',
 'anp_1980_07_05_70_ocr.xml',
 'anp_1980_11_06_14_ocr.xml',
 'anp_1980_11_14_32_ocr.xml',
 'anp_1981_02_11_100_ocr.xml',
 'anp_1981_02_11_110_ocr.xml',


### Staten Generaal

In [21]:
COLLECTION_NAME = "StatenGeneraal"
STATEN_GENERAAL_DIR = CORPUS_DIR / COLLECTION_NAME

assert RANDOM_SAMPLE_STATEN_GENERAAL == 0 or STATEN_GENERAAL_DIR.is_dir()

In [22]:
glob195x = "StatenGeneraal_19[0-9]?.csv.gz"  # Pattern for files from 1950-1999
glob20xx = "StatenGeneraal_2???.csv.gz"  # Pattern for files from 2000

files_195x = list(STATEN_GENERAAL_DIR.glob(glob195x))
files_20xx = list(STATEN_GENERAAL_DIR.glob(glob20xx))

sg_files = [
    file
    # Merge files from patterns
    for file in files_20xx + files_195x
    # Remove blacklisted files:
    for blacklisted in STATEN_GENERAAL_BLACKLIST
    if blacklisted not in file.name
]

if RANDOM_SAMPLE_STATEN_GENERAAL and RANDOM_SAMPLE_STATEN_GENERAAL < len(sg_files):
    sg_files = random.sample(sg_files, k=RANDOM_SAMPLE_STATEN_GENERAAL)

print(f"Found {len(sg_files)} STAATEN_G Files")
sorted(sg_files[:10])

Found 118 STAATEN_G Files


[PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/StatenGeneraal/StatenGeneraal_2000.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/StatenGeneraal/StatenGeneraal_2001.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/StatenGeneraal/StatenGeneraal_2002.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/StatenGeneraal/StatenGeneraal_2003.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/StatenGeneraal/StatenGeneraal_2004.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/StatenGeneraal/StatenGeneraal_2005.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semanti

In [23]:
import logging

ingested_files: set[str] = set()
try:
    ingested_files.update(db.filenames(COLLECTION_NAME))
except ValueError:
    logging.warning(f"No '{COLLECTION_NAME}' files ingested yet")

ingested_files

01:15:23 INFO:HTTP Request: POST http://localhost:8087/v1/graphql "HTTP/1.1 200 OK"


set()

In [24]:
import csv

csv.field_size_limit(100000000)

sg_corpus = (
    reduce(
        operator.add,
        (
            Corpus.from_csv_file(
                path,
                filter_terms=FILTER_TERMS,
                text_columns=["Content"],
                encoding="utf-8",
                compression="gzip",
                delimiter=";",
                window_size=WINDOW_SIZE,
                nlp_pipeline=None,
            )
            for path in tqdm(sg_files, unit="file")
            if path.name not in ingested_files
        ),
    )
    if sg_files
    else Corpus()
)

len(sg_corpus)

100%|██████████| 118/118 [52:09<00:00, 26.52s/file]


478029

In [25]:
db.ingest(sg_corpus, name=COLLECTION_NAME)

02:07:39 INFO:HTTP Request: GET http://localhost:8087/v1/schema/StatenGeneraal "HTTP/1.1 404 Not Found"
02:07:39 INFO:HTTP Request: POST http://localhost:8087/v1/objects "HTTP/1.1 200 OK"
02:07:39 INFO:HTTP Request: POST http://localhost:8087/v1/schema "HTTP/1.1 200 OK"
Embeddings: 100%|██████████| 59754/59754 [2:00:26<00:00,  8.27batch/s]    
Embeddings: 100%|█████████▉| 59753/59754 [2:00:26<00:00,  8.27batch/s]


### Show the Passages that were created in the corpus

In [27]:
for p in sg_corpus.passages[:20]:
    print(len(p), p)

303 Passage('dat de taakstraf een praktisch voordeel heeft en dat deze goedkoper is dan vrijheidsontneming. In deze tijd van bezuinigingen, vervolgt de minister, wegen deze argumenten zeker mee. Nu kun je hier in een periode van begrotingsoverschotten sowieso vraagtekens bij stellen, maar belangrijker bezwaar tegen', {'RecId': 'h-tk-19992000-2851-2868', 'chamber': 'TweedeKamer', 'date': '2000-01-18', 'speakers': 'De heer Dittrich D66|De heer Rouvoet RPF|De heer Dittrich D66|De heer Van de Camp CDA|De heer Dittrich D66|De heer Van de Camp CDA|De heer Dittrich D66|De heer Niederer VVD|De heer Dittrich D66|De heer Van de Camp CDA|De heer Dittrich D66|De heer Van de Camp CDA|De heer Dittrich D66|De heer Van de Camp CDA|De heer Dittrich D66|Mevrouw Halsema GroenLinks|De heer Dittrich D66|Mevrouw Halsema GroenLinks|De heer Dittrich D66|De heer Niederer VVD|De heer Rouvoet RPF|De heer Niederer VVD|De heer Rouvoet RPF|De heer Niederer VVD|De heer Dittrich D66|De heer Niederer VVD|De heer Dittr

## TEST: Retrieve Records from Database

In [34]:
filter_words = ["duurzaam"]
collections = ["ANP", "StatenGeneraal"]

corpus_mini = sum(
    (
        db.get_corpus(collection, filter_words=filter_words)
        for collection in collections
    ),
    start=Corpus(),
)
print(*corpus_mini.passages[:10], sep="\n")

Passage('vrouw, maar ook om die van * 5. duurzaam samenwonende of samenlevende mensen. De heffing van a. loon- en inkomstenbelasting zal in de toekomst moeten plaats7- . vinden op basis van individuele personen, zij het met enige 8. beperkingen. Premier Van Agt heeft dit meegedeeld op zijn 9. wekelijkse persconferentie.', {'year': '1979', 'date': '09-28-1979', 'highlighting': '81_88', 'day': '28', 'month': '9', 'filename': 'anp_1979_09_28_162_ocr.xml', 'issue': '162'}, Highlighting(start=81, end=88))
Passage('hebben-  de doelmatigheid doen blijken van een politiek die tegelijk vrede en onafhankelijkheid nastreeft. NEDERLAND, zei Uwe Majesteit, wil zichzelf blijven. Heerlijke en fiere leus, die NEDERLAND allang heeft toegepast en die toeliet- , dat het ongedeerd de tragische krisissen kon ontwijken, die zijne naburen geteisterd hebben. De toenadering van kleine staten verruimt de sfeer van hun actie en van hun gezag. Een solidaire houding kan den oorlog-  buiten onze grenzen weeren. Maa