# Dutch Corpora

### Instal tempo-embeddings

In [1]:
%pip install -U -e ..

# Optional Install tempo-embeddings from GitHub
# This can also refer to a specific version or branch

# %pip install --upgrade pip  # Required for properly resolving dependencies
# %pip uninstall -y tempo_embeddings  # Remove existing installation
# %pip install --upgrade git+https://github.com/Semantics-of-Sustainability/tempo-embeddings.git

Obtaining file:///Users/carstenschnober/Documents/SemanticsOfSustainability/workspace/tempo-embeddings
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: tempo_embeddings
  Building editable for tempo_embeddings (pyproject.toml) ... [?25ldone
[?25h  Created wheel for tempo_embeddings: filename=tempo_embeddings-0.0.1-0.editable-py3-none-any.whl size=9565 sha256=fa834471280044a61bd297e2448e925e59a4d81e502c5ebc139981f86625e92b
  Stored in directory: /private/var/folders/d8/j5_fyf8941j_492zvf8948y40000gn/T/pip-ephem-wheel-cache-v7j1x778/wheels/9d/dd/1c/d0e8ab6bde11a84556c00bd645333aaad0b90e2c38e11b0ed1
Successfully built tempo_embeddings
Installing collected packages: tempo_embeddings
  Attempting uninstall: tempo_embeddings
    Found existing installat

## Load Data

The data needs to be downloaded and provided in the path configured in the next cell.

NOTE: You have to manually adapt the `DATA_DIR` below.

In [2]:
from tqdm import tqdm

from tempo_embeddings.text.corpus import Corpus

In [3]:
WINDOW_SIZE = 300  # Size of passages in characters
USE_FULL_SENTENCES = False  # For now, this parameter overrides the window size. Instead of splitting arbitrarily the passages in WINDO_SIZE chunks, each Sentence will be a Passage

RANDOM_SAMPLE_ANP = 200
RANDOM_SAMPLE_STATEN_GENERAAL = 200

STATEN_GENERAAL_BLACKLIST = ["1987"]

FILTER_TERMS = [
    "duurzaam",
    "milieu",
]  # Search term(s) for filtering the corpus. If empty, it will only create Passages for the pieces of text that match the FILTER_TERMS
FILTER_TERMS = open("seed_terms/sustainability-filter-words.txt").read().split("\n")
FILTER_TERMS

['milieuproblemen',
 'Milieunormen',
 'Milieubeweging ',
 'Milieu-aspecten',
 'milieueffecten',
 'Milieumaatregelen',
 'Milieuvriendelijk ',
 'Milieubeleid',
 'milieuoogpunt',
 'koolzuur',
 'koolzuurgas',
 'stikstofdioxide',
 'zwaveldioxide',
 'isolatie',
 'Rookgassen',
 'verzuring',
 'Zuinig',
 'vervuilde',
 'niet-vervuilde',
 'luchtvervuiling',
 'luchtverontreiniging',
 'energieverbruik',
 'electriciteitsverbruik',
 'energieverspilling',
 'isolatie',
 'energieverslindend',
 'heffing',
 'Broeikaseffect',
 'broeikas-effect',
 'broeikasgassen',
 'Energiebesparingsmogelijkheden',
 'CO2-uitstoot',
 'wereldklimaat']

In [4]:
## NOTE: Adapt the `DATA_DIR` below manually!
## For a shared Google Drive, create a shortcut into your own Google Drive
## See https://stackoverflow.com/questions/54351852/accessing-shared-with-me-with-colab

from tempo_embeddings.settings import CORPUS_DIR

try:
    import google.colab  # noqa: F401

    IN_COLAB = True
except ModuleNotFoundError:
    IN_COLAB = False

assert CORPUS_DIR is not None

## Load Model

In [5]:
from tempo_embeddings.embeddings.model import (
    EmbeddingsMethod,
    SentenceTransformerModelWrapper,
)

kwargs = {"accelerate": True}

### Sentence Transformers
MODEL_NAME = "NetherlandsForensicInstitute/robbert-2022-dutch-sentence-transformers"
# MODEL_NAME = "textgain/allnli-GroNLP-bert-base-dutch-cased"
model_class = SentenceTransformerModelWrapper

model = model_class.from_pretrained(MODEL_NAME, **kwargs)
model.embeddings_method = EmbeddingsMethod.MEAN
model

  torch.utils._pytree._register_pytree_node(
  warn(


<tempo_embeddings.embeddings.model.SentenceTransformerModelWrapper at 0x3390a1650>

# Database Access

In [6]:
import weaviate
from tempo_embeddings.embeddings import WeaviateDatabaseManager

db = WeaviateDatabaseManager(client=weaviate.connect_to_local(port=8087), model=model)

### ANP

In [7]:
COLLECTION_NAME = "ANP"
ANP_DIR = CORPUS_DIR / COLLECTION_NAME
assert RANDOM_SAMPLE_ANP == 0 or ANP_DIR.is_dir(), f"{ANP_DIR} not found."

In [8]:
import random

random.seed(0)

anp_files = list(ANP_DIR.glob("ANP_????.csv.gz"))

if RANDOM_SAMPLE_ANP and len(anp_files) > RANDOM_SAMPLE_ANP:
    anp_files = random.sample(anp_files, k=RANDOM_SAMPLE_ANP)

print(f"Found {len(anp_files)} ANP Files")
anp_files[:10]

Found 48 ANP Files


[PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/ANP/ANP_1937.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/ANP/ANP_1938.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/ANP/ANP_1939.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/ANP/ANP_1940.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/ANP/ANP_1941.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/ANP/ANP_1942.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/ANP/ANP_1943.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantic

In [11]:
ingested_files: set[str] = set(db.provenances(COLLECTION_NAME))

print(f"Already ingested '{COLLECTION_NAME}' files: {ingested_files}")

Already ingested 'ANP' files: {'/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/ANP/ANP_1958.csv.gz', 'ANP_1937.csv.gz', '/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/ANP/ANP_1974.csv.gz', '/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/ANP/ANP_1966.csv.gz', 'ANP_1962.csv.gz', 'ANP_1958.csv.gz', 'ANP_1964.csv.gz', '/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/ANP/ANP_1954.csv.gz', 'ANP_1972.csv.gz', '/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/ANP/ANP_1956.csv.gz', '/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/ANP/ANP_1967.csv.gz', 'ANP_1973.csv.gz', 'ANP_1959.csv.gz', '/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/ANP/ANP_1948.csv.gz', '/V

In [12]:
# FIXME: this does not work any more, should use the CorpusReader (see scripts/build_sos_wv_database.py)

for path in tqdm(anp_files, desc=COLLECTION_NAME, unit="file"):
    if path.name not in ingested_files:
        corpus = Corpus.from_csv_file(
            path,
            filter_terms=FILTER_TERMS,
            text_columns=["content"],
            encoding="iso8859_15",
            compression="gzip",
            delimiter=";",
            window_size=WINDOW_SIZE,
            nlp_pipeline=None,
        )
        db.ingest(corpus, COLLECTION_NAME)

ANP:   0%|          | 0/48 [00:00<?, ?file/s]No passages to ingest into collection 'ANP'
ANP:   8%|▊         | 4/48 [00:02<00:25,  1.74file/s]No passages to ingest into collection 'ANP'
ANP:  10%|█         | 5/48 [00:02<00:19,  2.21file/s]No passages to ingest into collection 'ANP'
No passages to ingest into collection 'ANP'
ANP:  15%|█▍        | 7/48 [00:02<00:12,  3.38file/s]No passages to ingest into collection 'ANP'
No passages to ingest into collection 'ANP'
ANP:  19%|█▉        | 9/48 [00:02<00:08,  4.54file/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Embeddings:  99%|█████████▉| 80/81 [00:12<00:00,  6.50batch/s]
Embeddings:  98%|█████████▊| 79/81 [00:12<00:00,  6.42batch/s]
ANP:  85%|████████▌ | 41/48 [00:19<00:03,  1.99file/s]
[A
[A
[A
[A
[A
[A
[A
[A


In [20]:
set(db.provenances(COLLECTION_NAME))

{'ANP_1937.csv.gz',
 'ANP_1938.csv.gz',
 'ANP_1939.csv.gz',
 'ANP_1946.csv.gz',
 'ANP_1947.csv.gz',
 'ANP_1948.csv.gz',
 'ANP_1949.csv.gz',
 'ANP_1950.csv.gz',
 'ANP_1951.csv.gz',
 'ANP_1952.csv.gz',
 'ANP_1953.csv.gz',
 'ANP_1954.csv.gz',
 'ANP_1955.csv.gz',
 'ANP_1956.csv.gz',
 'ANP_1957.csv.gz',
 'ANP_1958.csv.gz',
 'ANP_1959.csv.gz',
 'ANP_1960.csv.gz',
 'ANP_1961.csv.gz',
 'ANP_1962.csv.gz',
 'ANP_1963.csv.gz',
 'ANP_1964.csv.gz',
 'ANP_1965.csv.gz',
 'ANP_1966.csv.gz',
 'ANP_1967.csv.gz',
 'ANP_1968.csv.gz',
 'ANP_1969.csv.gz',
 'ANP_1970.csv.gz',
 'ANP_1971.csv.gz',
 'ANP_1972.csv.gz',
 'ANP_1973.csv.gz',
 'ANP_1974.csv.gz',
 'ANP_1975.csv.gz',
 'ANP_1976.csv.gz',
 'ANP_1977.csv.gz',
 'ANP_1978.csv.gz',
 'ANP_1979.csv.gz',
 'ANP_1980.csv.gz',
 'ANP_1981.csv.gz',
 'ANP_1982.csv.gz',
 'ANP_1983.csv.gz',
 'ANP_1984.csv.gz'}

### Staten Generaal

In [21]:
COLLECTION_NAME = "StatenGeneraal"
STATEN_GENERAAL_DIR = CORPUS_DIR / COLLECTION_NAME

assert RANDOM_SAMPLE_STATEN_GENERAAL == 0 or STATEN_GENERAAL_DIR.is_dir()

In [22]:
glob195x = "StatenGeneraal_19[0-9]?.csv.gz"  # Pattern for files from 1950-1999
glob20xx = "StatenGeneraal_2???.csv.gz"  # Pattern for files from 2000

files_195x = list(STATEN_GENERAAL_DIR.glob(glob195x))
files_20xx = list(STATEN_GENERAAL_DIR.glob(glob20xx))

sg_files = [
    file
    # Merge files from patterns
    for file in files_20xx + files_195x
    # Remove blacklisted files:
    for blacklisted in STATEN_GENERAAL_BLACKLIST
    if blacklisted not in file.name
]

if RANDOM_SAMPLE_STATEN_GENERAAL and RANDOM_SAMPLE_STATEN_GENERAAL < len(sg_files):
    sg_files = random.sample(sg_files, k=RANDOM_SAMPLE_STATEN_GENERAAL)

print(f"Found {len(sg_files)} STAATEN_G Files")
sorted(sg_files[:10])

Found 118 STAATEN_G Files


[PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/StatenGeneraal/StatenGeneraal_2000.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/StatenGeneraal/StatenGeneraal_2001.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/StatenGeneraal/StatenGeneraal_2002.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/StatenGeneraal/StatenGeneraal_2003.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/StatenGeneraal/StatenGeneraal_2004.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semantics-of-sustainability/data/StatenGeneraal/StatenGeneraal_2005.csv.gz'),
 PosixPath('/Volumes/i-lab.data.uu.nl/research-semantics-of-sustainability/semanti

In [23]:
ingested_files: set[str] = set(db.provenances(COLLECTION_NAME))

print(f"Already ingested '{COLLECTION_NAME}' files: {ingested_files}")

Already ingested 'StatenGeneraal' files: set()


In [24]:
# FIXME: this does not work any more, should use the CorpusReader (see scripts/build_sos_wv_database.py)
for path in tqdm(anp_files, desc=COLLECTION_NAME, unit="file"):
    if path.name not in ingested_files:
        corpus = Corpus.from_csv_file(
            path,
            filter_terms=FILTER_TERMS,
            text_columns=["content"],
            encoding="iso8859_15",
            compression="gzip",
            delimiter=";",
            window_size=WINDOW_SIZE,
            segmenter=None,
        )
        db.ingest(corpus, COLLECTION_NAME)

StatenGeneraal:   0%|          | 0/48 [00:00<?, ?file/s]

[A
[A
[A
[A
[A
Embeddings: 100%|██████████| 7/7 [00:02<00:00,  3.04batch/s]
Embeddings:  86%|████████▌ | 6/7 [00:02<00:00,  2.61batch/s]
StatenGeneraal:   2%|▏         | 1/48 [00:05<03:59,  5.09s/file]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Embeddings: 100%|██████████| 15/15 [00:02<00:00,  5.39batch/s]
Embeddings:  93%|█████████▎| 14/15 [00:02<00:00,  5.04batch/s]
StatenGeneraal:   4%|▍         | 2/48 [00:09<03:27,  4.52s/file]
[A
[A
[A
[A
[A
[A
[A
[A
Embeddings: 100%|██████████| 10/10 [00:01<00:00,  5.85batch/s]
Embeddings:  90%|█████████ | 9/10 [00:01<00:00,  5.27batch/s]
StatenGeneraal:   6%|▋         | 3/48 [00:11<02:47,  3.71s/file]No passages to ingest into collection 'StatenGeneraal'
No passages to ingest into collection 'StatenGeneraal'
No passages to ingest into collection 'StatenGeneraal'
StatenGeneraal:  12%|█▎        | 6/48 [00:12<00:53,  1.28s/file]No passages to ingest into collection 'StatenG

## TEST: Retrieve Records from Database

In [26]:
filter_words = ["duurzaam"]
collections = ["ANP", "StatenGeneraal"]

corpus_mini = sum(
    (
        db.get_corpus(collection, filter_words=filter_words)
        for collection in collections
    ),
    start=Corpus(),
)
print(*corpus_mini.passages[:10], sep="\n")

Passage('vrouw, maar ook om die van * 5. duurzaam samenwonende of samenlevende mensen. De heffing van a. loon- en inkomstenbelasting zal in de toekomst moeten plaats7- . vinden op basis van individuele personen, zij het met enige 8. beperkingen. Premier Van Agt heeft dit meegedeeld op zijn 9. wekelijkse persconferentie.', {'year': '1979', 'highlighting': '81_88', 'date': '09-28-1979', 'day': '28', 'provenance': None, 'month': '9', 'filename': 'anp_1979_09_28_162_ocr.xml', 'issue': '162'}, Highlighting(start=81, end=88))
Passage('hebben-  de doelmatigheid doen blijken van een politiek die tegelijk vrede en onafhankelijkheid nastreeft. NEDERLAND, zei Uwe Majesteit, wil zichzelf blijven. Heerlijke en fiere leus, die NEDERLAND allang heeft toegepast en die toeliet- , dat het ongedeerd de tragische krisissen kon ontwijken, die zijne naburen geteisterd hebben. De toenadering van kleine staten verruimt de sfeer van hun actie en van hun gezag. Een solidaire houding kan den oorlog-  buiten onze