# Dutch Corpora

In [1]:
# Install tempo-embeddings from GitHub
# This can also refer to a specific version or branch

# %pip install --upgrade pip  # Required for properly resolving dependencies
# %pip uninstall -y tempo_embeddings  # Remove existing installation
# %pip install --upgrade git+https://github.com/Semantics-of-Sustainability/tempo-embeddings.git

In [2]:
# make sure installation has succeeded
import tempo_embeddings
%load_ext autoreload

In [3]:
# import stanza
# stanza.download('nl')
# nlp_pipeline = stanza.Pipeline("nl", processors='tokenize')
nlp_pipeline = None

In [4]:
try:
    import google.colab

    IN_COLAB = True
except ModuleNotFoundError:
    IN_COLAB = False

## Load Data

The data needs to be downloaded and provided in the path configured in the next cell.

NOTE: You have to manually adapt the `DATA_DIR` below.

In [5]:
%autoreload now

import operator
from functools import reduce
from pathlib import Path
from tqdm import tqdm
from tempo_embeddings.text.corpus import Corpus

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
WINDOW_SIZE = 200
USE_FULL_SENTENCES = False # For now, this parameter overrides the window size

RANDOM_SAMPLE_ANP = 200
RANDOM_SAMPLE_STATEN_GENERAAL = 200

STATEN_GENERAAL_BLACKLIST = ["1987"]

FILTER_TERMS = [] # ["duurzaam"]  # Search term(s) for filtering the corpus

In [7]:
## NOTE: Adapt the `DATA_DIR` below manually!
## For a shared Google Drive, create a shortcut into your own Google Drive
## See https://stackoverflow.com/questions/54351852/accessing-shared-with-me-with-colab

LOCAL_PATHS: list[Path] = [
    Path.home() / "Documents" / "SemanticsOfSustainability" / "data" / "Joris",
    Path.home() / "SEED_DATA" / "SemanticsSustainability", # local angel
    Path("/data/volume_2/data"),  # Research Cloud
    Path("/home/cschnober/data/"),  # Snellius
]

if IN_COLAB:
    from google.colab import drive

    drive.mount("/content/drive")

    DATA_DIR = Path("/content/drive/MyDrive/Data/")
else:
    try:
        DATA_DIR = next(path for path in LOCAL_PATHS if path.is_dir())
    except StopIteration as e:
        raise DirectoryNotFoundError(f"Data directory not found.") from e

assert DATA_DIR.is_dir(), f"Data dir '{DATA_DIR}' not found."

### ANP

In [8]:
ANP_DIR = DATA_DIR / "ANP"
assert RANDOM_SAMPLE_ANP == 0 or ANP_DIR.is_dir(), f"{ANP_DIR} not found."

In [9]:
import random


random.seed(0)

anp_files = list(ANP_DIR.glob("ANP_????.csv.gz"))

if RANDOM_SAMPLE_ANP and len(anp_files) > RANDOM_SAMPLE_ANP:
    anp_files = random.sample(
        list(ANP_DIR.glob("ANP_????.csv.gz")), k=RANDOM_SAMPLE_ANP
    )

print(f"Found {len(anp_files)} ANP Files")
anp_files[:10]

Found 2 ANP Files


[PosixPath('/Users/jose/SEED_DATA/SemanticsSustainability/ANP/ANP_1983.csv.gz'),
 PosixPath('/Users/jose/SEED_DATA/SemanticsSustainability/ANP/ANP_1984.csv.gz')]

In [10]:
anp_corpus = (
    reduce(
        operator.add,
        (
            Corpus.from_csv_file(
                path,
                filter_terms=FILTER_TERMS,
                text_columns=["content"],
                encoding="iso8859_15",
                compression="gzip",
                delimiter=";",
                window_size=WINDOW_SIZE,
                nlp_pipeline=nlp_pipeline
            )
            for path in tqdm(anp_files, unit="file")
        ),
    )
    if anp_files
    else Corpus()
)

len(anp_corpus)

100%|██████████| 2/2 [00:01<00:00,  1.08file/s]


383554

### Staten Generaal

In [11]:
STATEN_GENERAAL_DIR = DATA_DIR / "StatenGeneraal"

assert RANDOM_SAMPLE_STATEN_GENERAAL == 0 or STATEN_GENERAAL_DIR.is_dir()

In [12]:
glob195x = "StatenGeneraal_19[0-9]?.csv.gz"  # Pattern for files from 1950-1999
glob20xx = "StatenGeneraal_2???.csv.gz"  # Pattern for files from 2000

files_195x = list(STATEN_GENERAAL_DIR.glob(glob195x))
files_20xx = list(STATEN_GENERAAL_DIR.glob(glob20xx))

sg_files = [
    file
    # Merge files from patterns
    for file in files_20xx + files_195x
    # Remove blacklisted files:
    for blacklisted in STATEN_GENERAAL_BLACKLIST
    if blacklisted not in file.name
]

if RANDOM_SAMPLE_STATEN_GENERAAL and RANDOM_SAMPLE_STATEN_GENERAAL < len(sg_files):
    sg_files = random.sample(sg_files, k=RANDOM_SAMPLE_STATEN_GENERAAL)

print(f"Found {len(sg_files)} STAATEN_G Files")
sorted(sg_files[:10])

Found 2 STAATEN_G Files


[PosixPath('/Users/jose/SEED_DATA/SemanticsSustainability/StatenGeneraal/StatenGeneraal_2017.csv.gz'),
 PosixPath('/Users/jose/SEED_DATA/SemanticsSustainability/StatenGeneraal/StatenGeneraal_2018.csv.gz')]

In [13]:
%autoreload now

import csv

csv.field_size_limit(100000000)

sg_corpus = (
    reduce(
        operator.add,
        (
            Corpus.from_csv_file(
                path,
                filter_terms=FILTER_TERMS,
                text_columns=["Content"],
                encoding="utf-8",
                compression="gzip",
                delimiter=";",
                window_size=WINDOW_SIZE,
                nlp_pipeline=nlp_pipeline
            )
            for path in tqdm(sg_files, unit="file")
        ),
    )
    if sg_files
    else Corpus()
)

len(sg_corpus)

100%|██████████| 2/2 [00:02<00:00,  1.13s/file]


559435

In [14]:
for p in sg_corpus.passages[:20]:
    print(len(p), p)

206 Passage('14e vergadering Dinsdag 17 januari 2017 Aanvang 10:15 uur 1 Voorzitter: Broekers-Knol Tegenwoordig zijn 67 leden, te weten: Van Apeldoorn, Atsma, Backer, Barth, Beuving, Van Bijsterveld, Bikker, Bredenoord,', {'': '202', 'RecId': 'h-ek-20162017-14-1', 'chamber': 'EersteKamer', 'date': '2017-01-17', 'speakers': 'no speakers', 'title': 'handelingen', 'year': '2017'}, None)
220 Passage('Bredenoord, Brinkman, Broekers-Knol, Bruijn, Dercksen, Diederik van Dijk, Don, Duthler, Engels, Ester, Faber-van de Klashorst, Flierman, Ganzevoort, Gerkens, De Graaf, De Grave, Van Hattem, Hoekstra, Huijbregts-Schiedon,', {'': '202', 'RecId': 'h-ek-20162017-14-1', 'chamber': 'EersteKamer', 'date': '2017-01-17', 'speakers': 'no speakers', 'title': 'handelingen', 'year': '2017'}, None)
204 Passage('Huijbregts-Schiedon, Jorritsma-Lebbink, Van Kappen, Van Kesteren, Knapen, Knip, Koffeman, Köhler, Kok, Kops, Kox, Krikke, Kuiper, Lintmeijer, Markuszower, Martens, Meijer, Nooren, Oomen-Ruijten, Over

### Merge

In [15]:
corpus = anp_corpus + sg_corpus
len(corpus)

942989

## Load Model

In [16]:
%autoreload now

LAYER = 9

from tempo_embeddings.embeddings.model import EmbeddingsMethod

from tempo_embeddings.embeddings.model import (
    RobertaModelWrapper,
    TransformerModelWrapper,
    XModModelWrapper,
    SentenceTransformerModelWrapper,
)

kwargs = {"accelerate": True}

### RoBERTa Models
# MODEL_NAME = "DTAI-KULeuven/robbertje-1-gb-non-shuffled"
# model_class = RobertaModelWrapper

### XMod Models
# MODEL_NAME = "facebook/xmod-base"
# kwargs["default_language"] = "nl_XX"
# model_class = XModModelWrapper

### BERT Models
# MODEL_NAME = "GroNLP/bert-base-dutch-cased"
# MODEL_NAME = "xlm-roberta-base"
# MODEL_NAME = "xlm-mlm-100-1280"
# model_class = TransformerModelWrapper

### Sentence Transformers
MODEL_NAME = "NetherlandsForensicInstitute/robbert-2022-dutch-sentence-transformers"
# MODEL_NAME = "textgain/allnli-GroNLP-bert-base-dutch-cased"
model_class = SentenceTransformerModelWrapper

model = model_class.from_pretrained(MODEL_NAME, layer=LAYER, **kwargs)
model.embeddings_method = EmbeddingsMethod.MEAN
model



<tempo_embeddings.embeddings.model.SentenceTransformerModelWrapper at 0x2b8704f70>

## Create or Open existing Database

In [17]:
from tempo_embeddings.embeddings.vector_database import ChromaDatabaseManager

EMBED_CONFIG = {"type":"custom_model", "model": model}
BATCH_SIZE = 32

db = ChromaDatabaseManager(db_path="testing_db", embedder_name=MODEL_NAME,embedder_config=EMBED_CONFIG, batch_size=BATCH_SIZE)
db.connect()

collection_name = "anp_sg_corpus"

try:
    anp_sg_collection = db.create_new_collection(collection_name)
    created_new_collection = True
except ValueError:
    anp_sg_collection = db.get_existing_collection(collection_name)
    created_new_collection = False

Created NEW collection 'anp_sg_corpus'


In [18]:
%autoreload now

## Compute & Save Embeddings (If DB didn't existed)

In [19]:
if created_new_collection:
    # Insert the passages from the corpus in the new collection
    db.insert_passages(anp_sg_collection, corpus.passages)
else:
    print("No new passages to insert")
    

Embeddings Batches: 100%|██████████| 29469/29469 [2:31:28<00:00,  3.24batch/s]   


Added 942986 new documents. Total = 942986


## TEST: Retrieve Records from Database

In [20]:
%autoreload now

# passages = db.get_passages(anp_sg_collection, filter_words=["toekomst"], where_obj={'$and': [{'year': {'$eq': '1983'}}, {'month': {'$eq': '7'}}]})
# passages = db.get_passages(anp_sg_collection, limit=100)
passages = db.get_passages(anp_sg_collection, filter_words=["duurzaam"])
for p in passages:
    print(p.text)

handelsakkoorden een bijdrage leveren aan de internationale duurzaamheidsagenda, inclusief natuurlijk uitvoering van het Parijse klimaatakkoord. Duurzaamheidshoofdstukken zijn in principe altijd onderdeel
nieuwe bebouwing extra energiezuinig is. Dat is een goede stap op het gebied van duurzaamheid. Daarom hebben wij de volgende motie. Motie De Kamer, gehoord de beraadslaging, constaterende dat de Crisis-
mogelijk is over welke ondersteuning hier eigenlijk mee bedoeld zou zijn, gaan wij niet in één keer een handel overhalen. Wij zetten in op een transitie om weg te geraken van fossiel en voor duurzaam te
duurzaamheid. Hij was en bleef daarbij zijn hele leven een Rotterdammer. Indrukwekkend waren de honderden geëmotioneerde mensen die hem deze week in zijn stad in de Laurentius- en Elisabethkathedraal de
verhouding en in de vijfde plaats duurzaamheid. Dat betekent dat we er volgend jaar niet weer hoeven te staan omdat het eigenlijk een, wellicht noodzakelijke, tussenstap was maar niet ee

In [21]:
corpus_label = "anp_duurzam"
db.create_new_collection("anp_duurzam", passages)

Embeddings Batches: 100%|██████████| 44/44 [00:32<00:00,  1.37batch/s]

Added 1396 new documents. Total = 1396
Created NEW collection 'anp_duurzam'





Collection(name=anp_duurzam)