# ANP Corpus

## Load Data

The data needs to be downloaded and provided in the path configured in the next cell.

In [1]:
%load_ext autoreload

In [2]:
from pathlib import Path


DATA_DIR = Path.home() / "Documents" / "SemanticsOfSustainability" / "data" / "Joris"
assert DATA_DIR.is_dir()

In [3]:
ANP_DIR = DATA_DIR / "ANP" / "CleanFiles_perYear"
assert ANP_DIR.is_dir()


In [4]:
files = list(ANP_DIR.glob("ANP_????.csv.gz"))
files[:10]

[PosixPath('/Users/carstenschnober/Documents/SemanticsOfSustainability/data/Joris/ANP/CleanFiles_perYear/ANP_1952.csv.gz'),
 PosixPath('/Users/carstenschnober/Documents/SemanticsOfSustainability/data/Joris/ANP/CleanFiles_perYear/ANP_1940.csv.gz'),
 PosixPath('/Users/carstenschnober/Documents/SemanticsOfSustainability/data/Joris/ANP/CleanFiles_perYear/ANP_1976.csv.gz'),
 PosixPath('/Users/carstenschnober/Documents/SemanticsOfSustainability/data/Joris/ANP/CleanFiles_perYear/ANP_1968.csv.gz'),
 PosixPath('/Users/carstenschnober/Documents/SemanticsOfSustainability/data/Joris/ANP/CleanFiles_perYear/ANP_1964.csv.gz'),
 PosixPath('/Users/carstenschnober/Documents/SemanticsOfSustainability/data/Joris/ANP/CleanFiles_perYear/ANP_1942.csv.gz'),
 PosixPath('/Users/carstenschnober/Documents/SemanticsOfSustainability/data/Joris/ANP/CleanFiles_perYear/ANP_1950.csv.gz'),
 PosixPath('/Users/carstenschnober/Documents/SemanticsOfSustainability/data/Joris/ANP/CleanFiles_perYear/ANP_1978.csv.gz'),
 PosixPa

In [5]:
%autoreload now

import operator
from functools import reduce
from tqdm import tqdm
from tempo_embeddings.text.corpus import Corpus, Passage


corpus = reduce(
    operator.add,
    (
        Corpus.from_csv_file(
            path,
            text_columns=["content"],
            encoding="iso8859_15",
            compression="gzip",
            delimiter=";",
        )
        for path in tqdm(files, unit="file")
    ),
)

corpus

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()
100%|██████████| 48/48 [00:31<00:00,  1.51file/s]


Corpus([(Passage('Red.:  Datum:  veldman 1-1-52  Tijd: 13 ttt^.  Onderwerp: pan moen jom  De noordelijke onderhandelaars in Pan-Koen-Jom hebben vandaag Z^*^ ^ zullen j tocgoFcgd gegevens t^verstrekken ever de vijftig duizend  geallieerde krijgsgevangenen, die niet op de ingediende lijsten  voorkomen. Het g  : i\' ootste ^^^1 -.\'" - rt ren  L^A< ?-, ts^-aijn Zuidkoreanen.  De woordvoerder van de Verenigde Naties verklaarde, dat de  eerste bijeenkomst in het nieuwe jaar in een vriendschappelijke  sfeer is gehouden.  De commissie die sich bezig houdt met de vraagstukken betreffende-  de naleving van het bestand, heeft vandaag geen vorderingen  te.  gemaakt. De onderhandelingen worden morgen voortgezet.  Aan de fronten op Korea is het betrekkelijk rustig. XBMai De  gevechten beperken zich tot patrouilleactiviteit. Vliegtuigen van de Noordelijken hebben enkele bommen laten vallen op de  haven en het vliegveld van Seoul. De schade is gering  Bron: rtran  Regels: 13^', {'filename': 'anp_1952

## Load Model

In [6]:
MODEL_NAME = "DTAI-KULeuven/robbertje-1-gb-non-shuffled"  # NL MODEL TODO: is the model trained on lowercased data?


In [7]:
%autoreload now

from tempo_embeddings.embeddings.model import RobertaModelWrapper

model = RobertaModelWrapper.from_pretrained(MODEL_NAME)

Some weights of the model checkpoint at DTAI-KULeuven/robbertje-1-gb-non-shuffled were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at DTAI-KULeuven/robbertje-1-gb-non-shuffled and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this mode

## Compute Embeddings

In [8]:
TERM = "duurzaam"

In [9]:
subcorpus = corpus.subcorpus(TERM)

In [10]:
model.add_embeddings(subcorpus)

Computing embeddings: 100%|██████████| 273/273 [00:46<00:00,  5.81passage/s]


## Visualize Embeddings

In [11]:
from tempo_embeddings.visualization.wizmap import WizmapVisualizer

port = 8000

if "visualizer" in locals():
    # Cleanup previous run
    visualizer.cleanup()

visualizer = WizmapVisualizer(subcorpus, title=TERM)

visualizer.visualize(port=port)

Starting server on port 8000


In [12]:
#visualizer.cleanup()

127.0.0.1 - - [14/Jul/2023 15:20:35] "GET /grid.json HTTP/1.1" 200 -
127.0.0.1 - - [14/Jul/2023 15:20:35] "GET /data.ndjson HTTP/1.1" 200 -
