# Install requirements

In [1]:
%pip install spacy pandas
%conda install -c conda-forge ipywidgets

Note: you may need to restart the kernel to use updated packages.
Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 22.9.0
  latest version: 22.11.1

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

Retrieving notices: ...working... done

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

# Pipeline

In [3]:
import spacy
from spacy.lang.nl import Dutch

SPACY_MODEL = "nl_core_news_sm"

try:
    nlp = spacy.load(SPACY_MODEL)
except OSError:
    spacy.cli.download(SPACY_MODEL)
    nlp = spacy.load(SPACY_MODEL)

nlp.add_pipe("sentencizer")

for component in [
    "tok2vec",
    "morphologizer",
    "tagger",
    "parser",
    "lemmatizer",
    "attribute_ruler",
    "ner",
]:
    nlp.remove_pipe(component)


In [4]:
nlp.pipe_names

['sentencizer']

# Corpus Location

In [5]:
from pathlib import Path


DATA_DIR = Path.home() / "Documents" / "SemanticsOfSustainability" / "data" / "Joris"
assert DATA_DIR.is_dir()


## Read ANP

In [6]:
ANP_DIR = DATA_DIR / "ANP" / "CleanFiles_perYear"
assert ANP_DIR.is_dir()

ANP_FILE_GLOB = "ANP_19[0-9][0-9].csv.gz"
ANP_ENCODING = "iso8859_15"


### Count Sentences and Tokens

In [8]:
from tqdm import tqdm

n_sentences = 0
n_tokens = 0
n_docs = 0

files = list(ANP_DIR.glob(ANP_FILE_GLOB))


for f in tqdm(files, unit="file", position=0):
    df = pd.read_csv(
        f, compression="gzip", sep=";", encoding=ANP_ENCODING, parse_dates=["date"]
    ).dropna(subset="content")

    for index, value in tqdm(df.content.items(), unit="doc", total=len(df), position=1):
        n_docs += 1
        doc = nlp(value)
        for sent in doc.sents:
            n_sentences += 1
            for token in sent:
                n_tokens += 1

print(f"Number of documents:\t{n_docs}")
print(f"Number of sentences:\t{n_sentences}")
print(f"Number of tokens:\t{n_tokens}")


100%|██████████| 28067/28067 [00:35<00:00, 779.89doc/s]
0doc [00:00, ?doc/s]8 [00:36<28:27, 36.34s/file]
100%|██████████| 49629/49629 [01:10<00:00, 707.94doc/s]
100%|██████████| 43100/43100 [01:03<00:00, 682.23doc/s]
100%|██████████| 31069/31069 [00:45<00:00, 680.49doc/s]
0doc [00:00, ?doc/s]8 [03:36<32:37, 45.53s/file]
100%|██████████| 29361/29361 [00:38<00:00, 755.00doc/s]
100%|██████████| 49204/49204 [01:06<00:00, 734.62doc/s]
100%|██████████| 44971/44971 [01:00<00:00, 738.84doc/s]
100%|██████████| 56348/56348 [01:14<00:00, 754.99doc/s]
100%|██████████| 59280/59280 [01:29<00:00, 665.66doc/s]
100%|██████████| 45629/45629 [01:02<00:00, 729.39doc/s]
100%|██████████| 33397/33397 [00:49<00:00, 668.90doc/s]
100%|██████████| 2143/2143 [00:08<00:00, 267.31doc/s]
100%|██████████| 24412/24412 [00:35<00:00, 689.04doc/s]
100%|██████████| 28546/28546 [00:43<00:00, 654.48doc/s]
100%|██████████| 413/413 [00:00<00:00, 502.72doc/s]
100%|██████████| 33874/33874 [00:48<00:00, 701.14doc/s]
100%|███████

Number of documents:	: 1474649
Number of sentences:	: 23880052
Number of tokens:	: 226754135





## Staten Generaal

In [11]:
SG_DIR = DATA_DIR / "StatenGeneraal"

assert SG_DIR.is_dir()

SG_GLOB = "StatenGeneraal_[12][890][0-9][0-9].csv.gz"
SG_ENCODING = "iso8859_15"


In [15]:
import logging


n_sentences = 0
n_tokens = 0
n_docs = 0

files = list(SG_DIR.glob(SG_GLOB))


for f in tqdm(files, unit="file", position=0):
    df = pd.read_csv(
        f, compression="gzip", sep=";", encoding=SG_ENCODING, parse_dates=["date"]
    ).dropna(subset="Content")

    for index, value in tqdm(df.Content.items(), unit="doc", total=len(df), position=1):
        if len(value) > 1000000:
            logging.warning(
                f"Skipping long text ({len(value)} characters) in row {index}, file '{f}'."
            )
            continue

        n_docs += 1
        doc = nlp(value)
        for sent in doc.sents:
            n_sentences += 1
            for token in sent:
                n_tokens += 1

print(f"Number of documents:\t{n_docs}")
print(f"Number of sentences:\t{n_sentences}")
print(f"Number of tokens:\t{n_tokens}")


100%|██████████| 2906/2906 [01:04<00:00, 45.05doc/s]
100%|██████████| 142/142 [00:10<00:00, 13.93doc/s]]
100%|██████████| 1196/1196 [00:15<00:00, 76.46doc/s] 
100%|██████████| 1886/1886 [00:28<00:00, 66.69doc/s]
100%|██████████| 138/138 [00:06<00:00, 20.18doc/s]]
100%|██████████| 132/132 [00:10<00:00, 12.44doc/s]]
100%|██████████| 94/94 [00:01<00:00, 83.36doc/s]   
100%|██████████| 115/115 [00:04<00:00, 26.75doc/s]
100%|██████████| 1654/1654 [00:22<00:00, 73.08doc/s]
100%|██████████| 2160/2160 [00:23<00:00, 93.27doc/s] 
100%|██████████| 54/54 [00:04<00:00, 12.72doc/s]e]
100%|██████████| 172/172 [00:17<00:00,  9.78doc/s]
100%|██████████| 132/132 [00:11<00:00, 11.49doc/s]
100%|██████████| 161/161 [00:07<00:00, 22.45doc/s]
100%|██████████| 179/179 [00:13<00:00, 13.69doc/s]
100%|██████████| 102/102 [00:01<00:00, 57.58doc/s]
100%|██████████| 146/146 [00:13<00:00, 10.52doc/s]
100%|██████████| 51/51 [00:03<00:00, 15.19doc/s]e]
100%|██████████| 90/90 [00:02<00:00, 44.80doc/s]e]
100%|██████████

Number of documents:	: 187133
Number of sentences:	: 47302585
Number of tokens:	: 1167766456



