### Import

In [34]:
%load_ext autoreload
%autoreload 2

import os
from os.path import isfile
import re
import sys
from pathlib import Path

import pandas as pd
import nltk
from tqdm import tqdm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Loading merged articles

It is important to load the csv using the `, converters={'p': eval})` option, to keep the division between paragraph that exists in the original articles

In [5]:
data = pd.read_csv("/home/leonardovida/data/volume_1/data-histaware/merged_articles/1970s/merged_1970s_20.csv", converters={'p': eval})
# Select only first 1500 as a demo
data = data.head(1500)
# Select only the "artikel"s
data = data[data["subject"] == "artikel"]

Exploding the "p" (paragraph) column, to have one paragraph per row

In [28]:
data = data.explode('p', ignore_index = True)

### Create BasicTokenizer

In [30]:
import unicodedata


class BasicTokenizer(object):
    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""

    def __init__(self, do_lower_case=False):
        """Constructs a BasicTokenizer.
        Args:
          do_lower_case: Whether to lower case the input and strip acents.
        """
        self.do_lower_case = do_lower_case

    def tokenize(self, text, return_str=False):
        """Tokenizes a piece of text."""
        text = self._clean_text(text)

        orig_tokens = text.strip().split()
        split_tokens = []
        for token in orig_tokens:
            if self.do_lower_case:
                token = token.lower()
                token = self._run_strip_accents(token)
            split_tokens.extend(self._run_split_on_punc(token))

        return split_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            cat = unicodedata.category(char)
            if cat == "Mn":
                continue
            output.append(char)
        return "".join(output)

    def _run_split_on_punc(self, text):
        """Splits punctuation on a piece of text."""
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1

        return ["".join(x) for x in output]

    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        for char in text:
            cp = ord(char)
            if cp == 0 or cp == 0xfffd or _is_control(char):
                continue
            if _is_whitespace(char):
                output.append(" ")
            # Here you can add another "IF" if you want to eliminate stopwords or other things
            else:
                output.append(char)
        return "".join(output)


def _is_whitespace(char):
    """Checks whether `chars` is a whitespace character."""
    # \t, \n, and \r are technically contorl characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    cat = unicodedata.category(char)
    if cat == "Zs":
        return True
    return False


def _is_control(char):
    """Checks whether `chars` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == "\t" or char == "\n" or char == "\r":
        return False
    cat = unicodedata.category(char)
    if cat in ("Cc", "Cf"):
        return True
    return False


def _is_punctuation(char):
    """Checks whether `chars` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
        return True
    cat = unicodedata.category(char)
    return cat.startswith("P")


In [33]:
# Load dutch nltk (you need to download it)
sent_detector = nltk.data.load('tokenizers/punkt/dutch.pickle')
tokenizer = BasicTokenizer()

Here I create a new TXT file each 500k lines (as for BERT is better to train on txt files). But we can spit out in csv as well

In [46]:
out = ''
for i,r in tqdm(data.iterrows(), total=data.shape[0], ncols=80):
    # If paragrph is null, skip
    if (pd.isnull(r['p'])):
        continue
    else:
        # I'm creating even smaller sentences - this might be skipped - as it is now - if you are not interested in it.
        #sents = sent_detector.tokenize(r["p"]) # currently skipped
        sents = [' '.join(tokenizer.tokenize(r["p"]))] # if you do not skip the sent_detector you need to comment out this
        #sents = [' '.join(tokenizer.tokenize(s)) for s in sents] # and uncomment this
        out += '\n'.join(sents) + '\n-----\n'
print(out)

100%|█████████████████████████████████████| 1797/1797 [00:01<00:00, 1053.50it/s]

SNEEK — Wij , hier in Nederland , moeten een les trekken uit wat er nu in Chili is gebeurd , zei de heer Piet Reekman , leider van de Sjaloomgroep in Odijk , gisteravond in de Ichtuskerk in Sneek . Wij moeten uit al die dingen die zijn gebeurd om Allende en zijn volksbeweging tegen te werken , op opzij te zetten en uit te moorden , iets leren hier in Nederland . Want als we niet oppassen worden de democratische krachten die hier streven voor een rechtvaardiger samenleving ook opzij gezet , zei hij .
-----
De heer Reekman was naar Sneek uitgenodigd om over Chili te praten voor leden van de Rotonde - gespreksgroepen . Aan de hand van zijn inleiding zullen de komende weken discussieavonden worden gehouden . Aan het einde van zijn inleiding trok Piet Reekman gisteravond een parallel tussen de aarzelende ( iiristen - dernocratische partijen , in Chili en de christen - democratische partijen in Nederland , die volgens hem ook nooit een keuze durven maken .
-----
het volk kan weer in armoede 


