In [128]:
import io
from collections import defaultdict
import os, os.path
import sys
sys.path.insert(0, "..")

import numpy as np
import pandas as pd

import spacy
from spacy.lemmatizer import Lemmatizer
import nl_core_news_lg

from tqdm import tqdm_notebook as tqdm
from pprint import pprint
import spacy
import enchant
from enchant.checker import SpellChecker

from string import punctuation

import torch
import re
import nltk

from difflib import SequenceMatcher
from transformers import AutoTokenizer, AutoModel

from src import iterators


#### Settings

In [129]:
#stopword_list = nltk.corpus.stopwords.words('dutch')
from spacy.lang.nl.stop_words import STOP_WORDS
limit = 0 # limit the data
stopwords = set(STOP_WORDS) # set of stopwords for performance

#### Import the data

In [130]:
# Load csv
csv = iterators.iterate_directory("../data/processed/selected_articles/", ".csv")
df = pd.concat([pd.read_csv(c["article_path"]) for c in csv],ignore_index=True)
df.sort_values(by=["count"], ascending=False, inplace=True)

### Clean the dataset

#### Remove non Chinese, Japanese and Korean char

In [131]:
def jkc_detect(texts):
    # korean
    if re.search("[\uac00-\ud7a3]", texts):
        return False
    # japanese
    if re.search("[\u3040-\u30ff]", texts):
        return False
    # chinese
    if re.search("[\u4e00-\u9FFF]", texts):
        return False
    return True

In [132]:
# Only non jkc
df['non_jkc'] = df['text'].apply(jkc_detect)
df = df[df['non_jkc'] == True]

In [133]:
df.tail(3)

Unnamed: 0,Unnamed: 0_x,type,text,article_name,date,index_article,article_filepath,dir,Unnamed: 0_y,metadata_title,...,newspaper_title,newspaper_date,newspaper_city,newspaper_publisher,newspaper_source,newspaper_volume,newspaper_issuenumber,newspaper_language,count,non_jkc
2516,129590,p,--—————— — ' ' Van harte geluk gewenst OU Part...,DDD_010850933_0074_articletext.xml,1950-02-08,147528,../data/1950/02-08/DDD_010850933/DDD_010850933...,../data/1950/02-08/DDD_010850933,139.0,DDD:ddd:010850933:mpeg21.didl.xml.gz.xml,...,De waarheid,1950-02-08,Amsterdam,s.n.,Internationaal Instituut voor Sociale Geschied...,9.0,235.0,nl,0,True
2518,49694,p,! Nieuwe ontdekking ' ! i behoedt UW meubelen ...,DDD_110585156_0045_articletext.xml,1950-09-25,118470,../data/1950/09-25/DDD_110585156/DDD_110585156...,../data/1950/09-25/DDD_110585156,918.0,DDD:ddd:110585156:mpeg21.didl.xml.gz.xml,...,De Telegraaf,1950-09-25,Amsterdam,Dagblad De Telegraaf,KB C 98,53.0,19635.0,nl,0,True
2517,41222,p,W.J. NiJBOER KLEERMAKERIJ TAMBLQNGWEG 28 wenst...,DDD_010896645_0029_articletext.xml,1950-12-31,115413,../data/1950/12-31/DDD_010896645/DDD_010896645...,../data/1950/12-31/DDD_010896645,,,...,,,,,,,,,0,True


#### Create functions for preprocessing

In [134]:
# Keep "." "!" and "?" to define end of sentence.

punctuation = ",/<>;':\"[]\\{}|`~@#$%^&*()_+-="

def remove_punctuation(text):
    """Remove punctuation"""
    no_punct = "".join([c for c in text if c not in punctuation])
    return(no_punct)

In [136]:
df["text_clean"] = df["text"].apply(lambda x: remove_punctuation(x))

In [137]:
def remove_stopwords(text):
    """Remove stopwords as defined by Spacy stopwords"""
    words = "".join([w for w in text if w not in stopwords])
    return words

In [138]:
df["text_clean"] = df["text_clean"].apply(lambda x: remove_stopwords(x))

KeyError: 'clean'

In [140]:
df["text_clean"][1]

'ning tijdens de ontgassing normaal kan doorgaan. In de Belgische mijn „Le Grand Trait te Frameries in Henegowen „oogstte men op deze wijze in 2 maanden tijds 378.000 m 3 methaangas in de mijn „Saint Albert te Ressaix in een iets langere periode 428.650 m 3 methaan. In Henegowen wordt het gas reeds naar biten geleverd via de lichtgasfabrieken te Tertre. Methaangas levert 8000 tot 9000 caloriën warmte hetgeen tweemaal zoveel is als gewoon cokesovengas. In vele andere mijnen waaronder de Kempische neemt men proeven. Er bestaan plannen in Belgisch Limbrg een leidingermet aan te leggen voor de distribtie van het gas aan de bevolking. Een probleem vormt echter de vrij onregelmatige toevoer waarmee men ongetwijfeld te kampen zal krijgen. In de mijn Hirschbach in het Saargebied heeft men een andere methode gevolgd. In deze mijn ontsnapte zoveel gas dat met lchtverversing niet voldoende te bereiken was. Een gedeelte van de mijn werd daarom met dammen van de rest afgesloten. Door hel verrichten

In [148]:
def cleaner(df):
    "Extract relevant text from DataFrame using a regex"
    # Regex pattern for only alphanumeric, hyphenated text with 3 or more chars
    pattern = re.compile(r"[A-Za-z\-]{2,40}")
    df['text_clean'] = df['text_clean'].str.findall(pattern).str.join(' ')
    if limit > 0:
        return df.iloc[:limit, :].copy()
    else:
        return df

In [149]:
cleaner(df)

In [156]:
df["text_clean"][1]

'ning tijdens de ontgassing normaal kan doorgaan In de Belgische mijn Le Grand Trait te Frameries in Henegowen oogstte men op deze wijze in maanden tijds methaangas in de mijn Saint Albert te Ressaix in een iets langere periode methaan In Henegowen wordt het gas reeds naar biten geleverd via de lichtgasfabrieken te Tertre Methaangas levert tot calori warmte hetgeen tweemaal zoveel is als gewoon cokesovengas In vele andere mijnen waaronder de Kempische neemt men proeven Er bestaan plannen in Belgisch Limbrg een leidingermet aan te leggen voor de distribtie van het gas aan de bevolking Een probleem vormt echter de vrij onregelmatige toevoer waarmee men ongetwijfeld te kampen zal krijgen In de mijn Hirschbach in het Saargebied heeft men een andere methode gevolgd In deze mijn ontsnapte zoveel gas dat met lchtverversing niet voldoende te bereiken was Een gedeelte van de mijn werd daarom met dammen van de rest afgesloten Door hel verrichten van boringen en door het afbowen van een dieper ge

### Split long text

In [152]:
def get_split(txt, length):
  len_tot = []
  len_partial = []
  if len(txt.split())//length >0:
    n = len(txt.split())//length
  else: 
    n = 1
  for w in range(n):
    if w == 0:
      len_partial = txt.split()[:length]
      len_tot.append(" ".join(len_partial))
    else:
      len_partial = txt.split()[w*length:w*length + length]
      len_tot.append(" ".join(len_partial))
  return len_tot

In [153]:
df["text_split"] = df["text"].apply(get_split, length=500)

Select the divided text and retrieve also article_ids and article_name

In [154]:
divided_texts = []
idx_texts = []
name_texts = []
dfids_texts = []
for idx, row in df.iterrows():
  for text in row['text_split']:
    divided_texts.append(text)
    idx_texts.append(idx)
    name_texts.append(row["article_name"])
    dfids_texts.append(row["Unnamed: 0_x"])

Create smaller dataframe for analysis

In [155]:
df_texts = pd.DataFrame({"text":divided_texts, "article_id":dfids_texts, "article_name":name_texts})
df_texts.head(5)

Unnamed: 0,text,article_id,article_name
0,Aoiang er mijnen bestaan is het mijngas de gro...,36288,DDD_010417712_0100_articletext.xml
1,ning tijdens de ontgassing normaal kan doorgaa...,36381,DDD_010417712_0102_articletext.xml
2,"""W/ij eijn deze keer op een Joodse bruiloft, ""...",107454,DDD_010612570_0079_articletext.xml
3,Het is een spannende geschiedenis met de gasvo...,122625,DDD_010417601_0094_articletext.xml
4,"In elk geval, meende de archivaris, heeft pate...",125000,DDD_011199673_0059_articletext.xml


### NLP Pipe

In [157]:
def lemmatize_pipe(doc):
    lemma_list = [str(tok.lemma_).lower() for tok in doc
                  if tok.is_alpha and tok.text.lower() not in stopwords] 
    return lemma_list

def preprocess_pipe(texts):
    preproc_pipe = []
    for doc in nlp.pipe(texts, batch_size=50):
        preproc_pipe.append(lemmatize_pipe(doc))
    return preproc_pipe

In [159]:
%%time
df['preproc_pipe'] = preprocess_pipe(df['text_clean'])
df[['date', 'newspaper_source', 'text_clean', 'preproc_pipe']].head(3)

CPU times: user 15.9 s, sys: 59.9 ms, total: 16 s
Wall time: 16 s


Unnamed: 0,date,newspaper_source,text_clean,preproc_pipe
0,1950-11-16,Sociaal Historisch centrum voor Limburg T 501,Aoiang er mijnen bestaan is het mijngas de gro...,"[aoiang, mijnen, bestaan, mijngas, grootste, v..."
1,1950-11-16,Sociaal Historisch centrum voor Limburg T 501,ning tijdens de ontgassing normaal kan doorgaa...,"[ning, ontgassing, normaal, doorgaan, belgisch..."
2,1950-01-28,,Wij eijn deze keer op een Joodse briloft die z...,"[eijn, keer, joodse, briloft, zeven, dagen, dr..."


Find all incorrect words in the df

In [28]:
nlp = spacy.load('nl_core_news_lg', disable=['tagger', 'parser', 'ner'])
nlp.add_pipe(nlp.create_pipe('sentencizer'))
#The sentencizer pipeline simply performs tokenization and sentence boundary detection

In [88]:
for doc in nlp.pipe(df["text"].astype('unicode').values,
                    batch_size=50,
                    n_threads=3,
                    disable=["tagger", "parser"]):
    print([(ent.text, ent.label_) for ent in doc.ents])

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[

KeyboardInterrupt: 

In [102]:
predictions = []
MASKIDS = []
# Load, train and predict using pre-trained model
tokenizer = AutoTokenizer.from_pretrained("wietsedv/bert-base-dutch-cased")
# Load pre-trained model
model = AutoModel.from_pretrained("wietsedv/bert-base-dutch-cased")

for ids, row in tqdm(df_texts.iterrows(), total=df_texts.shape[0]):
    text = row["masked_texts"]
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    MASKIDS.append([i for i, e in enumerate(tokenized_text) if e == '[MASK]'])

    # Create the segments
    segments_ids = [0] * len(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    # Predict all tokens
    with torch.no_grad():
        predictions.append(model(tokens_tensor, segments_tensors))

HBox(children=(FloatProgress(value=0.0, max=375.0), HTML(value='')))




In [192]:
from transformers import pipeline
for ids, row in tqdm(df_texts.iterrows(), total=df_texts.shape[0]):
    text = row["masked_texts"]
    nlp_fill = pipeline(
        'fill-mask',
        model="wietsedv/bert-base-dutch-cased",
        tokenizer="wietsedv/bert-base-dutch-cased",
        topk=5)
    pprint(nlp_fill(text))

HBox(children=(FloatProgress(value=0.0, max=375.0), HTML(value='')))

Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


PipelineException: More than one mask_token ([MASK]) is not supported

In [187]:
a = predictions[0][0]
print(a[MASKIDS[0][i]])
#preds = torch.topk(a[MASKIDS[0][i]], k=25)
#indices = preds.indices.tolist()
#print(indices)
#list1 = tokenizer.convert_ids_to_tokens(indices)
#for i in range(len(MASKIDS[0])):
    #print(i)
    #torch.topk(predictions[0][0, MASKIDS[0][i]], k=50)

tensor([[ 0.1571, -0.6540, -0.1885,  ...,  0.0497,  0.1652, -0.4206],
        [-0.2000,  0.4086, -0.7478,  ..., -0.1650, -0.1866,  0.1263],
        [ 0.8099, -1.8925,  0.0474,  ...,  0.4340,  0.3164,  0.6638],
        ...,
        [-0.3609, -0.5708,  0.0787,  ...,  0.1913, -0.3615,  0.0275],
        [-0.9404, -0.7848, -0.5773,  ..., -0.1136, -0.0910,  0.1418],
        [ 1.0243, -0.8926, -0.2411,  ..., -0.3097,  0.3824,  0.2190]])


In [176]:
#Predict words for mask using BERT; 
def predict_word(text, predictions, maskids):
    pred_words=[]
    for item in range(len(predictions)):
        tns = predictions[item][0]
        for i in range(len(maskids)):
            preds = torch.topk(tns[maskids[item][i]], k=10) 
            indices = preds.indices.tolist()
            list1 = tokenizer.convert_ids_to_tokens(indices)
            list2 = suggestedwords[item][i]
            simmax=0
            predicted_token=''
            for word1 in list1:
                for word2 in list2:
                    s = SequenceMatcher(None, word1, word2).ratio()
                    if s is not None and s > simmax:
                        simmax = s
                        predicted_token = word1
            text = text.replace('[MASK]', predicted_token, 1)
    return text

In [177]:
text_cleaned = []
for ids, row in tqdm(df_texts.iterrows(), total=df_texts.shape[0]):
    masked_text = row["masked_texts"]
    text_cleaned.append(predict_word(masked_text, predictions, MASKIDS))

HBox(children=(FloatProgress(value=0.0, max=375.0), HTML(value='')))

TypeError: int() argument must be a string, a bytes-like object or a number, not 'list'