# Preprocess text

In [1]:
import io
from collections import defaultdict
import os, os.path
import sys
sys.path.insert(0, "..")

import numpy as np
import pandas as pd

import spacy
from spacy.lemmatizer import Lemmatizer
import nl_core_news_lg

from tqdm import tqdm_notebook as tqdm
from pprint import pprint
import spacy
import enchant
from enchant.checker import SpellChecker

from string import punctuation

import torch
import re
import nltk

from difflib import SequenceMatcher
from transformers import AutoTokenizer, AutoModel

from src import iterators

#### Settings

In [2]:
#stopword_list = nltk.corpus.stopwords.words('dutch')
from spacy.lang.nl.stop_words import STOP_WORDS
limit = 0 # limit the data
stopwords = set(STOP_WORDS) # set of stopwords for performance

#### Import the data

In [3]:
# Load csv
csv = iterators.iterate_directory("../data/processed/selected_articles/", ".csv")
df = pd.concat([pd.read_csv(c["article_path"]) for c in csv],ignore_index=True)
df.sort_values(by=["count"], ascending=False, inplace=True)

### Clean the dataset

#### Remove non Chinese, Japanese and Korean char

In [4]:
def jkc_detect(texts):
    # korean
    if re.search("[\uac00-\ud7a3]", texts):
        return False
    # japanese
    if re.search("[\u3040-\u30ff]", texts):
        return False
    # chinese
    if re.search("[\u4e00-\u9FFF]", texts):
        return False
    return True

In [5]:
# Only non jkc
df['non_jkc'] = df['text'].apply(jkc_detect)
df = df[df['non_jkc'] == True]

In [6]:
df.tail(3)

Unnamed: 0,Unnamed: 0_x,type,text,article_name,date,index_article,article_filepath,dir,Unnamed: 0_y,metadata_title,...,newspaper_title,newspaper_date,newspaper_city,newspaper_publisher,newspaper_source,newspaper_volume,newspaper_issuenumber,newspaper_language,count,non_jkc
390,64399,p,Donderdag 14 September. 15.00 uur Faun: Algeme...,DDD_010887169_0040_articletext.xml,1950-09-13,123813,../data/1950/09-13/DDD_010887169/DDD_010887169...,../data/1950/09-13/DDD_010887169,957.0,DDD:ddd:010887169:mpeg21.didl.xml.gz.xml,...,Nieuwsblad van het Noorden,1950-09-13,Groningen,Nieuwenhuis,Groninger archieven 649-1318,63.0,214.0,nl,1,True
391,63841,p,N Gist *So A 315* 312' i N. Instr. Nieaf 140 t...,DDD_110585146_0145_articletext.xml,1950-09-13,123589,../data/1950/09-13/DDD_110585146/DDD_110585146...,../data/1950/09-13/DDD_110585146,955.0,DDD:ddd:110585146:mpeg21.didl.xml.gz.xml,...,De Telegraaf,1950-09-13,Amsterdam,Dagblad De Telegraaf,KB C 98,53.0,19625.0,nl,1,True
914,135657,p,Donderdag 1 en Vrijdag 2 Juni zal het Koninkli...,DDD_010537284_0012_articletext.xml,1950-05-16,149828,../data/1950/05-16/DDD_010537284/DDD_010537284...,../data/1950/05-16/DDD_010537284,,,...,,,,,,,,,1,True


#### Create functions for preprocessing

In [7]:
# Keep "." "!" and "?" to define end of sentence.

punctuation = ",/<>;':\"[]\\{}|`~@#$%^&*()_+-="

def remove_punctuation(text):
    """Remove punctuation"""
    no_punct = "".join([c for c in text if c not in punctuation])
    return(no_punct)

In [8]:
df["text_clean"] = df["text"].apply(lambda x: remove_punctuation(x))

In [9]:
def remove_stopwords(text):
    """Remove stopwords as defined by Spacy stopwords"""
    words = "".join([w for w in text if w not in stopwords])
    return words

In [10]:
df["text_clean"] = df["text_clean"].apply(lambda x: remove_stopwords(x))

In [11]:
def remove_numeric(text):
    """Remove numbers"""
    words = ''.join([c for c in text if not c.isdigit()])
    return words

In [12]:
df["text_clean"] = df["text_clean"].apply(lambda x: remove_numeric(x))

In [13]:
df["text"][1]

"AMSTERDAM, 28 Nov. — Groente. tCoöp. Tuindersveilingverg. Amsterdam en omstreken G.A.) Andijvie 10—2 O, spinazie 20—66, prei B—l 4. uien s—6'i. broeivet 20—50, boerenkool 13—17. rode kool 10—12. groene kool 7—lo, gele kool 4—6 spruitkool 33—36. stoofsla 16—18, waspeen 14— 25, witlof A I 45—51. A II 34—41. B I 41—48. B II 26—36 et per kg, sla A 7—9 li, knolselderij 7—12 et. p. st., selderij B—l 7. peterselie 11—26. bospeen 12—18. kervel 2—3 et n. bos. MARKTBERICHT „DE TUINBOUW. GROOTEBROEK. 27 Nov. — Groente: 1000 kg Rode kool 8.50—9.80, 3600 kg Witte kool 4; 1500 kg Gele kool 3.50—4 40, 8500 kg Groene kool 5.90— 6.60. 9000 kg Bieten A 9 40—9 80. B 660—6.90; 6500 kg Uien grof 8.40— 860. groot 5.50—5.90. 50.000 stuks Bloemkool B 34—18. B II 25—34. C 8—22. C II B—lo. stek B—l 9. VEILING BROEK OP LANGENDHK. BROEK OP LANGENDUK 28 Nov. — Groente - 2500 kg Witlof (Al 9.80 —10; 2500 kg Uien 7.20. drielingen 9.40—10.10, nep 10—11.80; 800 kg Peen (C) 4.20; 13.000 kg Andijvie 19.20— 23.90: 30.00

In [14]:
def cleaner(df):
    "Extract relevant text from DataFrame using a regex"
    # Regex pattern for only alphanumeric, hyphenated text with 3 or more chars
    pattern = re.compile(r"[A-Za-z\-]{2,40}")
    df['text_no_point'] = df['text_clean'].str.findall(pattern).str.join(' ')
    if limit > 0:
        return df.iloc[:limit, :].copy()
    else:
        return df

In [15]:
cleaner(df)

Unnamed: 0,Unnamed: 0_x,type,text,article_name,date,index_article,article_filepath,dir,Unnamed: 0_y,metadata_title,...,newspaper_city,newspaper_publisher,newspaper_source,newspaper_volume,newspaper_issuenumber,newspaper_language,count,non_jkc,text_clean,text_no_point
0,40995,p,AMSTERDAM. 13 Nov. — (Coörj. Tuir.dersveilingv...,DDD_110585201_0106_articletext.xml,1950-11-16,115308,../data/1950/11-16/DDD_110585201/DDD_110585201...,../data/1950/11-16/DDD_110585201,896.0,DDD:ddd:110585201:mpeg21.didl.xml.gz.xml,...,Amsterdam,Dagblad De Telegraaf,KB C 98,53.0,19680.0,nl,22,True,AMSTERDAM. Nov. — Coörj. Tir.dersveilingverg....,AMSTERDAM Nov Co rj Tir dersveilingverg Amster...
1,27756,p,"AMSTERDAM, 28 Nov. — Groente. tCoöp. Tuindersv...",DDD_110585212_0127_articletext.xml,1950-11-29,110407,../data/1950/11-29/DDD_110585212/DDD_110585212...,../data/1950/11-29/DDD_110585212,859.0,DDD:ddd:110585212:mpeg21.didl.xml.gz.xml,...,Amsterdam,Dagblad De Telegraaf,KB C 98,53.0,19691.0,nl,19,True,AMSTERDAM Nov. — Groente. tCoöp. Tindersveili...,AMSTERDAM Nov Groente tCo Tindersveilingverg A...
2,122267,p,GROENTENVEILING LEEUWARDEN 5 Juni. Andijvie B—...,DDD_010612675_0107_articletext.xml,1950-06-07,144734,../data/1950/06-07/DDD_010612675/DDD_010612675...,../data/1950/06-07/DDD_010612675,,,...,,,,,,,13,True,GROENTENVEILING LEEUWARDEN Jni. Andijvie B—l ...,GROENTENVEILING LEEUWARDEN Jni Andijvie et per...
3,36288,p,Aoiang er mijnen bestaan is het mijngas de gro...,DDD_010417712_0100_articletext.xml,1950-11-16,113601,../data/1950/11-16/DDD_010417712/DDD_010417712...,../data/1950/11-16/DDD_010417712,882.0,DDD:ddd:010417712:mpeg21.didl.xml.gz.xml,...,Heerlen,Nieuwe Limburger koerier;Uitgeversmaatschappĳ ...,Sociaal Historisch centrum voor Limburg T 501,33.0,269.0,nl,13,True,Aoiang er mijnen bestaan is het mijngas de gro...,Aoiang er mijnen bestaan is het mijngas de gro...
4,90947,p,"NAALDWIJK, 6 Dec. — Groente Alleanten 1.09—1.3...",DDD_110585219_0073_articletext.xml,1950-12-07,133379,../data/1950/12-07/DDD_110585219/DDD_110585219...,../data/1950/12-07/DDD_110585219,26.0,DDD:ddd:110585219:mpeg21.didl.xml.gz.xml,...,Amsterdam,Dagblad De Telegraaf,KB C 98,53.0,19698.0,nl,12,True,NAALDWIJK Dec. — Groente Alleanten .—.. handa...,NAALDWIJK Dec Groente Alleanten handappelen an...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388,66431,p,(Advertentie) CarboTit beschermt leTen en gezo...,DDD_010417714_0020_articletext.xml,1950-11-18,124556,../data/1950/11-18/DDD_010417714/DDD_010417714...,../data/1950/11-18/DDD_010417714,964.0,DDD:ddd:010417714:mpeg21.didl.xml.gz.xml,...,Heerlen,Nieuwe Limburger koerier;Uitgeversmaatschappĳ ...,Sociaal Historisch centrum voor Limburg T 501,33.0,271.0,nl,1,True,Advertentie CarboTit beschermt leTen en gezond...,Advertentie CarboTit beschermt leTen en gezond...
389,64745,p,"Aan vrienden en kennissen,. £:sl9rst*^Sfi2'ffl...",DDD_011210810_0003_articletext.xml,1950-09-13,123963,../data/1950/09-13/DDD_011210810/DDD_011210810...,../data/1950/09-13/DDD_011210810,958.0,DDD:ddd:011210810:mpeg21.didl.xml.gz.xml,...,[Soerabaia],De Vrije Pers,Koninklijke Bibliotheek NBM C 1136,2.0,294.0,nl,1,True,Aan vrienden en kennissen. £slrstSfifflS®®fS£S...,Aan vrienden en kennissen slrstSfifflS fS Sysi...
390,64399,p,Donderdag 14 September. 15.00 uur Faun: Algeme...,DDD_010887169_0040_articletext.xml,1950-09-13,123813,../data/1950/09-13/DDD_010887169/DDD_010887169...,../data/1950/09-13/DDD_010887169,957.0,DDD:ddd:010887169:mpeg21.didl.xml.gz.xml,...,Groningen,Nieuwenhuis,Groninger archieven 649-1318,63.0,214.0,nl,1,True,Donderdag September. . r Fan Algemene vergade...,Donderdag September Fan Algemene vergadering v...
391,63841,p,N Gist *So A 315* 312' i N. Instr. Nieaf 140 t...,DDD_110585146_0145_articletext.xml,1950-09-13,123589,../data/1950/09-13/DDD_110585146/DDD_110585146...,../data/1950/09-13/DDD_110585146,955.0,DDD:ddd:110585146:mpeg21.didl.xml.gz.xml,...,Amsterdam,Dagblad De Telegraaf,KB C 98,53.0,19625.0,nl,1,True,N Gist So A i N. Instr. Nieaf t t Ned Kabe...,Gist So Instr Nieaf Ned Kabelt cA Ned Mij Hav ...


In [17]:
df["text"][10]

'Andijvie fs—fB per 100 kg., Bloemkool A f22—f37 per 100 stuks, B f 18—f20 per 100 stuks, C f 10—f 16 per 100 stuks. Bospeen f 9 per 100 bos. Waspeen f6—f 10 per 100 kg., Groene kool fB—f 16 per 100 kg., Groentjes f 12—f 13 per 100 kg., Knolselderij fs—f7 per 100 stuks, Komkommers f7—fB per 100 stuks, Kroten f7—f 10 per 100 kg., Princessebonen (dubbel) f2l—f33 per 100 kg., Stokprincessert f 47 per 100 kg., Stoksnljbonen fs6—f6o per 100 kg.. Sla f3.70—f4.30 per 100 stuks, Rode kool fB—f 10 per 100 kg., savoye kool fs—fB per 100 kg., Spruitkool f 35 per 100 kg.. Uien fs—f7 per 100 kg., Witte kool f 4 per 100 kg.. Tomaten f 12—f 15 per 100 kg., Druiven fso— fBl per 100 kg., pruimen f 37—f 58 per 100 kg.. Meloenen f26—fso per 100 kg.'

### Split long text

In [18]:
def get_split(txt, length):
  len_tot = []
  len_partial = []
  if len(txt.split())//length >0:
    n = len(txt.split())//length
  else: 
    n = 1
  for w in range(n):
    if w == 0:
      len_partial = txt.split()[:length]
      len_tot.append(" ".join(len_partial))
    else:
      len_partial = txt.split()[w*length:w*length + length]
      len_tot.append(" ".join(len_partial))
  return len_tot

In [19]:
df["text_clean_split"] = df["text_clean"].apply(get_split, length=500)
df["text_clean_split_no_point"] = df["text_no_point"].apply(get_split, length=500)

### Save cleaned df

In [20]:
df.to_csv("cleaned_df.csv")

----

### Additional subsetting only to relevant part

Select the divided text and retrieve also article_ids and article_name

In [154]:
divided_texts = []
idx_texts = []
name_texts = []
dfids_texts = []
for idx, row in df.iterrows():
  for text in row['text_split']:
    divided_texts.append(text)
    idx_texts.append(idx)
    name_texts.append(row["article_name"])
    dfids_texts.append(row["Unnamed: 0_x"])

Create smaller dataframe for analysis

In [155]:
df_texts = pd.DataFrame({"text":divided_texts, "article_id":dfids_texts, "article_name":name_texts})
df_texts.head(5)

Unnamed: 0,text,article_id,article_name
0,Aoiang er mijnen bestaan is het mijngas de gro...,36288,DDD_010417712_0100_articletext.xml
1,ning tijdens de ontgassing normaal kan doorgaa...,36381,DDD_010417712_0102_articletext.xml
2,"""W/ij eijn deze keer op een Joodse bruiloft, ""...",107454,DDD_010612570_0079_articletext.xml
3,Het is een spannende geschiedenis met de gasvo...,122625,DDD_010417601_0094_articletext.xml
4,"In elk geval, meende de archivaris, heeft pate...",125000,DDD_011199673_0059_articletext.xml
