# Preprocess text

In [16]:
import io
from collections import defaultdict
from string import punctuation
import os, os.path
import re
import sys
import string
sys.path.insert(0, "..")

import numpy as np
import pandas as pd

import spacy
from spacy.lemmatizer import Lemmatizer
import nl_core_news_lg
from spacy.lang.nl.stop_words import STOP_WORDS
nlp = nl_core_news_lg.load()

from tqdm import tqdm_notebook as tqdm
from pprint import pprint
import nltk

from collections import Counter

from src import iterators

#### Settings

In [17]:
stopword_list = nltk.corpus.stopwords.words('dutch')
STOPWORDS = set(stopword_list) # set of stopwords for performance
DECADE = "1990s"
TOPIC = "kool"
PUNCT_TO_REMOVE = ",/<>;':\"[]\\{}|`~@#$%^&*()_+-="

#### Import and prepare data

- Drop duplicates
- Reset index
- Rename columns

In [18]:
csv = iterators.iterate_directory(
    os.path.join("../data/processed/selected_articles/", DECADE), ".csv"
)
df = pd.concat(
    [pd.read_csv(c["article_path"]) for c in csv],
    ignore_index=True,
)
df.drop_duplicates(subset=["text"], inplace=True)
df.sort_values(by=["count"], ascending=False, inplace=True)
df.reset_index(inplace=True)
df.drop(columns={"index", "Unnamed: 0_x", "Unnamed: 0_y"}, inplace=True)

In [21]:
csv_temp = []
[csv_temp.append(c["article_name"]) for c in csv if "kool" in c["article_name"]]
csv_temp
#[csv_selected.append(c) for c in csv_temp if "kool" in c]
#csv_selected

['2020-11-17_kool_0.csv',
 '2020-11-17_kool_20.csv',
 '2020-11-17_kool_40.csv',
 '2020-11-17_kool_60.csv']

In [8]:
from collections import Counter
from os import sep
from string import punctuation
import re
import pandas as pd

import enchant
from pandarallel import pandarallel
import nltk


class TextCleaner:
    def __init__(self):
        self.d = enchant.Dict("nl_NL")
        self.stopword_list = nltk.corpus.stopwords.words("dutch")
        self.STOPWORDS = set(self.stopword_list)

    def get_words(self):
        self.text = " ".join([c for c in nltk.word_tokenize(self.text)])
        return self

    def lower(self):
        """Lower case the text"""
        self.text = "".join([t.lower() for t in self.text])
        return self

    def remove_stopwords(self):
        """custom function to remove the stopwords"""
        self.text = "".join([t for t in self.text if t not in self.STOPWORDS])
        return self

    def remove_numeric(self):
        """Remove numbers"""
        self.text = "".join([c for c in self.text if not c.isdigit()])
        return self

    def remove_non_ascii(self):
        """Remove non ASCII chars"""
        self.text = "".join([re.sub(r"[^\x00-\x7f]", r" ", c) for c in self.text])
        return self

    def remove_extra_whitespace_tabs(self):
        """Remove extra whitespaces and tabs"""
        self.text = re.sub(r"^\s*|\s\s*", " ", self.text).strip()
        return self

    def remove_one_char(self):
        self.text = " ".join([w for w in self.text.split() if len(w) > 1])
        return self

    def remove_non_words(self):
        """custom function to remove the rare words"""
        self.text = " ".join(
            [word for word in str(self.text).split() if self.d.check(word)]
        )
        return self

    def preprocess(self, text):
        self.text = text
        self = self.get_words()
        self = self.lower()
        self = self.remove_stopwords()
        self = self.remove_numeric()
        self = self.remove_non_ascii()
        self = self.remove_extra_whitespace_tabs()
        self = self.remove_one_char()
        self = self.remove_non_words()
        return self.text


In [9]:
tc = TextCleaner()

In [10]:
df["text_clean"] = df["text"].apply_progress(tc.preprocess)

In [11]:
df

Unnamed: 0,type,text,article_name,date,index_article,article_filepath,dir,metadata_title,index_metadata,metadata_filepath,newspaper_title,newspaper_date,newspaper_city,newspaper_publisher,newspaper_source,newspaper_volume,newspaper_issuenumber,newspaper_language,count,text_clean
0,p,JE VRAAGT JE WEL EENS AF: HOE KAN HET ZO GROEI...,DDD_011020088_0166_articletext.xml,1992-09-04,2251957,/Users/leonardovida/dev/HistAware/data/raw/del...,/Users/leonardovida/dev/HistAware/data/raw/del...,DDD:ddd:011020088:mpeg21.didl.xml.gz.xml,5720.0,/Users/leonardovida/dev/HistAware/data/raw/del...,Nieuwsblad van het Noorden,1992-09-04,Groningen,Nieuwenhuis,Groninger archieven,105.0,209.0,nl,27,je vraagt je wel eens af hoe kan het zo groeie...
1,p,ROTTERDAM — De raffinaderij van Shell in Perni...,DDD_010963661_0208_articletext.xml,1991-01-17,3384105,/Users/leonardovida/dev/HistAware/data/raw/del...,/Users/leonardovida/dev/HistAware/data/raw/del...,DDD:ddd:010963661:mpeg21.didl.xml.gz.xml,8705.0,/Users/leonardovida/dev/HistAware/data/raw/del...,Het vrĳe volk : democratisch-socialistisch dag...,1991-01-17,Rotterdam,De Arbeiderspers,Gemeentearchief Rotterdam,46.0,13383.0,nl,27,de raffinaderij van in moet de komende jaren d...
2,p,SHELL HELPT ZELFS ALS U UW PORTEMONNEE EN'T MI...,DDD_011019277_0037_articletext.xml,1990-03-02,2838400,/Users/leonardovida/dev/HistAware/data/raw/del...,/Users/leonardovida/dev/HistAware/data/raw/del...,DDD:ddd:011019277:mpeg21.didl.xml.gz.xml,7261.0,/Users/leonardovida/dev/HistAware/data/raw/del...,Nieuwsblad van het Noorden,1990-03-02,Groningen,Nieuwenhuis,Groninger archieven,103.0,52.0,nl,26,helpt zelfs als portemonnee wilt sparen we ver...
3,p,SHELL HELPT ZELFS ALSÜ] UW PORTEMONNEE EN 'T M...,DDD_010645642_0658_articletext.xml,1990-03-03,2860895,/Users/leonardovida/dev/HistAware/data/raw/del...,/Users/leonardovida/dev/HistAware/data/raw/del...,DDD:ddd:010645642:mpeg21.didl.xml.gz.xml,7330.0,/Users/leonardovida/dev/HistAware/data/raw/del...,De Telegraaf,1990-03-03,Amsterdam,Dagblad De Telegraaf,KB C 98,98.0,31764.0,nl,26,helpt zelfs als portemonnee en mie wilt sparen...
4,p,JE VRAAGT JE WEL EENS AF: HOE KAN HET ZO GROEI...,DDD_010646599_0098_articletext.xml,1992-09-04,2251164,/Users/leonardovida/dev/HistAware/data/raw/del...,/Users/leonardovida/dev/HistAware/data/raw/del...,DDD:ddd:010646599:mpeg21.didl.xml.gz.xml,5717.0,/Users/leonardovida/dev/HistAware/data/raw/del...,De Telegraaf,1992-09-04,Amsterdam,Dagblad De Telegraaf,KB C 98,100.0,32535.0,nl,25,je vraagt je wel eens af hoe kan het zo groeie...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112422,p,"T.k. BMW 320 6-cil., bj. '80, mot. '85, NETTE ...",DDD_010646237_0160_articletext.xml,1990-11-30,3113278,/Users/leonardovida/dev/HistAware/data/raw/del...,/Users/leonardovida/dev/HistAware/data/raw/del...,DDD:ddd:010646237:mpeg21.didl.xml.gz.xml,8016.0,/Users/leonardovida/dev/HistAware/data/raw/del...,De Telegraaf,1990-11-30,Amsterdam,Dagblad De Telegraaf,KB C 98,98.0,31994.0,nl,1,mot nette zwart grille wal bren met aangeboden...
112423,p,", Opel Omega 20 SE, LPG, grijs . met, m. '88, ...",DDD_010646237_0236_articletext.xml,1990-11-30,3113443,/Users/leonardovida/dev/HistAware/data/raw/del...,/Users/leonardovida/dev/HistAware/data/raw/del...,DDD:ddd:010646237:mpeg21.didl.xml.gz.xml,8016.0,/Users/leonardovida/dev/HistAware/data/raw/del...,De Telegraaf,1990-11-30,Amsterdam,Dagblad De Telegraaf,KB C 98,98.0,31994.0,nl,1,omega se grijs met diesel vele extra als dag v...
112424,p,"Am. mod. Ford LTD, 4-drs., .m. '83, LPG, ZEER ...",DDD_010646237_0206_articletext.xml,1990-11-30,3113641,/Users/leonardovida/dev/HistAware/data/raw/del...,/Users/leonardovida/dev/HistAware/data/raw/del...,DDD:ddd:010646237:mpeg21.didl.xml.gz.xml,8016.0,/Users/leonardovida/dev/HistAware/data/raw/del...,De Telegraaf,1990-11-30,Amsterdam,Dagblad De Telegraaf,KB C 98,98.0,31994.0,nl,1,ford zeer mooi ford zilvergrijs ford en ford e...
112425,p,"Honda PRELUDE EX, 1987, 55.000 km. Prelude EX ...",DDD_010646237_0208_articletext.xml,1990-11-30,3113690,/Users/leonardovida/dev/HistAware/data/raw/del...,/Users/leonardovida/dev/HistAware/data/raw/del...,DDD:ddd:010646237:mpeg21.didl.xml.gz.xml,8016.0,/Users/leonardovida/dev/HistAware/data/raw/del...,De Telegraaf,1990-11-30,Amsterdam,Dagblad De Telegraaf,KB C 98,98.0,31994.0,nl,1,ex km ex at km steen ex model ex zilver met ze...


In [323]:
df.to_csv("test_df")

In [325]:
from spacy import BaseEstimator, TransformerMixin

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                 n_jobs=4):
        """
        Text preprocessing transformer includes steps:
            2. Punctuation removal
            3. Stop words removal
        
        n_jobs - parallel jobs to run
        """
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):
        X_copy = X.copy()

        partitions = 1
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
            partitions = cores
        elif self.n_jobs <= 0:
            return X_copy.apply(self._preprocess_text)
        else:
            partitions = min(self.n_jobs, cores)

        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()

        return data

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        doc = nlp(text)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        return self._lemmatize(removed_stop_words)

    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        return [t for t in doc if not t.is_stop]

ImportError: cannot import name 'BaseEstimator' from 'spacy' (/Users/leonardovida/Library/Caches/pypoetry/virtualenvs/histaware-RplM6c3o-py3.8/lib/python3.8/site-packages/spacy/__init__.py)

In [317]:
text = TextPreprocessor(n_jobs=-1).transform(text['text'])

NameError: name 'TextPreprocessor' is not defined

In [326]:
def preprocess_spacy(text):
    doc = nlp(text)
    tokens = [t.text.lower() for t in doc]
    tokens_without_punct_spacy = [t.text for t in doc if t.pos_ != 'PUNCT']
    text_without_stop_words = [t.text for t in nlp(text) if not t.is_stop]
    text_without_stop_words = [t.text for t in text if re.match('[a-z]+', item)])
    return text_without_stop_words

In [358]:
%%timeit
df["tokens"] = df["text"].apply(lambda x: preprocess_spacy(x))

KeyboardInterrupt: 

In [328]:
df["tokens"][1]

['IB',
 'ÜiiT',
 'nmtifear-',
 '•i4B3blt^l',
 'BBBhBI',
 'BBBBilißßaWttßMa',
 '«',
 'ft',
 '«',
 'ia^^',
 '«',
 'B^BBBfißßßßsaa^^^^^^i^i',
 'v<.v',
 '•.',
 '-',
 '„',
 ';',
 '&',
 '■',
 'BBBBBBBbIÉBBBBBÉÈÏ^',
 'BBBBBBBHKÉ',
 ':',
 '-',
 '■',
 '■',
 '^i->^,^^>^^',
 'BBBBBBBSBBBB^',
 '&',
 ':',
 '>',
 'Së',
 "~-'ï",
 'w',
 '&',
 '.',
 '■',
 'Bwê',
 'v',
 '&',
 'c',
 'bbVAf',
 'BBrT^^l^',
 'vs',
 '>',
 ':',
 "'",
 "'",
 'J^',
 ':',
 'SfiBBBBBBBff',
 '*',
 '*',
 '*',
 'm/',
 '*',
 'geheime',
 'wapen',
 'Michiel',
 'Adriaansz',
 '.',
 'Ruyter',
 '.',
 'Michiel',
 'Ruyter',
 'inderdaad',
 'heel',
 'zuurkoolsla',
 'is-ie',
 'heel',
 'erg',
 'proeven',
 ',',
 'had-ie',
 'vingers',
 'witte',
 'kooi',
 'boord',
 '.',
 'schoot',
 'lekker',
 '.',
 'rauw',
 'fijngesneden',
 'erbij',
 'afgelikt',
 '.',
 'mee',
 '.',
 'zuurkool',
 'appelen',
 ',',
 'rozijnen',
 ',',
 'zure',
 'room',
 ',',
 'ui',
 "'m",
 'verteld',
 'maken',
 'at',
 '.',
 'Samen',
 'zn',
 'geknipte',
 'bieslook',
 '.',
 'recept',
 's

### Clean the dataset

In [7]:
import enchant
d = enchant.Dict("nl_NL")
FREQWORDS = set([w for (w, wc) in cnt.most_common(20)])

def remove_punctuation(text):
    """Remove punctuation"""
    return "".join([c.lower() for c in text if c not in punctuation])

def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

def remove_numeric(text):
    """Remove numbers"""
    return "".join([c for c in text if not c.isdigit()])

def remove_non_ascii(text):
    """Remove non ASCII chars"""
    return "".join([re.sub(r'[^\x00-\x7f]',r' ',c) for c in text])

def remove_extra_whitespace_tabs(text):
    """Remove extra whitespaces and tabs"""
    return re.sub(r'^\s*|\s\s*', ' ', text).strip()

def remove_one_char(text):
    return " ".join([w for w in text.split() if len(w)>1])

def remove_non_words(text):
    """custom function to remove the rare words"""
    return " ".join([word for word in str(text).split() if d.check(word)])

df["text_wop"] = df["text"].apply(lambda x: remove_punctuation(x))
df["text_wop_clean"] = df["text_wop"].apply(lambda x: remove_stopwords(x))
df["text_wop_clean"] = df["text_wop_clean"].apply(lambda x: remove_numeric(x))
df["text_wop_clean"] =  df["text_wop_clean"].apply(lambda x: remove_non_ascii(x))
df["text_wop_clean"] =  df["text_wop_clean"].apply(lambda x: remove_extra_whitespace_tabs(x))
df["text_wop_clean"] = df["text_wop_clean"].apply(lambda x: remove_one_char(x))
df["text_wop_clean"] = df["text_wop_clean"].apply(lambda text: remove_non_words(text))

NameError: name 'cnt' is not defined

In [387]:
df["label"] = None

### Save cleaned df

In [388]:
df.to_csv("df_to_label.csv")

----

### Split long text

In [21]:
def get_split(txt, length):
  len_tot = []
  len_partial = []
  if len(txt.split())//length >0:
    n = len(txt.split())//length
  else: 
    n = 1
  for w in range(n):
    if w == 0:
      len_partial = txt.split()[:length]
      len_tot.append(" ".join(len_partial))
    else:
      len_partial = txt.split()[w*length:w*length + length]
      len_tot.append(" ".join(len_partial))
  return len_tot

In [22]:
df["text_clean_split"] = df["text_clean"].apply(get_split, length=500)
df["text_clean_split_no_point"] = df["text_no_point"].apply(get_split, length=500)

---

### Scikit learn pipeline

In [36]:
from sklearn.preprocessing import FunctionTransformer
def pipelinize(function, active=True):
    def list_comprehend_a_function(list_or_series, active=True):
        if active:
            return [function(i) for i in list_or_series]
        else: # if it's not active, just pass it right back
            return list_or_series
    return FunctionTransformer(list_comprehend_a_function, validate=False, kw_args={'active':active})

In [66]:
import re
def preprocessor(text):
    if isinstance((text), (str)):
        text = re.sub('<[^>]*>', '', text) # All html tags word characters 
        text = re.sub('[\W]+', '', text.lower()) # All the non word characters
        text = re.sub('[/\b(\w)\b/]', '', text)
        return text
    if isinstance((text), (list)):
        return_list = []
        for i in range(len(text)):
            temp_text = re.sub('<[^>]*>', '', text[i])
            temp_text = re.sub('[\W]+', '', temp_text.lower())
            #temp_text = re.sub('[/\b(\w)\b/]', '', temp_text)
            return_list.append(temp_text)
        return(return_list)
    else:
        pass

In [67]:
from nltk.tokenize import WhitespaceTokenizer
def w_tokenizer(text):
    
    tokenizer = WhitespaceTokenizer()   
# Use tokenize method 
    tokenized_list = tokenizer.tokenize(text) 
    return(tokenized_list)

In [82]:
import spacy
def remove_stopwords(text_list):
    spacy_stopwords = spacy.lang.nl.stop_words.STOP_WORDS
    return_list = []
    for i in range(len(text_list)):
        if text_list[i] not in spacy_stopwords and len(text_list[i]) > 1:
            return_list.append(text_list[i])
    return(return_list)

In [86]:
from sklearn.pipeline import Pipeline
estimators = [
    ('tokenizer', pipelinize(w_tokenizer)),
    ('stopwordremoval', pipelinize(remove_stopwords)),
    ('preprocessor', pipelinize(preprocessor))
]
pipe = Pipeline(estimators)

In [87]:
test = df["text"][1]
pipe.transform([test])

[['aaamm',
  '__',
  'ha',
  '______',
  'i',
  'mssbjh',
  '',
  '',
  'f',
  'lljß',
  'het',
  'geheime',
  'wapen',
  'michiel',
  'adriaansz',
  'ruyter',
  'michiel',
  'ruyter',
  'inderdaad',
  'heel',
  'maar',
  'zuurkoolsla',
  'isie',
  'heel',
  'erg',
  'proeven',
  'hadie',
  'vingers',
  'witte',
  'kool',
  'boord',
  'alleen',
  'schoot',
  'lekker',
  'gewoon',
  'rauw',
  'fijngesneden',
  'erbij',
  'afgelikt',
  'mee',
  'hij',
  'zuurkool',
  'appelen',
  'rozijnen',
  'zure',
  'room',
  'ui',
  'en',
  'ais',
  'm',
  'verteld',
  'maken',
  'at',
  'op',
  'samen',
  'zn',
  'geknipte',
  'bieslook',
  'het',
  'receptstaat',
  'heerlijke',
  'dingen',
  'mannen',
  'zakje',
  'allemaal',
  'zuurkool',
  'en',
  'geheime',
  'wapen',
  'want',
  'oh',
  'ja',
  'door',
  'zuur',
  'zakjes',
  'lezen',
  'wasie',
  'eten',
  'zuurkool',
  'kregen',
  'kool',
  'week',
  'aanbieden',
  'waarschijnlijk',
  'schieten',
  'toeze',
  'last',
  'scheurbuik',
  'zakje

### Additional subsetting only to relevant part

Select the divided text and retrieve also article_ids and article_name

In [154]:
divided_texts = []
idx_texts = []
name_texts = []
dfids_texts = []
for idx, row in df.iterrows():
  for text in row['text_split']:
    divided_texts.append(text)
    idx_texts.append(idx)
    name_texts.append(row["article_name"])
    dfids_texts.append(row["Unnamed: 0_x"])

Create smaller dataframe for analysis

In [155]:
df_texts = pd.DataFrame({"text":divided_texts, "article_id":dfids_texts, "article_name":name_texts})
df_texts.head(5)

Unnamed: 0,text,article_id,article_name
0,Aoiang er mijnen bestaan is het mijngas de gro...,36288,DDD_010417712_0100_articletext.xml
1,ning tijdens de ontgassing normaal kan doorgaa...,36381,DDD_010417712_0102_articletext.xml
2,"""W/ij eijn deze keer op een Joodse bruiloft, ""...",107454,DDD_010612570_0079_articletext.xml
3,Het is een spannende geschiedenis met de gasvo...,122625,DDD_010417601_0094_articletext.xml
4,"In elk geval, meende de archivaris, heeft pate...",125000,DDD_011199673_0059_articletext.xml
