In [1]:
# Importing the libraries needed
import os, os.path
import sys
sys.path.insert(0, "..")
from string import punctuation
import re

import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, LoggingHandler, util
from transformers import BertTokenizer, BertModel

import nltk.data
from tqdm.notebook import tqdm
from pprint import pprint
import spacy
import enchant
from enchant.checker import SpellChecker
from nltk.sentiment import vader
from nltk.corpus import stopwords
import nltk
from gensim import corpora
from gensim import models

import pyLDAvis
import pyLDAvis.gensim
import pandas as pd
import seaborn as sns
%load_ext autoreload
%autoreload 2

from src import iterators

In [2]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
# Setup the file
csv = iterators.iterate_directory("../data/processed/selected_articles/", ".csv")
df = pd.concat([pd.read_csv(c["article_path"]) for c in csv],ignore_index=True)
df.sort_values(by=["count"], ascending=False, inplace=True)

In [4]:
word_nl=[]
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    word_nl.append(row["text"])

HBox(children=(FloatProgress(value=0.0, max=5744.0), HTML(value='')))




We will take this text as a basis to check our progress in cleaning the text

In [5]:
word_nl[1][:500]

'ning tijdens de ontgassing normaal kan doorgaan. In de Belgische mijn „Le Grand Trait" te Frameries in Henegouwen „oogstte" men op deze wijze in 2 maanden tijds 378.000 m 3 methaangas, in de mijn „Saint Albert" te Ressaix in een iets langere periode 428.650 m 3 methaan. In Henegouwen wordt het gas reeds naar buiten geleverd via de lichtgasfabrieken te Tertre. Methaangas levert 8000 tot 9000 caloriën warmte, hetgeen tweemaal zoveel is als gewoon cokesovengas. In vele andere mijnen, waaronder de K'

Clean-up

In [6]:
word_nl=["".join([l for l in word if l not in punctuation]) for word in word_nl]  #remove punctuation
word_nl=[word.lower() for word in word_nl]  # convert to lower case
word_nl=[" ".join(word.split()) for word in word_nl]   # remove double spaces by splitting the strings into words and joining these words again
word_nl=[re.sub(r'[^a-zA-z\s]', '', word) for word in word_nl]  # to remove special characters and symbols

In [7]:
word_nl[1][:500]

'ning tijdens de ontgassing normaal kan doorgaan in de belgische mijn le grand trait te frameries in henegouwen oogstte men op deze wijze in  maanden tijds  m  methaangas in de mijn saint albert te ressaix in een iets langere periode  m  methaan in henegouwen wordt het gas reeds naar buiten geleverd via de lichtgasfabrieken te tertre methaangas levert  tot  calorin warmte hetgeen tweemaal zoveel is als gewoon cokesovengas in vele andere mijnen waaronder de kempische neemt men proeven er bestaan p'

Convert a list of words (tokens) to (token_id, token_count) tuples.

In [9]:
ldainput_m1 = [word.split() for word in word_nl]           # convert all strings to list of words
id2word_m1 = corpora.Dictionary(ldainput_m1)                       # assign a token_id to each word
ldacorpus_m1 = [id2word_m1.doc2bow(doc) for doc in ldainput_m1]       # represent each word by (token_id, token_count) tuples

In [10]:
lda_m1 = models.LdaModel(ldacorpus_m1, id2word=id2word_m1, num_topics=3)

In [11]:
lda_m1.print_topics()

[(0,
  '0.027*"de" + 0.022*"en" + 0.015*"in" + 0.012*"te" + 0.012*"een" + 0.011*"van" + 0.010*"het" + 0.008*"op" + 0.007*"met" + 0.006*"is"'),
 (1,
  '0.018*"en" + 0.018*"v" + 0.016*"de" + 0.015*"te" + 0.015*"i" + 0.014*"van" + 0.011*"a" + 0.010*"j" + 0.009*"n" + 0.009*"t"'),
 (2,
  '0.070*"de" + 0.033*"van" + 0.028*"het" + 0.024*"en" + 0.022*"een" + 0.021*"in" + 0.012*"te" + 0.010*"dat" + 0.010*"op" + 0.008*"met"')]

All topics are stopwords! We don't want this, right?
Let's try to remove them.

In [8]:
stopword_list = nltk.corpus.stopwords.words('dutch')
words_nl_clean = [" ".join([w for w in word.split() if w not in stopword_list]) for word in word_nl] # eliminate all stopwords
words_nl_clean = [" ".join([w for w in word.split() if len(w)>1]) for word in words_nl_clean] # eliminate all one char words

In [13]:
words_nl_clean[1][:500]

'ning tijdens ontgassing normaal doorgaan belgische le grand trait frameries henegouwen oogstte wijze maanden tijds methaangas saint albert ressaix langere periode methaan henegouwen gas buiten geleverd via lichtgasfabrieken tertre methaangas levert calorin warmte hetgeen tweemaal zoveel gewoon cokesovengas vele mijnen waaronder kempische neemt proeven bestaan plannen belgisch limburg leidingermet leggen distributie gas bevolking probleem vormt echter vrij onregelmatige toevoer waarmee ongetwijfe'

In [12]:
ldainput_m2 = [word.split() for word in words_nl_clean]      # words_nl_clean instead of words_nl
id2word_m2 = corpora.Dictionary(ldainput_m2)                       
ldacorpus_m2 = [id2word_m2.doc2bow(doc) for doc in ldainput_m2]  
lda_m2 = models.LdaModel(ldacorpus_m2, id2word=id2word_m2, num_topics=3)
lda_m2.print_topics(num_words=5)

[(0,
  '0.004*"stg" + 0.003*"ned" + 0.003*"amsterdam" + 0.002*"ledig" + 0.002*"mei"'),
 (1, '0.003*"tel" + 0.003*"koop" + 0.003*"grote" + 0.003*"wel" + 0.003*"wij"'),
 (2, '0.005*"no" + 0.004*"uur" + 0.003*"ca" + 0.002*"br" + 0.002*"ned"')]

### Stopwords commentary!
- Is kind of arbitrary what is on the stopword list and what not
- Depending on the research question one is interested in, it might differ what words are 'meaningful'

Let's try with tf-idf

In [94]:
#ldacorpus_m3 = ldacorpus_m1       # reuse corpus from Model 1 
#id2word_m3 = id2word_m1           # and thus, also use id2word-mapping
#tfidfcorpus_m3 = models.TfidfModel(ldacorpus_m3)
#lda_m3 = models.ldamodel.LdaModel(corpus=tfidfcorpus_m3[ldacorpus_m3],id2word=id2word_m3,num_topics=10)
#lda_m3.print_topics(num_words=5)

### Using cleaned with stopwords

ldacorpus_m3 = ldacorpus_m2       # reuse corpus from Model 2
id2word_m3 = id2word_m2           # and thus, also use id2word-mapping
tfidfcorpus_m3 = models.TfidfModel(ldacorpus_m3)
lda_m3 = models.ldamodel.LdaModel(corpus=tfidfcorpus_m3[ldacorpus_m3],id2word=id2word_m3,num_topics=10)
lda_m3.print_topics(num_words=5)

  and should_run_async(code)


[(0,
  '0.001*"uur" + 0.001*"gasten" + 0.001*"koop" + 0.001*"wij" + 0.001*"wel"'),
 (1, '0.000*"it" + 0.000*"yn" + 0.000*"fan" + 0.000*"mar" + 0.000*"mei"'),
 (2,
  '0.001*"gasille" + 0.001*"frau" + 0.000*"arnhem" + 0.000*"stg" + 0.000*"gastvrouw"'),
 (3,
  '0.000*"gasten" + 0.000*"pakistan" + 0.000*"bank" + 0.000*"olie" + 0.000*"gasperi"'),
 (4,
  '0.001*"mrs" + 0.000*"prins" + 0.000*"mei" + 0.000*"brand" + 0.000*"pegasus"'),
 (5,
  '0.001*"olies" + 0.000*"hilv" + 0.000*"per" + 0.000*"hoger" + 0.000*"zullen"'),
 (6,
  '0.000*"knil" + 0.000*"liter" + 0.000*"introductie" + 0.000*"per" + 0.000*"alcide"'),
 (7,
  '0.000*"rooms" + 0.000*"pnt" + 0.000*"katholieke" + 0.000*"kerkdiensten" + 0.000*"gerecht"'),
 (8, '0.001*"br" + 0.001*"no" + 0.000*"per" + 0.000*"gevr" + 0.000*"tel"'),
 (9,
  '0.001*"pholien" + 0.001*"ned" + 0.001*"ca" + 0.001*"stg" + 0.000*"ledig"')]

Filtering extremes

In [93]:
#id2word_m4 = corpora.Dictionary(ldainput_m1)        # reuse input from M1     

#id2word_m4.filter_extremes(no_below=10, no_above=0.5)   # do not consider all words that occur in less than n=5 documents
                                                    # or in more than 50% of all documents.

#ldacorpus_m4 = [id2word_m4.doc2bow(doc) for doc in ldainput_m1]
#tfidfcorpus_m4 = models.TfidfModel(ldacorpus_m4)
#lda_m4 = models.ldamodel.LdaModel(corpus=tfidfcorpus_m4[ldacorpus_m4],id2word=id2word_m4,num_topics=10)
#lda_m4.print_topics(num_words=5)


### Using cleaned with stopwords

id2word_m4 = corpora.Dictionary(ldainput_m2)        # reuse input from M2 

id2word_m4.filter_extremes(no_below=20, no_above=0.5)   # do not consider all words that occur in less than n=5 documents
                                                    # or in more than 50% of all documents.

ldacorpus_m4 = [id2word_m4.doc2bow(doc) for doc in ldainput_m2]
tfidfcorpus_m4 = models.TfidfModel(ldacorpus_m4)
lda_m4 = models.ldamodel.LdaModel(corpus=tfidfcorpus_m4[ldacorpus_m4],id2word=id2word_m4,num_topics=10)
lda_m4.print_topics(num_words=5)

  and should_run_async(code)


[(0, '0.009*"br" + 0.009*"no" + 0.008*"koop" + 0.005*"tel" + 0.004*"heerlen"'),
 (1,
  '0.004*"franse" + 0.003*"regering" + 0.002*"ca" + 0.002*"wel" + 0.002*"gasten"'),
 (2,
  '0.005*"tel" + 0.005*"koop" + 0.004*"nieuws" + 0.004*"telef" + 0.004*"pholien"'),
 (3,
  '0.004*"mrs" + 0.002*"bezoek" + 0.002*"mr" + 0.002*"regering" + 0.002*"belgische"'),
 (4,
  '0.006*"bank" + 0.005*"olies" + 0.005*"punten" + 0.005*"gas" + 0.004*"markt"'),
 (5,
  '0.008*"gasten" + 0.006*"rust" + 0.004*"minuten" + 0.004*"bal" + 0.004*"spel"'),
 (6,
  '0.003*"heer" + 0.003*"gooi" + 0.003*"prins" + 0.003*"uur" + 0.002*"boord"'),
 (7, '0.008*"it" + 0.005*"fan" + 0.004*"yn" + 0.003*"and" + 0.003*"mar"'),
 (8,
  '0.004*"onze" + 0.004*"uur" + 0.004*"gast" + 0.003*"gasille" + 0.003*"mei"'),
 (9, '0.004*"ft" + 0.003*"ned" + 0.003*"aand" + 0.003*"kisten" + 0.003*"djl"')]

Bigrams

In [76]:
words_nl_bigrams = [["_".join(tup) for tup in nltk.ngrams(word.split(),2)] for word in words_nl_clean]

In [77]:
assert len(words_nl_clean)==len(words_nl_bigrams)
words_nl_uniandbigrams = []
for a,b in zip([word.split() for word in words_nl_clean],words_nl_bigrams):
    words_nl_uniandbigrams.append(a + b)

In [78]:
len(words_nl_uniandbigrams[6]),len(words_nl_bigrams[6]),len(words_nl_clean[6].split())

(721, 360, 361)

In [96]:
id2word_m5 = corpora.Dictionary(words_nl_uniandbigrams)                       
id2word_m5.filter_extremes(no_below=20, no_above=0.5)
ldacorpus_m5 = [id2word_m5.doc2bow(doc) for doc in words_nl_uniandbigrams]
tfidfcorpus_m5 = models.TfidfModel(ldacorpus_m5)
lda_m5 = models.ldamodel.LdaModel(corpus=tfidfcorpus_m5[ldacorpus_m5],id2word=id2word_m5,num_topics=10)
lda_m5.print_topics(num_words=5)

  and should_run_async(code)


[(0,
  '0.004*"nieuws" + 0.002*"steenkool" + 0.002*"ten" + 0.002*"ton" + 0.002*"water"'),
 (1, '0.006*"tel" + 0.003*"ca" + 0.003*"br" + 0.003*"uur" + 0.003*"koop"'),
 (2, '0.004*"it" + 0.004*"franse" + 0.003*"fan" + 0.003*"onze" + 0.002*"yn"'),
 (3,
  '0.003*"wel" + 0.002*"gast" + 0.002*"gasten" + 0.002*"weer" + 0.002*"echter"'),
 (4, '0.004*"mrs" + 0.003*"mr" + 0.003*"mr_mrs" + 0.003*"heer" + 0.003*"we"'),
 (5,
  '0.006*"bank" + 0.003*"olies" + 0.003*"punten" + 0.003*"philips" + 0.003*"vandaag"'),
 (6,
  '0.003*"gasten" + 0.002*"gasfabriek" + 0.002*"minuten" + 0.002*"doelpunt" + 0.002*"aardolie"'),
 (7,
  '0.010*"pholien" + 0.003*"mrt" + 0.003*"wonnen" + 0.003*"rooms" + 0.003*"katholieken"'),
 (8, '0.006*"koop" + 0.006*"gasten" + 0.006*"no" + 0.004*"br" + 0.004*"hilv"'),
 (9, '0.005*"am" + 0.004*"stg" + 0.004*"mei" + 0.003*"st" + 0.003*"prins"')]

In [97]:
cm1 = models.CoherenceModel(model=lda_m1, corpus=ldacorpus_m1, dictionary= id2word_m1, coherence='u_mass')  
naivecoh = cm1.get_coherence()
cm2 = models.CoherenceModel(model=lda_m2, corpus=ldacorpus_m2, dictionary= id2word_m2, coherence='u_mass')  
cleancoh = cm2.get_coherence()
#cm3 = models.CoherenceModel(model=lda_m3, corpus=ldacorpus_m3, coherence='u_mass')
cm3 = models.CoherenceModel(model=lda_m3, corpus=tfidfcorpus_m3[ldacorpus_m3], dictionary= id2word_m3, coherence='u_mass')
tfidfcoh = cm3.get_coherence()
cm4 = models.CoherenceModel(model=lda_m4, corpus=tfidfcorpus_m4[ldacorpus_m4], dictionary= id2word_m4, coherence='u_mass')
tfidffiltercoh = cm4.get_coherence()
cm5 = models.CoherenceModel(model=lda_m5, corpus=tfidfcorpus_m5[ldacorpus_m5], dictionary= id2word_m5, coherence='u_mass')
tfidffiltercohbi = cm5.get_coherence()
print("Coherence of naive model = {}\nCoherence of clean model = {}\nCoherence of tf-idf model = {}\nCoherence of tf-idf model without extreme words {}\nCoherence of tf-idf model without extreme words with bigrams {}".format(naivecoh, cleancoh, tfidfcoh,tfidffiltercoh, tfidffiltercohbi))
print("NB: Note that it may not make too much sense to compare these vaues across different corpora")

  and should_run_async(code)
Coherence of naive model = -1.0065971499963493
Coherence of clean model = -2.4950369875396925
Coherence of tf-idf model = -10.175816384984412
Coherence of tf-idf model without extreme words -4.512475279764333
Coherence of tf-idf model without extreme words with bigrams -5.234766037827193
NB: Note that it may not make too much sense to compare these vaues across different corpora


In [98]:
lda_m3_good = models.LdaModel(tfidfcorpus_m3[ldacorpus_m3], id2word=id2word_m3, num_topics=10, iterations=50, passes=5, eta='auto',alpha='auto')
print(models.CoherenceModel(model=lda_m3_good, corpus=tfidfcorpus_m3[ldacorpus_m3], coherence='u_mass').get_coherence())

lda_m4_good = models.LdaModel(tfidfcorpus_m4[ldacorpus_m4], id2word=id2word_m4, num_topics=10, iterations=50, passes=5, eta='auto',alpha='auto')
print(models.CoherenceModel(model=lda_m4_good, corpus=tfidfcorpus_m4[ldacorpus_m4], coherence='u_mass').get_coherence())

lda_m5_good = models.LdaModel(tfidfcorpus_m5[ldacorpus_m5], id2word=id2word_m5, num_topics=10, iterations=50, passes=5, eta='auto',alpha='auto')
print(models.CoherenceModel(model=lda_m5_good, corpus=tfidfcorpus_m5[ldacorpus_m5], coherence='u_mass').get_coherence())

  and should_run_async(code)
-11.009042078816773
-3.8527509763853436
-6.9572462986021675


In [101]:
lda_m5_good.top_topics(tfidfcorpus_m5[ldacorpus_m5])

  and should_run_async(code)


[([(0.013298701, 'tel'),
   (0.011875413, 'koop'),
   (0.011725163, 'no'),
   (0.010293188, 'br'),
   (0.006389606, 'telefoon'),
   (0.005796473, 'br_no'),
   (0.0056766374, 'enz'),
   (0.005346738, 'prima'),
   (0.0047042477, 'telef'),
   (0.0045381156, 'gr'),
   (0.004282774, 'uur'),
   (0.004157889, 'per'),
   (0.003971332, 'vanaf'),
   (0.0039335587, 'st'),
   (0.0038672606, 'gevraagd'),
   (0.0038622823, 'bur'),
   (0.0037630112, 'gevr'),
   (0.003719777, 'brieven'),
   (0.003695346, 'prijs'),
   (0.00368805, 'heerlen')],
  -1.4226505475735711),
 ([(0.00242109, 'wij'),
   (0.00241874, 'heer'),
   (0.0024047955, 'wel'),
   (0.0024030686, 'onze'),
   (0.0022985316, 'jaar'),
   (0.0022102934, 'uur'),
   (0.0021385876, 'waar'),
   (0.0021326314, 'grote'),
   (0.0021229677, 'weer'),
   (0.002104246, 'gasten'),
   (0.0020525474, 'we'),
   (0.002016045, 'twee'),
   (0.0018666487, 'ten'),
   (0.0018544492, 'zullen'),
   (0.0017914749, 'alle'),
   (0.0016847705, 'nederlandse'),
   (0.00168

In [88]:
#vis_data = pyLDAvis.gensim.prepare(lda_m5,ldacorpus_m5,id2word_m5)
#pyLDAvis.display(vis_data)

  and should_run_async(code)


In [117]:
dictionary = corpora.Dictionary(ldainput_m1)
dictionary.save('/Users/leonardovida/nltk_data/corpora/nl_1950/nl_1950_stop.dict')
dictionary = corpora.Dictionary(ldainput_m2)
dictionary.save('/Users/leonardovida/nltk_data/corpora/nl_1950/nl_1950_nostop.dict')

  and should_run_async(code)


In [121]:
import collections 
collections.Counter(ldainput_m2)

  and should_run_async(code)


TypeError: unhashable type: 'list'