# Creating a Corpus for German Political Speeches

The vector space model (VSM) is a common representation for documents in order to perfom clustering, topic modeling, classification, similarity search etc.

In this case, we want to represent documents as bag-of-words. Therefore, the documents in the input text files (CSV, one document per line) are tokenized and converted into a corpus of indexed terms/tokens.

### Pre-processing Steps

The input data is usually messy. But instead of extensive pre-processing, e.g. cleaning of markup, punctuation, etc., we will simply extract all alphabetic sequences as tokens and nomalize them including following steps:

  * convert to lower case
  * remove stopwords
  * create n-grams
  * (stemming - not so useful for interpreting topics)
  * **`TODO:`** lemmatization (more complicated for german)

### References

* [Tutorial on Corpora and Vector Spaces](https://radimrehurek.com/gensim/tut1.html) from [Gensim](https://radimrehurek.com/gensim/index.html).
* [Tutorial on Topic Modeling with Gensim](https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/) from [Machine Learning Plus](https://www.machinelearningplus.com/).

## Prerequisites

### Libraries and Constants

In [None]:
from collections import defaultdict
from pprint import pprint
import pandas as pd
import string
import os
import re
import time

# input files
data_dir        = '../data/'
filename        = data_dir + 'Bundesregierung.csv'

# output files
corpus_dir      = '../corpus/'
dict_filename   = corpus_dir + 'gps_ngrams.dict'
corpus_filename = corpus_dir + 'gps_ngrams_bow.mm'

# ensure output directory exists
if not os.path.exists(corpus_dir):
    os.makedirs(corpus_dir)

### Helper Functions

In [None]:
def print_diff(start_time):
    print("--- took %d:%.2d minutes ---" % divmod(time.time() - start_time, 60))

def most_frequent(tokens, topn=10):
    frequency = defaultdict(int)
    for doc in tokens:
        for term in doc:
            frequency[term] += 1
    return sorted(frequency.items(), key=lambda t: t[1], reverse=True)[0:topn]

## Read the Documents

In [None]:
start_time = time.time()
df = pd.read_csv(filename)
print_diff(start_time)

print(len(df), 'documents imported')
df.head()

## Analyse Compound Words

We want to recognize these compound words later in the n-gram detection.

In [None]:
PAT_COMPOUND = re.compile(r'\w+[-]\w+')

compounds = df['text'].apply(lambda doc: [match.group() for match in PAT_COMPOUND.finditer(doc)])
pprint(most_frequent(compounds, 10))

### Tokenize

In [None]:
from gensim.utils import tokenize

def tokens(documents):
    """Convert all documents into a list of lowercase tokens using Gensim's tokenize() function."""
    return [tokenize(doc, lower=True) for doc in documents]

# explicit tokenization
start_time = time.time()
tokens = [[t for t in tokenize(doc, lower=True)] for doc in df['text']]
print_diff(start_time)

pprint(" ".join(tokens[0][0:100]))

## Create N-Gram Model

In [None]:
from gensim.models.phrases import Phrases, Phraser

print('Building bigrams...')
start_time = time.time()
bigram_model = Phrases(tokens, min_count=1, threshold=100)
print_diff(start_time)

print(bigram_model)

print('Building trigrams...')
start_time = time.time()
bigrams = list(bigram_model[tokens])
trigram_model = Phrases(bigrams, min_count=1, threshold=100)
print_diff(start_time)

print(trigram_model)

print('Optimizing bigram/trigram models...')
# optimize bigram, trigram models
start_time = time.time()
bigram_model = Phraser(bigram_model)
trigram_model = Phraser(trigram_model)
print_diff(start_time)

In [None]:
def n_grams(documents):
    return trigram_model[list(bigram_model[documents])]

In [None]:
# most frequent n-grams
start_time = time.time()
pprint(most_frequent([[word for word in doc if '_' in word] for doc in n_grams(tokens)]))
print_diff(start_time)

## Load Stopwords

In [None]:
stopwords_filename = '../data/stopwords-de.txt'

with open(stopwords_filename) as f:
    stopwords = [line for line in f.read().splitlines() if not line.startswith(';')]

In [None]:
# add more stopwords
more_stopwords = 'anbelangt dingen genauso gerne hierzu hinzu liebe nahezu nunmehr punkt relativ sodass sozusagen trotz übrigen vielfach vielfache vielmehr voraussichtlich wahrlich wahrscheinlich zuvor'
stopwords.extend(more_stopwords.split())

# use dictionary for better performance
stopwordsdict = dict.fromkeys(stopwords, 1)

print(len(stopwordsdict), "stopwords")

In [None]:
def remove_stopwords(tokens):
    return [[word for word in doc if word not in stopwordsdict] for doc in tokens]

## Stemming

In [None]:
from nltk.stem.cistem import Cistem

stemmer = Cistem(True)

# even do stemming on each part of the n-grams
def stemming(tokens):
    return [['_'.join([stemmer.stem(part) for part in word.split('_')]) for word in doc] for doc in tokens]

## Lemmatization

This is more complicated for German than English as there are fewer good algorithms available. Usually, the results are less accurate.

Note: Lemmatization requires that the text has been POS tagged.

  * nltk
  * spacy
  * pattern.de
  * stanford NLP

See also:

  * https://datascience.blog.wzb.eu/2017/05/19/lemmatization-of-german-language-text/
    * https://github.com/WZBSocialScienceCenter/germalemma
    

## All Pre-processing together

In [None]:
tokens_nostop = remove_stopwords(tokens)
tokens_ngram  = n_grams(tokens_nostop)
tokens_stem   = stemming(tokens_ngram)

texts = tokens_ngram

In [None]:
print(tokens_nostop[1][0:5])
print(tokens_ngram[1][0:5])
print(tokens_stem[1][0:5])

In [None]:
# most frequent compound words after pre-processing
ngrams = [[word for word in doc if '_' in word] for doc in tokens_ngram]
stems  = [[word for word in doc if '_' in word] for doc in tokens_stem]

pprint(most_frequent(ngrams, 10))
pprint(most_frequent(stems, 10))

# Create Dictionary

In [None]:
from gensim import corpora

print('Creating Dictionary...')

start_time = time.time()
dictionary = corpora.Dictionary(texts)
print_diff(start_time)

print(dictionary)

In [None]:
dfs_desc = sorted(dictionary.dfs.items(), key=lambda t: t[1], reverse=True)

print('--- Most Frequent Tokens in X Documents', dictionary.num_docs, 'Documents ---')
for (k,v) in dfs_desc[0:10]: print('{freq}: {token}'.format(token=dictionary[k], freq=v))

print('--- Least Frequent Tokens in X Documents', dictionary.num_docs, 'Documents ---')
for (k,v) in dfs_desc[-10:]: print('{freq}: {token}'.format(token=dictionary[k], freq=v))

### Filter extreme tokens

* tokens which occur in more than 30% of all documents.
* tokens which occur in less than 5 documents.

In [None]:
print('Filtering extreme tokens')
freq_before = len(dictionary)
dictionary.filter_extremes(no_below=5, no_above=0.3)
print('{} token before -> {} after'.format(freq_before, len(dictionary)))

In [None]:
dfs_desc = sorted(dictionary.dfs.items(), key=lambda t: t[1], reverse=True)

print('--- Most Frequent Token Occurrences in', dictionary.num_docs, 'Documents ---')
for (k,v) in dfs_desc[0:10]: print('{freq}: {token}'.format(token=dictionary[k], freq=v))

print('--- Least Frequent Token Occurrences in', dictionary.num_docs, 'Documents ---')
for (k,v) in dfs_desc[-10:]: print('{freq}: {token}'.format(token=dictionary[k], freq=v))

### Saving Dictionary

In [None]:
print('Saving Dictionary to', dict_filename)
start_time = time.time()
dictionary.save(dict_filename)
print_diff(start_time)

# Create Corpus

Use the n-gram tokens to construct the model.

In [None]:
print('Creating Corpus')
start_time = time.time()
corpus_bow = [dictionary.doc2bow(doc) for doc in texts]
print_diff(start_time)

In [None]:
print('Saving Corpus to', corpus_filename)
start_time = time.time()
corpora.MmCorpus.serialize(corpus_filename, corpus_bow)
print_diff(start_time)

## LDA Model

In [None]:
from gensim.models import LdaModel, CoherenceModel

num_topics = 150

start_time = time.time()
model_lda = LdaModel(corpus_bow, id2word=dictionary, num_topics=num_topics)
print_diff(start_time)
print(model_lda)

model_lda.save('../model/{}_topics/'.format(num_topics) + 'topic_model.lda')

### Compute Coherence

Remove words from texts which are not in dictionary

In [None]:
texts = [[token for token in text if token in dictionary.token2id] for text in texts]

In [None]:
start_time = time.time()
cm = CoherenceModel(model=model_lda, corpus=corpus_bow, dictionary=dictionary, coherence='u_mass')
print('u_mass: {:0.3f}'.format(cm.get_coherence()))
print_diff(start_time)

start_time = time.time()
cm = CoherenceModel(texts=texts, model=model_lda, corpus=corpus_bow, dictionary=dictionary, coherence='c_v')
print('c_v: {:0.3f}'.format(cm.get_coherence()))
print_diff(start_time)