In [None]:
import pandas as pd
import numpy as np
import nltk
from tqdm.auto import tqdm
from pandarallel import pandarallel
import string
from HanTa import HanoverTagger as ht
import mgzip
import pickle

pandarallel.initialize(progress_bar=True)
tqdm.pandas()

# Preprocessing of data sets
This notebook contains the necessary steps for the preprocessing of the data sets. It processes the data sets for both types of data, comments and articles.

## Processing of articles

In [None]:
# load csv of file to process
zeit = pd.read_csv('../data/zeit_scraped.gzip', compression='gzip', low_memory=False, usecols=["title", "date", "combined_text"])
welt = pd.read_csv('../data/welt_scraped.gzip', compression='gzip', low_memory=False, usecols=["title", "date", "combined_text"])
tagesspiegel = pd.read_csv('../data/tagesspiegel_scraped.gzip', compression='gzip', low_memory=False, usecols=["title", "date", "combined_text"])

In [None]:
zeit['newspaper'] = 'zeit'
welt['newspaper'] = 'welt'
tagesspiegel['newspaper'] = 'tagespiegel'

In [None]:
combined_news = pd.concat([zeit, welt, tagesspiegel])

In [None]:
# there was one document that contained a lot of arabic characters, thus it was removed
combined_news = combined_news[combined_news['combined_text'].str.contains('في')==False ]

### Tokenize, remove punctuation & lower casing

In [None]:
combined_news['text_token'] = combined_news['combined_text'].progress_apply(nltk.word_tokenize)

In [None]:
punctuation_custom = list(string.punctuation)
punctuation_custom = punctuation_custom + ['„', '“', '–', '•']


def remove_punctuation(text):
    text_list = []
    for word in text:
        for punctuation in punctuation_custom:
            word = word.replace(punctuation, '')
        if not word:
            continue
        text_list.append(word.lower())
    return text_list

In [None]:
combined_news['text_token'] = combined_news['text_token'].progress_apply(remove_punctuation)

### Stop word removal, lemmatization of comments & emotion lexicon
Following, stopwords without meaning ('der', 'wo', etc.) are removed from the comments. I edited the sourced stopwords list and removed any negations, since these contain emotions that I want to capture later in the workflow. Afterwards, the remaining tokenized words for each comment are lemmatized, i.e. reduced to their basic form. In order to do this I utilise the 'Hannover Tagger', which also provides part of speech information, although I do not use it (the main reason that I do not use the PoS information is that the utilised German emotion and sentiment lexical use different PoS abbreviations, resulting in no matches).

In [None]:
stopwords = open('../resources/german_stopwords-master/german_stopwords_topic.txt').read().splitlines()

In [None]:
def stop_word_removal(x):
    return list([str(w) for w in x if not w in stopwords])

In [None]:
combined_news['text_token'] = combined_news['text_token'].progress_apply(stop_word_removal)

In [None]:
tagger = ht.HanoverTagger('morphmodel_ger.pgz')

In [None]:
def tagger_custom(input):
    tmp_list = []
    for word in input:
        tmp_list.append(tagger.analyze(word)[0].lower())
    return tmp_list

In [None]:
combined_news['text_token'] = combined_news['text_token'].parallel_apply(tagger_custom)

In [None]:
combined_news['combined_text_joined'] = combined_news['text_token'].apply(' '.join)

In [None]:
combined_news_pre = combined_news[['title', 'combined_text_joined', 'date', 'newspaper']]

In [None]:
# saving preprocessed file
with mgzip.open('../data/combined_news_pre.mgzip', 'wb') as handle:
    pickle.dump(combined_news, handle)

## Processing of comments