# Topic classification using cosine similarity on the word embeddings

## Preprocessing
Same as in the data_exploration notebook

In [230]:
# some general imports
from enum import Enum

from nltk.corpus import stopwords
from spacy.lang.de.stop_words import STOP_WORDS

import re
import spacy

In [231]:
class Party(Enum):
    AFD = 0
    CDU = 1
    FDP = 2
    GRUENE = 3
    LINKE = 4
    SPD = 5

In [232]:
# https://spacy.io/models/de#de_core_news_md
nlp = spacy.load('de_core_news_md')

# stopwords
nltk_stopwords = stopwords.words('german')

# build stopwords list
all_stopwords = list(set(STOP_WORDS) | set(nltk_stopwords))
with open('data_exploration/custom_stopwords.txt', 'r', encoding='utf-8') as f:
    all_stopwords += [line.strip() for line in f.readlines()]

# Load files
party_text = {}
for party in Party:
    all_stopwords.extend(['{}'.format(party.name.lower())])
    with open('resources/' + party.name + '.txt', encoding='utf-8', errors='ignore') as txt:
        file = " ".join(l for l in txt)
        # remove gender *
        file = re.sub(r'\*innen(\w*)\s', r'\1 ', file)
    party_text[party] = file

In [233]:
import gensim


def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    doc = nlp(" ".join(texts))
    texts_out = [token.lemma_ for token in doc if token.pos_ in allowed_postags and token.lemma_ not in all_stopwords]

    return texts_out


def prepare_data(party: Party):
    """
    Prepare data for topic classification.
    Split into sections and lemmatize.

    :param party: The party to get the text from
    :return: a dict containing the text and the lemmatized text seperated in sections
    """
    # get sections
    sections = re.split(r'\n\s\n', party_text[party])
    cleaned_sections = {}
    for section in sections:
        wordbag = gensim.utils.simple_preprocess(section)
        #lemmatize
        # wordbag = lemmatization(wordbag)
        cleaned_sections[section] = wordbag

    # return all cleaned sections except empty ones
    return dict(filter(lambda x: len(x[1]) > 0, cleaned_sections.items()))


## Create target clusters
based on lda from the [data_exploration notebook](data_exploration/topic_modeling_playground.ipynb)

In [234]:
import numpy as np
from sklearn import metrics
from transformers import BertTokenizer, TFBertModel

In [235]:
# based on lda from the data_exploration notebook
cluster_dict = {
    "UMWELT":
        ["umwelt", "klima", "klimaschutz", "ökologie", "co2", "co", "landwirtschaft", "klimakrise", "treibhaus",
         "emissionen", "ausbau", "innovative", "natur", "wasser", "landwirtschaft", "nachhaltig", "strom",
         "solar", "windkraft", "wasserkraft", "kohlekraft", "atomkraft", "kernenergie", "energiewende",
         "erneuerbar", "öl", "gas", "energie", "nachhaltig", "wasserstoff", "luftqualität", "umwelt", "wälder",
         "erneuerbare energie"],

    "WIRTSCHAFT":
        ["unternehmen", "selbstständige", "firma", "markt", "kapital", "finanzierung", "ezb", "banken",
         "staatsanleihen", "währungsfonds", "staatsanleihen", "kredit", "euro", "industrie", "schulden", "steuern",
         "konzern", "kapital", "finanzen"],

    "BILDUNG":
        ["student", "schüler", "schule", "gesamtschule", "lehrer", "universität", "lehre", "elternunabhängig",
         "bildung", "hochschulen", "wissenschaft", "bildung", "erasmus", "forschung", "lehre", "ausbildung",
         "weiterbildung", "aufstiegsmöglichkeit", "bildungsstandard", "innovationen", "bafög", "studium"],

    "GESELLSCHAFT":
        ["gesellschaft", "kultur", "freiheit", "privat", "kulturelle", "antisemitismus",
         "vorbild", "frauen", "familie", "identität", "gender", "sprache", "leben", "religion", "christentum", "islam",
         "diskriminierung", "menschenrechte", "kunst", "adoption"],

    "INNEN":
        ["schutz", "polizei", "schützen", "überwachung", "datenschutz", "sicherheit", "bundeswehr", "asyl",
         "integration", "migrant", "flüchtling", "immigrant", "toleranz", "zuwanderung", "asylbewerber",
         "krimminalität", "zuwanderung", "kontrolle", "bundespolizei", "gefahr", "terroristen", "gewalt"],

    "ARBEIT_UND_SOZIALES":
        ["rente", "harz4", "arbeitslosengeld", "pflege", "wohngeld", "familie", "arbeitslos", "sozial", "bauen",
         "wohnungen", "sozialbau", "kinder", "pflegen", "arbeitssuchende", "grundsicherung", "eltern", "jugendlich",
         "gesundheit", "arzt", "armut", "einkommen", "löhne", "tarifvertrag", "bürgergeld",
         "sozialstaat", "teilhabe", "bezahlbar"]
}

In [236]:
# Model from https://huggingface.co/dbmdz/bert-base-german-uncased
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-german-uncased", do_lower_case=True)
model = TFBertModel.from_pretrained("dbmdz/bert-base-german-uncased")

Some layers from the model checkpoint at dbmdz/bert-base-german-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at dbmdz/bert-base-german-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [242]:
def convert_to_embeddings(cluster):
    idx = tokenizer.encode(cluster)
    idx = np.array(idx)[None, :]
    embedding = model(idx)
    tensor = np.array(embedding[0][0][1:-1])
    return tensor

## Classify text

In [243]:
 results = {}
 for party in Party:
    # create list of vectors
    section_dict = prepare_data(party)
    mean_vectors = [convert_to_embeddings(txt).mean(0) for txt in section_dict.values()]

    feature_matrix = np.array(mean_vectors)

    # vector space for dict (maybe mean(0) ?)
    dic_vectors = {key: convert_to_embeddings(value) for key, value in cluster_dict.items()}

    similarities = np.array(
        [metrics.pairwise.cosine_similarity(feature_matrix, dic_y).T.tolist()[0] for dic_y in dic_vectors.values()]).T

    labels = list(dic_vectors.keys())
    for i in range(len(similarities)):

        # if there is a similarity to a cluster -> random cluster
        if sum(similarities[i]) == 0:
            similarities[i] = [0] * len(labels)
            similarities[i][np.random.choice(range(len(labels)))] = 1

        # rescale to 1
        similarities[i] = similarities[i] / sum(similarities[i])

    #classify based on similarities
    prediction = [(labels[np.argmax(pred)], max(pred)) for pred in similarities]

    # print number of topics for each topic
    print(party)
    for topic in labels:
        print(topic, ":", len(list(filter(lambda x: x[0] == topic, prediction))))
    print()

    results[party] = (section_dict, prediction)

Party.AFD
UMWELT : 12
WIRTSCHAFT : 103
BILDUNG : 2
GESELLSCHAFT : 82
INNEN : 210
ARBEIT_UND_SOZIALES : 7

Party.CDU
UMWELT : 98
WIRTSCHAFT : 400
BILDUNG : 14
GESELLSCHAFT : 156
INNEN : 452
ARBEIT_UND_SOZIALES : 7

Party.FDP
UMWELT : 41
WIRTSCHAFT : 91
BILDUNG : 13
GESELLSCHAFT : 31
INNEN : 190
ARBEIT_UND_SOZIALES : 7

Party.GRUENE
UMWELT : 45
WIRTSCHAFT : 108
BILDUNG : 1
GESELLSCHAFT : 15
INNEN : 130
ARBEIT_UND_SOZIALES : 8

Party.LINKE
UMWELT : 25
WIRTSCHAFT : 171
BILDUNG : 6
GESELLSCHAFT : 44
INNEN : 194
ARBEIT_UND_SOZIALES : 27

Party.SPD
UMWELT : 21
WIRTSCHAFT : 44
BILDUNG : 1
GESELLSCHAFT : 10
INNEN : 47
ARBEIT_UND_SOZIALES : 2

