# Topic classification using cosine similarity on the word embeddings

## Preprocessing
Same as in the data_exploration notebook

In [196]:
from enum import Enum

from nltk.corpus import stopwords
from spacy.lang.de.stop_words import STOP_WORDS

import re
import spacy

In [197]:
class Party(Enum):
    AFD = 0
    CDU = 1
    FDP = 2
    GRUENE = 3
    LINKE = 4
    SPD = 5

In [198]:
nlp = spacy.load('de_core_news_md')

# stopwords
nltk_stopwords = stopwords.words('german')

# build stopwords list
all_stopwords = list(set(STOP_WORDS) | set(nltk_stopwords))
with open('custom_stopwords.txt', 'r', encoding='utf-8') as f:
    all_stopwords += [line.strip() for line in f.readlines()]

# Load files
party_text = {}
for party in Party:
    all_stopwords.extend(['{}'.format(party.name.lower())])
    with open('resources/' + party.name + '.txt', encoding='utf-8', errors='ignore') as txt:
        file = " ".join(l for l in txt)
        # remove gender *
        file = re.sub(r'\*innen(\w*)\s', r'\1 ', file)
    party_text[party] = file

In [199]:
import gensim


def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    doc = nlp(" ".join(texts))
    texts_out = [token.lemma_ for token in doc if token.pos_ in allowed_postags and token.lemma_ not in all_stopwords]

    return texts_out


def prepare_data(party: Party):
    """
    Prepare data for topic classification.
    Split into sections and lemmatize.

    :param party: The party to get the text from
    :return: a dict containing the text and the lemmatized text seperated in sections
    """
    # get sections
    sections = re.split(r'\n\s\n', party_text[party])
    cleaned_sections = {}
    for section in sections:
        wordbag = gensim.utils.simple_preprocess(section)
        #lemmatize
        # wordbag = lemmatization(wordbag)
        cleaned_sections[section] = wordbag

    # return all cleaned sections except empty ones
    return dict(filter(lambda x: len(x[1]) > 0, cleaned_sections.items()))


## Create target clusters

In [200]:
import numpy as np
from sklearn import metrics
from transformers import BertTokenizer, TFBertModel

In [201]:
# based on lda from the data_exploration notebook
cluster_dict = {
    "UMWELT":
        ["umwelt", "klima", "klimaschutz", "ökologie", "co2", "co", "landwirtschaft", "klimakrise", "treibhaus",
         "emissionen", "ausbau", "innovative", "natur", "wasser", "landwirtschaft", "nachhaltig", "strom",
         "solar", "windkraft", "wasserkraft", "kohlekraft", "atomkraft", "kernenergie", "energiewende",
         "erneuerbar", "öl", "gas", "energie", "nachhaltig", "wasserstoff", "luftqualität", "umwelt", "wälder",
         "erneuerbare energie"],

    "WIRTSCHAFT":
        ["unternehmen", "wettbewerb", "digitale", "selbstständige", "firma", "markt",
         "kapital", "finanzierung", "ezb", "banken", "staatsanleihen", "währungsfonds",
         "staatsanleihen", "kredit", "investitionen", "euro", "industrie", "schulden", "steuern",
         "konzern", "kapital", "finanzen"],

    "BILDUNG":
        ["schule", "gymnasium", "realschule", "grundschule", "mittelschule", "lehrer", "universität", "lehre",
         "gesamtschule", "bildung", "hochschulen", "wissenschaft", "bildung", "erasmus", "forschung",
         "lehre", "entwickeln", "weiterbildung", "aufstiegsmöglichkeit", "bildungsstandard","innovationen"],

    "GESELLSCHAFT":
        ["gesellschaft", "kultur", "freiheit", "privat", "kulturelle", "antisemitismus",
         "vorbild", "frauen", "familie", "identität", "gender", "sprache", "leben", "religion", "christentum", "islam", "diskriminierung",
         "menschenrechte", "kunst", "adoption"],

    "INNEN":
        ["schutz", "polizei", "schützen", "überwachung", "datenschutz", "sicherheit", "bundeswehr", "asyl",
         "integration", "migrant", "flüchtling", "immigrant", "toleranz", "zuwanderung", "asylbewerber", "innere",
         "krimminalität", "zuwanderung", "kontrolle", "bundespolizei", "rechtsstaat", "gefahr", "terroristen",
         "gewalt"],

    "ARBEIT_UND_SOZIALES":
        ["rente", "harz4", "arbeitslosengeld", "pflege", "wohngeld", "familie", "arbeitslos", "sozial", "bauen",
         "wohnungen", "sozialbau", "kinder", "pflegen", "arbeitssuchende", "grundsicherung", "eltern", "jugendlich",
         "gesundheit", "arzt", "armut", "einkommen", "löhne", "tarifvertrag", "bürgergeld",
         "sozialstaat", "teilhabe", "bezahlbar"],

    # default:
    #"KEIN_THEMA": []

}

In [202]:
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-german-uncased", do_lower_case=True)
model = TFBertModel.from_pretrained("dbmdz/bert-base-german-uncased")

Some layers from the model checkpoint at dbmdz/bert-base-german-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at dbmdz/bert-base-german-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [203]:
def convert_to_embeddings(cluster):
    idx = tokenizer.encode(cluster)
    idx = np.array(idx)[None, :]
    embedding = model(idx)
    tensor = np.array(embedding[0][0][1:-1])
    return tensor

In [204]:
# create list of vectors
section_dict = prepare_data(Party.AFD)
mean_vectors = [convert_to_embeddings(txt).mean(0) for txt in section_dict.values()]

feature_matrix = np.array(mean_vectors)

# vector space for dict (maybe mean(0) ?)
dic_vectors = {key: convert_to_embeddings(value) for key, value in cluster_dict.items()}


In [205]:
similarities = np.array(
    [metrics.pairwise.cosine_similarity(feature_matrix, dic_y).T.tolist()[0] for dic_y in dic_vectors.values()]).T

labels = list(dic_vectors.keys())
for i in range(len(similarities)):

    # if there is a similarity to a cluster -> random cluster
    if sum(similarities[i]) == 0:
        similarities[i] = [0] * len(labels)
        similarities[i][np.random.choice(range(len(labels)))] = 1

    # rescale to 1
    similarities[i] = similarities[i] / sum(similarities[i])

#classify based on similarities
prediction = [(labels[np.argmax(pred)], max(pred)) for pred in similarities]

In [206]:

for topic in labels:
    print(topic, ":", len(list(filter(lambda x: x[0] == topic, prediction))))
    print()

UMWELT : 11

WIRTSCHAFT : 135

BILDUNG : 2

GESELLSCHAFT : 109

INNEN : 166

ARBEIT_UND_SOZIALES : 7



In [207]:
#k = 4
#print(list(section_dict.keys())[k], '\n --> ', prediction[k])

# print first occurence of each label
for label in labels:
    for i in range(len(prediction)):
        if prediction[i][0] == label:
            print(list(section_dict.keys())[i], "\n -->", prediction[i])
            print()
            break

 Eine existentielle Frage
 wie die Zuwanderung
 muss in demokratischer
 Selbstbestimmung
 auf nationaler Ebene
 entschieden werden. 
 --> ('UMWELT', 0.18235809743927947)

 Die Regierungspolitiker in Bund und Ländern haben mit
 ihrer Flüchtlings-, Europa- und Corona-Politik die
 Prinzipien der deutschen Staatlichkeit, des Rechts und
 der Verfassung vielfach verletzt.
 Zugleich haben sich die Volksvertreter der etablierten
 Parteien den grundgesetzlich garantierten Parlamentsvorbehalt für alle wichtigen Entscheidungen im Staat
 ohne Widerstand nehmen lassen.
 Die Bundesregierung kommt ihrer Pflicht, Vertragsbrüchen und Selbstermächtigungen durch
 EU-Institutionen entgegenzutreten, nicht nach. Einzelne
 rechts- und verfassungswidrige Maßnahmen wurden
 zwar durch mutige Richter in Hunderten von Urteilen zu
 Fall gebracht. In unserem Land hat sich aber eine
 politische Klasse herausgebildet, deren vordringliches
 Interesse ihrer Macht, ihrem Status und ihrem
 materiellen Wohlergehen gilt. D