In [102]:
from enum import Enum
from transformers import BertTokenizer, TFBertModel

import re
import spacy
import numpy as np
from sklearn import metrics
import pandas
import matplotlib as plt
import seaborn as sns

In [103]:
class Party(Enum):
    AFD = 0
    CDU = 1
    FDP = 2
    GRUENE = 3
    LINKE = 4
    SPD = 5

In [104]:
nlp = spacy.load('de_core_news_md')
from nltk.corpus import stopwords
from spacy.lang.de.stop_words import STOP_WORDS

# stopwords
nltk_stopwords = stopwords.words('german')

# build stopwords list
all_stopwords = list(set(STOP_WORDS) | set(nltk_stopwords))
with open('custom_stopwords.txt', 'r', encoding='utf-8') as f:
    all_stopwords += [line.strip() for line in f.readlines()]

# Load files
party_text = {}
for party in Party:
    all_stopwords.extend(['{}'.format(party.name.lower())])
    with open('resources/' + party.name + '.txt', encoding='utf-8', errors='ignore') as txt:
        file = " ".join(l for l in txt)
        # remove gender *
        file = re.sub(r'\*innen(\w*)\s', r'\1 ', file)
    party_text[party] = file

In [105]:
topic_clusters = {}

topic_clusters["JUGEND_UND_FAMILIE"] = ["Kinder", "Jugend", "Familie", "Senioren", "Eltern", "Erziehung"]
topic_clusters["BILDUNG"] = ["Schule", "Gymnasium", "Realschule", "Grundschule", "Mittelschule", "Lehrer", "Universität", "Lehre", "Gesamtschule", "Bildung"]
topic_clusters["SPORT_UND_FREIZEIT"] = ["Sport", "Freizeit", "Aktivität", "Fitness"]
topic_clusters["WIRTSCHAFT"] = ["Wirtschaft", "Ökonomie", "Geld", "Inflation", "Preis", "Währung", "Aktien", "Fonds", "Börse", "Unternehmen", "Gehalt", "Kredit"]
topic_clusters["GESUNDHEIT"] = ["Gesundheit", "Krankenhaus", "Arzt", "Doktor", "Medizin", "Versorgung", "Corona", "Pflege", "Intensiv", ""]
topic_clusters["ARBEIT_UND_SOZIALES"] = ["Arbeit", "Gehalt", "Arbeitsstelle", "Arbeitgeber", "Arbeitnehmen"]
topic_clusters["INNERE_SICHERHEIT_UND_DATENSCHUTZ"] = ["Datenschutz"]
topic_clusters["ZUWANDERUNG_UND_INTEGRATION"] = ["Diversität", "Flüchtlinge", "Asyl", "Migrant", "Immigrant", "Toleranz", "Zuwanderung", "Integration"]
#topic_clusters["AUSSENPOLITIK"] = []
topic_clusters["UMWELT"] = ["Umwelt", "Klima", "Klimaschutz", "Ökologie", "CO2", "Landwirtschaft", "Klimakrise", "Treibhaus", "Emissionen"]
topic_clusters["VERKEHR"] = ["Auto", "Straße", "Flug", "Schiff", "Reise", "Verkehr", "Stau", "Autobahn", "Landstraße", "Bundesstraße"]
topic_clusters["ENERGIE"] = ["Strom", "Solar", "Windkraft", "Wasserkraft", "Kohlekraft", "Atomkraft", "Kernenergie", "Energiewende", "erneuerbar", "Öl", "Gas"]
#topic_clusters["VERBRAUCHERSCHUTZ"] = [""]

In [106]:
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-german-uncased")
model = TFBertModel.from_pretrained("dbmdz/bert-base-german-uncased")

Some layers from the model checkpoint at dbmdz/bert-base-german-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at dbmdz/bert-base-german-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [107]:
def convert_to_embeddings(cluster):
    idx = tokenizer.encode(cluster)
    idx = np.array(idx)[None,:]
    embedding = model(idx)
    tensor = np.array(embedding[0][0][1:-1])
    return tensor

In [108]:
def split_in_512_chunks(text):
    chunks = []
    tokens = text.split(' ')
    for i in range(0, len(tokens), 256):
        chunks.append(" ".join(tokens[i: min(i+255, len(tokens) - 1)]))
    return chunks

In [109]:
mean_vectors = [convert_to_embeddings(txt).mean(0) for txt in split_in_512_chunks(party_text[Party.FDP])[:3]]

feature_matrix = np.array(mean_vectors)

topic_cluster_embeddings = { t: convert_to_embeddings(w) for t, w in topic_clusters.items() }

In [110]:
similarities = np.array([metrics.pairwise.cosine_similarity(feature_matrix, y).T.tolist()[0]
                         for y in topic_cluster_embeddings.values()])

topics = list(topic_cluster_embeddings.keys())
for i in range(len(similarities)):
    if sum(similarities[i]) == 0:
       similarities[i] = [0]*len(topics)
       similarities[i][np.random.choice(range(len(topics)))] = 1
    similarities[i] = similarities[i] / sum(similarities[i])

predicted_prob = similarities
predicted = [topics[np.argmax(pred)] for pred in predicted_prob]

[[0.32906326 0.32893802 0.34199872]
 [0.34191655 0.33127416 0.3268093 ]
 [0.33173463 0.32768345 0.34058191]
 [0.34235638 0.33067732 0.32696629]
 [0.34196647 0.33004942 0.32798411]
 [0.32981197 0.32843035 0.34175768]
 [0.34397971 0.33197895 0.32404134]
 [0.32801349 0.32972886 0.34225765]
 [0.32744162 0.32993859 0.34261979]
 [0.34191655 0.33127416 0.3268093 ]
 [0.34278387 0.33027306 0.32694306]]


In [111]:
classes = np.unique(topics)
y_test_array = pandas.get_dummies(topics, drop_first=False).values
## Accuracy, Precision, Recall
accuracy = metrics.accuracy_score(topics, predicted)
auc = metrics.roc_auc_score(topics, predicted_prob,
                            multi_class="ovr")
print("Accuracy:",  round(accuracy,2))
print("Auc:", round(auc,2))
print("Detail:")
print(metrics.classification_report(topics, predicted))
    ## Plot confusion matrix
cm = metrics.confusion_matrix(topics, predicted)
fig, ax = plt.subplots()
sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues,
            cbar=False)
ax.set(xlabel="Pred", ylabel="True", xticklabels=classes,
       yticklabels=classes, title="Confusion matrix")
plt.yticks(rotation=0)
fig, ax = plt.subplots(nrows=1, ncols=2)
## Plot roc
for i in range(len(classes)):
    fpr, tpr, thresholds = metrics.roc_curve(y_test_array[:,i],
                           predicted_prob[:,i])
    ax[0].plot(fpr, tpr, lw=3,
              label='{0} (area={1:0.2f})'.format(classes[i],
                              metrics.auc(fpr, tpr))
               )
ax[0].plot([0,1], [0,1], color='navy', lw=3, linestyle='--')
ax[0].set(xlim=[-0.05,1.0], ylim=[0.0,1.05],
          xlabel='False Positive Rate',
          ylabel="True Positive Rate (Recall)",
          title="Receiver operating characteristic")
ax[0].legend(loc="lower right")
ax[0].grid(True)
    ## Plot precision-recall curve
for i in range(len(classes)):
    precision, recall, thresholds = metrics.precision_recall_curve(
                 y_test_array[:,i], predicted_prob[:,i])
    ax[1].plot(recall, precision, lw=3,
               label='{0} (area={1:0.2f})'.format(classes[i],
                                  metrics.auc(recall, precision))
              )
ax[1].set(xlim=[0.0,1.05], ylim=[0.0,1.05], xlabel='Recall',
          ylabel="Precision", title="Precision-Recall curve")
ax[1].legend(loc="best")
ax[1].grid(True)
plt.show()

ValueError: Number of classes in y_true not equal to the number of columns in 'y_score'