In [1]:
import re
import spacy
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import normalize
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

As always we start with loading spacy model, we need it to get vector of each speech content.

In [2]:
nlp = spacy.load('de_core_news_lg')

This function first clean up the text and pass it to the nlp model and return vector of the text.

In [3]:
def vectorize(text):
    text = text.replace('-\n\n', '').replace('\n\n', ' ').replace('-\n', '')
    text = text.replace('\n', ' ').replace('\xa0', '').replace('–', '')
    text = re.sub(r'\({\d}\)', ' ', text)
    text = text.replace('  ', ' ')
    # Get the SpaCy vector -- turning off other processing to speed things up
    return nlp(text, disable=['parser', 'tagger', 'ner']).vector

Then, we load the contributions table, encode the contributions and merge it with the speeches table.

In [2]:
contri_df = pd.read_feather('../contributions.feather')
contri_encoded = pd.get_dummies(contri_df[['speechId', 'type']].drop_duplicates())
contri_encoded_summed = contri_encoded.groupby('speechId').sum()

In [3]:
speeches = pd.read_feather('../speeches_2020.feather')
merged = pd.merge(speeches[['id', 'speechContent']], contri_encoded_summed, left_on='id', right_on="speechId")

In [4]:
merged.columns

Index(['id', 'speechContent', 'type_Beifall', 'type_Heiterkeit', 'type_Lachen',
       'type_Personen-Einruf', 'type_Unruhe', 'type_Widerspruch', 'type_Zuruf',
       'type_Zustimmung'],
      dtype='object')

In [5]:
merged = merged.drop(columns='id')

We save merged data to csv with new column names.

In [7]:
merged.to_csv('merged.csv', index=False, header=['text', 'Beifall', 'Heiterkeit', 'Lachen',
       'Personen_Einruf', 'Unruhe', 'Widerspruch', 'Zuruf', 'Zustimmung'])

Then we load from the saved csv and show the merged data which contains the text which is speech content and other encoded contributions.

In [8]:
m = pd.read_csv('merged.csv')
m

Unnamed: 0,text,Beifall,Heiterkeit,Lachen,Personen_Einruf,Unruhe,Widerspruch,Zuruf,Zustimmung
0,"Vielen Dank, Herr Bundesminister. – Ich bedank...",0,0,0,1,0,0,1,0
1,Danke sehr. – Jetzt möchte dazu der Kollege Ka...,0,0,0,1,0,0,0,0
2,Er wirkt so verwirrt.({0}),0,1,0,0,0,0,0,0
3,Als nächster Redner hat das Wort der Kollege D...,1,0,0,0,0,0,0,0
4,Als nächster Redner erhält das Wort der Kolleg...,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
6158,Sehr geehrter Herr Präsident! Meine Damen und ...,1,0,0,1,0,0,1,0
6159,Sehr geehrter Herr Präsident! Sehr geehrte Abg...,1,0,0,1,0,0,0,0
6160,Sehr geehrter Herr Präsident! Liebe Kolleginne...,1,0,0,1,0,0,0,0
6161,Sehr geehrter Herr Präsident! Liebe Kolleginne...,1,0,0,0,0,0,0,0


Then we create our input labels by vectorizing the text fields.

In [21]:
X = np.stack([vectorize(t).get() for t in m.text])

In [7]:
X.shape

(6163, 300)

We drop the text field and the remaining fields are the output labels.

In [8]:
y = m.drop(columns=['text'])

We split the dataset to create training and test sets.

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.83)

We then train our MLP classifier with the training set.

In [41]:
clf = MLPClassifier((1024,128,), random_state=1, max_iter=500).fit(X_train, y_train)

Finally, we calculate the accuracy, which is rather low.

In [38]:
accuracy_score(y_test, clf.predict(X_test))

0.4161454261141517