In [62]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import datetime as dt
import matplotlib.pyplot as plt

import plotly
import plotly.express as px

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression

import stanza



In [9]:
stanza.download('de')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-07-10 13:26:24 INFO: Downloaded file to C:\Users\sarah\stanza_resources\resources.json
2025-07-10 13:26:24 INFO: Downloading default packages for language: de (German) ...


Downloading https://huggingface.co/stanfordnlp/stanza-de/resolve/v1.10.0/models/default.zip:   0%|          | …

2025-07-10 13:28:07 INFO: Downloaded file to C:\Users\sarah\stanza_resources\de\default.zip
2025-07-10 13:28:19 INFO: Finished downloading models and saved to C:\Users\sarah\stanza_resources


In [7]:
#!pip install stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.10.1-py3-none-any.whl (1.1 MB)
   ---------------------------------------- 1.1/1.1 MB 6.8 MB/s eta 0:00:00
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
   ---------------------------------------- 590.6/590.6 kB 7.1 MB/s eta 0:00:00
Installing collected packages: emoji, stanza
Successfully installed emoji-2.14.1 stanza-1.10.1


In [1]:
data_file= 'data/dataset.pkl'

In [2]:
#dictionary to map parties to num for tensors
party_dict={'SPÖ':0,'ÖVP':1,'FPÖ':2,'Grüne':3,'LIF':4,'BZÖ':5,'NEOS':6,'STRONACH':7,'PILZ':8,'independent':9}

In [48]:
'''
takes data set and filters out independet speeches to a seperate list,
takes other speeches as list and
according party-labels are mapped to number as a list
'''
def load_data(data_file):
    df = pd.read_pickle(data_file)
    independent_df= df[df['party']=='independent']
    df = df[~(df['party']=='independent')]
    #for testing I am using only part of data
    df= df[2000:3000]
    texts = df['text'].tolist()
    df['party'] = df['party'].map(party_dict)
    labels = df['party'].tolist()
    test_texts= independent_df['text'].tolist()
    mps = df['speaker'].tolist()
    print(df.value_counts('party'))
    return texts, labels, test_texts,mps

In [49]:
texts,labels,test_texts,mps = load_data(data_file)

party
1    329
0    281
2    194
3    188
4      8
dtype: int64


In [50]:
# lemmatize data for tf-idf

nlp = stanza.Pipeline('de')

In [51]:
class TextLemmatizer:
    def __init__(self, max_tokens):
        self.max_tokens=max_tokens
        #initialize pipeline
        self.nlp = stanza.Pipeline('de', processors='tokenize,mwt,pos,lemma',verbose=False)
    def lemmatize(self,text):
        "Tokenizes texts and returns a list of tokens"
        tokens = text.split()
        truncated_text = " ".join(tokens[self.max_tokens:])
        "lemmatize truncated text"
        doc = self.nlp(truncated_text)
        lemmas = [word.lemma for sent in doc.sentences for word in sent.words]
        return " ".join(lemmas)
    def lemmatize_batch(self,texts):
        return [self.lemmatize(text) for text in texts]


#truncated_texts = [" ".join(text.split()[:max_length]) for text in texts]

#lemmatized = [
#    " ".join(word.lemma for sent in nlp(truncated_text).sentences for word in sent.words)
#    for text in truncated_texts
#]

In [52]:
max_length=128

In [53]:
lemmatized_texts = [lemmatizer.lemmatize(text) for text in texts]

In [56]:
df = pd.DataFrame()
df['texts'] = texts
df['party'] = labels
df['lemmatized_texts']= lemmatized_texts

In [93]:
df.to_pickle('data\lemmatized.texts.pkl')

In [90]:
print(lemmatized_texts)

['Dame und Herr ! wir sichern mit der Konjunkturprogramm der Arbeitsplatz , wir sichern darüber hinaus der richtig Wirtschaftsstruktur , der es wir ermöglichen , dann , wenn der Aufschwung kommen , an der Aufschwung zu partizipieren , und laufen nicht Gefahr , der tun zu müssen , was sie , Herr Dr. Wittmann , in solcher Fall|Fällen immer vorschlagen : da müssen wir eigentlich ein Verschuldungspaket und ein Infrastrukturpaket schnüren – mit viel ausgeliehen Geld ! – der wollen wir nicht , kein Rückfall in der alt Verschuldungspolitik ! da unterscheiden wir wir ganz , ganz gravierend von Sie . – zu der " Wert schaffen " kommen wir noch ! Schau sie sich einmal ihr Vorgangsweise in Sache Statistik an , und dann reden sie bitten weiter ! sie kommen sicherlich noch zu Wort . Zweite Aspekt : wir haben in der Rahmen der Konjunkturprogramm auch ein Forschung - und Technologiepaket schnüren . dieser Paket sein aus zweierlei Grund|Gründen so positiv : erstens werden mit der Prämie auch der Invest

In [79]:
X,y= lemmatized_texts, labels

In [80]:
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.2,random_state=42)

In [82]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_features=max_length, norm='l2')
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [64]:
model = LogisticRegression(random_state=42, max_iter=1000)

In [83]:
#train
model.fit(X_train, y_train)

In [84]:
y_pred = model.predict(X_test)

In [88]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.56      0.45      0.50        64
           1       0.46      0.81      0.58        63
           2       0.70      0.17      0.27        41
           3       0.50      0.46      0.48        28
           4       0.00      0.00      0.00         4

    accuracy                           0.50       200
   macro avg       0.44      0.38      0.37       200
weighted avg       0.54      0.50      0.47       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [89]:
from sklearn.metrics import log_loss

y_pred_proba = model.predict_proba(X_test)
loss = log_loss(y_test, y_pred_proba)
print("Log Loss:", loss)


Log Loss: 1.2724017057398596
