Import Stuff

In [63]:
import ssl
import re

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from hdbscan import HDBSCAN
from umap import UMAP

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split


import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from datetime import datetime
from textblob import TextBlob

import pickle
import requests

import importlib
import functions

import torch
import datasets
from datasets import load_dataset
from datasets import Dataset
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments





In [66]:
from functions import *
importlib.reload(functions)

nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

parties = ["CDU", "SPD", "FDP", "AFD", "LEFT", "GREENS"]
file_path = "C:/Users/Jacob/OneDrive/uni/MA WiSoz/Semester III/Computational Social Sciences/foPra/data/"


# Open the file in read mode
with open("API Keys/DeepL.txt", 'r', encoding='utf-8') as file:
    # Read the contents of the file
    api_key_deepl = file.read()

url = 'https://api-free.deepl.com/v2/translate'

# Load the DataFrame from a pickle file
df_manifesto = pd.read_pickle('data\df_manifesto.pkl') 
topics = pd.read_pickle('topics.pkl') 


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Jacob\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jacob\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jacob\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jacob\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Jacob\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Edit Text documents

In [67]:
lemmatizer = WordNetLemmatizer()

defining different models
    

In [74]:
embedding_model_en = "all-MiniLM-L6-v2"
embedding_model_multilingual1 = "paraphrase-multilingual-mpnet-base-v2"
embedding_model_multilingual2 = "distiluse-base-multilingual-cased-v1"
embedding_model_manifestoberta = "manifesto-project/manifestoberta-xlm-roberta-56policy-topics-sentence-2024-1-1"

In [70]:
representation_model = KeyBERTInspired()
cluster_model = KMeans(n_clusters=30)

In [79]:
umap_model = UMAP(
    n_neighbors=40, 
    n_components=5, 
    min_dist=0.1, 
    metric='cosine', 
    random_state=42
    )

In [80]:
hdbscan_model = HDBSCAN(
    min_cluster_size=35, 
    # min_samples=10,
    metric='euclidean', 
    prediction_data=True
    )



In [73]:
seed_topic_list = [
    ["steuergeld", 
    "ausgaben", 
    "staatsausgaben", 
    "staatshaushalt", 
    "schulden", 
    "schuldenbremse", 
    "staatsschulden", 
    "defizit", 
    "haushalt", 
    "haushaltsdefizit", 
    "haushaltsausgleich", 
    "schwarze null", 
    "schuldenabbau", 
    "schuldenpolitik", 
    "schuldenkrise", 
    "schuldenlast", 
    "schuldenstand", 
    "schuldenquote", 
    "schuldenpolitisch",
    "ausgaben erhöhen",
    "ausgaben kürzen"]]

Initialisiere und trainiere das BERTopic-Modell


In [81]:

# Ensure the 'text' column contains only strings and handle NaN values
df_manifesto["text"] = df_manifesto["text"].astype(str).fillna("")

topic_model = BERTopic(
    language="multilingual"
    , n_gram_range=(1,2)
    , min_topic_size=20
    # , top_n_words=20
    , representation_model=representation_model
    , embedding_model=embedding_model_multilingual2
    , umap_model=umap_model
    # , hdbscan_model=hdbscan_model
    , seed_topic_list=seed_topic_list
    )

#topic_model = BERTopic(nr_topics=100, calculate_probabilities=True, embedding_model="all_MiniLM-L6-v2")
topics, probs = topic_model.fit_transform(df_manifesto["text"])



In [11]:
df_manifesto["topic"] = topics

In [12]:
df_manifesto.to_pickle('Manifesto_final.pkl')

In [88]:
topic_model.get_topic(83)

[('171 172', 0.47052616),
 ('172 173', 0.46464276),
 ('177 178', 0.46311623),
 ('181 182', 0.45728236),
 ('173 174', 0.45439306),
 ('176 177', 0.45394817),
 ('182 183', 0.4422307),
 ('178 179', 0.44135106),
 ('188 189', 0.43885046),
 ('186 187', 0.43734956)]

In [86]:
topic_model.get_topic(80)
related_terms = topic_model.find_topics("related", top_n=10)
print(related_terms)

([83, 131, 156, 128, 102, 160, 152, 94, 142, 96], [0.18113098, 0.13463318, 0.10732683, 0.08718705, 0.086810865, 0.076846965, 0.07013552, 0.059989497, 0.03488005, 0.031079182])


In [None]:
len(topic_model.get_topic(25))

In [None]:

# Erstelle das `topics_over_time`-Objekt
topics_over_time = topic_model.topics_over_time(df_manifesto["text"], df_manifesto["date"])
#topic_model = BERTopic(representation_model=KeyBERTInspired())


In [None]:
# Save topics_over_time to a file
with open("topics_over_time.pkl", "wb") as f:
    pickle.dump(topics_over_time, f)

In [38]:

# Method 1 - safetensors
topic_model.save(file_path, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model_multilingual2)

# Method 2 - pytorch
#topic_model.save(file_path, serialization="pytorch", save_ctfidf=True, save_embedding_model=embedding_model)

# Method 3 - pickle
#topic_model.save("bertopic_model.pkl")



In [None]:
predicted_class = model.config.id2label[logits.argmax().item()]
print(predicted_class)
# 201 - Freedom and Human Rights


In [14]:
predicted_class = model.config.id2label[logits.argmax().item()]
print(predicted_class)
# 501 - Environmental Protection: Positive

414 - Economic Orthodoxy


In [39]:
# Save the topics to a pickle file
with open('topics.pkl', 'wb') as f:
	pickle.dump(topics, f)

# Save the probabilities to a pickle file
with open('probs.pkl', 'wb') as f:
	pickle.dump(probs, f)