In [9]:
import ssl
import re
import pandas as pd

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from hdbscan import HDBSCAN
from umap import UMAP

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split


import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from textblob import TextBlob
from textblob_de import TextBlobDE

import pickle
import requests

import importlib
import functions



import torch
import datasets
from datasets import load_dataset
from datasets import Dataset
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

In [15]:
# Load the DataFrame from a pickle file
df_manifesto = pd.read_pickle('data/df_manifesto_final.pkl') 
df_spendings = pd.read_pickle('data/df_spendings_final.pkl')
# df_no_label = pd.read_csv('data/Unlabeled.csv', sep=';')

In [10]:
df_spendings["sentiment"] = df_spendings["text"].apply(lambda x: TextBlobDE(x).sentiment.polarity)

In [12]:
sentiment = df_spendings["sentiment"]

In [22]:
df_spendings['sentiment'] = sentiment
df_spendings = df_spendings[df_spendings['sentiment'] != 0]
df_spendings = df_spendings[df_spendings['topic'] != 57]
df_spendings = df_spendings.drop(columns=['description_md'])
df_spendings.to_csv('data/df_spendings_final.csv', index=True, encoding='utf-8', sep=';')

In [8]:
sample = df_spendings.sample(100)  # Replace 5 with the desired number of random rows
sample = sample[['text', 'cmp_code', 'label'] + [col for col in sample.columns if col not in ['text', 'cmp_code', 'label']]]

In [None]:
df_spendings['label']

In [4]:
# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_spendings["text"].tolist(), df_spendings["label"].tolist(), test_size=0.2, random_state=42
)

# Convert to Hugging Face Dataset format
dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels})


In [None]:
# Reduce the size of the training and validation datasets
small_train_dataset = dataset.select(range(100))  # Use the first 100 samples
small_val_dataset = val_dataset.select(range(50))  # Use the first 50 samples

In [5]:
model_name = "manifesto-project/manifestoberta-xlm-roberta-56policy-topics-context-2024-1-1"
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large", use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)
model.config.id2label = {0: "expansion", 1: "austerity"}
model.config.label2id = {"expansion": 0, "austerity": 1}
model.config.language = "german"

# Tokenize the dataset
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True)

# Convert the pandas DataFrame to a Hugging Face Dataset
hf_dataset = Dataset.from_pandas(df_spendings)

# Tokenize the dataset
tokenized_dataset = hf_dataset.map(preprocess_function, batched=True)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at manifesto-project/manifestoberta-xlm-roberta-56policy-topics-context-2024-1-1 and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([56]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([56, 1024]) in the checkpoint and torch.Size([2, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/16590 [00:00<?, ? examples/s]

In [None]:

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_dataset = dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

In [None]:
# Reduce the size of the training and validation datasets
small_train_dataset = train_dataset.select(range(100))  # Use the first 100 samples
small_val_dataset = val_dataset.select(range(50))  # Use the first 50 samples

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="C:/Users/Jacob/Desktop/Code/fopra/output_dir",
    evaluation_strategy="epoch",
    save_steps=1000,  # Save a checkpoint every 1000 steps
    save_total_limit=2,  # Keep only the last 2 checkpoints
    learning_rate=2e-5,
    per_device_train_batch_size=2,  # Reduce batch size for CPU
    gradient_accumulation_steps=4,  # Increase to simulate larger batch size
    num_train_epochs=2,
    weight_decay=0.01,
    overwrite_output_dir=True,  # Overwrite the output directory to avoid clogging
)

In [None]:
print("GPU available:", torch.cuda.is_available())
print("Using device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))

In [None]:
training_args_split = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    # no_cuda=True,  # Force training on CPU

)


optimum: this way the model works best

In [None]:
training_args_opt = TrainingArguments(
    output_dir="C:/Users/Jacob/Desktop/Code/fopra/output_dir",
    evaluation_strategy="steps",  # Evaluate more frequently
    eval_steps=500,  # Evaluate every 500 steps
    save_steps=500,  # Save a checkpoint every 500 steps
    save_total_limit=2,  # Keep only the last 2 checkpoints
    learning_rate=1e-5,  # Smaller learning rate for better precision
    per_device_train_batch_size=8,  # Increase batch size if memory allows
    gradient_accumulation_steps=1,  # Keep this if memory is limited
    num_train_epochs=5,  # Train for more epochs
    weight_decay=0.01,
    no_cuda=True,  # Force training on CPU
    overwrite_output_dir=True,
    load_best_model_at_end=True,  # Load the best model at the end
    metric_for_best_model="accuracy",  # Use accuracy as the metric
    greater_is_better=True,
)

In [None]:
# Define the Trainer
trainer_small = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_val_dataset,
    tokenizer=tokenizer
    )

# Fine-tune the model
trainer_small.train()

In [None]:
tokenized_dataset

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

trainer.train()


In [None]:
trainer.evaluate()


In [None]:
def predict_policy_stance(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=-1).item()
    return "expansion" if predicted_class == 0 else "austerity"

# Example predictions
new_sentences = [
    "wir wollen die Schuldenbremse abschaffen.",
    "wir wollen mehr geld für den sozialstaat ausgeben"
]

for sentence in new_sentences:
    print(f"'{sentence}' → {predict_policy_stance(sentence)}")
