In [None]:
!pip install -q transformers datasets optuna scikit-learn matplotlib torch

In [None]:
import os
import json
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, random_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoConfig,
    Trainer,
    TrainingArguments,
    TrainerCallback,
)
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
)
import optuna

In [None]:
class SentimentDataset(Dataset):
    """
    A custom PyTorch Dataset class to handle tokenized inputs and their corresponding labels.

    Attributes:
        encodings (dict): Tokenized input data.
        labels (list): List of labels corresponding to the input data.
    """
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        """
        Retrieves a single data sample by index, including inputs and label.

        Args:
            idx (int): Index of the data sample to retrieve.

        Returns:
            dict: A dictionary containing tokenized input tensors and label tensor.
        """
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        """
        Returns the total number of samples in the dataset.

        Returns:
            int: The number of samples in the dataset.
        """
        return len(self.labels)


class EarlyStoppingCallbackCustom(TrainerCallback):
    """
    Custom early stopping callback for the Trainer.

    Attributes:
        patience (int): Number of evaluations to wait for improvement before stopping.
    """
    def __init__(self, patience=2):
        self.patience = patience
        self.best_loss = None
        self.epochs_no_improve = 0

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        """
        Called after evaluation; checks if validation loss has improved.

        Args:
            args: Training arguments.
            state: Trainer state.
            control: Trainer control.
            metrics (dict): Evaluation metrics.
        """
        if metrics is None:
            return

        eval_loss = metrics.get("eval_loss")
        if eval_loss is None:
            return

        if self.best_loss is None or eval_loss < self.best_loss:
            self.best_loss = eval_loss
            self.epochs_no_improve = 0
            control.should_save = True  # Save the model if it has improved
        else:
            self.epochs_no_improve += 1
            if self.epochs_no_improve >= self.patience:
                print(f"Validation loss has not improved for {self.patience} evaluations. Stopping training.")
                control.should_training_stop = True


class CustomTrainingArguments(TrainingArguments):
    """
    Custom training arguments class to include optimizer selection.

    Attributes:
        optimizer (str): Name of the optimizer to use.
    """
    def __init__(self, *args, optimizer=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.optimizer = optimizer


class CustomTrainer(Trainer):
    """
    Custom Trainer class to create optimizer based on provided arguments.
    """
    def create_optimizer(self):
        """
        Overrides the default optimizer creation to use custom optimizer.
        """
        optimizer_name = self.args.optimizer
        if optimizer_name == "adamw":
            optimizer_cls = torch.optim.AdamW
            optimizer_kwargs = {
                "lr": self.args.learning_rate,
                "weight_decay": self.args.weight_decay,
            }
        elif optimizer_name == "adafactor":
            from transformers.optimization import Adafactor
            optimizer_cls = Adafactor
            optimizer_kwargs = {
                "lr": self.args.learning_rate,
                "weight_decay": self.args.weight_decay,
                "scale_parameter": False,
                "relative_step": False,
            }
        else:
            raise ValueError(f"Unknown optimizer: {optimizer_name}")
        self.optimizer = optimizer_cls(self.model.parameters(), **optimizer_kwargs)
        return self.optimizer


class Maissen:
    """
    A class to handle model training with k-fold cross-validation and hyperparameter optimization.

    Attributes:
        model_names (list): List of model names to train.
        data_path (str): Path to the training data CSV file.
        save_base_dir (str): Directory to save models and results.
        use_drive (bool): Whether to use Google Drive for storage.
        hpo_n_trials (int): Number of trials for hyperparameter optimization.
        k_folds (int): Number of folds for cross-validation.
    """
    def __init__(self, model_names, use_drive=False, hpo_n_trials=17, k_folds=5):
        self.model_names = model_names
        self.use_drive = use_drive
        self.hpo_n_trials = hpo_n_trials
        self.k_folds = k_folds
        self.tokenizer = None
        self.model = None
        self.training_args = None
        self.trainer = None

        # Adjust paths if using Google Drive
        if self.use_drive:
            from google.colab import drive
            drive.mount('/content/drive')
            self.data_path = "/content/drive/MyDrive/MAS DataScience/CAS_ML/training_data.csv"
            self.save_base_dir = "/content/drive/MyDrive/MAS DataScience/CAS_ML/saved_models"
        else:
            self.data_path = "training_data.csv"
            self.save_base_dir = "./saved_models"

        os.makedirs(self.save_base_dir, exist_ok=True)

    def load_data(self):
        """
        Loads data from CSV and processes it into texts and labels.

        Returns:
            tuple: A tuple containing lists of texts and labels.
        """
        df = pd.read_csv(self.data_path)
        df = df[['relevant_sentence', 'label']]
        label_map = {'negativ': 0, 'neutral': 1, 'positiv': 2}
        df['label'] = df['label'].map(label_map)
        texts = df['relevant_sentence'].tolist()
        labels = df['label'].tolist()
        
        return texts, labels

    def prepare_dataset(self, texts, labels):
        """
        Tokenizes texts and creates a SentimentDataset.

        Args:
            texts (list): List of input texts.
            labels (list): List of labels.

        Returns:
            SentimentDataset: A dataset containing tokenized inputs and labels.
        """
        encodings = self.tokenizer(texts, truncation=True, padding=True, max_length=512)
        return SentimentDataset(encodings, labels)

    def initialize_model(self, model_name):
        """
        Initializes the tokenizer and model with the specified name.

        Args:
            model_name (str): Name of the pretrained model.
        """
        self.model_name = model_name  # Store the model name for later use

        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Load model configuration with the correct number of labels
        model_config = AutoConfig.from_pretrained(model_name, num_labels=3)

        # Load model with configuration
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            config=model_config,
            ignore_mismatched_sizes=True
        )

    def set_training_arguments(self, output_dir='./results', **kwargs):
        """
        Sets up training arguments for the Trainer.

        Args:
            output_dir (str): Directory to save training outputs.
            **kwargs: Additional keyword arguments for TrainingArguments.
        """
        self.training_args = CustomTrainingArguments(
            output_dir=output_dir,
            logging_dir='./logs',
            logging_steps=10,
            disable_tqdm=True,
            **kwargs
        )

    def perform_hyperparameter_search(self, texts, labels):
        """
        Performs hyperparameter optimization using k-fold cross-validation.

        Args:
            texts (list): List of input texts.
            labels (list): List of labels.

        Returns:
            dict: The best hyperparameters found during optimization.
        """
        def objective(trial):
            # Suggest hyperparameters
            learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True)
            per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])
            num_train_epochs = trial.suggest_categorical("num_train_epochs", [2, 5, 10])
            weight_decay = trial.suggest_categorical("weight_decay", [0, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2])
            warmup_steps = trial.suggest_int("warmup_steps", 0, 300)
            optimizer_name = trial.suggest_categorical("optimizer", ["adamw", "adafactor"])

            # Initialize the model-config for this trial
            model_config = AutoConfig.from_pretrained(self.model_name, num_labels=3)
            
            # Initialize the model for this trial
            model = AutoModelForSequenceClassification.from_pretrained(
                self.model_name, config=model_config, ignore_mismatched_sizes=True
            )

            # Self-made Cross-validation
            skf = StratifiedKFold(n_splits=self.k_folds, shuffle=True, random_state=42)
            val_losses = []

            for fold, (train_index, val_index) in enumerate(skf.split(texts, labels)):
                print(f"\n--- Hyperparameter Tuning Fold {fold + 1}/{self.k_folds} ---")

                # Prepare data for current fold
                train_texts = [texts[i] for i in train_index]
                val_texts = [texts[i] for i in val_index]
                train_labels = [labels[i] for i in train_index]
                val_labels = [labels[i] for i in val_index]

                train_dataset = self.prepare_dataset(train_texts, train_labels)
                val_dataset = self.prepare_dataset(val_texts, val_labels)

                # Set training arguments
                training_args = CustomTrainingArguments(
                    output_dir=f'./results/trial_{trial.number}_fold_{fold + 1}',
                    eval_strategy='epoch',
                    save_strategy='no',
                    per_device_train_batch_size=per_device_train_batch_size,
                    num_train_epochs=num_train_epochs,
                    learning_rate=learning_rate,
                    weight_decay=weight_decay,
                    warmup_steps=warmup_steps,
                    optimizer=optimizer_name,
                    logging_steps=10,
                    disable_tqdm=True
                )

                trainer = CustomTrainer(
                    model=model,
                    args=training_args,
                    train_dataset=train_dataset,
                    eval_dataset=val_dataset,
                    compute_metrics=self.compute_metrics
                )

                trainer.train()

                # Evaluate on validation set
                eval_metrics = trainer.evaluate(eval_dataset=val_dataset)
                val_loss = eval_metrics["eval_loss"]
                val_losses.append(val_loss)

                # Clean up
                del trainer
                torch.cuda.empty_cache()

            avg_val_loss = np.mean(val_losses)
            return avg_val_loss

        study = optuna.create_study(direction="minimize")
        study.optimize(objective, n_trials=self.hpo_n_trials)

        # Get best hyperparameters
        best_hyperparameters = study.best_trial.params

        return best_hyperparameters


    def compute_metrics(self, eval_pred):
        """
        Computes evaluation metrics.

        Args:
            eval_pred (tuple): A tuple containing logits and labels.

        Returns:
            dict: A dictionary of computed metrics.
        """
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        precision, recall, f1, _ = precision_recall_fscore_support(
            labels, predictions, average='weighted', zero_division=1
        )
        acc = accuracy_score(labels, predictions)
        return {
            'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }

    def train_model(self, train_dataset, val_dataset, best_hyperparameters):
        """
        Trains the model using the provided datasets and hyperparameters.

        Args:
            train_dataset (Dataset): The training dataset.
            val_dataset (Dataset): The validation dataset.
            best_hyperparameters (dict): The best hyperparameters to use for training.
        """
        self.set_training_arguments(
            output_dir='./temp_trainer',
            per_device_train_batch_size=best_hyperparameters['per_device_train_batch_size'],
            num_train_epochs=best_hyperparameters['num_train_epochs'],
            learning_rate=best_hyperparameters['learning_rate'],
            weight_decay=best_hyperparameters['weight_decay'],
            warmup_steps=best_hyperparameters['warmup_steps'],
            optimizer=best_hyperparameters['optimizer'],
            eval_strategy='epoch',
            save_strategy='epoch',
            load_best_model_at_end=True,
            save_total_limit=1,
            metric_for_best_model='eval_loss',
            greater_is_better=False
        )

        self.trainer = CustomTrainer(
            model=self.model,
            args=self.training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=self.compute_metrics,
            callbacks=[EarlyStoppingCallbackCustom()]
        )

        self.trainer.train()

    def evaluate_model(self, eval_dataset):
        """
        Evaluates the model on the provided dataset.

        Args:
            eval_dataset (Dataset): The dataset to evaluate on.

        Returns:
            dict: Evaluation metrics.
        """
        if self.trainer is None:
            raise ValueError("Trainer has not been initialized. Please train the model first.")

        eval_metrics = self.trainer.evaluate(eval_dataset=eval_dataset)
        return eval_metrics

    def save_model(self, save_path, best_hyperparameters):
        """
        Saves the model, tokenizer, and hyperparameters to the specified path.

        Args:
            save_path (str): The path to save the model.
            best_hyperparameters (dict): The best hyperparameters used during training.
        """
        os.makedirs(save_path, exist_ok=True)
        self.trainer.save_model(save_path)
        self.tokenizer.save_pretrained(save_path)
        hyperparams_path = os.path.join(save_path, 'best_hyperparams.json')
        with open(hyperparams_path, 'w') as f:
            json.dump({'best_params': best_hyperparameters}, f, indent=4)
        print(f"Model and hyperparameters saved to: {save_path}")

    def generate_learning_curves(self, texts, labels):
        """
        Generates learning curves for each model using different training sizes.

        Args:
            texts (list): List of input texts.
            labels (list): List of labels.
        """
        saved_models_dir = self.save_base_dir  # Use the directory where models are saved
        train_sizes = np.linspace(0.1, 1.0, 5)  # Training sizes from 10% to 100%
        learning_curves = {}

        # List of saved model directories
        model_dirs = [
            os.path.join(saved_models_dir, d)
            for d in os.listdir(saved_models_dir)
            if os.path.isdir(os.path.join(saved_models_dir, d))
        ]

        for model_dir in model_dirs:
            model_name = os.path.basename(model_dir)
            print(f"\n===== Processing Model: {model_name} =====")

            # Load the best hyperparameters
            hyperparams_path = os.path.join(model_dir, 'best_hyperparams.json')
            try:
                with open(hyperparams_path, 'r') as f:
                    hyperparams_data = json.load(f)
                best_params = hyperparams_data['best_params']
            except Exception as e:
                print(f"Error loading hyperparameters for {model_name}: {e}")
                continue

            # Load tokenizer and model
            try:
                self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
                self.model = AutoModelForSequenceClassification.from_pretrained(model_dir)
            except Exception as e:
                print(f"Error loading model or tokenizer for {model_name}: {e}")
                continue

            # Prepare full dataset
            try:
                full_dataset = self.prepare_dataset(texts, labels)
            except Exception as e:
                print(f"Error preparing dataset for {model_name}: {e}")
                continue

            # Initialize learning curve data for the model
            learning_curves[model_name] = {
                'train_sizes': [],
                'train_scores': [],
                'val_scores': []
            }

            for size_fraction in train_sizes:
                subset_size = int(len(full_dataset) * size_fraction)
                if subset_size < 1:
                    subset_size = 1  # Ensure at least one example is used

                print(f"  Training size: {subset_size} ({size_fraction*100:.0f}%)")

                # Create a subset of the training dataset
                train_subset, _ = random_split(full_dataset, [subset_size, len(full_dataset) - subset_size])

                # Split train_subset further into train and validation sets
                train_size = int(len(train_subset) * 0.8)
                val_size = len(train_subset) - train_size
                train_dataset, val_dataset = random_split(train_subset, [train_size, val_size])

                # Define training arguments
                training_args = CustomTrainingArguments(
                    output_dir='./temp_trainer',
                    eval_strategy='epoch',
                    save_strategy='no',
                    logging_dir='./logs',
                    per_device_train_batch_size=best_params['per_device_train_batch_size'],
                    learning_rate=best_params['learning_rate'],
                    num_train_epochs=best_params['num_train_epochs'],
                    weight_decay=best_params['weight_decay'],
                    warmup_steps=best_params['warmup_steps'],
                    disable_tqdm=True,  # Avoid excessive output
                    optimizer=best_params['optimizer'],
                )

                # Define trainer
                trainer = CustomTrainer(
                    model=self.model,
                    args=training_args,
                    train_dataset=train_dataset,
                    eval_dataset=val_dataset,
                    compute_metrics=self.compute_metrics
                )

                # Train the model
                try:
                    trainer.train()
                except Exception as e:
                    print(f"Error training {model_name} with size {subset_size}: {e}")
                    continue

                # Evaluate on training subset
                try:
                    train_results = trainer.evaluate(eval_dataset=train_dataset)
                    train_acc = train_results.get('eval_accuracy', 0)
                except Exception as e:
                    print(f"Error evaluating training set for {model_name}: {e}")
                    train_acc = 0

                # Evaluate on validation set
                try:
                    val_results = trainer.evaluate(eval_dataset=val_dataset)
                    val_acc = val_results.get('eval_accuracy', 0)
                except Exception as e:
                    print(f"Error evaluating validation set for {model_name}: {e}")
                    val_acc = 0

                # Store results
                learning_curves[model_name]['train_sizes'].append(subset_size)
                learning_curves[model_name]['train_scores'].append(train_acc)
                learning_curves[model_name]['val_scores'].append(val_acc)

                # Optionally: Clean up temporary training directories
                try:
                    import shutil
                    shutil.rmtree('./temp_trainer')
                    shutil.rmtree('./logs')
                except:
                    pass

            # Plot learning curves for the model
            plt.figure(figsize=(8, 6))

            train_sizes_plot = learning_curves[model_name]['train_sizes']
            train_scores = learning_curves[model_name]['train_scores']
            val_scores = learning_curves[model_name]['val_scores']

            plt.plot(train_sizes_plot, train_scores, 'o-', label='Training Score')
            plt.plot(train_sizes_plot, val_scores, 's-', label='Validation Score')

            plt.xlabel('Training Size')
            plt.ylabel('Accuracy')
            plt.title(f'Learning Curve for {model_name}')
            plt.legend(loc='best')
            plt.grid(True)
            plt.tight_layout()
            plt.show()

            # Clear model from memory to conserve resources
            del self.model
            torch.cuda.empty_cache()


In [None]:
# Liste der Modelle
model_names = [
    'deepset/gbert-base',
    'aari1995/German_Sentiment',
    'oliverguhr/german-sentiment-bert',
    'lxyuan/distilbert-base-multilingual-cased-sentiments-student',
    'nlptown/bert-base-multilingual-uncased-sentiment',
    'distilbert-base-german-cased',
    'xlm-roberta-base',
    'ssary/XLM-RoBERTa-German-sentiment',
]

In [None]:
# Initialisierung der Maissen-Klasse
maissen = Maissen(model_names=model_names, hpo_n_trials=20, use_drive=False, k_folds=5)

# Laden der Daten
texts, labels = maissen.load_data()

# Durchlaufen der Modelle
for model_name in model_names:
    print(f"\n===== Processing Model: {model_name} =====")
    
    # Initialisieren des Modells und Tokenizers
    maissen.initialize_model(model_name)
    
    # Hyperparameter-Optimierung
    best_hyperparameters = maissen.perform_hyperparameter_search(texts, labels)
    print(f"Best hyperparameters for {model_name}: {best_hyperparameters}")
    
    # Vorbereitung des Datasets
    full_dataset = maissen.prepare_dataset(texts, labels)
    
    # Aufteilen in Trainings- und Validierungsdatensatz
    train_size = int(len(full_dataset) * 0.8)
    val_size = len(full_dataset) - train_size
    train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])
    
    # Training des Modells mit den besten Hyperparametern
    maissen.train_model(train_dataset, val_dataset, best_hyperparameters)
    
    # Evaluierung des Modells
    eval_metrics = maissen.evaluate_model(val_dataset)
    print(f"Evaluation metrics for {model_name}: {eval_metrics}")
    
    # Speichern des Modells
    save_path = os.path.join(maissen.save_base_dir, model_name.replace('/', '_'))
    maissen.save_model(save_path, best_hyperparameters)
    
    # Bereinigung für das nächste Modell
    del maissen.model
    del maissen.trainer
    torch.cuda.empty_cache()

# Generierung von Lernkurven nachdem alle gewünschten Modelle optimiert wurden
maissen.generate_learning_curves(texts, labels)
