# Ein Hybridmodell mit LSTM-AE, DBSCAN und IF

In [1]:
import numpy as np
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.metrics import classification_report
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from matplotlib import pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
import logging
from tensorflow.keras.layers import TimeDistributed, Dense, LSTM, Dropout, LayerNormalization, Input, Add, Bidirectional, RepeatVector
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import roc_curve
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import balanced_accuracy_score
import sys
from keras.layers import LSTM, Dropout, LayerNormalization, Add, TimeDistributed, Dense
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score
from tensorflow.keras import backend as K

2025-08-28 12:22:07.583985: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 1. Vorbereitung

In [2]:
# import torch
# print(torch.cuda.is_available())
# print(torch.cuda.get_device_name(0))

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # import os
# # import tensorflow as tf

# # num_threads = os.cpu_count()
# # tf.config.threading.set_inter_op_parallelism_threads(num_threads)
# # tf.config.threading.set_intra_op_parallelism_threads(num_threads)
# # print(f"Verwendete CPU Threads: {num_threads}")

In [3]:
from sklearn.model_selection import StratifiedKFold

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

#### Daten laden

In [6]:
data_train = pd.read_json("train_logs.jsonl", lines=True)
data_test = pd.read_json("test_logs.jsonl", lines=True)
data_val = pd.read_json("val_logs.jsonl", lines=True)

In [7]:
X_train_full = pd.DataFrame(data_train)
X_test_full = pd.DataFrame(data_test)
X_val_full = pd.DataFrame(data_val)

# data_test['label'] = data_test['details'].apply(lambda x: x.get('label', None))
# data_val['label'] = data_val['details'].apply(lambda x: x.get('label', None))
y_test_full = data_test["label"]
y_val_full = data_val["label"]

#### Automatisch alles in Numerische oder Kategorische Daten einteilen

In [8]:
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

def is_missing(val):
    return val is None or (isinstance(val, float) and np.isnan(val))

def flatten_log_entry(log, parent_key='', sep='.'):
    items = []
    for k, v in log.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_log_entry(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

def auto_encode_features(logs, one_hot_numeric=False, label_encoders=None, onehot_encoders=None, fit=True):
    # Logs flach machen
    flat_logs = [flatten_log_entry(log) for log in logs]

    if label_encoders is None:
        label_encoders = {}
    if onehot_encoders is None:
        onehot_encoders = {}

    # Encoder vorbereiten
    for key in flat_logs[0].keys():
        values = []
        for log in flat_logs:
            val = log.get(key)

            if is_missing(val):
                val = "__MISSING__"
            elif isinstance(val, (list, dict)):
                print(f"Feature '{key}' enthält nicht-skalaren Wert – wird als String gespeichert: {val}")
                val = str(val)

            values.append(val)

        if fit:
            try:
                le = LabelEncoder()
                le.fit(values)
                label_encoders[key] = le
            except Exception as e:
                print(f"Fehler beim Enkodieren von Feature '{key}': {e}")
                continue

    # Feature-Werte transformieren
    encoded_logs = []
    for log in flat_logs:
        encoded_log = {}
        for key, le in label_encoders.items():
            val = log.get(key, "__MISSING__")
            if is_missing(val):
                val = "__MISSING__"
            elif isinstance(val, (list, dict)):
                val = str(val)
            try:
                encoded_log[key] = le.transform([val])[0]
            except ValueError:
                encoded_log[key] = -1  # unbekannte Kategorie
        encoded_logs.append(encoded_log)

    return encoded_logs, label_encoders, onehot_encoders

#### Werte aus vorheriger Funktion vereinigen

In [9]:
def dicts_to_feature_matrix(encoded_logs):
    feature_names = sorted({key for d in encoded_logs for key in d.keys()})

    X = np.zeros((len(encoded_logs), len(feature_names)), dtype=np.float32)

    for i, d in enumerate(encoded_logs):
        for j, feat in enumerate(feature_names):
            if feat in d:
                X[i, j] = d[feat]

    return X, feature_names

In [10]:
logs_train = X_train_full.to_dict(orient='records')
#logs_train = events
logs_test = X_test_full.to_dict(orient='records')
logs_val = X_val_full.to_dict(orient='records')

encoded_logs, label_encoders, onehot_encoders = auto_encode_features(
    logs_train, one_hot_numeric=False, fit=True
)
X_train, feature_names = dicts_to_feature_matrix(encoded_logs)

encoded_test_logs, _, _ = auto_encode_features(
    logs_test, one_hot_numeric=False,
    label_encoders=label_encoders, onehot_encoders=onehot_encoders, fit=False
)
X_test, _ = dicts_to_feature_matrix(encoded_test_logs)

encoded_val_logs, _, _ = auto_encode_features(
    logs_val, one_hot_numeric=False,
    label_encoders=label_encoders, onehot_encoders=onehot_encoders, fit=False
)
X_val, _ = dicts_to_feature_matrix(encoded_val_logs)

#### Skalierung

In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)

### Parameter

In [12]:
seq_length = 45
batch_size = 32

encoder_layers = [128, 64]
decoder_layers = [64, 128]

#dropout_rate = 0.1
learning_rate = 0.001

### Sequenzen

In [13]:
class SequenceToSequenceGenerator(tf.keras.utils.Sequence):
    def __init__(self, data, seq_length, batch_size, pad_last=True):
        self.data = data
        self.seq_length = seq_length
        self.batch_size = batch_size
        self.pad_last = pad_last

        self.indices = np.arange(len(data) - seq_length)
        
        # Wenn Padding aktiv ist und etwas übrig bleibt
        if pad_last and (len(data) % seq_length != 0):
            self.include_last = True
        else:
            self.include_last = False

    def __len__(self):
        base = (len(self.indices) + self.batch_size - 1) // self.batch_size
        return base + (1 if self.include_last else 0)

    def __getitem__(self, idx):
        if idx < len(self) - 1 or not self.include_last:
            batch_idx = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
            X_batch = np.array([self.data[i:i + self.seq_length] for i in batch_idx])
        else:
            # letzte Sequenz mit Padding
            last_seq = self.data[-self.seq_length:]
            if len(last_seq) < self.seq_length:
                padding_len = self.seq_length - len(last_seq)
                padding = np.zeros((padding_len, self.data.shape[1]))
                last_seq = np.vstack((last_seq, padding))
            X_batch = np.expand_dims(last_seq, axis=0)

        return X_batch, X_batch

train_gen = SequenceToSequenceGenerator(X_train_scaled, seq_length, batch_size)
val_gen = SequenceToSequenceGenerator(X_val_scaled, seq_length, batch_size)
test_gen = SequenceToSequenceGenerator(X_test_scaled, seq_length, batch_size)

## 2. LSTM-AE implementieren

In [14]:
from tensorflow.keras.layers import Input, Bidirectional, LSTM, RepeatVector, TimeDistributed, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

n_features = X_train_scaled.shape[1]

inputs = Input(shape=(seq_length, n_features))
x = inputs

# Tieferer Encoder mit bidirektionalen LSTMs
for units in encoder_layers:
    x = Bidirectional(LSTM(units, return_sequences=True))(x)
    #x = Dropout(dropout_rate)(x)

# Letzte Encoder-Schicht ohne return_sequences, nur den letzten Output
encoded = Bidirectional(LSTM(encoder_layers[-1]))(x)

# Tieferer Decoder
x = RepeatVector(seq_length)(encoded)
for units in decoder_layers:
    x = Bidirectional(LSTM(units, return_sequences=True))(x)
    #x = Dropout(dropout_rate)(x)

outputs = TimeDistributed(Dense(n_features, activation='linear'))(x)

model = Model(inputs, outputs)
model.compile(optimizer=Adam(learning_rate), loss='mse')
model.summary()

2025-08-28 12:45:29.326616: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [15]:
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
model.fit(train_gen, epochs=50, validation_data=val_gen, callbacks=[early_stop])

Epoch 1/50


  self._warn_if_super_not_called()


[1m853/853[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 169ms/step - loss: 0.7904 - val_loss: 0.7206
Epoch 2/50
[1m853/853[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 170ms/step - loss: 0.8050 - val_loss: 0.7143
Epoch 3/50
[1m853/853[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 172ms/step - loss: 0.7703 - val_loss: 0.7043
Epoch 4/50
[1m853/853[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 168ms/step - loss: 0.7044 - val_loss: 0.7678
Epoch 5/50
[1m853/853[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 168ms/step - loss: 0.7493 - val_loss: 0.6905
Epoch 6/50
[1m853/853[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 167ms/step - loss: 0.7336 - val_loss: 0.6981
Epoch 7/50
[1m853/853[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 170ms/step - loss: 0.7191 - val_loss: 0.7850


<keras.src.callbacks.history.History at 0x72068ce4b920>

### Umwandlung der Daten für nächste Modelle: Rekonstruktionsfehler berechnen

In [16]:
def get_reconstruction_errors(gen, model):
    errors = []
    for i in range(len(gen)):
        X_batch, _ = gen[i]
        pred = model.predict(X_batch, verbose=0)
        batch_errors = np.mean(np.square(X_batch - pred), axis=(1, 2))
        errors.extend(batch_errors)
    return np.array(errors)

train_errors = get_reconstruction_errors(train_gen, model)
val_errors = get_reconstruction_errors(val_gen, model)
test_errors = get_reconstruction_errors(test_gen, model)
#reconstruction_errors = get_reconstruction_errors(test_gen, model)

In [17]:
# from sklearn.metrics import precision_recall_curve, precision_recall_fscore_support, roc_auc_score
# import numpy as np

y_test_seq = np.array([
    int(np.any(y_test_full[i : i + seq_length] == 1))
    for i in range(len(test_errors))
])

#true_labels = y_test_seq.astype(int)

# precisions, recalls, thresholds = precision_recall_curve(true_labels, test_errors)
# f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)

# best_idx = np.argmax(f1_scores)
# optimal_threshold = thresholds[best_idx]

# y_pred = (test_errors > optimal_threshold).astype(int)

# precision, recall, f1, _ = precision_recall_fscore_support(true_labels, y_pred, average='binary')
# roc_auc = roc_auc_score(true_labels, test_errors)

# print(f"Best Threshold by F1: {optimal_threshold:.5f}")
# print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}, ROC-AUC: {roc_auc:.3f}")

In [18]:
# scaler_rec = StandardScaler()
# scaler_rec.fit(train_errors.reshape(-1, 1))
# test_errors_scaled = scaler_rec.transform(test_errors.reshape(-1, 1))

# Modell 1

## Isolation Forest

In [19]:
from sklearn.metrics import accuracy_score

iforest = IsolationForest(random_state=42, contamination=0.0008, max_samples = 'auto')
iforest.fit(train_errors.reshape(-1, 1))

scores_if = iforest.decision_function(test_errors.reshape(-1, 1))
precisions_if, recalls_if, thresholds_if = precision_recall_curve(y_test_seq, -scores_if)
best_f1_scores_if = 2 * (precisions_if * recalls_if) / (precisions_if + recalls_if + 1e-10)
best_idx_if = np.argmax(best_f1_scores_if)
optimal_threshold_if = thresholds_if[best_idx_if]
y_pred_if = ((-scores_if) > optimal_threshold_if).astype(int)

precision_if, recall_if, f1_if, _ = precision_recall_fscore_support(y_test_seq, y_pred_if, average='binary')
roc_auc_if = roc_auc_score(y_test_seq, -scores_if)
pr_auc_if = average_precision_score(y_test_seq, y_pred_if)
fpr, tpr, _ = roc_curve(y_test_seq, y_pred_if)
accuracy_if = accuracy_score(y_test_seq, y_pred_if)

mcc_if = matthews_corrcoef(y_test_seq, y_pred_if)
balanced_acc_if = balanced_accuracy_score(y_test_seq, y_pred_if)

print(f"IsolationForest Best Threshold by F1: {optimal_threshold_if:.4f}")
print(f"Precision: {precision_if:.4f}, Recall: {recall_if:.4f}, F1: {f1_if:.4f}, ROC-AUC: {roc_auc_if:.4f}, AUC-PR: {pr_auc_if:.4f}")
print(f"Matthews Correlation Coefficient: {mcc_if:.4f}, Balanced Accuracy: {balanced_acc_if:.4f}, Accuracy: {accuracy_if:.4f}")

IsolationForest Best Threshold by F1: -0.2633
Precision: 0.6533, Recall: 0.5861, F1: 0.6179, ROC-AUC: 0.7442, AUC-PR: 0.4105
Matthews Correlation Coefficient: 0.5932, Balanced Accuracy: 0.7820, Accuracy: 0.9517


# Modell 2

## One-Class SVM

In [20]:
ocsvm = OneClassSVM(nu = 0.0008, gamma='scale', kernel = 'rbf')
ocsvm.fit(train_errors.reshape(-1,1))

scores_ocsvm = ocsvm.decision_function(test_errors.reshape(-1, 1))  # kleiner = anomal
precisions_ocsvm, recalls_ocsvm, thresholds_ocsvm = precision_recall_curve(y_test_seq, -scores_ocsvm)
f1_scores_ocsvm = 2 * (precisions_ocsvm * recalls_ocsvm) / (precisions_ocsvm + recalls_ocsvm + 1e-10)
best_idx_ocsvm = np.argmax(f1_scores_ocsvm)
optimal_threshold_ocsvm = thresholds_ocsvm[best_idx_ocsvm]

# Vorhersagen auf Testset
y_pred_ocsvm = ((-scores_ocsvm) > optimal_threshold_ocsvm).astype(int)

# Metriken
precision_ocsvm, recall_ocsvm, f1_ocsvm, _ = precision_recall_fscore_support(y_test_seq, y_pred_ocsvm, average='binary')
roc_auc_ocsvm = roc_auc_score(y_test_seq, -scores_ocsvm)
fpr, tpr, _ = roc_curve(y_test_seq, y_pred_ocsvm)
pr_auc_ocsvm = average_precision_score(y_test_seq, y_pred_ocsvm)
accuracy_ocsvm = accuracy_score(y_test_seq, y_pred_ocsvm)

mcc_ocsvm = matthews_corrcoef(y_test_seq, y_pred_ocsvm)
balanced_acc_ocsvm = balanced_accuracy_score(y_test_seq, y_pred_ocsvm)

print(f"One-Class SVM Best Threshold by F1: {optimal_threshold_ocsvm:.4f}")
print(f"Precision: {precision_ocsvm:.4f}, Recall: {recall_ocsvm:.4f}, F1: {f1_ocsvm:.4f}, ROC-AUC: {roc_auc_ocsvm:.4f}, AUC-PR: {pr_auc_ocsvm:.4f},MCC: {mcc_ocsvm:.4f}, Balanced Accuracy: {balanced_acc_ocsvm:.4f}, Accuracy: {accuracy_ocsvm:.4f}")

One-Class SVM Best Threshold by F1: -0.0181
Precision: 0.6568, Recall: 0.6247, F1: 0.6403, ROC-AUC: 0.6866, AUC-PR: 0.4353,MCC: 0.6155, Balanced Accuracy: 0.8007, Accuracy: 0.9532


# Modell 3

## DBSCAN

In [21]:
for eps in np.linspace(0.01, 1.0, 100):
    dbscan = DBSCAN(min_samples=3, metric='euclidean', eps=eps)
    labels = dbscan.fit_predict(test_errors.reshape(-1, 1))
    n_anomalies = np.sum(labels == -1)
    if abs(n_anomalies - 31) <= 2:
        print(f'Passender eps-Wert: {eps}, Anomalien: {n_anomalies}')
        break

y_pred_dbscan = (labels == -1).astype(int)

# Metriken berechnen
precision_dbscan, recall_dbscan, f1_dbscan, _ = precision_recall_fscore_support(y_test_seq, y_pred_dbscan, average='binary')
roc_auc_dbscan = roc_auc_score(y_test_seq, y_pred_dbscan)
pr_auc_dbscan = average_precision_score(y_test_seq, y_pred_dbscan)
accuracy_dbscan = accuracy_score(y_test_seq, y_pred_dbscan)
mcc_dbscan = matthews_corrcoef(y_test_seq, y_pred_dbscan)
balanced_acc_dbscan = balanced_accuracy_score(y_test_seq, y_pred_dbscan)

print(f"Precision: {precision_dbscan:.4f}, Recall: {recall_dbscan:.4f}, F1: {f1_dbscan:.4f}")
print(f"MCC: {mcc_dbscan:.4f}, Balanced Accuracy: {balanced_acc_dbscan:.4f}")
print(f"ROC-AUC: {roc_auc_dbscan:.4f}, AUC-PR: {pr_auc_dbscan:.4f}, Accuracy: {accuracy_dbscan:.4f}")

Passender eps-Wert: 0.05, Anomalien: 29
Precision: 1.0000, Recall: 0.0746, F1: 0.1388
MCC: 0.2644, Balanced Accuracy: 0.5373
ROC-AUC: 0.5373, AUC-PR: 0.1362, Accuracy: 0.9383


### Standardabweichung und Varianz

#### Isolation Forest

In [32]:
if_25_f1 = [0.7596, 0.7610, 0.7494, 0.7368, 0.7639]
if_25_mcc = [0.7647, 0.7550, 0.7346, 0.7720, 0.7673]
if_25_ba = [0.8165, 0.8128, 0.8209, 0.8141, 0.8297]

if_45_f1 = [0.6094, 0.6379, 0.6289, 0.5918, 0.6179]
if_45_mcc = [0.5819, 0.6126, 0.6024, 0.5619, 0.5932]
if_45_ba = [0.7875, 0.8015, 0.8005, 0.7971, 0.7820]

# Varianz (Population)
if_25_f1_varianz = np.var(if_25_f1)
if_25_mcc_varianz = np.var(if_25_mcc)
if_25_ba_varianz = np.var(if_25_ba)

if_45_f1_varianz = np.var(if_45_f1)
if_45_mcc_varianz = np.var(if_45_mcc)
if_45_ba_varianz = np.var(if_45_ba)

# Varianz (Stichprobe, mit n-1 im Nenner)
if_25_f1_varianz_stichprobe = np.var(if_25_f1, ddof=1)
if_25_mcc_varianz_stichprobe = np.var(if_25_mcc, ddof=1)
if_25_ba_varianz_stichprobe = np.var(if_25_ba, ddof=1)

if_45_f1_varianz_stichprobe = np.var(if_45_f1, ddof=1)
if_45_mcc_varianz_stichprobe = np.var(if_45_mcc, ddof=1)
if_45_ba_varianz_stichprobe = np.var(if_45_ba, ddof=1)

# Standardabweichung (Population)
if_25_f1_std_pop = np.std(if_25_f1)
if_25_mcc_std_pop = np.std(if_25_mcc)
if_25_ba_std_pop = np.std(if_25_ba)

if_45_f1_std_pop = np.std(if_45_f1)
if_45_mcc_std_pop = np.std(if_45_mcc)
if_45_ba_std_pop = np.std(if_45_ba)

# Standardabweichung (Stichprobe)
if_25_f1_std_stichprobe = np.std(if_25_f1, ddof=1)
if_25_mcc_std_stichprobe = np.std(if_25_mcc, ddof=1)
if_25_ba_std_stichprobe = np.std(if_25_ba, ddof=1)

if_45_f1_std_stichprobe = np.std(if_45_f1, ddof=1)
if_45_mcc_std_stichprobe = np.std(if_45_mcc, ddof=1)
if_45_ba_std_stichprobe = np.std(if_45_ba, ddof=1)

print("IF, Seq 25, F1-Score, Varianz (Stichprobe):", if_25_f1_varianz_stichprobe)
print("IF, Seq 25, MCC, Varianz (Stichprobe):", if_25_mcc_varianz_stichprobe)
print("IF, Seq 25, BA, Varianz (Stichprobe):", if_25_ba_varianz_stichprobe)
print("IF, Seq 25, IF, Standardabweichung (Stichprobe):", if_25_f1_std_stichprobe)
print("IF, Seq 25, MCC, Standardabweichung (Stichprobe):", if_25_mcc_std_stichprobe)
print("IF, Seq 25, BA, Standardabweichung (Stichprobe):", if_25_ba_std_stichprobe)

print("IF, Seq 45, F1-Score, Varianz (Stichprobe):", if_45_f1_varianz_stichprobe)
print("IF, Seq 45, MCC, Varianz (Stichprobe):", if_45_mcc_varianz_stichprobe)
print("IF, Seq 45, BA, Varianz (Stichprobe):", if_45_ba_varianz_stichprobe)
print("IF, Seq 45, F1-Score, Standardabweichung (Stichprobe):", if_45_f1_std_stichprobe)
print("IF, Seq 45, MCC, Standardabweichung (Stichprobe):", if_45_mcc_std_stichprobe)
print("IF, Seq 45, BA, Standardabweichung (Stichprobe):", if_45_ba_std_stichprobe)

IF, Seq 25, F1-Score, Varianz (Stichprobe): 0.0001238180000000003
IF, Seq 25, MCC, Varianz (Stichprobe): 0.0002203369999999998
IF, Seq 25, BA, Varianz (Stichprobe): 4.6649999999999914e-05
IF, Seq 25, IF, Standardabweichung (Stichprobe): 0.011127353683603318
IF, Seq 25, MCC, Standardabweichung (Stichprobe): 0.014843752894736554
IF, Seq 25, BA, Standardabweichung (Stichprobe): 0.006830080526611668
IF, Seq 45, F1-Score, Varianz (Stichprobe): 0.00031796700000000016
IF, Seq 45, MCC, Varianz (Stichprobe): 0.0003822950000000013
IF, Seq 45, BA, Varianz (Stichprobe): 7.349199999999987e-05
IF, Seq 45, F1-Score, Standardabweichung (Stichprobe): 0.017831629202066764
IF, Seq 45, MCC, Standardabweichung (Stichprobe): 0.019552365585780183
IF, Seq 45, BA, Standardabweichung (Stichprobe): 0.008572747517569842


#### OCSVM

In [30]:
ocsvm_25_f1 = [0.7564, 0.7603, 0.7364, 0.7039, 0.7578]
ocsvm_25_mcc = [0.7572, 0.7696, 0.7332, 0.6916, 0.7549]
ocsvm_25_ba = [0.8256, 0.8146, 0.8227, 0.8357, 0.8369]

ocsvm_45_f1 = [0.6094, 0.6379, 0.6289, 0.5918, 0.6179]
ocsvm_45_mcc = [0.5819, 0.6126, 0.6024, 0.5619, 0.5932]
ocsvm_45_ba = [0.7875, 0.8015, 0.8005, 0.7971, 0.7820]

# Varianz (Population)
ocsvm_25_f1_varianz = np.var(ocsvm_25_f1)
ocsvm_25_mcc_varianz = np.var(ocsvm_25_mcc)
ocsvm_25_ba_varianz = np.var(ocsvm_25_ba)

ocsvm_45_f1_varianz = np.var(ocsvm_45_f1)
ocsvm_45_mcc_varianz = np.var(ocsvm_45_mcc)
ocsvm_45_ba_varianz = np.var(ocsvm_45_ba)

# Varianz (Stichprobe, mit n-1 im Nenner)
ocsvm_25_f1_varianz_stichprobe = np.var(ocsvm_25_f1, ddof=1)
ocsvm_25_mcc_varianz_stichprobe = np.var(ocsvm_25_mcc, ddof=1)
ocsvm_25_ba_varianz_stichprobe = np.var(ocsvm_25_ba, ddof=1)

ocsvm_45_f1_varianz_stichprobe = np.var(ocsvm_45_f1, ddof=1)
ocsvm_45_mcc_varianz_stichprobe = np.var(ocsvm_45_mcc, ddof=1)
ocsvm_45_ba_varianz_stichprobe = np.var(ocsvm_45_ba, ddof=1)

# Standardabweichung (Population)
ocsvm_25_f1_std_pop = np.std(ocsvm_25_f1)
ocsvm_25_mcc_std_pop = np.std(ocsvm_25_mcc)
ocsvm_25_ba_std_pop = np.std(ocsvm_25_ba)

ocsvm_45_f1_std_pop = np.std(ocsvm_45_f1)
ocsvm_45_mcc_std_pop = np.std(ocsvm_45_mcc)
ocsvm_45_ba_std_pop = np.std(ocsvm_45_ba)

# Standardabweichung (Stichprobe)
ocsvm_25_f1_std_stichprobe = np.std(ocsvm_25_f1, ddof=1)
ocsvm_25_mcc_std_stichprobe = np.std(ocsvm_25_mcc, ddof=1)
ocsvm_25_ba_std_stichprobe = np.std(ocsvm_25_ba, ddof=1)

ocsvm_45_f1_std_stichprobe = np.std(ocsvm_45_f1, ddof=1)
ocsvm_45_mcc_std_stichprobe = np.std(ocsvm_45_mcc, ddof=1)
ocsvm_45_ba_std_stichprobe = np.std(ocsvm_45_ba, ddof=1)

print("OCSVM, Seq 25, F1-Score, Varianz (Stichprobe):", ocsvm_25_f1_varianz_stichprobe)
print("OCSVM, Seq 25, MCC,Varianz (Stichprobe):", ocsvm_25_mcc_varianz_stichprobe)
print("OCSVM, Seq 25, BA,Varianz (Stichprobe):", ocsvm_25_ba_varianz_stichprobe)
print("OCSVM, Seq 25, F1-Score, Standardabweichung (Stichprobe):", ocsvm_25_f1_std_stichprobe)
print("OCSVM, Seq 25, MCC, Standardabweichung (Stichprobe):", ocsvm_25_mcc_std_stichprobe)
print("OCSVM, Seq 25, BA, Standardabweichung (Stichprobe):", ocsvm_25_ba_std_stichprobe)

print("OCSVM, Seq 45, F1-Score,Varianz (Stichprobe):", ocsvm_45_f1_varianz_stichprobe)
print("OCSVM, Seq 45, MCC, Varianz (Stichprobe):", ocsvm_45_mcc_varianz_stichprobe)
print("OCSVM, Seq 45, BA, Varianz (Stichprobe):", ocsvm_45_ba_varianz_stichprobe)
print("OCSVM, Seq 45, F1-Score, Standardabweichung (Stichprobe):", ocsvm_45_f1_std_stichprobe)
print("OCSVM, Seq 45, MCC, Standardabweichung (Stichprobe):", ocsvm_45_mcc_std_stichprobe)
print("OCSVM, Seq 45, BA, Standardabweichung (Stichprobe):", ocsvm_45_ba_std_stichprobe)

OCSVM, Seq 25, F1-Score, Varianz (Stichprobe): 0.0005675630000000001
OCSVM, Seq 25, MCC,Varianz (Stichprobe): 0.0009435899999999996
OCSVM, Seq 25, BA,Varianz (Stichprobe): 8.696499999999996e-05
OCSVM, Seq 25, F1-Score, Standardabweichung (Stichprobe): 0.023823580755209746
OCSVM, Seq 25, MCC, Standardabweichung (Stichprobe): 0.030717910085160408
OCSVM, Seq 25, BA, Standardabweichung (Stichprobe): 0.009325502667416913
OCSVM, Seq 45, F1-Score,Varianz (Stichprobe): 0.00031796700000000016
OCSVM, Seq 45, MCC, Varianz (Stichprobe): 0.0003822950000000013
OCSVM, Seq 45, BA, Varianz (Stichprobe): 7.349199999999987e-05
OCSVM, Seq 45, F1-Score, Standardabweichung (Stichprobe): 0.017831629202066764
OCSVM, Seq 45, MCC, Standardabweichung (Stichprobe): 0.019552365585780183
OCSVM, Seq 45, BA, Standardabweichung (Stichprobe): 0.008572747517569842


#### DBSCAN

In [31]:
dbscan_25_f1 = [0.2278, 0.2214, 0.0846, 0.2214, 0.2086]
dbscan_25_mcc = [0.3517, 0.3462, 0.2059, 0.3462, 0.3348]
dbscan_25_ba = [0.5643, 0.5622, 0.5221, 0.5622, 0.5582]

dbscan_45_f1 = [0.1388, 0.0501, 0.1476, 0.1432, 0.1388]
dbscan_45_mcc = [0.2644, 0.1550, 0.2735, 0.2690, 0.2644]
dbscan_45_ba = [0.5373, 0.5129, 0.5398, 0.5386, 0.5373]

# Varianz (Population)
dbscan_25_f1_varianz = np.var(dbscan_25_f1)
dbscan_25_mcc_varianz = np.var(dbscan_25_mcc)
dbscan_25_ba_varianz = np.var(dbscan_25_ba)

dbscan_45_f1_varianz = np.var(dbscan_45_f1)
dbscan_45_mcc_varianz = np.var(dbscan_45_mcc)
dbscan_45_ba_varianz = np.var(dbscan_45_ba)

# Varianz (Stichprobe, mit n-1 im Nenner)
dbscan_25_f1_varianz_stichprobe = np.var(dbscan_25_f1, ddof=1)
dbscan_25_mcc_varianz_stichprobe = np.var(dbscan_25_mcc, ddof=1)
dbscan_25_ba_varianz_stichprobe = np.var(dbscan_25_ba, ddof=1)

dbscan_45_f1_varianz_stichprobe = np.var(dbscan_45_f1, ddof=1)
dbscan_45_mcc_varianz_stichprobe = np.var(dbscan_45_mcc, ddof=1)
dbscan_45_ba_varianz_stichprobe = np.var(dbscan_45_ba, ddof=1)

# Standardabweichung (Population)
dbscan_25_f1_std_pop = np.std(dbscan_25_f1)
dbscan_25_mcc_std_pop = np.std(dbscan_25_mcc)
dbscan_25_ba_std_pop = np.std(dbscan_25_ba)

dbscan_45_f1_std_pop = np.std(dbscan_45_f1)
dbscan_45_mcc_std_pop = np.std(dbscan_45_mcc)
dbscan_45_ba_std_pop = np.std(dbscan_45_ba)

# Standardabweichung (Stichprobe)
dbscan_25_f1_std_stichprobe = np.std(dbscan_25_f1, ddof=1)
dbscan_25_mcc_std_stichprobe = np.std(dbscan_25_mcc, ddof=1)
dbscan_25_ba_std_stichprobe = np.std(dbscan_25_ba, ddof=1)

dbscan_45_f1_std_stichprobe = np.std(dbscan_45_f1, ddof=1)
dbscan_45_mcc_std_stichprobe = np.std(dbscan_45_mcc, ddof=1)
dbscan_45_ba_std_stichprobe = np.std(dbscan_45_ba, ddof=1)

print("Varianz (Stichprobe):", dbscan_25_f1_varianz_stichprobe)
print("Varianz (Stichprobe):", dbscan_25_mcc_varianz_stichprobe)
print("Varianz (Stichprobe):", dbscan_25_ba_varianz_stichprobe)
print("Standardabweichung (Stichprobe):", dbscan_25_f1_std_stichprobe)
print("Standardabweichung (Stichprobe):", dbscan_25_mcc_std_stichprobe)
print("Standardabweichung (Stichprobe):", dbscan_25_ba_std_stichprobe)


print("Varianz (Stichprobe):", dbscan_45_f1_varianz_stichprobe)
print("Varianz (Stichprobe):", dbscan_45_mcc_varianz_stichprobe)
print("Varianz (Stichprobe):", dbscan_45_ba_varianz_stichprobe)
print("Standardabweichung (Stichprobe):", dbscan_45_f1_std_stichprobe)
print("Standardabweichung (Stichprobe):", dbscan_45_mcc_std_stichprobe)
print("Standardabweichung (Stichprobe):", dbscan_45_ba_std_stichprobe)


Varianz (Stichprobe): 0.003704448000000001
Varianz (Stichprobe): 0.0038923530000000003
Varianz (Stichprobe): 0.0003189050000000004
Standardabweichung (Stichprobe): 0.06086417665589506
Standardabweichung (Stichprobe): 0.06238872494289333
Standardabweichung (Stichprobe): 0.017857911412032496
Varianz (Stichprobe): 0.0017061099999999998
Varianz (Stichprobe): 0.0025601580000000007
Varianz (Stichprobe): 0.00012960699999999952
Standardabweichung (Stichprobe): 0.04130508443279108
Standardabweichung (Stichprobe): 0.050598003913198006
Standardabweichung (Stichprobe): 0.011384507016116223


## Friedmann-Test

In [22]:
import sys
print(sys.executable)

/home/ueay/bachelorarbeit-projekt/env/bin/python


In [2]:
import scipy as sp
from scipy.stats import friedmanchisquare

# ------------------------------
# Ergebnisse pro Modell (in der Reihenfolge Seq-Längen 25, 35, 45)
# ------------------------------

# F1-Score
IF_f1_25 = [0.7596, 0.7610, 0.7494, 0.7368, 0.7639]
OCSVM_f1_25 = [0.7564, 0.7603, 0.7364, 0.7039, 0.7578]
DBSCAN_f1_25 = [0.2278, 0.2214, 0.0846, 0.2214, 0.2086]

IF_f1_45 = [0.6094, 0.6379, 0.6289, 0.5918, 0.6179]
OCSVM_f1_45 = [0.6094, 0.6379, 0.6289, 0.5918, 0.6179]
DBSCAN_f1_45 = [0.1388, 0.0501, 0.1476, 0.1432, 0.1388]

# Matthews Correlation Coefficient (MCC)
IF_mcc_25 = [0.7647, 0.7550, 0.7346, 0.7720, 0.7673]
OCSVM_mcc_25 = [0.7572, 0.7696, 0.7332, 0.6916, 0.7549]
DBSCAN_mcc_25 = [0.3517, 0.3462, 0.2059, 0.3462, 0.3348]

IF_mcc_45 = [0.5819, 0.6126, 0.6024, 0.5619, 0.5932]
OCSVM_mcc_45 = [0.5819, 0.6126, 0.6024, 0.5619, 0.5932]
DBSCAN_mcc_45 = [0.2644, 0.1550, 0.2735, 0.2690, 0.2644]

# Balanced Accuracy
IF_bal_25 = [0.8165, 0.8128, 0.8209, 0.8141, 0.8297]
OCSVM_bal_25 = [0.8256, 0.8146, 0.8227, 0.8357, 0.8369]
DBSCAN_bal_25 = [0.5643, 0.5622, 0.5221, 0.5622, 0.5582]

IF_bal_45 = [0.7875, 0.8015, 0.8005, 0.7971, 0.7820]
OCSVM_bal_45 = [0.7875, 0.8015, 0.8005, 0.7971, 0.7820]
DBSCAN_bal_45 = [0.5373, 0.5129, 0.5398, 0.5386, 0.5373]

# ------------------------------
# Funktion für Friedman-Test
# ------------------------------
def run_friedman(model1, model2, model3, metric_name):
    stat, p = friedmanchisquare(model1, model2, model3)
    print(f"\n=== Friedman-Test für {metric_name} ===")
    print(f"Chi² = {stat:.3f}, p = {p:.4f}")
    if p < 0.05:
        print("Signifikante Unterschiede zwischen den Modellen")
    else:
        print("Keine signifikanten Unterschiede")

# ------------------------------
# Tests durchführen
# ------------------------------
run_friedman(IF_f1_25, OCSVM_f1_25, DBSCAN_f1_25, "F1-Score")
run_friedman(IF_mcc_25, OCSVM_mcc_25, DBSCAN_mcc_25, "MCC")
run_friedman(IF_bal_25, OCSVM_bal_25, DBSCAN_bal_25, "Balanced Accuracy")

run_friedman(IF_f1_45, OCSVM_f1_45, DBSCAN_f1_45, "F1-Score")
run_friedman(IF_mcc_45, OCSVM_mcc_45, DBSCAN_mcc_45, "MCC")
run_friedman(IF_bal_45, OCSVM_bal_45, DBSCAN_bal_45, "Balanced Accuracy")


=== Friedman-Test für F1-Score ===
Chi² = 10.000, p = 0.0067
Signifikante Unterschiede zwischen den Modellen

=== Friedman-Test für MCC ===
Chi² = 8.400, p = 0.0150
Signifikante Unterschiede zwischen den Modellen

=== Friedman-Test für Balanced Accuracy ===
Chi² = 10.000, p = 0.0067
Signifikante Unterschiede zwischen den Modellen

=== Friedman-Test für F1-Score ===
Chi² = 10.000, p = 0.0067
Signifikante Unterschiede zwischen den Modellen

=== Friedman-Test für MCC ===
Chi² = 10.000, p = 0.0067
Signifikante Unterschiede zwischen den Modellen

=== Friedman-Test für Balanced Accuracy ===
Chi² = 10.000, p = 0.0067
Signifikante Unterschiede zwischen den Modellen


In [4]:
import scikit_posthocs as sp
import numpy as np
import scikit_posthocs as sp

# ==========================
# Seq 25
# ==========================

# F1-Score
f1_seq25 = np.array([
    (0.7596, 0.7564, 0.2278),  # Datensatz 1: IF, OCSVM, DBSCAN
    (0.7610, 0.7603, 0.2214),
    (0.7494, 0.7364, 0.0846),
    (0.7368, 0.7039, 0.2214),
    (0.7639, 0.7578, 0.2086)
])
nemenyi_f1_seq25 = sp.posthoc_nemenyi_friedman(f1_seq25)
print("Seq 25 - F1:")
print(nemenyi_f1_seq25)

# MCC
mcc_seq25 = np.array([
    (0.7647, 0.7572, 0.3517),
    (0.7550, 0.7696, 0.3462),
    (0.7346, 0.7332, 0.2059),
    (0.7720, 0.6916, 0.3462),
    (0.7673, 0.7549, 0.3348)
])
nemenyi_mcc_seq25 = sp.posthoc_nemenyi_friedman(mcc_seq25)
print("Seq 25 - MCC:")
print(nemenyi_mcc_seq25)

# Balanced Accuracy
bal_seq25 = np.array([
    (0.8165, 0.8256, 0.5643),
    (0.8128, 0.8146, 0.5622),
    (0.8209, 0.8227, 0.5221),
    (0.8141, 0.8357, 0.5622),
    (0.8297, 0.8369, 0.5582)
])
nemenyi_bal_seq25 = sp.posthoc_nemenyi_friedman(bal_seq25)
print("Seq 25 - Balanced Accuracy:")
print(nemenyi_bal_seq25)

# ==========================
# Seq 45
# ==========================

# F1-Score
f1_seq45 = np.array([
    (0.6094, 0.6094, 0.1388),
    (0.6379, 0.6379, 0.0501),
    (0.6289, 0.6289, 0.1476),
    (0.5918, 0.5918, 0.1432),
    (0.6179, 0.6179, 0.1388)
])
nemenyi_f1_seq45 = sp.posthoc_nemenyi_friedman(f1_seq45)
print("Seq 45 - F1:")
print(nemenyi_f1_seq45)

# MCC
mcc_seq45 = np.array([
    (0.5819, 0.5819, 0.2644),
    (0.6126, 0.6126, 0.1550),
    (0.6024, 0.6024, 0.2735),
    (0.5619, 0.5619, 0.2690),
    (0.5932, 0.5932, 0.2644)
])
nemenyi_mcc_seq45 = sp.posthoc_nemenyi_friedman(mcc_seq45)
print("Seq 45 - MCC:")
print(nemenyi_mcc_seq45)

# Balanced Accuracy
bal_seq45 = np.array([
    (0.7875, 0.7875, 0.5373),
    (0.8015, 0.8015, 0.5129),
    (0.8005, 0.8005, 0.5398),
    (0.7971, 0.7971, 0.5386),
    (0.7820, 0.7820, 0.5373)
])
nemenyi_bal_seq45 = sp.posthoc_nemenyi_friedman(bal_seq45)
print("Seq 45 - Balanced Accuracy:")
print(nemenyi_bal_seq45)


Seq 25 - F1:
          0         1         2
0  1.000000  0.253784  0.004464
1  0.253784  1.000000  0.253784
2  0.004464  0.253784  1.000000
Seq 25 - MCC:
          0         1         2
0  1.000000  0.609411  0.012310
1  0.609411  1.000000  0.139405
2  0.012310  0.139405  1.000000
Seq 25 - Balanced Accuracy:
          0         1         2
0  1.000000  0.253784  0.253784
1  0.253784  1.000000  0.004464
2  0.253784  0.004464  1.000000
Seq 45 - F1:
         0        1        2
0  1.00000  1.00000  0.04656
1  1.00000  1.00000  0.04656
2  0.04656  0.04656  1.00000
Seq 45 - MCC:
         0        1        2
0  1.00000  1.00000  0.04656
1  1.00000  1.00000  0.04656
2  0.04656  0.04656  1.00000
Seq 45 - Balanced Accuracy:
         0        1        2
0  1.00000  1.00000  0.04656
1  1.00000  1.00000  0.04656
2  0.04656  0.04656  1.00000


In [24]:
import pandas as pd
from statsmodels.multivariate.manova import MANOVA

# Daten für alle drei Algorithmen
data = {
    'algorithm': ['IsolationForest']*9 + ['OCSVM']*9 + ['DBSCAN']*9,
    'seq_len': [10]*3 + [25]*3 + [50]*3 + [10]*3 + [25]*3 + [50]*3 + [10]*3 + [25]*3 + [50]*3,
    'f1': [
        0.8534,0.8831,0.8583, 0.7512,0.7647,0.7651, 0.5944,0.5410,0.6009,
        0.8655,0.8831,0.8560, 0.7548,0.7651,0.7656, 0.6014,0.5901,0.6009,
        0.2585,0.2222,0.3590, 0.0846,0.0846,0.0920, 0.0506,0.0506,0.0506
    ],
    'mcc': [
        0.8553,0.8862,0.8574, 0.7574,0.7771,0.7747, 0.6069,0.5044,0.6030,
        0.8654,0.8862,0.8542, 0.7622,0.7747,0.7725, 0.5692,0.5570,0.6030,
        0.3817,0.3502,0.4637, 0.2059,0.2059,0.2150, 0.1553,0.1553,0.1553
    ],
    'balanced_acc': [
        0.8863,0.8984,0.9016, 0.8142,0.8130,0.8167, 0.7236,0.7568,0.7330,
        0.9017,0.8984,0.9053, 0.8144,0.8167,0.8205, 0.7966,0.7951,0.7330,
        0.5742,0.5625,0.6094, 0.5221,0.5221,0.5241, 0.5130,0.5130,0.5130
    ]
}

df = pd.DataFrame(data)

def manova_to_df(manova_results, algorithm_name):
    results = manova_results.mv_test().results
    rows = []
    for effect, val in results.items():
        if effect == 'Intercept':
            continue
        stats = val['stat']
        for test_name in ['Wilks\' lambda', 'Pillai\'s trace', 'Hotelling-Lawley trace', 'Roy\'s greatest root']:
            row = {
                'Algorithm': algorithm_name,
                'Effect': effect,
                'Test': test_name,
                'Value': round(stats.loc[test_name, 'Value'], 3),
                'F': round(stats.loc[test_name, 'F Value'], 3),
                'p': round(stats.loc[test_name, 'Pr > F'], 4)
            }
            rows.append(row)
    return pd.DataFrame(rows)


# Isolation Forest
manova_if = MANOVA.from_formula('f1 + mcc + balanced_acc ~ C(seq_len)', data=df[df.algorithm=='IsolationForest'])
df_if = manova_to_df(manova_if, 'IsolationForest')

# OCSVM
manova_ocsvm = MANOVA.from_formula('f1 + mcc + balanced_acc ~ C(seq_len)', data=df[df.algorithm=='OCSVM'])
df_ocsvm = manova_to_df(manova_ocsvm, 'OCSVM')

# DBSCAN
manova_dbscan = MANOVA.from_formula('f1 + mcc + balanced_acc ~ C(seq_len)', data=df[df.algorithm=='DBSCAN'])
df_dbscan = manova_to_df(manova_dbscan, 'DBSCAN')

# Alles zusammen
df_all = pd.concat([df_if, df_ocsvm, df_dbscan], ignore_index=True)
df_all

  b = (p + 2*n) * (q + 2*n) / 2 / (2*n + 1) / (n - 1)
  b = (p + 2*n) * (q + 2*n) / 2 / (2*n + 1) / (n - 1)
  b = (p + 2*n) * (q + 2*n) / 2 / (2*n + 1) / (n - 1)
  b = (p + 2*n) * (q + 2*n) / 2 / (2*n + 1) / (n - 1)
  b = (p + 2*n) * (q + 2*n) / 2 / (2*n + 1) / (n - 1)
  b = (p + 2*n) * (q + 2*n) / 2 / (2*n + 1) / (n - 1)


Unnamed: 0,Algorithm,Effect,Test,Value,F,p
0,IsolationForest,C(seq_len),Wilks' lambda,0.002,32.981,0.0
1,IsolationForest,C(seq_len),Pillai's trace,1.529,5.403,0.0099
2,IsolationForest,C(seq_len),Hotelling-Lawley trace,310.272,206.848,0.0001
3,IsolationForest,C(seq_len),Roy's greatest root,309.137,515.228,0.0
4,OCSVM,C(seq_len),Wilks' lambda,0.0,72.133,0.0
5,OCSVM,C(seq_len),Pillai's trace,1.479,4.729,0.0155
6,OCSVM,C(seq_len),Hotelling-Lawley trace,1580.386,1053.591,0.0
7,OCSVM,C(seq_len),Roy's greatest root,1579.465,2632.441,0.0
8,DBSCAN,C(seq_len),Wilks' lambda,0.0,211.409,0.0
9,DBSCAN,C(seq_len),Pillai's trace,1.97,109.829,0.0


## Cross Validation Test

### Für LSTM-AE und IF

In [25]:
from sklearn.model_selection import KFold
from sklearn.ensemble import IsolationForest
import numpy as np

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

all_precisions, all_recalls, all_f1s, all_roc_aucs = [], [], [], []

for fold, (train_idx, test_idx) in enumerate(kf.split(X_test_full)):
    print(f"Fold {fold + 1}/{n_splits}")

    model.fit(train_gen, validation_data=val_gen, epochs=50, callbacks=[early_stop], verbose=0)

    iforest = IsolationForest()
    iforest.fit(train_errors.reshape(-1, 1))

    test_preds_if = iforest.predict(test_errors.reshape(-1, 1))
    val_anomaly_if = (test_preds_if == -1).astype(int)

    y_test_seq_if = np.array([y_test_full[i + seq_length- 1] for i in range(len(val_anomaly_if))])
    true_labels_if = y_test_seq_if.astype(int)

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, y_pred, average='binary')
    roc_auc = roc_auc_score(true_labels, iforest.decision_function(test_errors.reshape(-1, 1)) * -1)

    all_precisions.append(precision)
    all_recalls.append(recall)
    all_f1s.append(f1)
    all_roc_aucs.append(roc_auc)

    print(f"Mean Precision: {np.mean(all_precisions):.3f} ± {np.std(all_precisions):.3f}")
    print(f"Mean Recall: {np.mean(all_recalls):.3f} ± {np.std(all_recalls):.3f}")
    print(f"Mean F1-Score: {np.mean(all_f1s):.3f} ± {np.std(all_f1s):.3f}")
    print(f"Mean ROC-AUC: {np.mean(all_roc_aucs):.3f} ± {np.std(all_roc_aucs):.3f}")

Fold 1/5


NameError: name 'true_labels' is not defined

### Für LSTM-AE und OCSVM

In [None]:
from sklearn.model_selection import KFold
from sklearn.ensemble import IsolationForest
import numpy as np

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

all_precisions, all_recalls, all_f1s, all_roc_aucs = [], [], [], []

for fold, (train_idx, test_idx) in enumerate(kf.split(X_test_full)):
    print(f"Fold {fold + 1}/{n_splits}")

    model.fit(train_gen, validation_data=val_gen, epochs=50, callbacks=[early_stop], verbose=0)
    ocsvm = OneClassSVM(nu=0.005, gamma=50)
    ocsvm.fit(train_errors.reshape(-1, 1))

    test_preds_ocsvm = ocsvm.predict(test_errors.reshape(-1, 1))
    test_anomaly_ocsvm = (test_preds_ocsvm == -1).astype(int)

    y_test_seq_ocsvm = np.array([y_test_full[i + seq_length - 1] for i in range(len(test_anomaly_ocsvm))])
    true_labels_ocsvm = y_test_seq.astype(int)

    report_ocsvm = classification_report(true_labels_ocsvm, test_anomaly_ocsvm)

    all_precisions.append(precision)
    all_recalls.append(recall)
    all_f1s.append(f1)
    all_roc_aucs.append(roc_auc)

    print(f"Mean Precision: {np.mean(all_precisions):.3f} ± {np.std(all_precisions):.3f}")
    print(f"Mean Recall: {np.mean(all_recalls):.3f} ± {np.std(all_recalls):.3f}")
    print(f"Mean F1-Score: {np.mean(all_f1s):.3f} ± {np.std(all_f1s):.3f}")
    print(f"Mean ROC-AUC: {np.mean(all_roc_aucs):.3f} ± {np.std(all_roc_aucs):.3f}")

### Für LSTM-AE und DBSCAN

In [None]:
from sklearn.model_selection import KFold
from sklearn.ensemble import IsolationForest
import numpy as np

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

all_precisions, all_recalls, all_f1s, all_roc_aucs = [], [], [], []

for fold, (train_idx, test_idx) in enumerate(kf.split(X_test_full)):
    print(f"Fold {fold + 1}/{n_splits}")

    model.fit(train_gen, validation_data=val_gen, epochs=50, callbacks=[early_stop], verbose=0)
    dbscan = DBSCAN(eps = 0.05, min_samples = 40)

    dbscan_labels = dbscan.fit_predict(test_errors.reshape(-1, 1))
    dbscan_anomaly = (dbscan_labels == -1).astype(int)

    y_test_seq_dbscan = np.array([y_test_full[i + seq_length - 1] for i in range(len(dbscan_anomaly))])
    true_labels_dbscan = y_test_seq_dbscan.astype(int)
    
    all_precisions.append(precision)
    all_recalls.append(recall)
    all_f1s.append(f1)
    all_roc_aucs.append(roc_auc)

    print(f"Mean Precision: {np.mean(all_precisions):.3f} ± {np.std(all_precisions):.3f}")
    print(f"Mean Recall: {np.mean(all_recalls):.3f} ± {np.std(all_recalls):.3f}")
    print(f"Mean F1-Score: {np.mean(all_f1s):.3f} ± {np.std(all_f1s):.3f}")
    print(f"Mean ROC-AUC: {np.mean(all_roc_aucs):.3f} ± {np.std(all_roc_aucs):.3f}")