# Ein Hybridmodell mit LSTM-AE, DBSCAN und IF

In [1]:
import numpy as np
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.metrics import classification_report
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from matplotlib import pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
import logging
from tensorflow.keras.layers import TimeDistributed, Dense, LSTM, Dropout, LayerNormalization, Input, Add, Bidirectional, RepeatVector
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import roc_curve
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import balanced_accuracy_score
import sys
from keras.layers import LSTM, Dropout, LayerNormalization, Add, TimeDistributed, Dense
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score
from tensorflow.keras import backend as K

2025-07-14 16:53:19.397293: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752504799.416896  239140 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752504799.422886  239140 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752504799.440796  239140 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752504799.440833  239140 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752504799.440835  239140 computation_placer.cc:177] computation placer alr

## 1. Vorbereitung

In [2]:
import tensorflow as tf

# Liste aller GPUs
gpus = tf.config.list_physical_devices('GPU')
print("GPUs:", gpus)

if gpus:
    # Erzwinge, dass TensorFlow auf GPU 0 rechnet
    with tf.device('/GPU:0'):
        a = tf.constant([[1.0, 2.0]])
        b = tf.constant([[3.0], [4.0]])
        c = tf.matmul(a, b)
        print("Result auf GPU:", c.numpy())
else:
    print("Keine GPU verfÃ¼gbar, lÃ¤uft auf CPU")

GPUs: []
Keine GPU verfÃ¼gbar, lÃ¤uft auf CPU


W0000 00:00:1752504801.939432  239140 gpu_device.cc:2341] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


#### Daten laden

In [3]:
data_train = pd.read_json("train_logs.json", lines=False)
data_test = pd.read_json("test_logs.json", lines=False)
data_val = pd.read_json("val_logs.json", lines=False)
print(data_test.head())

                  timestamp log_level             category realmId realmName  \
0 2025-07-12 01:20:55+02:00      info  org.keycloak.events   bosch     bosch   
1 2025-07-12 01:48:00+02:00      warn  org.keycloak.events  master    master   
2 2025-07-12 02:05:02+02:00      warn  org.keycloak.events  master    master   
3 2025-07-12 02:08:17+02:00      warn  org.keycloak.events  master    master   
4 2025-07-12 02:31:44+02:00      warn  org.keycloak.events  master    master   

    clientId userId     sessionId        ipAddress      authMethod  \
0     broker   Yara  session_3504   198.143.36.129             otp   
1  login-app    NaN  session_7099   198.112.119.68        password   
2  login-app    NaN  session_7806  203.207.106.189  openid-connect   
3  login-app    NaN  session_1824     198.17.0.136            saml   
4  login-app    NaN  session_7726    198.2.114.248            code   

  resourceType operationType  label         type                error details  
0       groups    

#### NaN-Werte bereinigen

In [4]:
X_train_full = pd.DataFrame(data_train)
X_test_full = pd.DataFrame(data_test)
X_val_full = pd.DataFrame(data_val)

y_test_full = data_test["label"]
y_val_full = data_val["label"]

#### Logging

In [5]:
logfile = open("output_log.txt", "w")
logging.basicConfig(stream=logfile, level=logging.INFO)

sys.stdout = logfile
sys.stderr = logfile

logging.info("Modelle wurden gestartet.")

#### Automatisch alles in Numerische oder Kategorische Daten einteilen

In [6]:
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

def is_missing(val):
    return val is None or (isinstance(val, float) and np.isnan(val))

def auto_encode_features(logs, one_hot_numeric=False, 
                         label_encoders=None, onehot_encoders=None, fit=True):
    if logs is None or len(logs) == 0:
        return [], {}, {}

    if hasattr(logs, "to_dict"):
        logs = logs.to_dict(orient='records')

    if label_encoders is None:
        label_encoders = {}
    if onehot_encoders is None:
        onehot_encoders = {}

    sample = logs[0]
    all_keys = sample.keys()
    encoded_logs = []

    # Alle mÃ¶glichen Rollen sammeln (optional)
    all_possible_roles = set()
    if fit:
        for log in logs:
            roles = log.get("roles", [])
            # Fix: Sicherstellen, dass roles eine Liste ist
            if not isinstance(roles, (list, tuple)):
                if is_missing(roles):
                    roles = []
                else:
                    roles = [roles]

            for role in roles:
                if is_missing(role):
                    role = "__MISSING__"
                all_possible_roles.add(role)

    for log in logs:
        encoded = {}
        for key in all_keys:
            val = log.get(key)

            # ðŸ”¹ Spezialfall: roles
            if key == "roles":
                roles = val
                # Fix: Sicherstellen, dass roles eine Liste ist
                if not isinstance(roles, (list, tuple)):
                    if is_missing(roles):
                        roles = []
                    else:
                        roles = [roles]

                present_roles = set()
                for role in roles:
                    if is_missing(role):
                        role = "__MISSING__"
                    present_roles.add(role)

                # Ein Feature pro Rolle (1 falls vorhanden, sonst 0)
                for role in all_possible_roles:
                    feature_name = f"role_{role}"
                    encoded[feature_name] = 1 if role in present_roles else 0

                continue  # skip default logic for "roles"

            # ðŸ”¸ Fehlende Werte
            if is_missing(val):
                val = "__MISSING__"

            # ðŸ”¸ Numerisch
            if isinstance(val, (int, float)) and not isinstance(val, bool):
                if val == "__MISSING__":
                    encoded[key] = -9999
                elif one_hot_numeric:
                    if key not in onehot_encoders and fit:
                        values = np.array([[l.get(key) if not is_missing(l.get(key)) else -9999]
                                           for l in logs])
                        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
                        encoder.fit(values)
                        onehot_encoders[key] = encoder

                    if key in onehot_encoders:
                        enc = onehot_encoders[key].transform([[val]])[0]
                        for i, v in enumerate(enc):
                            encoded[f"{key}_{i}"] = v
                else:
                    encoded[key] = val

            # ðŸ”¸ String
            elif isinstance(val, str):
                if key not in label_encoders and fit:
                    values = list(set(
                        tuple(l.get(key)) if isinstance(l.get(key), list) 
                        else (l.get(key) if not is_missing(l.get(key)) else "__MISSING__")
                        for l in logs
                    ))
                    le = LabelEncoder()
                    le.fit(values)
                    label_encoders[key] = le

                if key in label_encoders:
                    le = label_encoders[key]
                    if val in le.classes_:
                        encoded[key] = le.transform([val])[0]
                    else:
                        encoded[key] = -1

            # ðŸ”¸ Liste (aber nicht "roles")
            elif isinstance(val, str):
                if key not in label_encoders and fit:
                    def make_hashable(v):
                        if isinstance(v, list):
                            return tuple(make_hashable(x) for x in v)
                        return v
                    
                    values = list(set(
                        make_hashable(l.get(key)) if not is_missing(l.get(key)) else "__MISSING__"
                        for l in logs
                    ))
                    le = LabelEncoder()
                    le.fit(values)
                    label_encoders[key] = le

                if key in label_encoders:
                    le = label_encoders[key]
                    if val in le.classes_:
                        encoded[key] = le.transform([val])[0]
                    else:
                        encoded[key] = -1

        encoded_logs.append(encoded)

    return encoded_logs, label_encoders, onehot_encoders


#### Werte aus vorheriger Funktion vereinigen

In [7]:
def dicts_to_feature_matrix(encoded_logs):
    feature_names = sorted({key for d in encoded_logs for key in d.keys()})

    X = np.zeros((len(encoded_logs), len(feature_names)), dtype=np.float32)

    for i, d in enumerate(encoded_logs):
        for j, feat in enumerate(feature_names):
            if feat in d:
                X[i, j] = d[feat]

    return X, feature_names

In [8]:
logs_train = X_train_full.to_dict(orient='records')
logs_test = X_test_full.to_dict(orient='records')
logs_val = X_val_full.to_dict(orient='records')

encoded_logs, label_encoders, onehot_encoders = auto_encode_features(
    logs_train, one_hot_numeric=True, fit=True
)
X_train, feature_names = dicts_to_feature_matrix(encoded_logs)

encoded_test_logs, _, _ = auto_encode_features(
    logs_test, one_hot_numeric=True,
    label_encoders=label_encoders, onehot_encoders=onehot_encoders, fit=False
)
X_test, _ = dicts_to_feature_matrix(encoded_test_logs)

encoded_val_logs, _, _ = auto_encode_features(
    logs_val, one_hot_numeric=True,
    label_encoders=label_encoders, onehot_encoders=onehot_encoders, fit=False
)
X_val, _ = dicts_to_feature_matrix(encoded_val_logs)

KeyboardInterrupt: 

#### Skalierung

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)

#### PCA zur Dimensionsreduktion

### Parameter

In [None]:
seq_length = 15
batch_size = 128

encoder_layers = [1024, 512, 256]
decoder_layers = [256, 512, 1024]

dropout_rate = 0.2
learning_rate = 0.0001

### Sequenzen

In [None]:
class SequenceToSequenceGenerator(tf.keras.utils.Sequence):
    def __init__(self, data, seq_length, batch_size, pad_last=True):
        self.data = data
        self.seq_length = seq_length
        self.batch_size = batch_size
        self.pad_last = pad_last

        self.indices = np.arange(len(data) - seq_length)
        
        # Wenn Padding aktiv ist und etwas Ã¼brig bleibt
        if pad_last and (len(data) % seq_length != 0):
            self.include_last = True
        else:
            self.include_last = False

    def __len__(self):
        base = (len(self.indices) + self.batch_size - 1) // self.batch_size
        return base + (1 if self.include_last else 0)

    def __getitem__(self, idx):
        if idx < len(self) - 1 or not self.include_last:
            batch_idx = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
            X_batch = np.array([self.data[i:i + self.seq_length] for i in batch_idx])
        else:
            # letzte Sequenz mit Padding
            last_seq = self.data[-self.seq_length:]
            if len(last_seq) < self.seq_length:
                padding_len = self.seq_length - len(last_seq)
                padding = np.zeros((padding_len, self.data.shape[1]))
                last_seq = np.vstack((last_seq, padding))
            X_batch = np.expand_dims(last_seq, axis=0)

        return X_batch, X_batch


train_gen = SequenceToSequenceGenerator(X_train_scaled, seq_length, batch_size)
val_gen = SequenceToSequenceGenerator(X_val_scaled, seq_length, batch_size)
test_gen = SequenceToSequenceGenerator(X_test_scaled, seq_length, batch_size)

## 2. LSTM-AE implementieren

In [None]:
#  LSTM-Block
def residual_lstm_block(inputs, units, dropout_rate=dropout_rate):
    shortcut = inputs
    x = LSTM(units, return_sequences=True)(inputs)
    x = Dropout(dropout_rate)(x)
    x = LayerNormalization()(x)

    # Falls Dimension nicht passt, anpassen
    if K.int_shape(shortcut)[-1] != units:
        shortcut = TimeDistributed(Dense(units))(shortcut)

    x = Add()([x, shortcut])
    return x

n_features = X_train_scaled.shape[1]

inputs = Input(shape=(seq_length, n_features))
x = inputs

# Encoder mit Residual-BlÃ¶cken
for units in encoder_layers:
    x = residual_lstm_block(x, units, dropout_rate)

# Bottleneck
encoded = Bidirectional(LSTM(encoder_layers[-1]))(x)

# Decoder
decoded = RepeatVector(seq_length)(encoded)
x = decoded

for units in decoder_layers:
    x = residual_lstm_block(x, units, dropout_rate) 

outputs = TimeDistributed(Dense(n_features, activation='linear'))(x)

model = Model(inputs, outputs)
model.compile(optimizer=Adam(learning_rate), loss='mse')
model.summary()

In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model.fit(train_gen, validation_data=val_gen, epochs=5, callbacks=[early_stop])

### Umwandlung der Daten fÃ¼r nÃ¤chste Modelle: Rekonstruktionsfehler berechnen

In [None]:
def get_reconstruction_errors(gen, model):
    errors = []
    for i in range(len(gen)):
        X_batch, _ = gen[i]
        pred = model.predict(X_batch, verbose=0)
        batch_errors = np.mean(np.square(X_batch - pred), axis=(1, 2))
        errors.extend(batch_errors)
    return np.array(errors)

train_errors = get_reconstruction_errors(train_gen, model)
val_errors = get_reconstruction_errors(val_gen, model)
test_errors = get_reconstruction_errors(test_gen, model)

reconstruction_errors = get_reconstruction_errors(test_gen, model)
threshold = np.percentile(train_errors, 95)

In [None]:
y_test_seq = np.array([y_test_full[i + seq_length - 1] for i in range(len(reconstruction_errors))])
true_labels = y_test_seq.astype(int)

fpr, tpr, thresholds = roc_curve(true_labels, reconstruction_errors)
optimal_idx = (tpr - fpr).argmax()
optimal_threshold = thresholds[optimal_idx]
y_pred = (reconstruction_errors > optimal_threshold).astype(int)

precision, recall, f1, _ = precision_recall_fscore_support(true_labels, y_pred, average='binary')
roc_auc = roc_auc_score(true_labels, reconstruction_errors)
logging.info(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1-Score: {f1:.3f}, ROC-AUC: {roc_auc:.3f}")

In [None]:
scaler_rec = StandardScaler()
scaler_rec.fit(train_errors.reshape(-1, 1))
val_errors_scaled = scaler_rec.transform(val_errors.reshape(-1, 1))
test_errors_scaled = scaler_rec.transform(test_errors.reshape(-1, 1))

# Modell 1

## Isolation Forest

In [None]:
iforest = IsolationForest()
iforest.fit(train_errors.reshape(-1, 1))

test_preds_if = iforest.predict(test_errors.reshape(-1, 1))
val_anomaly_if = (test_preds_if == -1).astype(int)

y_test_seq_if = np.array([y_test_full[i + seq_length- 1] for i in range(len(val_anomaly_if))])
true_labels_if = y_test_seq_if.astype(int)

report_iforest_if = classification_report(true_labels_if, val_anomaly_if)
logging.info("Isolation Forest - Test Report:\n" + report_iforest_if)

### IF MCC

In [None]:
mcc_if = matthews_corrcoef(true_labels_if, val_anomaly_if)
logging.info("Matthews Correlation Coefficient: %f", mcc_if)

### IF Balanced Accuracy

In [None]:
balanced_acc = balanced_accuracy_score(true_labels_if, val_anomaly_if)
logging.info("Balanced Accuracy: %f", balanced_acc)

### IF AUC-Kurven

In [None]:
val_scores = -iforest.decision_function(val_errors.reshape(-1,1))
y_val_seq = np.array([y_val_full[i + seq_length - 1] for i in range(len(val_scores))])
true_labels = y_val_seq.astype(int)

fpr, tpr, _ = roc_curve(true_labels, val_scores)
roc_auc = roc_auc_score(true_labels, val_scores)

precision, recall, _ = precision_recall_curve(true_labels, val_scores)
pr_auc = average_precision_score(true_labels, val_scores)

In [None]:
plt.figure(figsize=(12, 5))

# ROC Plot
plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve (Validation)")
plt.legend()
plt.tight_layout()
plt.savefig('roc_curve_if.png', dpi=300)
plt.close()

In [None]:
# PR Plot
plt.subplot(1, 2, 2)
plt.plot(recall, precision, label=f"AP = {pr_auc:.3f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve (Validation)")
plt.legend()
plt.tight_layout()
plt.savefig('pr_curve_if.png', dpi=300)
plt.close()

# Modell 2

## One-Class SVM

In [None]:
ocsvm = OneClassSVM(nu=0.005, gamma=50)
ocsvm.fit(train_errors.reshape(-1, 1))

test_preds_ocsvm = ocsvm.predict(test_errors.reshape(-1, 1))
test_anomaly_ocsvm = (test_preds_ocsvm == -1).astype(int)

y_test_seq_ocsvm = np.array([y_test_full[i + seq_length - 1] for i in range(len(test_anomaly_ocsvm))])
true_labels_ocsvm = y_test_seq.astype(int)

report_ocsvm = classification_report(true_labels_ocsvm, test_anomaly_ocsvm)
logging.info("One-Class SVM - Test Report:\n" + report_ocsvm)

### OCSVM MCC

In [None]:
mcc = matthews_corrcoef(true_labels_ocsvm, test_anomaly_ocsvm)
logging.info("Matthews Correlation Coefficient: %f", mcc)

### OCSVM Balanced Accuracy

In [None]:
balanced_acc = balanced_accuracy_score(true_labels_ocsvm, test_anomaly_ocsvm)
logging.info("Balanced Accuracy: %f", balanced_acc)

### OCSVM AUC-Kurven

In [None]:
val_scores_ocsvm = -ocsvm.decision_function(val_errors.reshape(-1,1))
y_val_seq_ocsvm = np.array([y_val_full[i + seq_length - 1] for i in range(len(val_scores_ocsvm))])
true_labels_ocsvm = y_val_seq_ocsvm.astype(int)

fpr, tpr, _ = roc_curve(true_labels_ocsvm, val_scores_ocsvm)
roc_auc = roc_auc_score(true_labels_ocsvm, val_scores_ocsvm)

precision, recall, _ = precision_recall_curve(true_labels_ocsvm, val_scores_ocsvm)
pr_auc = average_precision_score(true_labels_ocsvm, val_scores_ocsvm)
# Plotten
plt.figure(figsize=(12, 5))

# ROC OCSVM
plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve (Validation)")
plt.legend()
plt.tight_layout()
plt.savefig('roc_curve_ocsvm.png', dpi=300)
plt.close()

# PR OCSVM
plt.subplot(1, 2, 2)
plt.plot(recall, precision, label=f"AP = {pr_auc:.3f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve (Validation)")
plt.legend()
plt.tight_layout()
plt.savefig('pr_curve_ocsvm.png', dpi=300)
plt.close()

# Modell 3

## DBSCAN

In [None]:
dbscan = DBSCAN(eps = 0.05, min_samples = 40)

dbscan_labels = dbscan.fit_predict(test_errors.reshape(-1, 1))
dbscan_anomaly = (dbscan_labels == -1).astype(int)

y_test_seq_dbscan = np.array([y_test_full[i + seq_length - 1] for i in range(len(dbscan_anomaly))])
true_labels_dbscan = y_test_seq_dbscan.astype(int)
logging.info("\n" + classification_report(true_labels_dbscan, dbscan_anomaly))

### DBSCAN MCC

In [None]:
mcc = matthews_corrcoef(true_labels_dbscan, dbscan_anomaly)
logging.info("Matthews Correlation Coefficient: %f", mcc)

### DBSCAN Balanced Accuracy

In [None]:
balanced_acc = balanced_accuracy_score(true_labels_dbscan, dbscan_anomaly)
logging.info("Balanced Accuracy: %f", balanced_acc)

### DBSCAN AUC-Kurven

In [None]:
scores = dbscan_anomaly

fpr, tpr, _ = roc_curve(true_labels_dbscan, scores)
roc_auc = roc_auc_score(true_labels_dbscan, scores)

precision, recall, _ = precision_recall_curve(true_labels_dbscan, scores)
pr_auc = average_precision_score(true_labels_dbscan, scores)
plt.figure(figsize=(12, 5))

In [None]:
# ROC-Kurve
plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve (DBSCAN)")
plt.legend()
plt.savefig('roc_curve_dbscan.png', dpi=300)
plt.close()

In [None]:
# PR-Kurve
plt.subplot(1, 2, 2)
plt.plot(recall, precision, label=f"AP = {pr_auc:.3f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve (DBSCAN)")
plt.legend()
plt.tight_layout()
plt.savefig('pr_curve_dbscan.png', dpi=300)
plt.close()