# Ein Hybridmodell mit LSTM-AE, DBSCAN und IF

In [1]:
import numpy as np
from keras.models import Model
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.metrics import classification_report
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.metrics import (
    adjusted_rand_score,
    homogeneity_score,
    fowlkes_mallows_score
)
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
from sklearn.metrics import auc, roc_curve
from sklearn.metrics import classification_report, roc_curve, auc
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, LSTM, Dense, RepeatVector, TimeDistributed, Bidirectional, Dropout, LayerNormalization, Add
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score

2025-05-27 12:09:46.506858: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748340586.668343   41779 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748340586.702447   41779 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748340586.938668   41779 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748340586.938692   41779 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748340586.938694   41779 computation_placer.cc:177] computation placer alr

## 1. Daten laden

In [2]:
data_train = pd.read_json("train_logs.json", lines=False)
data_test = pd.read_json("test_logs.json", lines=False)
print(data_test.head())

                  timestamp log_type   userId       ipAddress location  \
0 2025-05-21 10:12:03+02:00    event  user_48    87.97.198.68       DE   
1 2025-05-19 00:43:39+02:00   access  user_25   206.243.239.8       VE   
2 2025-05-22 01:59:57+02:00    event   user_5  251.100.154.74       RU   
3 2025-05-17 10:36:45+02:00    event   user_9   79.119.192.41       DE   
4 2025-05-22 11:36:28+02:00    event  user_37    94.177.65.27       DE   

         city                                roles             type  \
0   Stuttgart          [uma_authorization, member]           LOGOUT   
1     Karachi                     [auditor, admin]              NaN   
2      Algier  [manage-users, user, impersonation]  FORGOT_PASSWORD   
3  Reutlingen               [member, user, viewer]           LOGOUT   
4   Esslingen             [viewer, offline_access]           LOGOUT   

      realmId      clientId  ... authType  \
0   app-users  frontend-app  ...     code   
1         NaN           NaN  ...      

### Kategorische in numerische Spalten umwandeln

In [3]:
# kategorische Spalten
candidate_cat_cols = ["timestamp", "type", "realmId", "clientId", "authType"]
# kategorische in numerische Spalten umwandeln
data_train_enc = pd.get_dummies(data_train, columns=candidate_cat_cols)
data_test_enc = pd.get_dummies(data_test, columns=candidate_cat_cols)

### Beide Datensets haben gleiche Spaltenanzahl

In [4]:
# Beide Datensets haben gleiche Spaltenanzahl
data_train_enc, data_test_enc = data_train_enc.align(data_test_enc, join='left', axis=1, fill_value=0)
y_test_full = data_test["label"]
X_train_full = data_train_enc.drop(columns=["label"], errors='ignore')
X_test_full = data_test_enc.drop(columns=["label"], errors='ignore')

### Unklare Werte einfüllen (NaN-Werte bereinigen)

In [5]:
# unklare Werte in NaN-Werte umwandeln
X_train_full = X_train_full.apply(pd.to_numeric, errors='coerce').fillna(0)
X_test_full = X_test_full.apply(pd.to_numeric, errors='coerce').fillna(0)

### Daten skalieren, standardisierte Werte

In [6]:
# Daten skalieren, standardisierte Werte
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_full).astype(np.float32)
X_test_scaled = scaler.transform(X_test_full).astype(np.float32)

### Sequenzen

In [7]:
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

latent_dim = 16
seq_length = 50
batch_size = 16

# Generatoren bleiben gleich
class SequenceToSequenceGenerator(tf.keras.utils.Sequence):
    def __init__(self, data, seq_length, batch_size):
        self.data = data
        self.seq_length = seq_length
        self.batch_size = batch_size
        self.indices = np.arange(len(data) - seq_length)

    def __len__(self):
        return (len(self.indices) + self.batch_size - 1) // self.batch_size

    def __getitem__(self, idx):
        batch_idx = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
        if len(batch_idx) == 0:
            batch_idx = self.indices[-self.batch_size:]
        X_batch = np.array([self.data[i:i + self.seq_length] for i in batch_idx])
        return X_batch, X_batch

train_gen = SequenceToSequenceGenerator(X_train_scaled, seq_length, batch_size)
val_split = int(len(X_train_scaled) * 0.1)
val_gen = SequenceToSequenceGenerator(X_train_scaled[-val_split:], seq_length, batch_size)
test_gen = SequenceToSequenceGenerator(X_test_scaled, seq_length, batch_size)

## 2. LSTM-AE implementieren

In [8]:
timesteps = seq_length
n_features = X_train_scaled.shape[1]

def residual_lstm_block(x, units, dropout_rate=0.02):
    shortcut = x
    x = LSTM(units, return_sequences=True)(x)
    x = Dropout(dropout_rate)(x)
    x = LayerNormalization()(x)
    if shortcut.shape[-1] != x.shape[-1]:
        shortcut = Dense(units)(shortcut)
    x = Add()([x, shortcut])
    return x

inputs = Input(shape=(timesteps, n_features))
x = inputs
for _ in range(4):
    x = residual_lstm_block(x, latent_dim)

encoded = Bidirectional(LSTM(latent_dim))(x)
decoded = RepeatVector(timesteps)(encoded)

x = decoded
for _ in range(4):
    x = LSTM(latent_dim, return_sequences=True)(x)
    x = Dropout(0.02)(x)
    x = LayerNormalization()(x)

outputs = TimeDistributed(Dense(n_features))(x)

model = Model(inputs, outputs)
model.compile(optimizer=Adam(0.05), loss='mse')

2025-05-27 12:10:00.026803: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [9]:
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model.fit(train_gen, validation_data=val_gen, epochs=5, callbacks=[early_stop])

Epoch 1/5


  self._warn_if_super_not_called()


[1m560/560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 325ms/step - loss: 1.0029 - val_loss: 1.0038
Epoch 2/5
[1m560/560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 251ms/step - loss: 1.0028 - val_loss: 1.0039
Epoch 3/5
[1m560/560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 209ms/step - loss: 1.0026 - val_loss: 1.0048
Epoch 4/5
[1m121/560[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m2:06[0m 288ms/step - loss: 1.0022

KeyboardInterrupt: 

### Umwandlung der Daten für nächste Modelle: Rekonstruktionsfehler berechnen

In [None]:
# --- Rekonstruktionsfehler batchweise berechnen ---
def get_reconstruction_errors(gen, model):
    errors = []
    for i in range(len(gen)):
        X_batch, _ = gen[i]
        # predict ohne batch_size (damit TensorFlow das selbst handhabt)
        pred = model.predict(X_batch, verbose=0)
        batch_errors = np.mean(np.square(X_batch - pred), axis=(1, 2))
        errors.extend(batch_errors)
    return np.array(errors)

# Reconstruction Errors berechnen
reconstruction_errors = get_reconstruction_errors(test_gen, model)
#train_errors = get_reconstruction_errors(train_gen, model)
test_errors = get_reconstruction_errors(test_gen, model)

# Labels passend zur Anzahl der Sequenzen extrahieren
y_test_seq = np.array([y_test_full[i + seq_length - 1] for i in range(len(y_test_full) - seq_length)])

# Threshold auf 75. Perzentil der Trainingsfehler setzen
#threshold = np.percentile(train_errors, 99)
threshold_test = np.percentile(test_errors, 75)

# Anomalien vorhersagen
y_pred = (reconstruction_errors > threshold_test).astype(int)
true_labels = y_test_seq.astype(int)

precision, recall, f1, _ = precision_recall_fscore_support(true_labels, y_pred, average='binary')
roc_auc = roc_auc_score(true_labels, reconstruction_errors)

print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1-Score: {f1:.3f}, ROC-AUC: {roc_auc:.3f}")

In [None]:
# --- Encoder-Model bauen ---
encoder = Model(inputs, encoded)

# --- Batchweise encodieren, um RAM zu sparen ---
X_encoded_batches = []
for i in range(len(test_gen)):   # len(test_gen) gibt Anzahl der Batches an
    X_batch, _ = test_gen[i]
    encoded_batch = encoder.predict(X_batch, batch_size=len(X_batch), verbose=0)
    X_encoded_batches.append(encoded_batch)
X_encoded_new = np.vstack(X_encoded_batches)

# --- Feature-Skalierung ---
scaler_enc = StandardScaler()
X_encoded_scaled = scaler_enc.fit_transform(X_encoded_new)

reconstruction_errors_reshaped = reconstruction_errors.reshape(-1, 1)
hybrid_features = np.hstack((X_encoded_scaled, reconstruction_errors_reshaped))

print("Shape Hybrid-Features:", hybrid_features.shape)

# Modell 1

## 3.1 Hybridmodell mit DBSCAN und IF

In [None]:
dbscan_labels_hybrid = DBSCAN().fit_predict(hybrid_features)
dbscan_anomaly_hybrid = (dbscan_labels_hybrid == -1).astype(int)

iforest_hybrid = IsolationForest()
iforest_labels_hybrid = iforest_hybrid.fit_predict(hybrid_features)
iso_anomaly_hybrid = (iforest_labels_hybrid == -1).astype(int)

hybrid_pred = (dbscan_labels_hybrid & iso_anomaly_hybrid).astype(int)
hybrid_features_scaled = scaler.fit_transform(hybrid_features)

print(classification_report(true_labels, hybrid_pred))

In [None]:
fpr, tpr, thresholds = roc_curve(true_labels, hybrid_pred)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--', label="Chance (random)")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC-Kurve – Anomalieerkennung')
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.show()

## 3.2 Oder nur mit IF

In [None]:
iforest = IsolationForest()
iforest_labels = iforest.fit_predict(X_encoded_scaled)
iso_anomaly = (iforest_labels == -1).astype(int)
print(classification_report(true_labels, iso_anomaly))

### Anomaly Scores berechnen

In [None]:
scores = iforest.decision_function(X_encoded_scaled)
print("Scores:", scores[:10])

In [None]:
fpr, tpr, thresholds = roc_curve(true_labels, iso_anomaly)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--', label="Chance (random)")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC-Kurve – Anomalieerkennung')
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.show()

# Modell 2

## 4.1 Hybrid-Modell mit DBSCAN und One-Class SVM

In [None]:
dbscan_hybrid = DBSCAN()
dbscan_labels_hybrid = dbscan_hybrid.fit_predict(hybrid_features)
dbscan_anomaly_hybrid = (dbscan_labels_hybrid == -1).astype(int)

ocsvm_hybrid = OneClassSVM()
ocsvm_labels_hybrid = ocsvm_hybrid.fit_predict(hybrid_features)
ocsvm_anomaly_hybrid = (ocsvm_labels_hybrid == -1).astype(int)

hybrid_pred_2 = np.maximum(dbscan_anomaly_hybrid, ocsvm_anomaly_hybrid)
print(classification_report(true_labels, hybrid_pred_2))

In [None]:
fpr, tpr, thresholds = roc_curve(true_labels, hybrid_pred_2)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--', label="Chance (random)")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC-Kurve – Anomalieerkennung')
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.show()

## 4.2 Nur mit One-Class SVM

In [None]:
ocsvm = OneClassSVM()
y_pred = ocsvm.fit_predict(X_encoded_scaled)
anomaly = (y_pred == -1).astype(int)
print(classification_report(true_labels, anomaly))

### Anomaly-Scores berechnen

In [None]:
scores_ocsvm = ocsvm.decision_function(X_encoded_scaled)
print("OCSVM Scores:", scores_ocsvm[:10])

In [None]:
fpr, tpr, thresholds = roc_curve(true_labels, anomaly)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--', label="Chance (random)")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC-Kurve – Anomalieerkennung')
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.show()

# Modell 3

## 5. Nur DBSCAN

In [None]:
dbscan_labels = DBSCAN().fit_predict(X_encoded_scaled)
dbscan_anomaly = (dbscan_labels == -1).astype(int)

# Werden Cluster zufällig verteilt?
ari  = adjusted_rand_score(true_labels, dbscan_anomaly)
# Jeder Cluster hat nur ein Label
homo = homogeneity_score(true_labels, dbscan_anomaly)
# Balance zwischen Precision und Recall
fmi  = fowlkes_mallows_score(true_labels, dbscan_anomaly)

# Ausgabe
print(classification_report(true_labels, dbscan_anomaly))
print(f"Adjusted Rand Index (ARI):     {ari:.4f}")
print(f"Homogeneity:                   {homo:.4f}")
print(f"Fowlkes-Mallows Index (FMI):   {fmi:.4f}")

In [None]:
noise_ratio = np.sum(dbscan_labels == -1) / len(dbscan_labels)
print(f"Noise Ratio: {noise_ratio:.2f}")

# MANOVA Test und Tukey Test

In [None]:
from statsmodels.multivariate.manova import MANOVA
from statsmodels.stats.multitest import multipletests
import statsmodels.api as sm
from statsmodels.formula.api import ols
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.stats.multicomp as mc


df = pd.DataFrame(hybrid_features_scaled, columns=[f"Feature_{i}" for i in range(hybrid_features.shape[1])])
df["label"] = hybrid_pred  # oder: true_labels, wenn du Ground Truth vergleichen willst

formula = ' + '.join(df.columns[:-1]) + ' ~ label'
maov = MANOVA.from_formula(formula, data=df)
print("MANOVA Ergebnisse:")
print(maov.mv_test())

np.random.seed(42)
anova_df = pd.DataFrame({
    "Feature": [f"Feature_{i}" for i in range(hybrid_features.shape[1])],
    "p-value": np.random.rand(hybrid_features.shape[1])
})

# FDR-Korrektur
pvals = anova_df["p-value"].values
_, pvals_corr, _, _ = multipletests(pvals, method='fdr_bh')
anova_df["p-corrected"] = pvals_corr

print("ANOVA (simuliert) mit FDR-Korrektur:")
print(anova_df.sort_values("p-corrected"))


In [None]:
df = pd.DataFrame(hybrid_features_scaled, columns=[f"Feature_{i}" for i in range(hybrid_features.shape[1])])
df["label"] = hybrid_pred  # oder true_labels, wenn du Ground Truth analysieren willst
results = []
for col in df.columns[:-1]:  # alle außer 'label'
    model = ols(f"{col} ~ C(label)", data=df).fit()
    anova = sm.stats.anova_lm(model, typ=2)
    p = anova["PR(>F)"].values[0]
    results.append((col, p))

# Sortieren nach p-Wert
sorted_results = sorted(results, key=lambda x: x[1])
for feature, p in sorted_results:
    print(f"{feature}: p = {p:.4f}")

for f in ["Feature_2", "Feature_0", "Feature_1"]:
    sns.boxplot(x="label", y=f, data=df)
    plt.title(f"{f} vs Label")
    plt.show()