# Test mit den Modellen einzeln

### Imports

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report, roc_curve, auc
import logging
from sklearn.svm import OneClassSVM
from sklearn.cluster import DBSCAN
from sklearn.metrics import roc_auc_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import balanced_accuracy_score
import sys
import logging

### Logging

In [None]:
logfile_two = open("output_log.txt", "w")

sys.stdout = logfile_two
sys.stderr = logfile_two

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    stream=logfile_two,
    force=True  # falls schon vorher etwas konfiguriert war
)

print("Das ist eine Print-Ausgabe.")
logging.info("Das ist eine Log-Nachricht.")

### Logs laden

In [None]:
data_train = pd.read_json("train_logs.json", lines=False)
data_test = pd.read_json("test_logs.json", lines=False)

### Daten in numerisch/kategorisch unterteilen

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np

def auto_encode_features(logs, one_hot_numeric=False):
    if logs is None or len(logs) == 0:
        return [], {}, {}

    # Falls logs ein DataFrame ist, in Liste von dicts konvertieren
    if hasattr(logs, "to_dict"):
        logs = logs.to_dict(orient='records')

    label_encoders = {}
    onehot_encoders = {}
    sample = logs[0]

    all_keys = sample.keys()
    encoded_logs = []

    for log in logs:
        encoded = {}
        for key in all_keys:
            val = log.get(key)

            if val is None:
                continue

            # Numerische Werte
            if isinstance(val, (int, float)) and not isinstance(val, bool):
                if one_hot_numeric:
                    if key not in onehot_encoders:
                        values = [
                            [l.get(key)] for l in logs
                            if isinstance(l.get(key), (int, float)) and not isinstance(l.get(key), bool)
                        ]
                        if values:
                            encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
                            encoder.fit(values)
                            onehot_encoders[key] = encoder
                    if key in onehot_encoders:
                        enc = onehot_encoders[key].transform([[val]])[0]
                        for i, v in enumerate(enc):
                            encoded[f"{key}_{i}"] = v
                    else:
                        encoded[key] = val  # Numeric passthrough
                else:
                    encoded[key] = val  # Numeric passthrough

            # Kategorische Werte (Strings)
            elif isinstance(val, str):
                if key not in label_encoders:
                    le = LabelEncoder()
                    values = [l.get(key) for l in logs if l.get(key) is not None]
                    le.fit(values)
                    label_encoders[key] = le
                encoded[key] = label_encoders[key].transform([val])[0]

            # Listen (z.B. "roles")
            elif isinstance(val, list):
                for i, item in enumerate(val):
                    label = f"{key}_{i}"
                    if label not in label_encoders:
                        le = LabelEncoder()
                        values = [itm for l in logs for itm in l.get(key, [])]
                        le.fit(values)
                        label_encoders[label] = le
                    encoded[label] = label_encoders[label].transform([item])[0]

            # Andere Datentypen ignorieren
            else:
                continue

        encoded_logs.append(encoded)

    return encoded_logs, label_encoders, onehot_encoders


### Zusammenfügen

In [None]:
def dicts_to_feature_matrix(encoded_logs):
    feature_names = sorted({key for d in encoded_logs for key in d.keys()})

    X = np.zeros((len(encoded_logs), len(feature_names)), dtype=np.float32)

    for i, d in enumerate(encoded_logs):
        for j, feat in enumerate(feature_names):
            if feat in d:
                X[i, j] = d[feat]

    return X, feature_names

### Daten unterteilen

In [None]:
logs_train = data_train.to_dict(orient='records')
logs_test = data_test.to_dict(orient='records')

encoded_logs, label_encoders, onehot_encoders = auto_encode_features(logs_train, one_hot_numeric=True)
X_train, feature_names = dicts_to_feature_matrix(encoded_logs)
print(f"Feature Matrix Shape: {X_train.shape}")

encoded_test_logs, _, _ = auto_encode_features(logs_test, one_hot_numeric=True)
X_test, _ = dicts_to_feature_matrix(encoded_test_logs)

### In Dataframe umwandeln

In [None]:
X_train_full = pd.DataFrame(X_train)
X_train_full = X_train_full.apply(pd.to_numeric).fillna(0)
X_test_full = pd.DataFrame(X_test)
X_test_full = X_test_full.apply(pd.to_numeric).fillna(0)
y_test_full = data_test["label"]

### Skalieren

In [None]:
 # Daten skalieren, standardisierte Werte
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_full).astype(np.float32)
X_test_scaled = scaler.transform(X_test_full).astype(np.float32)

### Test mit IF

In [None]:
iforest = IsolationForest(contamination=0.2, random_state=42)
iforest.fit(X_train_full)
scores = iforest.decision_function(X_test_full)
preds = iforest.predict(X_test_full)
y_pred = (preds == -1).astype(int)
logging.info(classification_report(y_test_full, y_pred))

### Test IF MCC

In [None]:
mcc = matthews_corrcoef(y_test_full, y_pred)
logging.info("Matthews Correlation Coefficient: %f", mcc)

### Test IF Balanced Accuracy

In [None]:
balanced_acc = balanced_accuracy_score(y_test_full, y_pred)
logging.info("Balanced Accuracy: %f", balanced_acc)

### Test IF AUC-Kurven

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score

# Die Anomaly Scores invertieren, damit höher = "mehr anomal"
anomaly_scores = -scores  # wichtig für ROC/PR-Logik

# ROC-Kurve und AUC
fpr, tpr, _ = roc_curve(y_test_full, anomaly_scores)
roc_auc = roc_auc_score(y_test_full, anomaly_scores)

# Precision-Recall-Kurve und Average Precision (PR AUC)
precision, recall, _ = precision_recall_curve(y_test_full, anomaly_scores)
pr_auc = average_precision_score(y_test_full, anomaly_scores)

# Plot ROC Curve
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()

# Plot Precision-Recall Curve
plt.subplot(1, 2, 2)
plt.plot(recall, precision, label=f"AP = {pr_auc:.3f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()

plt.tight_layout()
plt.show()


### Test DBSCAN 

In [None]:
dbscan = DBSCAN(eps=1.5, min_samples=3, metric='euclidean', n_jobs=-1)
dbscan_labels = dbscan.fit_predict(X_test_scaled)
dbscan_anomaly = (dbscan_labels == -1).astype(int)
logging.info(classification_report(y_test_full, dbscan_anomaly))

### Test DBSCAN MCC

In [None]:
mcc = matthews_corrcoef(y_test_full, dbscan_anomaly)
logging.info("Matthews Correlation Coefficient: %f", mcc)

### Test DBSCAN Balanced Accuracy

In [None]:
balanced_acc = balanced_accuracy_score(y_test_full, dbscan_anomaly)
logging.info("Balanced Accuracy: %f", balanced_acc)

### Test DBSCAN AUC-Kurven

In [None]:
# "Fake Score": 1 für Anomalie, 0 für normal
# (nur damit du überhaupt eine Kurve bekommst)
anomaly_scores = dbscan_anomaly  # 0/1-Wert

# ROC & PR Kurven
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt

fpr, tpr, _ = roc_curve(y_test_full, anomaly_scores)
roc_auc = roc_auc_score(y_test_full, anomaly_scores)

precision, recall, _ = precision_recall_curve(y_test_full, anomaly_scores)
pr_auc = average_precision_score(y_test_full, anomaly_scores)

# Plotten
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(recall, precision, label=f"AP = {pr_auc:.3f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()

plt.tight_layout()
plt.show()


### Test OCSVM

In [None]:
ocsvm = OneClassSVM(kernel='rbf', gamma='scale', nu=0.05)
ocsvm.fit(X_train_scaled)
y_pred_ocsvm = ocsvm.predict(X_test_scaled)
anomaly = (y_pred_ocsvm == -1).astype(int)
logging.info(classification_report(y_test_full, anomaly))

### OCSVM MCC

In [None]:
mcc = matthews_corrcoef(y_test_full, anomaly)
logging.info("Matthews Correlation Coefficient: %f", mcc)

### OCSVM Balanced Accuracy

In [None]:
balanced_acc = balanced_accuracy_score(y_test_full, anomaly)
logging.info("Balanced Accuracy: %f", balanced_acc)

### OCSVM AUC-Kurven

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt

# Anomalie-Score (höher = wahrscheinlicher Anomalie)
scores = -ocsvm.decision_function(X_test_scaled)

# ROC
fpr, tpr, _ = roc_curve(y_test_full, scores)
roc_auc = roc_auc_score(y_test_full, scores)

# Precision-Recall
precision, recall, _ = precision_recall_curve(y_test_full, scores)
pr_auc = average_precision_score(y_test_full, scores)

# Plotten
plt.figure(figsize=(12, 5))

# ROC Curve
plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()

# PR Curve
plt.subplot(1, 2, 2)
plt.plot(recall, precision, label=f"AP = {pr_auc:.3f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()

plt.tight_layout()
plt.show()
