# Code zu der Bachelorarbeit:
# "Comparitve Study von Machine Learning Modellen zur Erkennung von Web Schwachstellen"
## von Nils Pudenz, 2735230

## Imports

In [1]:

#%pip install kaggle scikit-learn xgboost catboost tabpfn pandas numpy matplotlib seaborn -q
#%pip install --quiet scikit-learn xgboost catboost tabpfn chardet
#%pip install -U scikit-learn
## in deiner (Conda/venv) Umgebung
#%pip install --upgrade "torch==2.*" --index-url https://download.pytorch.org/whl/cu121
#%pip install --upgrade xgboost catboost scikit-learn pandas scipy tabpfn
##wenn es komplikationen mit torch gibt, deiinstallieren und neu installieren
#%pip uninstall torch
#%pip install torch --index-url https://download.pytorch.org/whl/cu121 --upgrade
#%pip install openpyxl


In [None]:
import os, math, time
import numpy as np
import pandas as pd
import torch

import zipfile
import random
from pathlib import Path
import matplotlib.pyplot as plt
import joblib
import sys

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from tabpfn import TabPFNClassifier

from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import make_pipeline


from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils import resample
from kaggle.api.kaggle_api_extended import KaggleApi

from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline

## Check ob GPU verwendet werden kann

In [3]:
print(sys.version)
print("CUDA verfügbar:", torch.cuda.is_available())

3.11.9 (tags/v3.11.9:de54cf5, Apr  2 2024, 10:12:12) [MSC v.1938 64 bit (AMD64)]
CUDA verfügbar: True


In [4]:
print("Python:", sys.executable)
print("Torch:", torch.__version__, "| CUDA build:", torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())

Python: c:\Users\nilsp\Github_Desktop\Comparative_Study_ML_WebVuln\.venv\Scripts\python.exe
Torch: 2.5.1+cu121 | CUDA build: 12.1
CUDA available: True


## mögliche Modeloptimization

In [5]:
'''
xgb = XGBClassifier(
    tree_method="hist",        # CPU-Optimierung
    n_estimators=400,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.9,
    colsample_bytree=0.8,
    n_jobs=-1,
    random_state=42
)

from catboost import CatBoostClassifier
cat = CatBoostClassifier(
    iterations=400,
    depth=8,
    learning_rate=0.1,
    random_seed=42,
    loss_function="Logloss",
    task_type="CPU",
    thread_count=8,
    od_type="Iter",            # early stopping
    od_wait=30,
    verbose=False
)
#'''

'\nxgb = XGBClassifier(\n    tree_method="hist",        # CPU-Optimierung\n    n_estimators=400,\n    max_depth=6,\n    learning_rate=0.1,\n    subsample=0.9,\n    colsample_bytree=0.8,\n    n_jobs=-1,\n    random_state=42\n)\n\nfrom catboost import CatBoostClassifier\ncat = CatBoostClassifier(\n    iterations=400,\n    depth=8,\n    learning_rate=0.1,\n    random_seed=42,\n    loss_function="Logloss",\n    task_type="CPU",\n    thread_count=8,\n    od_type="Iter",            # early stopping\n    od_wait=30,\n    verbose=False\n)\n#'

In [6]:
#Speicher optimieren
'''
vec = TfidfVectorizer(
    ngram_range=(3,5),
    max_features=50_000,
    sublinear_tf=True,
    lowercase=False
).fit(X_train_raw)           # nur einmal fitten
'''

'\nvec = TfidfVectorizer(\n    ngram_range=(3,5),\n    max_features=50_000,\n    sublinear_tf=True,\n    lowercase=False\n).fit(X_train_raw)           # nur einmal fitten\n'

In [7]:
#kleine Hyperparamsuche statt Grid-Overkill
'''
search_space = {
    "max_depth": [4, 6, 8],
    "learning_rate": [0.05, 0.1, 0.2],
    "n_estimators": [200, 400, 600]
}
randcv = RandomizedSearchCV(
    xgb,
    search_space,
    n_iter=10,            # statt 3×3×3 = 27
    scoring="f1",
    cv=3,
    n_jobs=-1
)
randcv.fit(X_train_vec, y_train)
'''

'\nsearch_space = {\n    "max_depth": [4, 6, 8],\n    "learning_rate": [0.05, 0.1, 0.2],\n    "n_estimators": [200, 400, 600]\n}\nrandcv = RandomizedSearchCV(\n    xgb,\n    search_space,\n    n_iter=10,            # statt 3×3×3 = 27\n    scoring="f1",\n    cv=3,\n    n_jobs=-1\n)\nrandcv.fit(X_train_vec, y_train)\n'

In [8]:
def predict_in_batches(model, X, batch_size=512): #um Überlastung zu vermeiden, evtl. 256 oder 128
    """Make predictions on input data in batches."""
    preds = []
    for i in range(0, X.shape[0], batch_size):
        batch = X[i:i + batch_size]
        preds.append(model.predict(batch))
    return np.concatenate(preds)


## Dowload Kaggle Datasets
Requires Kaggle API credentials ('~/.kaggle/kaggle.json') für API-Token, um zugriff auf die Datenbanken über das Kaggle Konto zu bekommen

In [9]:
DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

In [10]:
#Dowload der Datasets von Kaggle, Output =1 erfoglgreich, Output = 0 fehlerhaft
os.system("kaggle datasets download -d syedsaqlainhussain/sql-injection-dataset -p data --unzip --quiet")
os.system("kaggle datasets download -d syedsaqlainhussain/cross-site-scripting-xss-dataset-for-deep-learning -p data --unzip --quiet")
#KAGGLE_DATASETS = { #gleich wie oben nur renaming auf sql & xss
#    "sql": "syedsaqlainhussain/sql-injection-dataset",
#    "xss": "syedsaqlainhussain/cross-site-scripting-xss-dataset-for-deep-learning"
#}

0

In [11]:
def kaggle_download(dataset, path="data", unzip=True):
    api = KaggleApi()
    api.authenticate() #nutzt ~/.kaggle/kaggle.json für Authentifizierung oder Environment-Variablen
    api.dataset_download_files(dataset, path=path, unzip=unzip)
    print(f"Downloaded {dataset}")

kaggle_download("syedsaqlainhussain/sql-injection-dataset")#, path=DATA_DIR, unzip=True)
kaggle_download("syedsaqlainhussain/cross-site-scripting-xss-dataset-for-deep-learning")#, path=DATA_DIR, unzip=True)
    

Dataset URL: https://www.kaggle.com/datasets/syedsaqlainhussain/sql-injection-dataset
Downloaded syedsaqlainhussain/sql-injection-dataset
Dataset URL: https://www.kaggle.com/datasets/syedsaqlainhussain/cross-site-scripting-xss-dataset-for-deep-learning
Downloaded syedsaqlainhussain/cross-site-scripting-xss-dataset-for-deep-learning


In [12]:

sql_df = pd.read_csv("data/SQLiV3.csv", encoding="utf-8", low_memory=False)
xss_df = pd.read_csv("data/XSS_dataset.csv", encoding="utf-8", low_memory=False)

In [13]:
# Spalten ansehen
print(sql_df.columns.tolist())

# Typische Index-/Hilfsspalten loswerden
sql_df = sql_df.loc[:, ~sql_df.columns.str.contains(r"^Unnamed|^index$", case=False)]

# Auf die Kernspalten reduzieren (falls etwas anderes drin ist)
sql_df = sql_df[["Sentence", "Label"]].copy()

# Optional: Duplikate auf Satzebene entfernen (falls noch nicht passiert)
sql_df = sql_df.drop_duplicates(subset=["Sentence"]).reset_index(drop=True)

print("clean shape:", sql_df.shape)  # Erwartung: (30873, 2)
print(sql_df["Label"].value_counts(normalize=True))


['Sentence', 'Label', 'Unnamed: 2', 'Unnamed: 3']
clean shape: (30873, 2)
Label
0                                                                         0.628891
1                                                                         0.370162
 --                                                                       0.000359
waitfor delay '0:0:__TIME__'--                                            0.000131
 DROP TABLE Suppliers                                                     0.000065
 desc users                                                               0.000033
SELECT *                                                                  0.000033
 OR                                                                       0.000033
 if not  (  select system_user  )   <> 'sa' waitfor delay '0:0:2' --      0.000033
 drop table temp --                                                       0.000033
 grant resource to name                                                   0.000033
 /*Sele

In [14]:

for name, df in {"SQL": sql_df, "XSS": xss_df}.items():
    print(f"{name} dataset shape: {df.shape}")
    display(df.head())
    display(df.describe())
    display(df.info())

SQL dataset shape: (30873, 2)


Unnamed: 0,Sentence,Label
0,""" or pg_sleep ( __TIME__ ) --",1.0
1,create user name identified by pass123 tempora...,
2,AND 1 = utl_inaddr.get_host_address ( ...,1.0
3,select * from users where id = '1' or @ @1 ...,1.0
4,"select * from users where id = 1 or 1#"" ( ...",1.0


Unnamed: 0,Sentence,Label
count,30872,30619
unique,30872,17
top,""" or pg_sleep ( __TIME__ ) --",0
freq,1,19256


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30873 entries, 0 to 30872
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sentence  30872 non-null  object
 1   Label     30619 non-null  object
dtypes: object(2)
memory usage: 482.5+ KB


None

XSS dataset shape: (13686, 3)


Unnamed: 0.1,Unnamed: 0,Sentence,Label
0,0,"<li><a href=""/wiki/File:Socrates.png"" class=""i...",0
1,1,"<tt onmouseover=""alert(1)"">test</tt>",1
2,2,"\t </span> <span class=""reference-text"">Steeri...",0
3,3,"\t </span> <span class=""reference-text""><cite ...",0
4,4,"\t </span>. <a href=""/wiki/Digital_object_iden...",0


Unnamed: 0.1,Unnamed: 0,Label
count,13686.0,13686.0
mean,6842.5,0.538726
std,3950.952227,0.498516
min,0.0,0.0
25%,3421.25,0.0
50%,6842.5,1.0
75%,10263.75,1.0
max,13685.0,1.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13686 entries, 0 to 13685
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  13686 non-null  int64 
 1   Sentence    13686 non-null  object
 2   Label       13686 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 320.9+ KB


None

## Basic Cleaning
* Drop Duplicate rows
* Handle missing values (simple fill-na)

In [15]:
for df in (sql_df, xss_df):
    df.drop_duplicates(inplace=True)
    df.fillna(0, inplace=True)

In [16]:
def preprocess_xy(df: pd.DataFrame,
                  label_candidates=("label", "class", "target"),
                  label_map=None):
    if label_map is None:
        label_map = {
            "0": "0", "1": "1",
            "benign": "0", "normal": "0", "legitimate": "0", "safe": "0",
            "attack": "1", "malicious": "1", "sql injection": "1",
            "sql-injection": "1", "xss": "1"
        }

    # Zielspalte finden (im *übergebenen* df!)
    cols_lower = {c.lower(): c for c in df.columns}
    target_col = next((cols_lower[c] for c in label_candidates if c in cols_lower), None)
    if target_col is None:
        raise ValueError(f"Keine Label-Spalte gefunden. Kandidaten: {label_candidates}")

    # Labels normieren -> nur 0/1 behalten
    y_str = df[target_col].astype(str).str.strip().str.lower()
    y_map = y_str.map(label_map)
    mask = y_map.notna()
    y = pd.to_numeric(y_map[mask]).astype(int).to_numpy()

    # Rohtext aus allen Nicht-Label-Spalten zusammenbauen
    feat_cols = [c for c in df.columns if c != target_col]
    X_raw = df.loc[mask, feat_cols].astype(str).agg(" ".join, axis=1)

    return X_raw, y, target_col


# Globale Settings

In [17]:

#Deterministische Ausgabe generieren, um die Reproduzierbarkeit zu gewährleisten
RANDOM_STATE = 42
#np.random.seed(RANDOM_STATE) wofür das?
#random.seed(RANDOM_STATE)
os.environ["OMP_NUM_THREADS"] = "8"
os.environ["OPENBLAS_NUM_THREADS"] = "8"
torch.set_num_threads(8)

USE_CUDA = torch.cuda.is_available()  # für TabPFN & XGBoost
print(f"[ENV] CUDA avail: {USE_CUDA} | Torch CUDA build: {torch.version.cuda}")


[ENV] CUDA avail: True | Torch CUDA build: 12.1


# Hilfsfunktionen 

## Evaluationsmetriken

In [18]:
def binary_metrics(y_true, y_pred) -> dict:
    """Präzision/Recall/F1 & Raten (FPR/FNR) für binäre Klassifikation."""
    y_pred = np.asarray(y_pred).ravel()
    p  = precision_score(y_true, y_pred)
    r  = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    fpr = float(fp / (fp + tn)) if (fp + tn) else 0.0
    fnr = float(fn / (fn + tp)) if (fn + tp) else 0.0
    return dict(Precision=p, Recall=r, F1=f1, FPR=fpr, FNR=fnr)

def evaluate_model(model, X_test, y_test, name, use_batches=False, batch_size=256) -> dict: #Dictionary für Evaluierungsmetriken
    """Zeitmessung + Vorhersage (optional in Batches) + Metriken."""
    print(f"→ Evaluate {name} on X_test={getattr(X_test,'shape',None)}")
    t0 = time.perf_counter()
    if use_batches:
        y_pred = predict_in_batches(model, X_test, batch_size=batch_size, verbose=True)
    else:
        y_pred = model.predict(X_test)
    pred_s = time.perf_counter() - t0
    m = binary_metrics(y_test, y_pred)
    res = dict(Model=name, Pred_s=pred_s, **m)
    return res

## Preprocessing

### Batchweise Vorhersage für TabPFN

In [19]:
def predict_in_batches(model, X, batch_size=256, verbose=False):
    """Vorhersage in Batches (schont RAM/VRAM; wichtig für TabPFN)."""
    n = X.shape[0]
    out = []
    total = math.ceil(n / batch_size)
    for b, i in enumerate(range(0, n, batch_size), start=1):
        j = min(i + batch_size, n)
        t1 = time.perf_counter()
        with torch.inference_mode():
            out.append(model.predict(X[i:j]))
        dt = time.perf_counter() - t1
        if verbose:
            print(f"   [predict] batch {b:>3}/{total} ({j-i} rows) in {dt:.2f}s")
        if USE_CUDA:
            torch.cuda.synchronize()
    return np.concatenate(out)

### Label bereinigung der Datensets

In [20]:
def clean_labels(df, label_col="Label", text_cols=("Sentence",)):
    """Bringt Labels robust auf {0,1} und gibt (X_raw, y) zurück."""
    df = df.copy()
    y_raw = df[label_col].astype(str).str.strip().str.lower()
    map01 = {"0": "0", "1": "1", "benign": "0", "normal": "0", "legitimate": "0", "safe": "0",
             "attack": "1", "malicious": "1", "sql injection": "1", "sql-injection": "1", "xss": "1"}
    y_map = y_raw.map(map01)
    mask = y_map.notna()
    y = pd.to_numeric(y_map[mask]).astype(int).to_numpy()
    feat_cols = [c for c in df.columns if c != label_col]
    X_raw = df.loc[mask, feat_cols].astype(str).agg(" ".join, axis=1)
    return X_raw, y

# Modelle

In [21]:
models = {
    "RandomForest": RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=RANDOM_STATE),
    "MLP": MLPClassifier(hidden_layer_sizes=(256,128), activation="relu",
                         early_stopping=True, n_iter_no_change=5, max_iter=200,
                         random_state=RANDOM_STATE),
    "XGBoost": XGBClassifier(
        n_estimators=500, max_depth=6, learning_rate=0.1,
        subsample=0.9, colsample_bytree=0.8,
        tree_method="hist", device=("cuda" if USE_CUDA else "cpu"),
        random_state=RANDOM_STATE
    ),
    "CatBoost": CatBoostClassifier(
        iterations=400, depth=8, learning_rate=0.1,
        loss_function="Logloss", random_seed=RANDOM_STATE, verbose=False,
        task_type="CPU"   # stabil über Pool + Sparse; bei GPU: task_type="GPU" und ggf. Dense verwenden
    ),
    "TabPFN": TabPFNClassifier(
        device=("cuda" if USE_CUDA else "cpu"),
        ignore_pretraining_limits=True
    )
}

# Pipeline

In [22]:
VEC_ARGS = dict(ngram_range=(3,5), max_features=50_000, sublinear_tf=True, lowercase=False)

TABPFN_MAX_SAMPLES = 4000     # starte konservativ; später 6000/8000 testen
TABPFN_N_COMPONENTS = 150     # <=500, kleiner = schneller/ram-sparender
TABPFN_BATCH = 128            # Batch für Predict; 128/64 bei RAM-Engpässen

results = []

## Pipeline je Datensatz (TF-IDF; TabPFN: SVD + batched predict)

In [23]:

for df, ds_name in [(sql_df, "SQL"), (xss_df, "XSS")]:
    print("\n" + "="*70)
    print(f"DATASET: {ds_name}")

    # Labels/Text
    X_raw, y = clean_labels(df, label_col="Label", text_cols=("Sentence",))
    print(f"[DATA] rows={len(y)} | pos={int(y.sum())} | neg={len(y)-int(y.sum())}")

    # Split 
    # Erst den Datensatz splitten, um Data Leakage vorzubeugen, Wujek et al. (2016)
    X_train_txt, X_test_txt, y_train, y_test = train_test_split(
        X_raw, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
    )

    
    # #Gemeinsmaer TF-IDF-Vectorizer (fit nur auf Train, wegen Oversampling), verhinderung von Fold Leakage
    vec = TfidfVectorizer(**VEC_ARGS)
    t0 = time.perf_counter()
    X_train_vec = vec.fit_transform(X_train_txt)
    X_test_vec  = vec.transform(X_test_txt)
    print(f"[VEC] TF-IDF train={X_train_vec.shape}, test={X_test_vec.shape} in {time.perf_counter()-t0:.2f}s")

    # Modelle trainieren/evaluieren
    for name, model in models.items():
        print("-"*50 + f"\nMODEL: {name} on {ds_name}")

        if name == "CatBoost":
            # CatBoost stabil via Pool (sparse)
            pool_train = Pool(X_train_vec, y_train)
            t0 = time.perf_counter()
            model.fit(pool_train)
            print(f"[{name}] fit in {time.perf_counter()-t0:.2f}s")
            res = evaluate_model(model, X_test_vec, y_test, f"{name}-{ds_name}")
            results.append(res)
            print(res)

        elif name == "TabPFN":
            # ggf. Trainingsmenge stratifiziert klein halten
            X_tab, y_tab = X_train_vec, y_train
            if X_train_vec.shape[0] > TABPFN_MAX_SAMPLES:
                sss = StratifiedShuffleSplit(n_splits=1, train_size=TABPFN_MAX_SAMPLES, random_state=RANDOM_STATE)
                idx, _ = next(sss.split(X_train_vec, y_train))
                X_tab = X_train_vec[idx]
                y_tab = np.asarray(y_train)[idx]
            print(f"[{name}] train subset: {X_tab.shape[0]} rows")

            # SVD -> float32
            t0 = time.perf_counter()
            svd = TruncatedSVD(n_components=TABPFN_N_COMPONENTS, random_state=RANDOM_STATE)
            X_train_svd = svd.fit_transform(X_tab).astype("float32", copy=False)
            X_test_svd  = svd.transform(X_test_vec).astype("float32", copy=False)
            print(f"[{name}] SVD train={X_train_svd.shape}, test={X_test_svd.shape} in {time.perf_counter()-t0:.2f}s")

            # Fit
            t0 = time.perf_counter()
            model.fit(X_train_svd, y_tab)
            if USE_CUDA:
                torch.cuda.synchronize()
            print(f"[{name}] fit in {time.perf_counter()-t0:.2f}s (device={'cuda' if USE_CUDA else 'cpu'})")

            # Evaluate (batched predict; kein zweites predict in evaluate_model)
            res = evaluate_model(model, X_test_svd, y_test, f"{name}-{ds_name}",
                                 use_batches=True, batch_size=TABPFN_BATCH)
            results.append(res)
            print(res)

        else:
            # RF, MLP, XGB: direkt auf Sparse
            t0 = time.perf_counter()
            model.fit(X_train_vec, y_train)
            print(f"[{name}] fit in {time.perf_counter()-t0:.2f}s")
            res = evaluate_model(model, X_test_vec, y_test, f"{name}-{ds_name}")
            results.append(res)
            print(res)



DATASET: SQL
[DATA] rows=30844 | pos=11334 | neg=19510
[VEC] TF-IDF train=(24675, 50000), test=(6169, 50000) in 1.01s
--------------------------------------------------
MODEL: RandomForest on SQL
[RandomForest] fit in 13.52s
→ Evaluate RandomForest-SQL on X_test=(6169, 50000)
{'Model': 'RandomForest-SQL', 'Pred_s': 0.8905150000937283, 'Precision': 0.9917172832689122, 'Recall': 0.792236435818262, 'F1': 0.8808239333006376, 'FPR': 0.0038441824705279346, 'FNR': 0.20776356418173797}
--------------------------------------------------
MODEL: MLP on SQL
[MLP] fit in 796.96s
→ Evaluate MLP-SQL on X_test=(6169, 50000)
{'Model': 'MLP-SQL', 'Pred_s': 0.020672500133514404, 'Precision': 0.9911894273127754, 'Recall': 0.7940008822232024, 'F1': 0.881704628949302, 'FPR': 0.004100461301896463, 'FNR': 0.20599911777679752}
--------------------------------------------------
MODEL: XGBoost on SQL
[XGBoost] fit in 9.93s
→ Evaluate XGBoost-SQL on X_test=(6169, 50000)
{'Model': 'XGBoost-SQL', 'Pred_s': 0.02780

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


[CatBoost] fit in 133.82s
→ Evaluate CatBoost-SQL on X_test=(6169, 50000)
{'Model': 'CatBoost-SQL', 'Pred_s': 0.127218599896878, 'Precision': 1.0, 'Recall': 0.7878253198059109, 'F1': 0.8813224771773994, 'FPR': 0.0, 'FNR': 0.2121746801940891}
--------------------------------------------------
MODEL: TabPFN on SQL
[TabPFN] train subset: 4000 rows
[TabPFN] SVD train=(4000, 150), test=(6169, 150) in 2.64s
[TabPFN] fit in 68.66s (device=cuda)
→ Evaluate TabPFN-SQL on X_test=(6169, 150)
   [predict] batch   1/49 (128 rows) in 31.92s
   [predict] batch   2/49 (128 rows) in 36.72s
   [predict] batch   3/49 (128 rows) in 30.10s
   [predict] batch   4/49 (128 rows) in 29.47s
   [predict] batch   5/49 (128 rows) in 29.71s
   [predict] batch   6/49 (128 rows) in 29.67s
   [predict] batch   7/49 (128 rows) in 29.93s
   [predict] batch   8/49 (128 rows) in 29.66s
   [predict] batch   9/49 (128 rows) in 29.78s
   [predict] batch  10/49 (128 rows) in 29.84s
   [predict] batch  11/49 (128 rows) in 29.6

# Ergebnisse

## Ergebnistabelle ausgeben

In [24]:
# Ergebnis-Tabelle ausgeben
df_results = pd.DataFrame(results)
display(df_results.sort_values(["Model"]).reset_index(drop=True))
df_results.to_csv("results_all.csv", index=False)
with pd.ExcelWriter("results_all.xlsx") as w:
    df_results.to_excel(w, sheet_name="All", index=False)
print(" Ergebnisse gespeichert: results_all.csv / results_all.xlsx")

Unnamed: 0,Model,Pred_s,Precision,Recall,F1,FPR,FNR
0,CatBoost-SQL,0.127219,1.0,0.787825,0.881322,0.0,0.212175
1,CatBoost-XSS,0.078337,1.0,0.946441,0.972483,0.0,0.053559
2,MLP-SQL,0.020673,0.991189,0.794001,0.881705,0.0041,0.205999
3,MLP-XSS,0.009859,0.999288,0.951864,0.975,0.000792,0.048136
4,RandomForest-SQL,0.890515,0.991717,0.792236,0.880824,0.003844,0.207764
5,RandomForest-XSS,0.086699,1.0,0.945085,0.971767,0.0,0.054915
6,TabPFN-SQL,1528.796063,1.0,0.783855,0.878833,0.0,0.216145
7,TabPFN-XSS,670.212918,1.0,0.942373,0.970332,0.0,0.057627
8,XGBoost-SQL,0.027806,1.0,0.423908,0.595415,0.0,0.576092
9,XGBoost-XSS,0.020781,0.646363,1.0,0.785201,0.638955,0.0


 Ergebnisse gespeichert: results_all.csv / results_all.xlsx


## Ergebnisse speichern

## Ergebnistabelle

In [None]:
results_df = pd.DataFrame(results)
results_df.sort_values(["Model"], inplace=True)
print("\n Gesamt Ergebnisse: \n", results_df)

results_df.to_csv("results.csv", index=False)

NameError: name 'DataFrame' is not defined

In [None]:

'''
# Variante A – wenn df_results bereits eine 'Dataset'-Spalte hat
if "Dataset" in df_results.columns:
    df_sql = df_results[df_results["Dataset"] == "SQL"].copy()
    df_xss = df_results[df_results["Dataset"] == "XSS"].copy()
# Variante B – Spalte fehlt -> am Modell-Namen aufteilen
else:
    df_sql = df_results[df_results["Model"].str.contains("-SQL")].copy()
    df_xss = df_results[df_results["Model"].str.contains("-XSS")].copy()

# ------------------------------------------------------------
# 2) zwei CSV-Dateien 
# ------------------------------------------------------------
df_sql.to_csv("results_sql.csv", index=False, sep=";")
df_xss.to_csv("results_xss.csv", index=False, sep=";")

# ------------------------------------------------------------
# Excel-Workbook mit zwei Sheets
# ------------------------------------------------------------
with pd.ExcelWriter("results_by_dataset.xlsx") as writer:
    df_sql.to_excel(writer, sheet_name="SQL", index=False)
    df_xss.to_excel(writer, sheet_name="XSS", index=False)

print( "Dateien wurden exportiert: results_sql.csv, results_xss.csv, results_by_dataset.xlsx")
'''

# Hyperparameter-Tuning

## Helper Functions

In [26]:
#Schwellenwert-Tuning (max F1) für Modelle mit predict_proba
def tune_threshold(estimator, X_val, y_val, grid=None):
    if not hasattr(estimator, "predict_proba"):
        return 0.5, None  # kein Proba -> Standard-Threshold
    if grid is None:
        grid = np.linspace(0.1, 0.9, 81)
    p = estimator.predict_proba(X_val)[:, 1]
    best_t, best_f1 = 0.5, -1
    for t in grid:
        y_hat = (p >= t).astype(int)
        f1 = f1_score(y_val, y_hat)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    return best_t, best_f1


In [27]:
# Fit und Threshold und Testmetriken
def eval_with_threshold(name, est, X_train, y_train, X_test, y_test, do_threshold=True):
    # kleine Validierung aus dem Train für Thresholdsuche abspalten (leak-frei)
    X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.15,
                                                stratify=y_train, random_state=RANDOM_STATE)
    t0 = time.perf_counter()
    est.fit(X_tr, y_tr)
    fit_s = time.perf_counter() - t0

    thr, f1_val = (0.5, None)
    if do_threshold:
        thr, f1_val = tune_threshold(est, X_val, y_val)

    # Test
    t0 = time.perf_counter()
    if hasattr(est, "predict_proba"):
        y_pred = (est.predict_proba(X_test)[:, 1] >= thr).astype(int)
    else:
        y_pred = est.predict(X_test)
    pred_s = time.perf_counter() - t0

    p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary", zero_division=0)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    fpr = float(fp / (fp + tn)) if (fp + tn) else 0.0
    fnr = float(fn / (fn + tp)) if (fn + tp) else 0.0

    return dict(Model=name, Fit_s=fit_s, Pred_s=pred_s, Thr=thr, ValF1=f1_val,
                Precision=p, Recall=r, F1=f1, FPR=fpr, FNR=fnr)


## Pipelines

In [None]:
from sklearn.pipeline import Pipeline
# Einheitlicher TF-IDF
def make_tfidf():
    return TfidfVectorizer(analyzer="char", ngram_range=(3,5),
                           max_features=50_000, sublinear_tf=True, lowercase=False)

def pipe_rf():
    return Pipeline([
        ("tfidf", make_tfidf()),
        ("clf", RandomForestClassifier(n_jobs=-1, random_state=RANDOM_STATE))
    ])
def pipe_mlp():
    return Pipeline([
        ("tfidf", make_tfidf()),
        ("clf", MLPClassifier(early_stopping=True, n_iter_no_change=5, max_iter=200,
                              random_state=RANDOM_STATE))
    ])

def pipe_xgb():
    return Pipeline([
        ("tfidf", make_tfidf()),
        ("clf", XGBClassifier(
            tree_method="hist", device=("cuda" if USE_CUDA else "cpu"),
            random_state=RANDOM_STATE, n_jobs=1  # parallelism über CV, nicht im Estimator
        ))
    ])

def pipe_cat():
    return Pipeline([
        ("tfidf", make_tfidf()),
        ("clf", CatBoostClassifier(
            loss_function="Logloss", verbose=False, random_seed=RANDOM_STATE
            # task_type="GPU" geht auch, aber dann Dense-Arrays besser
        ))
    ])

# TabPFN: TF-IDF -> SVD -> TabPFN (nur SVD-Dim tunen; TabPFN selbst kaum Hyperparams)
def pipe_tabpfn(n_components=150):
    return Pipeline([
        ("tfidf", make_tfidf()),
        ("svd", TruncatedSVD(n_components=n_components, random_state=RANDOM_STATE)),
        ("clf", TabPFNClassifier(device=("cuda" if USE_CUDA else "cpu"),
                                 ignore_pretraining_limits=True))
    ])



In [29]:
def pos_weight(y):
    # für XGB: scale_pos_weight ~ Neg/Pos
    pos = max(1, int(np.sum(y)))
    neg = max(1, int(len(y)-pos))
    return neg / pos

param_grids = {
    "RandomForest": {
        "tfidf__max_features": [30_000, 50_000, 80_000],
        "clf__n_estimators": [300, 500, 800],
        "clf__max_depth": [None, 20, 40, 80],
        "clf__max_features": ["sqrt", 0.5, None],
        "clf__min_samples_split": [2, 5, 10],
        "clf__class_weight": [None, "balanced"]
    },
    "MLP": {
        "tfidf__max_features": [30_000, 50_000],
        "clf__hidden_layer_sizes": [(256,128), (512,256), (256,)],
        "clf__alpha": np.logspace(-5, -2, 4),
        "clf__learning_rate_init": np.logspace(-4, -2, 3),
        "clf__batch_size": [64, 128, 256]
    },
    "XGBoost": lambda y: {
        "tfidf__max_features": [30_000, 50_000],
        "clf__n_estimators": [300, 500, 800],
        "clf__max_depth": [4, 6, 8],
        "clf__learning_rate": [0.05, 0.1, 0.2],
        "clf__subsample": [0.7, 0.9, 1.0],
        "clf__colsample_bytree": [0.6, 0.8, 1.0],
        "clf__reg_lambda": [0, 1, 5, 10],
        "clf__gamma": [0, 1, 2],
        "clf__scale_pos_weight": [pos_weight(y)]
    },
    "CatBoost": {
        "tfidf__max_features": [30_000, 50_000],
        "clf__iterations": [400, 800, 1000],
        "clf__depth": [6, 8, 10],
        "clf__learning_rate": [0.05, 0.1, 0.2],
        "clf__l2_leaf_reg": [1, 3, 5, 10]
    },
# TabPFN: lieber nur SVD-Dimension suchen (sonst sehr langsam)
    "TabPFN": {
        "tfidf__max_features": [30_000, 50_000],
        "svd__n_components": [100, 150, 200]
    }
}


## Tuning Functions

In [None]:



def tune_model(name, X_text, y, cv_splits=3, n_iter=20, random_state=RANDOM_STATE, run_tabpfn=False):
    print(f"\n=== TUNING: {name} ===")
    if name == "RandomForest":
        pipe = pipe_rf(); space = param_grids["RandomForest"]
    elif name == "MLP":
        pipe = pipe_mlp(); space = param_grids["MLP"]
    elif name == "XGBoost":
        pipe = pipe_xgb(); space = param_grids["XGBoost"](y)
    elif name == "CatBoost":
        pipe = pipe_cat(); space = param_grids["CatBoost"]
    elif name == "TabPFN":
        if not run_tabpfn:
            print("Skipping TabPFN tuning (set run_tabpfn=True to enable).")
            return None
        pipe = pipe_tabpfn()
        space = param_grids["TabPFN"]
        # TabPFN: reduziere CV-Last
        cv_splits = 2
        n_iter = min(n_iter, 6)
    else:
        raise ValueError(name)

    scorer = make_scorer(f1_score)
    cv = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=random_state)
    # XGB (GPU) parallelisiert schlecht über Prozesse -> n_jobs=1; sonst gerne -1
    search_n_jobs = 1 if name == "XGBoost" else -1

    rs = RandomizedSearchCV(
        pipe, space, n_iter=n_iter, scoring=scorer, cv=cv,
        random_state=random_state, n_jobs=search_n_jobs, verbose=1
    )
    t0 = time.perf_counter()
    rs.fit(X_text, y)
    fit_s = time.perf_counter() - t0
    print(f"[{name}] best F1 (CV): {rs.best_score_:.4f} | best params: {rs.best_params_} | time: {fit_s/60:.1f} min")
    return rs.best_estimator_


In [37]:
def run_tuning_for_dataset(df, ds_name, run_tabpfn=False):
    # Labels + Rohtext (deine Clean-Funktion)
    X_raw, y = clean_labels(df, label_col="Label", text_cols=("Sentence",))
    X_train, X_test, y_train, y_test = train_test_split(
        X_raw, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
    )

    results = []
    best_models = {}

    for name in ["RandomForest", "MLP", "XGBoost", "CatBoost"]:  # TabPFN separat
        best = tune_model(name, X_train, y_train, n_iter=20)
        best_models[name] = best
        res = eval_with_threshold(f"{name}-{ds_name}", best, X_train, y_train, X_test, y_test, do_threshold=True)
        print(res); results.append(res)

    # Optional: TabPFN sehr vorsichtig
    if run_tabpfn:
        best = tune_model("TabPFN", X_train, y_train, run_tabpfn=True)
        if best is not None:
            # Für TabPFN am Ende keine großen Batches nötig, Pipeline macht SVD intern.
            res = eval_with_threshold(f"TabPFN-{ds_name}", best, X_train, y_train, X_test, y_test, do_threshold=False)
            print(res); results.append(res)

    return pd.DataFrame(results), best_models


## Run & Export

In [38]:
df_sql_res, sql_models = run_tuning_for_dataset(sql_df, "SQL", run_tabpfn=False)   # TabPFN erstmal aus
df_xss_res, xss_models = run_tuning_for_dataset(xss_df, "XSS", run_tabpfn=False)

df_all = pd.concat([df_sql_res, df_xss_res], ignore_index=True)
display(df_all)

# Excel-Export
out = Path("reports") / "tuning_results.xlsx"
out.parent.mkdir(exist_ok=True, parents=True)
with pd.ExcelWriter(out) as w:
    df_all.to_excel(w, sheet_name="All", index=False)
    df_sql_res.to_excel(w, sheet_name="SQL", index=False)
    df_xss_res.to_excel(w, sheet_name="XSS", index=False)
print(f"Saved:{out}")


=== TUNING: RandomForest ===
Fitting 3 folds for each of 20 candidates, totalling 60 fits


KeyboardInterrupt: 

## K-Fold-Cross Validation