# Code zu der Bachelorarbeit:
# "Comparitve Study von Machine Learning Modellen zur Erkennung von Web Schwachstellen"
## von Nils Pudenz, 2735230

# Importe

In [136]:

#%pip install kaggle scikit-learn xgboost catboost tabpfn pandas numpy matplotlib seaborn -q
#!pip install --quiet scikit-learn xgboost catboost tabpfn chardet

In [137]:
import os
import zipfile
import random
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (precision_score, recall_score, f1_score,
                             confusion_matrix)
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from tabpfn import TabPFNClassifier
import matplotlib.pyplot as plt

In [138]:
#Deterministische Ausgabe generieren, um die Reproduzierbarkeit zu gewährleisten
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

## Dowload Kaggle Datasets
Requires Kaggle API credentials ('~/.kaggle/kaggle.json') für API-Token, um zugriff auf die Datenbanken über das Kaggle Konto zu bekommen

In [139]:
DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

In [140]:
#Dowload der Datasets von Kaggle
os.system("kaggle datasets download -d syedsaqlainhussain/sql-injection-dataset -p data --unzip --quiet")
os.system("kaggle datasets download -d syedsaqlainhussain/cross-site-scripting-xss-dataset-for-deep-learning -p data --unzip --quiet")
#KAGGLE_DATASETS = { #gleich wie oben nur renaming auf sql & xss
#    "sql": "syedsaqlainhussain/sql-injection-dataset",
#    "xss": "syedsaqlainhussain/cross-site-scripting-xss-dataset-for-deep-learning"
#}

0

## Load and Inspect Data

In [141]:
SQL_CSV = next(DATA_DIR.glob("**/sql*/*.csv"), None) or next(DATA_DIR.glob("**/*SQL*.csv"), None)
XSS_CSV = next(DATA_DIR.glob("**/xss*/*.csv"), None) or next(DATA_DIR.glob("**/*XSS*.csv"), None)


In [142]:
#csv to dataframe
sql_df = pd.read_csv(SQL_CSV, encoding="utf-16", sep=",", low_memory=False) #utf-8 Fehler
xss_df = pd.read_csv(XSS_CSV)
##

In [143]:
for name, df in {"SQL": sql_df, "XSS": xss_df}.items():
    print(f"{name} dataset shape: {df.shape}")
    display(df.head())

SQL dataset shape: (4200, 2)


Unnamed: 0,Sentence,Label
0,a,1
1,a',1
2,a' --,1
3,a' or 1 = 1; --,1
4,@,1


XSS dataset shape: (13686, 3)


Unnamed: 0.1,Unnamed: 0,Sentence,Label
0,0,"<li><a href=""/wiki/File:Socrates.png"" class=""i...",0
1,1,"<tt onmouseover=""alert(1)"">test</tt>",1
2,2,"\t </span> <span class=""reference-text"">Steeri...",0
3,3,"\t </span> <span class=""reference-text""><cite ...",0
4,4,"\t </span>. <a href=""/wiki/Digital_object_iden...",0


## Basic Cleaning
* Drop Duplicate rows
* Handle missing values (simple fill-na)

In [144]:
for df in (sql_df, xss_df):
    df.drop_duplicates(inplace=True)
    df.fillna(0, inplace=True)

In [145]:
def preprocess_xy(df: pd.DataFrame):
    # Zielspalte finden und Features extrahieren
    #Split in Features (roher Text) und Label-Vector.
    target_col = next(c for c in sql_df.columns if c.lower() in {"label", "class", "target"})
    X_raw = df.drop(columns=[target_col]).astype(str).agg(" ".join, axis=1)
    y = df[target_col].values
    print("Target column assumed:", target_col)
    FEATURES = [c for c in sql_df.columns if c != target_col]
    return X_raw, y, target_col

    

In [146]:
vectorizer = TfidfVectorizer(
    analyzer="char", ngram_range=(3,5), min_df=2, max_features=50000
)

## Splitting & Measure-Metrics

In [147]:
def split(df):
    X = df[FEATURES].values
    y = df[target_col].astype(int).values
    return train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)

Erst den Datensatz splitten, um Data Leakage vorzubeugen, Wujek et al. (2016)

In [148]:
def evaluate_model(model, X_test, y_test, name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    p = precision_score(y_test, y_pred, average="binary")
    r = recall_score(y_test, y_pred, average="binary")
    f1 = f1_score(y_test, y_pred, average="binary")
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    fpr = fp / (fp + tn)
    fnr = fn / (fn + tp)
    return dict(Model=name, Precision=p, Recall=r, F1=f1, FPR=fpr, FNR=fnr)

Dictionary für die Evaulierungsmetriken

## Modeldefinition

In [149]:
models = {

"RandomForest": RandomForestClassifier(n_estimators=300, max_depth=None, n_jobs=-1, random_state=RANDOM_STATE),
"MLP": MLPClassifier(hidden_layer_sizes=(512, 256), activation="relu", alpha= 1e-4, learning_rate_init=1e-3, early_stopping=True, random_state=RANDOM_STATE, max_iter=30),
"XGBoost": XGBClassifier(n_estimator=500, max_depth=10, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, objective="binary:logistic", eval_metric="logloss", tree_method="hist", random_state=RANDOM_STATE, n_jobs=1),
"CatBoost": CatBoostClassifier(iterations=400, depth=8, learning_rate=0.1, loss_function="Logsloss", random_seed=RANDOM_STATE, verbose=False),
"TabPFN": TabPFNClassifier(device="cpu")

}

## Training & Evaluation

In [150]:
results = []
for df, ds_name in [(sql_df, "SQL"), (xss_df, "XSS")]:
    X_raw, y, target_col = preprocess_xy(df) 
    X_train, X_test, y_train, y_test = train_test_split(
        X_raw,
        y,
        test_size=0.2,
        stratify=y,
        random_state=RANDOM_STATE
    )

Target column assumed: Label
Target column assumed: Label


In [151]:
#Gemeinsmaer TF-IDF-Vectorizer (fit nur auf Train, wegen Oversampling)
vec = vectorizer.fit(X_train)
X_train_vec = vec.transform(X_train)
X_test_vec = vec.transform(X_test)

In [152]:
for name, model in models.items():
    if name in {"CatBoost", "TabPFN"}:
        # Dichte der Matrix erforderlich
        model.fit(X_train_vec.toarray(), y_test, f"{name}-{ds_name}")
    else:
        model.fit(X_train_vec, y_train)
        res = evaluate_model(model, X_test_vec, y_test, f"{name}-{ds_name}")
    results.append(res)
    print(res)

ValueError: could not convert string to float: '1658 <a href="/wiki/Optimization_(mathematics)" class="mw-redirect" title="Optimization (mathematics)">Optimization </a> searches:'