In [None]:
# Datensätze laden

import pickle

file_path_X_train = "splits/train/X_pandas.pck"
file_path_y_train = "splits/train/y_pandas.pck"
file_path_X_test = "splits/test/X_pandas.pck"
file_path_y_test = "splits/test/y_pandas.pck"
file_path_X_val = "splits/val/X_pandas.pck"
file_path_y_val = "splits/val/y_pandas.pck"

# Pickle-Dateien einzeln laden
with open(file_path_X_train, 'rb') as file:
    X_train = pickle.load(file)

with open(file_path_y_train, 'rb') as file:
    y_train = pickle.load(file)

with open(file_path_X_test, 'rb') as file:
    X_test = pickle.load(file)

with open(file_path_y_test, 'rb') as file:
    y_test = pickle.load(file)

with open(file_path_X_val, 'rb') as file:
    X_val = pickle.load(file)

with open(file_path_y_val, 'rb') as file:
    y_val = pickle.load(file)

# print(os.listdir('/content/drive/MyDrive/splits/test'))

In [None]:
# Code um verschiedene Modellkonfigurationen/ Hyperparameter für XGBoost Modelle zu testen. Hieraus hat sich ein bestes Modell ergeben, das im nachfolgenden Code separat nochmal trainiert wurde.
import optuna
import xgboost as xgb
from scipy.sparse import csr_matrix
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split

X_train_csr = csr_matrix(X_train)
X_test_csr = csr_matrix(X_test)  

def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 5, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 50, 100),
        'subsample': trial.suggest_float('subsample', 0.8, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }
    
    model = xgb.XGBClassifier(**params, random_state=42)
    model.fit(X_train_csr, y_train)
    
    test_preds = model.predict(X_test_csr)
   
    test_accuracy = accuracy_score(y_test, test_preds)
    test_f1 = f1_score(y_test, test_preds, average="weighted")
    test_recall = recall_score(y_test, test_preds, average="weighted")
    test_precision = precision_score(y_test, test_preds, average="weighted")
    
    print(f"Durchlauf {trial.number}: Acc = {test_accuracy:.4f}, F1 = {test_f1:.4f}, Rec = {test_recall:.4f}, Pre = {test_precision:.4f}")
    return test_accuracy

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("Beste Parameter: ", study.best_params)
print("Beste Score (Test Accuracy): ", study.best_value)

In [None]:
# Training der besten Konfig. mit Angabe der Params und Speicherung
import xgboost as xgb
from sklearn.metrics import accuracy_score
import sklearn as skl
from scipy.sparse import csr_matrix

X_train_csr = csr_matrix(X_train)
X_test_csr = csr_matrix(X_test)

param = {
    'max_depth': 7,
    'learning_rate': 0.08,
    'n_estimators': 300,
    'subsample': 0.8,
    'colsample_bytree': 0.7,
    'gamma': 1.0,
    'reg_alpha': 5,
    'reg_lambda': 10,
    'min_child_weight': 8,
}

model = xgb.XGBClassifier(
    **param,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train_csr, y_train)

model.save_model("2.model")

y_pred_test = model.predict(X_test_csr)
y_pred_train = model.predict(X_train_csr)

accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy: {accuracy}")

accuracy = accuracy_score(y_train, y_pred_train)
print(f"Train Accuracy: {accuracy}")

In [None]:
# Bestes XGBoost Modell (2.model) Evaluation (Metriken)
from scipy.sparse import csr_matrix
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

def metrics_test(models):
    results = []

    y_test_csr = csr_matrix(y_test)
    x_test_csr = csr_matrix(X_test)

    y_train_csr = csr_matrix(y_train)
    x_train_csr = csr_matrix(X_train)

    model = xgb.XGBClassifier()

    for m in models:
        model.load_model(m)

        y_pred_test = model.predict(x_test_csr)
        accuracy_test = accuracy_score(y_test_csr, y_pred_test)
        precision_test = precision_score(y_test_csr, y_pred_test, average="micro", zero_division=0)
        recall_test = recall_score(y_test_csr, y_pred_test, average="micro")
        f1_test = f1_score(y_test_csr, y_pred_test, average="micro")

        y_pred_prob_test = model.predict_proba(X_test)
        auc_test = roc_auc_score(y_test, y_pred_prob_test, average="macro", multi_class="ovr")

        y_pred_train = model.predict(x_train_csr)
        accuracy_train = accuracy_score(y_train_csr, y_pred_train)
        precision_train = precision_score(y_train_csr, y_pred_train, average="micro", zero_division=0)
        recall_train = recall_score(y_train_csr, y_pred_train, average="micro")
        f1_train = f1_score(y_train_csr, y_pred_train, average="micro")

        y_pred_prob_train = model.predict_proba(X_train)
        auc_train = roc_auc_score(y_train, y_pred_prob_train, average="macro", multi_class="ovr")

        results.append({
            "Model": m,
            "Accuracy (Test)": accuracy_test,
            "Precision (Test)": precision_test,
            "Recall (Test)": recall_test,
            "F1 Score (Test)": f1_test,
            "AUC Score (Test)": auc_test,
            "Accuracy (Train)": accuracy_train,
            "Precision (Train)": precision_train,
            "Recall (Train)": recall_train,
            "F1 Score (Train)": f1_train,
            "AUC Score (Train)": auc_train,
        })

    return pd.DataFrame(results)

models = ["2.model"]
results_df = metrics_test(models)

print(results_df)

In [None]:
# Training und Eval vom besten XGBoost Modell, mit PCA (50, 100 und 200 Features)
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from scipy.sparse import csr_matrix
import xgboost as xgb
import pandas as pd

X_train_csr = csr_matrix(X_train)
X_test_csr = csr_matrix(X_test)

n_components_list = [200, 100, 50]

results = []

param = {
    'max_depth': 7,
    'learning_rate': 0.08,
    'n_estimators': 300,
    'subsample': 0.8,
    'colsample_bytree': 0.7,
    'gamma': 1.0,
    'reg_alpha': 5,
    'reg_lambda': 10,
    'min_child_weight': 8,
}

for n_components in n_components_list:
    print(f"Komponentenanzahl {n_components}")
    
    # PCA
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train_csr.toarray())
    X_test_pca = pca.transform(X_test_csr.toarray())
    print("pca done")

    model = xgb.XGBClassifier(**param, random_state=42, n_jobs=-1)
    model.fit(X_train_pca, y_train)
    print("modell fitted")
    
    y_pred_test = model.predict(X_test_pca)
    y_pred_train = model.predict(X_train_pca)
    y_pred_prob_test = model.predict_proba(X_test_pca)
    y_pred_prob_train = model.predict_proba(X_train_pca)
    print("preds done")
    
    # Metriken Test
    accuracy_test = accuracy_score(y_test, y_pred_test)
    precision_test = precision_score(y_test, y_pred_test, average="micro", zero_division=0)
    recall_test = recall_score(y_test, y_pred_test, average="micro")
    f1_test = f1_score(y_test, y_pred_test, average="macro")
    auc_test = roc_auc_score(y_test, y_pred_prob_test, average="macro", multi_class="ovr")
    # print(f"acc test:{accuracy_test}")
    
    # Metriken Train
    accuracy_train = accuracy_score(y_train, y_pred_train)
    precision_train = precision_score(y_train, y_pred_train, average="micro", zero_division=0)
    recall_train = recall_score(y_train, y_pred_train, average="micro")
    f1_train = f1_score(y_train, y_pred_train, average="macro")
    auc_train = roc_auc_score(y_train, y_pred_prob_train, average="macro", multi_class="ovr")
    # print(f"acc test:{accuracy_train}")

    results.append({
        'PCA Components': n_components,
        'Accuracy (Test)': accuracy_test,
        'Precision (Test)': precision_test,
        'Recall (Test)': recall_test,
        'F1 (Test)': f1_test,
        'AUC (Test)': auc_test,
        'Accuracy (Train)': accuracy_train,
        'Precision (Train)': precision_train,
        'Recall (Train)': recall_train,
        'F1 (Train)': f1_train,
        'AUC (Train)': auc_train,
    })

results_df = pd.DataFrame(results)

print(results_df)

results_df.to_csv("xgboost_pca_ergebnisse.csv", index=False)

In [None]:
#Code für verschiedene Konfigurationen von Random Forest Modellen mit anschließender Eval für das beste Modell
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import joblib
import os

os.makedirs("saved_models", exist_ok=True)

def objective(trial):

    n_estimators = trial.suggest_int("n_estimators", 50, 100)
    max_depth = trial.suggest_categorical("max_depth", [None] + list(range(5, 51)))
    criterion = trial.suggest_categorical("criterion", ["gini", "entropy", "log_loss"])
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", None])
    bootstrap = trial.suggest_categorical("bootstrap", [True, False])

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        criterion=criterion,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=bootstrap,
        random_state=42,
        n_jobs=-1,
        verbose=2
    )

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    model_filename = f"saved_models/model_trial_{trial.number}.model"
    joblib.dump(model, model_filename)
    print(f"Modell für Durchlauf {trial.number} gespeichert: {model_filename}")

    return accuracy

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=15)

print("Beste Hyperparams:", study.best_params)
print("Beste Acc:", study.best_value)

best_params = study.best_params
best_model = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1, verbose=2)
best_model.fit(X_train, y_train) #Best Modell trainieren

#Eval Test
y_pred_test = best_model.predict(X_test)
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test, average="micro", zero_division=0)
recall_test = recall_score(y_test, y_pred_test, average="micro")
f1_test = f1_score(y_test, y_pred_test, average="macro")
auc_test = roc_auc_score(y_test, y_pred_test, average="macro", multi_class="ovr")

print("\nTest:")
print(f"Accuracy (Test): {accuracy:.2f}")
print(f"Precision (Test): {precision:.2f}")
print(f"Recall (Test): {recall:.2f}")
print(f"F1 (Test): {f1:.2f}")
print(f"AUC (Test): {auc_test:.2f}")

#Eval Train
y_pred_train = best_model.predict(X_train)
accuracy_train = accuracy_score(y_train, y_pred_train)
precision_train = precision_score(y_train, y_pred_train, average="micro", zero_division=0)
recall_train = recall_score(y_train, y_pred_train, average="micro")
f1_train = f1_score(y_train, y_pred_train, average="macro")
auc_train = roc_auc_score(y_train, y_pred_train, average="macro", multi_class="ovr")

print("\nTrain:")
print(f"Accuracy (Train): {accuracy_train:.2f}")
print(f"Precision (Train): {precision_train:.2f}")
print(f"Recall (Train): {recall_train:.2f}")
print(f"F1 (Train): {f1_train:.2f}")
print(f"AUC(Train): {auc_train:.2f}")

In [None]:
# Training und Eval des besten Random Forest Modells mit PCA und 200 Features
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import joblib
import os

pca = PCA(n_components=200)
    
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

model = RandomForestClassifier(
    n_estimators=60,
    max_depth=None,
    criterion='log_loss',
    min_samples_split=2,
    min_samples_leaf=7,
    max_features='sqrt',
    bootstrap=True,
    n_jobs=-1,
    verbose=2
)

model.fit(X_train_pca, y_train)

#Eval Test
y_pred_test = model.predict(X_test_pca)
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test, average="micro", zero_division=0)
recall_test = recall_score(y_test, y_pred_test, average="micro")
f1_test = f1_score(y_test, y_pred_test, average="macro")
auc_test = roc_auc_score(y_test, y_pred_test, average="macro", multi_class="ovr")

print("Test:")
print(f"Accuracy (Test): {accuracy_test:.2f}")
print(f"Precision (Test): {precision_test:.2f}")
print(f"Recall (Test): {recall_test:.2f}")
print(f"F1 (Test): {f1_test:.2f}")
print(f"AUC (Test): {auc_test:.2f}")

#Eval Train
y_pred_train = model.predict(X_train_pca)
accuracy_train = accuracy_score(y_train, y_pred_train)
precision_train = precision_score(y_train, y_pred_train, average="micro", zero_division=0)
recall_train = recall_score(y_train, y_pred_train, average="micro")
f1_train = f1_score(y_train, y_pred_train, average="macro")
auc_train = roc_auc_score(y_train, y_pred_train, average="macro", multi_class="ovr")

print("Train:")
print(f"Accuracy (Train): {accuracy_train:.2f}")
print(f"Precision (Train): {precision_train:.2f}")
print(f"Recall (Train): {recall_train:.2f}")
print(f"F1 (Train): {f1_train:.2f}")
print(f"AUC (Train): {auc_train:.2f}")

In [None]:
# Plot des Baumgraphen des besten XGBoost Modells ohne PCA
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import xgboost as xgb
import matplotlib.pyplot as plt

model = xgb.XGBClassifier()
model.load_model("2.model")

xgb.plot_tree(model, num_trees=-1)

plt.savefig("tree_plot.png", dpi=300, bbox_inches='tight')

plt.show()