## 1. Load Dataset and Fit Data for Model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load CSV file
df = pd.read_csv('data/MA_BDO_cleaned.csv')  # Replace 'your_dataset.csv' with dataset name











## 2. Split original dataset into Three:
- Training Dataset
- Validation Dataset (Testdataset for Hyperparameter Optimization process)
- Test Dataset (for final Evaluation)

In [None]:
# Separate features and target variable
X = df.drop('Austritt', axis=1)   # Replace 'Label_Column' with target column
y = df['Austritt']               # Replace 'Label_Column' with target column

# Split original dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Testdata: 0.2 - Traindata: 0.6 (0.8*0.75) - Validationdata: 0.2 (0.8*0.25)
XX_train, X_val, yy_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

## (Simple Test Dataset to Check Code - Should be Deleted Later)

In [1]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
# Lade den Iris-Datensatz
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target, name='target')
# Entferne alle Einträge mit dem Label 2
X = X[y != 2]
y = y[y != 2]
# Prüfen der neuen Labels
print("Einzigartige Labels nach Umbenennung:", y.unique())
# Split original dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Testdata: 0.2 - Traindata: 0.6 (0.8*0.75) - Validationdata: 0.2 (0.8*0.25)
XX_train, X_val, yy_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

Einzigartige Labels nach Umbenennung: [0 1]


## 3. Perform Hyperband Hyperparameter-Optimization

In [9]:
#Because optuna and xgboost libraries gets deactivated all the time within this environment
try:
    import optuna
    print("Optuna ist bereits installiert.")
except ImportError:
    import os
    os.system('pip install optuna')

try:
    import xgboost
    print("Optuna ist bereits installiert.")
except ImportError:
    import os
    os.system('pip install xgboost')

Optuna ist bereits installiert.
Optuna ist bereits installiert.


In [10]:
import optuna
from optuna.pruners import HyperbandPruner
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Function for Hyperparameter-Optimization
def objective(trial):
    # Hyperparameter search space
    n_estimators = trial.suggest_int("n_estimators", 10, 500)
    max_depth = trial.suggest_int("max_depth", 1, 50)
    learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 1.0)
    
    # XGBoost-Modell code with full parameter search space
    model = xgb.XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, 
                               learning_rate=learning_rate, use_label_encoder=False, eval_metric='logloss', random_state=42)
    model.fit(XX_train, yy_train)
    
    # Accuracy on test-set
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

# Activate Hyperband-Pruner
study = optuna.create_study(direction="maximize", pruner=HyperbandPruner())

# objective: value to optimize (maximize or minimize e.g. direction="maximize" like defined above)
# n_trials: How many Hyperparameter combinations will be tried
study.optimize(objective, n_trials=50)

[I 2025-01-12 15:37:58,951] A new study created in memory with name: no-name-07eddf95-4221-49fd-9b3b-e2998f11dee9
  learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 1.0)
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-12 15:37:59,018] Trial 0 finished with value: 1.0 and parameters: {'n_estimators': 298, 'max_depth': 41, 'learning_rate': 0.01638703532182828}. Best is trial 0 with value: 1.0.
  learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 1.0)
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-12 15:37:59,033] Trial 1 finished with value: 1.0 and parameters: {'n_estimators': 10, 'max_depth': 19, 'learning_rate': 0.3314041361108037}. Best is trial 0 with value: 1.0.
  learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 1.0)
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-12 15:37:59,093] Trial 2 finished with value: 1.0 and parameters: {'n_estimators': 298, 'max_depth': 2, 'learning_rate': 0.015350690346

In [11]:
# Print best parameter and test results
print("Best Parameter: ---", study.best_params)
print("Best Accuracy: ----", study.best_value)

Best Parameter: --- {'n_estimators': 298, 'max_depth': 41, 'learning_rate': 0.01638703532182828}
Best Accuracy: ---- 1.0


## 4. Train Model With Best Parameters

In [12]:
# Create model with best hyperparameter
best_xgb_model = xgb.XGBClassifier(**study.best_params, eval_metric='logloss', random_state=42)

# Train final model with training- and validation dataset combined
# "_ =" because xgb library can't handle output and throws error without "_ ="
_ = best_xgb_model.fit(X_train, y_train)

## 5. Evaluate Final Model

In [13]:
# Predict test data
y_pred = best_xgb_model.predict(X_test)  

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

# Show results
print(f"Model: {best_xgb_model.__class__.__name__}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print("Combined Confusion Matrix:")
print(cm)
print("-" * 40)

Model: XGBClassifier
Accuracy: 0.6142
Precision: 0.6154
Recall: 0.5987
F1-Score: 0.6070
Combined Confusion Matrix:
[[4394 2585]
 [2773 4137]]
----------------------------------------


## 6. Resignations-Probability Outputs

In [14]:
# XGBoost
y_proba_xgb = best_xgb_model.predict_proba(X_test)

# Probability that prediction is 1 (Resignation/Austritt)
y_proba_xgb[:, 1]

array([0.5049081 , 0.8702685 , 0.5402979 , ..., 0.52945745, 0.45286858,
       0.6508555 ], dtype=float32)