## 1. Load Dataset and Fit Data for Model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load CSV file
df = pd.read_csv('../data/updated_master_data_combined.csv')  # Replace 'your_dataset.csv' with dataset name










## 2. Split original dataset into Three:
- Training Dataset
- Validation Dataset (Testdataset for Hyperparameter Optimization process)
- Test Dataset (for final Evaluation)

In [None]:
# Separate features and target variable
X = df.drop('Austritt', axis=1)   # Replace 'Label_Column' with target column
y = df['Austritt']               # Replace 'Label_Column' with target column

# Check the first few rows of the data
print(df.head())

# Split original dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Testdata: 0.2 - Traindata: 0.6 (0.8*0.75) - Validationdata: 0.2 (0.8*0.25)
XX_train, X_val, yy_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from xgboost import cv, DMatrix
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Daten laden
data_path = 'data/MA_BDO_cleaned.csv'
df = pd.read_csv(data_path)

# Zielvariable definieren und Daten vorbereiten
target = 'Austritt'
if target not in df.columns:
    raise ValueError(f"Die Zielvariable '{target}' ist nicht in den Daten vorhanden.")

# Feature Engineering: Beispiel für Verweildauer
if 'Eintrittsdatum' in df.columns and 'Austrittsdatum' in df.columns:
    df['Verweildauer'] = (pd.to_datetime(df['Austrittsdatum']) - pd.to_datetime(df['Eintrittsdatum'])).dt.days

# Features und Zielvariable trennen
X = df.drop(columns=[target])
y = df[target]

# Kategorische Variablen in numerische Werte umwandeln
label_encoders = {}
for column in X.select_dtypes(include=['object', 'category']).columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column].astype(str))
    label_encoders[column] = le

# Zielvariable kodieren (falls nötig)
if y.dtype == 'object' or y.dtype.name == 'category':
    y = LabelEncoder().fit_transform(y)

# Daten skalieren
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Datenbalance mit SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Trainings- und Testdaten aufteilen
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


## (Simple Test Dataset to Check Code - Should be Deleted Later)

In [1]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
# Lade den Iris-Datensatz
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target, name='target')
# Entferne alle Einträge mit dem Label 2
X = X[y != 2]
y = y[y != 2]
# Prüfen der neuen Labels
print("Einzigartige Labels nach Umbenennung:", y.unique())
# Split original dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Testdata: 0.2 - Traindata: 0.6 (0.8*0.75) - Validationdata: 0.2 (0.8*0.25)
XX_train, X_val, yy_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

Einzigartige Labels nach Umbenennung: [0 1]


## 3. Perform Hyperband Hyperparameter-Optimization

In [2]:
try:
    import optuna
    print("Optuna ist bereits installiert.")
except ImportError:
    import os
    os.system('pip install optuna')

Optuna ist bereits installiert.


In [19]:
import optuna
from optuna.pruners import HyperbandPruner
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Function for Hyperparameter-Optimization
def objective(trial):
    # Hyperparameter search space
    max_depth = trial.suggest_int("max_depth", 1, 50)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    
    # DecisionTree-Model code with full parameter search space
    model = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, random_state=42)
    model.fit(XX_train, yy_train)
    
    # Accuracy on test-set
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

# Activate Hyperband-Pruner
study = optuna.create_study(direction="maximize", pruner=HyperbandPruner())

# objective: value to optimize (maximize or minimize e.g. direction="maximize" like defined above)
# n_trials: How many Hyperparameter combinations will be tried
study.optimize(objective, n_trials=50)

[I 2025-01-12 15:36:35,598] A new study created in memory with name: no-name-811ad298-eec9-46fd-a4eb-b823cab54d89
[I 2025-01-12 15:36:35,608] Trial 0 finished with value: 1.0 and parameters: {'max_depth': 3, 'min_samples_split': 8}. Best is trial 0 with value: 1.0.
[I 2025-01-12 15:36:35,615] Trial 1 finished with value: 1.0 and parameters: {'max_depth': 33, 'min_samples_split': 2}. Best is trial 0 with value: 1.0.
[I 2025-01-12 15:36:35,621] Trial 2 finished with value: 1.0 and parameters: {'max_depth': 46, 'min_samples_split': 20}. Best is trial 0 with value: 1.0.
[I 2025-01-12 15:36:35,626] Trial 3 finished with value: 1.0 and parameters: {'max_depth': 1, 'min_samples_split': 5}. Best is trial 0 with value: 1.0.
[I 2025-01-12 15:36:35,630] Trial 4 finished with value: 1.0 and parameters: {'max_depth': 23, 'min_samples_split': 19}. Best is trial 0 with value: 1.0.
[I 2025-01-12 15:36:35,635] Trial 5 finished with value: 1.0 and parameters: {'max_depth': 2, 'min_samples_split': 19}. B

In [20]:
# Print best parameter and test results
print("Best Parameter: ---", study.best_params)
print("Best Accuracy: ----", study.best_value)

Best Parameter: --- {'max_depth': 3, 'min_samples_split': 8}
Best Accuracy: ---- 1.0


## 4. Train Model With Best Parameters

In [21]:
# Create model with best hyperparameter
best_dt_model = DecisionTreeClassifier(**study.best_params, random_state=42)

# Train final model with training- and validation dataset combined
best_dt_model.fit(X_train, y_train)

## 5. Evaluate Final Model

In [22]:
# Predict test data
y_pred = best_dt_model.predict(X_test)  

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

# Show results
print(f"Model: {best_dt_model.__class__.__name__}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print("Combined Confusion Matrix:")
print(cm)
print("-" * 40)

Model: DecisionTreeClassifier
Accuracy: 0.5588
Precision: 0.5502
Recall: 0.6197
F1-Score: 0.5829
Combined Confusion Matrix:
[[3479 3500]
 [2628 4282]]
----------------------------------------


## 6. Resignations-Probability Outputs

In [15]:
# Decision Tree
y_proba_dt = best_dt_model.predict_proba(X_test)

# Probability that prediction is 1 (Resignation/Austritt)
y_proba_dt[:, 1]

array([0.4       , 0.85714286, 0.        , ..., 0.4       , 1.        ,
       0.18181818])