## 1. Load Dataset and Fit Data for Model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load CSV file
df = pd.read_csv('your_dataset.csv')  # Replace 'your_dataset.csv' with dataset name









## 2. Split original dataset into Three:
- Training Dataset
- Validation Dataset (Testdataset for Hyperparameter Optimization process)
- Test Dataset (for final Evaluation)

In [None]:
# Separate features and target variable
X = df.drop('Label_Column', axis=1)   # Replace 'Label_Column' with target column
y = df['Label_Column']               # Replace 'Label_Column' with target column

# Check the first few rows of the data
print(df.head())

# Split original dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Testdata: 0.2 - Traindata: 0.6 (0.8*0.75) - Validationdata: 0.2 (0.8*0.25)
XX_train, X_val, yy_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

## (Simple Test Dataset to Check Code - Should be Deleted Later)

In [1]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
# Lade den Iris-Datensatz
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target, name='target')
# Entferne alle Einträge mit dem Label 2
X = X[y != 2]
y = y[y != 2]
# Prüfen der neuen Labels
print("Einzigartige Labels nach Umbenennung:", y.unique())
# Split original dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Testdata: 0.2 - Traindata: 0.6 (0.8*0.75) - Validationdata: 0.2 (0.8*0.25)
XX_train, X_val, yy_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

Einzigartige Labels nach Umbenennung: [0 1]


## 3. Perform Hyperband Hyperparameter-Optimization

In [2]:
#Because optuna library gets deactivated all the time within this environment
try:
    import optuna
    print("Optuna ist bereits installiert.")
except ImportError:
    import os
    os.system('pip install optuna')

Optuna ist bereits installiert.


In [3]:
import optuna
from optuna.pruners import HyperbandPruner
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Function for Hyperparameter-Optimization
def objective(trial):
    # Hyperparameter search space
    n_estimators = trial.suggest_int("n_estimators", 10, 500)
    max_depth = trial.suggest_int("max_depth", 1, 50)
    
    # RandomForest-Model code with full parameter search space
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    model.fit(XX_train, yy_train)
    
    # Accuracy on test-set
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

# Activate Hyperband-Pruner
study = optuna.create_study(direction="maximize", pruner=HyperbandPruner())

# objective: value to optimize (maximize or minimize e.g. direction="maximize" like defined above)
# n_trials: How many Hyperparameter combinations will be tried
study.optimize(objective, n_trials=50)

[I 2025-01-12 14:47:36,011] A new study created in memory with name: no-name-4baff3ce-c3c7-4f5d-b60d-aa2dd42074ae
[I 2025-01-12 14:47:36,083] Trial 0 finished with value: 1.0 and parameters: {'n_estimators': 54, 'max_depth': 15}. Best is trial 0 with value: 1.0.
[I 2025-01-12 14:47:36,485] Trial 1 finished with value: 1.0 and parameters: {'n_estimators': 329, 'max_depth': 29}. Best is trial 0 with value: 1.0.
[I 2025-01-12 14:47:36,515] Trial 2 finished with value: 1.0 and parameters: {'n_estimators': 20, 'max_depth': 34}. Best is trial 0 with value: 1.0.
[I 2025-01-12 14:47:36,648] Trial 3 finished with value: 1.0 and parameters: {'n_estimators': 108, 'max_depth': 38}. Best is trial 0 with value: 1.0.
[I 2025-01-12 14:47:36,892] Trial 4 finished with value: 1.0 and parameters: {'n_estimators': 166, 'max_depth': 26}. Best is trial 0 with value: 1.0.
[I 2025-01-12 14:47:37,122] Trial 5 finished with value: 1.0 and parameters: {'n_estimators': 159, 'max_depth': 26}. Best is trial 0 with 

In [4]:
# Print best parameter and test results
print("Best Parameter: ---", study.best_params)
print("Best Accuracy: ----", study.best_value)

Best Parameter: --- {'n_estimators': 54, 'max_depth': 15}
Best Accuracy: ---- 1.0


## 4. Train Model With Best Parameters

In [5]:
# Create model with best hyperparameter
best_rf_model = RandomForestClassifier(**study.best_params, random_state=42)

# Train final model with training- and validation dataset combined
best_rf_model.fit(X_train, y_train)

## 5. Evaluate Final Model

In [6]:
# Predict test data
y_pred = best_rf_model.predict(X_test)  

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

# Show results
print(f"Model: {best_rf_model.__class__.__name__}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print("Combined Confusion Matrix:")
print(cm)
print("-" * 40)

Model: RandomForestClassifier
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000
Combined Confusion Matrix:
[[12  0]
 [ 0  8]]
----------------------------------------


## 6. Resignations-Probability Outputs

In [7]:
# Random Forest
y_proba_rf = best_rf_model.predict_proba(X_test)

# Probability that prediction is 1 (Resignation/Austritt)
y_proba_rf[:, 1]

array([1.        , 1.        , 0.98148148, 0.        , 0.        ,
       0.        , 0.        , 1.        , 0.        , 0.        ,
       0.01851852, 0.        , 1.        , 0.        , 1.        ,
       0.        , 1.        , 1.        , 0.        , 0.        ])