## 1. Load Dataset and Fit Data for Model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load CSV file
df = pd.read_csv('your_dataset.csv')  # Replace 'your_dataset.csv' with dataset name









## 2. Split original dataset into Three:
- Training Dataset
- Validation Dataset (Testdataset for Hyperparameter Optimization process)
- Test Dataset (for final Evaluation)

In [None]:
# Separate features and target variable
X = df.drop('Label_Column', axis=1)   # Replace 'Label_Column' with target column
y = df['Label_Column']               # Replace 'Label_Column' with target column

# Check the first few rows of the data
print(df.head())

# Split original dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Testdata: 0.2 - Traindata: 0.6 (0.8*0.75) - Validationdata: 0.2 (0.8*0.25)
XX_train, X_val, yy_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

## (Simple Test Dataset to Check Code - Should be Deleted Later)

In [1]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
# Lade den Iris-Datensatz
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target, name='target')
# Entferne alle Einträge mit dem Label 2
X = X[y != 2]
y = y[y != 2]
# Prüfen der neuen Labels
print("Einzigartige Labels nach Umbenennung:", y.unique())
# Split original dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Testdata: 0.2 - Traindata: 0.6 (0.8*0.75) - Validationdata: 0.2 (0.8*0.25)
XX_train, X_val, yy_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

Einzigartige Labels nach Umbenennung: [0 1]


## 3. Perform Hyperband Hyperparameter-Optimization

In [2]:
#Because optuna library gets deactivated all the time within this environment
try:
    import optuna
    print("Optuna ist bereits installiert.")
except ImportError:
    import os
    os.system('pip install optuna')

Optuna ist bereits installiert.


In [3]:
import optuna
from optuna.pruners import HyperbandPruner
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Function for Hyperparameter-Optimization
def objective(trial):
    # Hyperparameter search space
    C = trial.suggest_loguniform("C", 0.01, 10.0)
    solver = trial.suggest_categorical("solver", ["liblinear", "saga"])
    
    # LogisticRegression-Model code with full parameter search space
    model = LogisticRegression(C=C, solver=solver, max_iter=1000, random_state=42)
    model.fit(XX_train, yy_train)
    
    # Accuracy on test-set
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

# Activate Hyperband-Pruner
study = optuna.create_study(direction="maximize", pruner=HyperbandPruner())

# objective: value to optimize (maximize or minimize e.g. direction="maximize" like defined above)
# n_trials: How many Hyperparameter combinations will be tried
study.optimize(objective, n_trials=50)

[I 2025-01-12 14:47:06,931] A new study created in memory with name: no-name-e5bf7f12-b541-4b45-925e-bfcb8966522c
  C = trial.suggest_loguniform("C", 0.01, 10.0)
[I 2025-01-12 14:47:06,937] Trial 0 finished with value: 1.0 and parameters: {'C': 0.2627129735852165, 'solver': 'liblinear'}. Best is trial 0 with value: 1.0.
  C = trial.suggest_loguniform("C", 0.01, 10.0)
[I 2025-01-12 14:47:06,946] Trial 1 finished with value: 1.0 and parameters: {'C': 0.18472764014109638, 'solver': 'saga'}. Best is trial 0 with value: 1.0.
  C = trial.suggest_loguniform("C", 0.01, 10.0)
[I 2025-01-12 14:47:06,955] Trial 2 finished with value: 1.0 and parameters: {'C': 0.19830680037214024, 'solver': 'saga'}. Best is trial 0 with value: 1.0.
  C = trial.suggest_loguniform("C", 0.01, 10.0)
[I 2025-01-12 14:47:06,960] Trial 3 finished with value: 1.0 and parameters: {'C': 0.23516067299526175, 'solver': 'liblinear'}. Best is trial 0 with value: 1.0.
  C = trial.suggest_loguniform("C", 0.01, 10.0)
[I 2025-01-12

In [4]:
# Print best parameter and test results
print("Best Parameter: ---", study.best_params)
print("Best Accuracy: ----", study.best_value)

Best Parameter: --- {'C': 0.2627129735852165, 'solver': 'liblinear'}
Best Accuracy: ---- 1.0


## 4. Train Model With Best Parameters

In [5]:
# Create model with best hyperparameter
best_lr_model = LogisticRegression(**study.best_params, max_iter=1000, random_state=42)

# Train final model with training- and validation dataset combined
best_lr_model.fit(X_train, y_train)

## 5. Evaluate Final Model

In [6]:
# Predict test data
y_pred = best_lr_model.predict(X_test)  

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

# Show results
print(f"Model: {best_lr_model.__class__.__name__}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print("Combined Confusion Matrix:")
print(cm)
print("-" * 40)

Model: LogisticRegression
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000
Combined Confusion Matrix:
[[12  0]
 [ 0  8]]
----------------------------------------


## 6. Resignations-Probability Outputs

In [7]:
# Logistic Regression
y_proba_lr = best_lr_model.predict_proba(X_test)

# Probability that prediction is 1 (Resignation/Austritt)
y_proba_lr[:, 1]

array([0.98807786, 0.95587535, 0.97423975, 0.11705316, 0.11323876,
       0.08291529, 0.03815557, 0.9271212 , 0.05879102, 0.065849  ,
       0.07050208, 0.13209963, 0.96848023, 0.03110392, 0.96511513,
       0.06153337, 0.971543  , 0.97938576, 0.10405107, 0.0872788 ])