In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score

# Für XGBoost und LightGBM
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.ensemble import RandomForestClassifier

# Daten laden und Feature-Definition wie gehabt
df = pd.read_csv('../data/processed/test.csv')
group_col = 'c_serial_number'
time_col = 'msg_timestamp'
target_col = 'failure_after_7_days'
excluded_cols = [group_col, time_col, target_col, 'c_van17', 'first_valid_time','time_diff_days']
feature_cols = [
    c for c in df.select_dtypes(include=[np.number]).columns
    if c not in excluded_cols
]

# Zeitbasierter Split pro Gruppe (wie oben)
df_sorted = df.sort_values([group_col, time_col]).copy()
train_frac = 0.8
train_indices, test_indices = [], []

for g, grp in df_sorted.groupby(group_col):
    n = len(grp)
    k_train = max(1, int(n * train_frac))
    train_indices.extend(grp.index[:k_train].tolist())
    test_indices.extend(grp.index[k_train:].tolist())

df_train = df_sorted.loc[train_indices]
df_test = df_sorted.loc[test_indices]

X_train = df_train[feature_cols]
y_train = df_train[target_col]
X_test = df_test[feature_cols]
y_test = df_test[target_col]

# Balancing: Oversampling der True-Klasse im Training
from sklearn.utils import resample

train_df = pd.concat([X_train, y_train.rename(target_col)], axis=1)
df_false = train_df[train_df[target_col] == 0]
df_true = train_df[train_df[target_col] == 1]
n_target = max(len(df_false), len(df_true))
df_true_oversampled = resample(df_true, replace=True, n_samples=n_target, random_state=42)
train_balanced = pd.concat([df_false, df_true_oversampled])
X_train_bal = train_balanced[feature_cols]
y_train_bal = train_balanced[target_col]


  df = pd.read_csv('../data/processed/test.csv')


In [3]:
# Zeitbasierten Split pro Gruppe (wie oben erarbeitet)
df_sorted = df.sort_values([group_col, time_col]).copy()
train_frac = 0.8  # 80% Training, 20% Test

train_indices = []
test_indices = []

for g, grp in df_sorted.groupby(group_col):
    n = len(grp)
    if n == 0:
        continue
    k_train = max(1, int(n * train_frac))
    train_idx_grp = grp.index[:k_train]
    test_idx_grp = grp.index[k_train:]
    train_indices.extend(train_idx_grp.tolist())
    test_indices.extend(test_idx_grp.tolist())

df_train = df_sorted.loc[train_indices]
df_test = df_sorted.loc[test_indices]

X_train = df_train[feature_cols]
y_train = df_train[target_col]
X_test = df_test[feature_cols]
y_test = df_test[target_col]

In [4]:
# Balancing des Trainingsdatensatzes (Oversampling der True-Klasse)
from sklearn.utils import resample

train_df = pd.concat([X_train, y_train.rename(target_col)], axis=1)
df_false = train_df[train_df[target_col] == 0]
df_true = train_df[train_df[target_col] == 1]
n_target = max(len(df_false), len(df_true))
df_true_oversampled = resample(df_true, replace=True, n_samples=n_target, random_state=42)
train_balanced = pd.concat([df_false, df_true_oversampled])
X_train_bal = train_balanced[feature_cols]
y_train_bal = train_balanced[target_col]

In [5]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

models = {
    'RandomForest': (
        RandomForestClassifier(random_state=42, n_jobs=-1),
        {
            'n_estimators': [200],
            'max_depth': [15],
            'class_weight': ['balanced']
        }
    ),
    'XGBoost': (
        XGBClassifier(random_state=42, n_jobs=-1, use_label_encoder=False, eval_metric='logloss'),
        {
            'n_estimators': [200],
            'max_depth': [15],
            'scale_pos_weight': [1, len(df_false)/len(df_true)]  # Handling Imbalance
        }
    ),
    'LightGBM': (
        LGBMClassifier(random_state=42, n_jobs=-1),
        {
            'n_estimators': [200],
            'max_depth': [15],
            'class_weight': ['balanced']
        }
    ),
}

results = {}

for name, (model, params) in models.items():
    print(f"\nTrainiere & tune {name} ...")
    gs = GridSearchCV(
        estimator=model,
        param_grid=params,
        scoring='roc_auc',
        n_jobs=1,
        cv=skf,
        verbose=1
    )
    gs.fit(X_train_bal, y_train_bal)
    best_model = gs.best_estimator_
    print(f"Best Score (AUC): {gs.best_score_:.4f}")
    print(f"Best Params: {gs.best_params_}")
    results[name] = {
        'model': best_model,
        'cv_score': gs.best_score_,
        'params': gs.best_params_
    }



Trainiere & tune RandomForest ...
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best Score (AUC): 0.9498
Best Params: {'class_weight': 'balanced', 'max_depth': 15, 'n_estimators': 200}

Trainiere & tune XGBoost ...
Fitting 3 folds for each of 2 candidates, totalling 6 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best Score (AUC): 1.0000
Best Params: {'max_depth': 15, 'n_estimators': 200, 'scale_pos_weight': 22.981502117227546}

Trainiere & tune LightGBM ...
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[LightGBM] [Info] Number of positive: 343727, number of negative: 343726
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.096482 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14138
[LightGBM] [Info] Number of data points in the train set: 687453, number of used features: 110
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 343726, number of negative: 343727
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.083590 seconds.
You can set `force_row_wise=true` to remove the overhea

In [6]:
print("\n--- Evaluation auf Testdaten ---\n")
for name, res in results.items():
    model = res['model']
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, y_proba)
    f1 = f1_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    print(f"{name}:")
    print(f"  ROC-AUC:    {auc:.4f}")
    print(f"  F1-Score:   {f1:.4f}")
    print(f"  Precision:  {prec:.4f}")
    print(f"  Recall:     {rec:.4f}\n")
    results[name].update({'auc': auc, 'f1': f1, 'precision': prec, 'recall': rec})



--- Evaluation auf Testdaten ---

RandomForest:
  ROC-AUC:    0.6432
  F1-Score:   0.3346
  Precision:  0.6428
  Recall:     0.2261

XGBoost:
  ROC-AUC:    0.6989
  F1-Score:   0.3706
  Precision:  0.9482
  Recall:     0.2303

LightGBM:
  ROC-AUC:    0.6978
  F1-Score:   0.4300
  Precision:  0.7796
  Recall:     0.2968



In [7]:
# Wähle das Modell mit dem höchsten ROC-AUC
best_name = max(results, key=lambda x: results[x]['auc'])
best_model = results[best_name]['model']
print(f"Bestes Modell: {best_name} mit ROC-AUC {results[best_name]['auc']:.4f}")


Bestes Modell: XGBoost mit ROC-AUC 0.6989


In [9]:
# Auswahl Komponenten mit bestem Modell

# Beispiel: new_logs = pd.read_csv('dein_neuer_logs_file.csv')
# new_logs['predicted_failure_prob'] = best_model.predict_proba(new_logs[feature_cols])[:,1]

# Hier als Beispiel X_test:
new_logs = X_test.copy()
new_logs['predicted_failure_prob'] = best_model.predict_proba(new_logs)[:,1]

# Schwelle so wählen, dass der Mittelwert möglichst nahe an 0.10 liegt
possible_thresholds = np.unique(new_logs['predicted_failure_prob'])
best_diff = float('inf')
best_thresh = None

for thresh in possible_thresholds[::-1]:  # von hoch nach niedrig
    subset = new_logs[new_logs['predicted_failure_prob'] >= thresh]
    if len(subset) == 0:
        continue
    mean_prob = subset['predicted_failure_prob'].mean()
    diff = abs(mean_prob - 0.10)
    if diff < best_diff:
        best_diff = diff
        best_thresh = thresh
        best_selected = subset

    if diff <= 0.0025:
        break

selected = best_selected
print(f"Gewählte Schwelle: {best_thresh}")
print(f"Durchschnittliche Wahrscheinlichkeit: {selected['predicted_failure_prob'].mean():.4f}")
print(f"Anzahl ausgewählter Komponenten: {len(selected)}")



KeyboardInterrupt: 