In [1]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier, early_stopping
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer  
from sklearn.impute import SimpleImputer 

In [2]:

# 1. Daten laden
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [8]:
# 1. Data Cleaning
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    cleaned = df.copy()
    
    # Zieltabelle erstellen
    if 'efs' in cleaned.columns:
        cleaned['efs'] = cleaned['efs'].replace({'Event': 1, 'Censoring': 0})
        cleaned = cleaned[cleaned['efs'].isin([0, 1])]
    
    return cleaned

# 2. Preprocessing-Pipeline (Kernel für Train/Test-Konsistenz)
cat_cols = ["dri_score", "cyto_score", "graft_type", 
           "conditioning_intensity", "cmv_status", "prim_disease_hct"]
num_cols = ["comorbidity_score", "age_at_hct", "donor_age", 
           "hla_high_res_8", "karnofsky_score"]

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num', Pipeline([
        ('impute', SimpleImputer(strategy='median')),
    ]), num_cols)
], remainder='drop')

# 3. Daten laden und vorverarbeiten
train_raw = train_df
test_raw = test_df

train_df = clean_data(train_raw)
test_df = clean_data(test_raw)

# WICHTIG: Preprocessing einmal auf alle Daten anwenden
X_train = preprocessor.fit_transform(train_df)
y_train = train_df['efs'].values
X_test = preprocessor.transform(test_df)  # Einmalige Transformation [[1]]

# 4. Modellkonfiguration
model = LGBMClassifier(
    objective="binary",
    class_weight="balanced",
    n_estimators=1000,
    learning_rate=0.05,
    random_state=42
)

# 5. Cross-Validation
cv = StratifiedKFold(n_splits=5)
test_preds = []
progress_bar = tqdm(cv.split(X_train, y_train), total=5, desc="Training Folds")

for fold, (train_idx, val_idx) in enumerate(progress_bar):
    X_tr, y_tr = X_train[train_idx], y_train[train_idx]
    X_val, y_val = X_train[val_idx], y_train[val_idx]
    
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        callbacks=[early_stopping(stopping_rounds=50)],
        eval_metric="auc",
    )
    
    # PRE-verarbeiteter Test-Datensatz [[1]]
    test_preds.append(model.predict_proba(X_test)[:, 1])  

# 6. Ensembling
final_preds = np.mean(test_preds, axis=0)

test_df.head().T
#test_df['predicted_efs_prob'] = final_preds
test_df[['ID', 'predicted_efs_prob']].to_csv("predictions_model_3.csv", index=False)


Training Folds:   0%|                                     | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 12426, number of negative: 10614
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023445 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 636
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 55
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 50 rounds


Training Folds:  20%|█████▊                       | 1/5 [00:00<00:03,  1.05it/s]

Early stopping, best iteration is:
[73]	valid_0's auc: 0.721179	valid_0's binary_logloss: 0.612338
[LightGBM] [Info] Number of positive: 12426, number of negative: 10614
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001589 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 636
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 55
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 50 rounds


Training Folds:  40%|███████████▌                 | 2/5 [00:02<00:03,  1.18s/it]

Early stopping, best iteration is:
[134]	valid_0's auc: 0.700997	valid_0's binary_logloss: 0.623178
[LightGBM] [Info] Number of positive: 12426, number of negative: 10614
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001608 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 636
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 55
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 50 rounds


Training Folds:  60%|█████████████████▍           | 3/5 [00:03<00:02,  1.22s/it]

Early stopping, best iteration is:
[116]	valid_0's auc: 0.717004	valid_0's binary_logloss: 0.614395
[LightGBM] [Info] Number of positive: 12425, number of negative: 10615
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001665 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 636
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 55
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 50 rounds


Training Folds:  80%|███████████████████████▏     | 4/5 [00:04<00:01,  1.21s/it]

Early stopping, best iteration is:
[105]	valid_0's auc: 0.713506	valid_0's binary_logloss: 0.616414
[LightGBM] [Info] Number of positive: 12425, number of negative: 10615
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001199 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 636
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 55
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 50 rounds


Training Folds: 100%|█████████████████████████████| 5/5 [00:06<00:00,  1.21s/it]

Early stopping, best iteration is:
[136]	valid_0's auc: 0.711721	valid_0's binary_logloss: 0.619048





In [4]:
# Finale Vorhersagen in Datei speichern
submission_df = pd.DataFrame({
    "id": test_df["id"],  # ID-Spalte aus der Originaldatei
    "predicted_efs_prob": final_preds  # Vorhersage-Scores
})

# Ohne Index speichern [[1]][[3]][[10]]
submission_df.to_csv("predictions_model_3.csv", index=False)
print("Vorhersagen gespeichert in test_predictions.csv")

print("done")

KeyError: 'id'