In [4]:
import os
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from tqdm import tqdm

top_feature_cols = ['conditioning_intensity', 'year_hct', 'age_at_hct',
                    'sex_match', 'donor_age', 'prim_disease_hct', 'gvhd_proph', 
                    'comorbidity_score', 'karnofsky_score', 'cyto_score_detail', 
                    'dri_score', 'cmv_status', 'race_group', 'in_vivo_tcd', 'hla_match_drb1_high', 
                    'tbi_status', 'cardiac', 'cyto_score', 'hla_nmdp_6', 'mrd_hct', 'hla_match_dqb1_high', 
                    'hla_match_a_low', 'pulm_severe', 'psych_disturb', 'hla_match_c_high', 'ID']

def clean_data(df, is_train=True):
    """
    Bereinigt den DataFrame:
    - Ersetzt fehlende Werte (NaN) mit sinnvollen Standardwerten.
    - Entfernt problematische Spaltennamen.
    - Wandelt kategorische Spalten in numerische Werte um.
    """
    df.fillna(0, inplace=True)
    df.columns = df.columns.str.replace(r"[^a-zA-Z0-9_]", "_", regex=True)
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype(str).astype('category').cat.codes
    return df

def main():
    # Aktuelles Arbeitsverzeichnis abrufen
    current_dir = os.getcwd().replace("\\", "/")

    # Trainings- und Testdaten einlesen
    train_path = os.path.join(current_dir, "data/train.csv")
    test_path = os.path.join(current_dir, "data/test.csv")

    df_train = pd.read_csv(train_path)
    df_test = pd.read_csv(test_path)

    # Daten bereinigen und auf die benötigten Spalten beschränken
    df_train = clean_data(df_train, is_train=True)
    df_train = df_train[top_feature_cols + ['efs', 'efs_time']]
    
    df_test = clean_data(df_test, is_train=False)
    df_test = df_test[top_feature_cols]

    # Features und Zielvariable definieren
    X_train = df_train.drop(columns=['efs', 'efs_time'], errors='ignore')
    y_train = df_train['efs']
    X_test = df_test

    # Nur numerische Spalten verwenden
    X_train = X_train.select_dtypes(include=[np.number])
    X_test = X_test.select_dtypes(include=[np.number])

    # XGBRegressor initialisieren
    xgb_model = XGBRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=5,
        random_state=42
    )

    # Training mit Fortschrittsbalken
    print("Training des Modells...")
    with tqdm(total=1, desc="Training") as pbar:
        xgb_model.fit(X_train, y_train)
        pbar.update(1)

    # Vorhersagen für Testdaten
    print("Vorhersagen für Testdaten...")
    risk_scores = xgb_model.predict(X_test)
    test_ids = X_test['ID']
    # Ergebnisse speichern und dabei die gespeicherten IDs nutzen
    if test_ids is not None:
        results = pd.DataFrame({'ID': ID, 'prediction': prediction})
    else:
        results = pd.DataFrame({'prediction': risk_scores})
    #results.to_csv(os.path.join(current_dir, "results.csv"), index=False)

if __name__ == "__main__":
    main()


Training des Modells...


Training: 100%|███████████████████████████████████| 1/1 [00:03<00:00,  3.40s/it]

Vorhersagen für Testdaten...
✅ Ergebnisse wurden erfolgreich gespeichert: results.csv



