Defaulting to user installation because normal site-packages is not writeable
[33mDEPRECATION: tensorflow-serving-api 2.2.0 has a non-standard dependency specifier grpcio>=1.0<2. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of tensorflow-serving-api or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mDefaulting to user installation because normal site-packages is not writeable
[33mDEPRECATION: tensorflow-serving-api 2.2.0 has a non-standard dependency specifier grpcio>=1.0<2. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of tensorflow-serving-api or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mDefaulting to user i

In [13]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from sksurv.ensemble import RandomSurvivalForest
from sksurv.util import Surv

def clean_data(df):
    """Datenbereinigung: Spaltennamen-Normalisierung und Behandlung fehlender Werte"""
    # 1. Spaltennamen standardisieren
    df.columns = df.columns.str.strip().str.lower().str.replace(r"[^a-z0-9_]+", "_", regex=True)
    
    # 2. Behandlung der Survival-Zielvariablen: 'efs' (Ereignis-Indikator) und 'efs_time' (Zeit)
    survival_cols = ['efs', 'efs_time']
    for col in survival_cols:
        if col in df.columns:
            df[col] = df[col].fillna(0)
    
    # 3. Behandlung anderer Spalten: Numerische Spalten werden mit ihrem Median aufgefüllt,
    # andere Spalten werden mithilfe von Codes kodiert.
    for col in df.columns:
        if col in survival_cols or df[col].dtype == object:
            continue
        if pd.api.types.is_numeric_dtype(df[col]):
            df[col].fillna(df[col].median(), inplace=True)
        else:
            df[col] = df[col].astype('category').cat.codes

    return df

def main():
    # Aktuelles Arbeitsverzeichnis abrufen
    current_dir = os.getcwd().replace("\\", "/")

    # Pfade zu den Trainings- und Testdaten
    train_path = os.path.join(current_dir, "data/train.csv")
    test_path = os.path.join(current_dir, "data/test.csv")

    # Trainings- und Testdaten einlesen
    df_train = pd.read_csv(train_path)
    df_test = pd.read_csv(test_path)

    # Daten bereinigen
    df_train = clean_data(df_train)
    df_test = clean_data(df_test)

    # Prüfen, ob alle notwendigen Spalten vorhanden sind
    required_cols = ['efs', 'efs_time', 'id']
    for col in required_cols:
        if col not in df_train.columns:
            raise KeyError(f"Fehlende erforderliche Spalte '{col}' im Trainingsdatensatz")

    # Survival-Zielvariable für das Training erstellen
    y_train = Surv.from_dataframe('efs', 'efs_time', df_train)
    X_train = df_train.drop(columns=['efs', 'efs_time', 'id'], errors='ignore')
    X_test = df_test.drop(columns=['efs', 'efs_time', 'id'], errors='ignore')

    # Es werden nur numerische Features verwendet
    X_train = X_train.select_dtypes(include=np.number)
    X_test = X_test[X_train.columns]

    # Initialisierung des RandomSurvivalForest-Modells
    model = RandomSurvivalForest(
        n_jobs=100, 
        n_estimators=500,
        max_depth=3,
        verbose=2,
        random_state=42,
    )

    print("Starte Training...")
    model.fit(X_train, y_train)

    # Statt model.event_times_ berechnen wir hier den Median der Ereigniszeiten aus den Trainingsdaten.
    median_time = np.median(df_train['efs_time'].values)
    print(f"Ausgewählter Zeitpunkt für Vorhersage: {median_time}")

    # Vorhersage der Überlebensfunktion: predict_survival_function liefert für jeden Testfall 
    # eine Funktion, die die Überlebenswahrscheinlichkeit zu jedem Zeitpunkt liefert.
    survival_functions = model.predict_survival_function(X_test)
    
    # Berechnung der Überlebenswahrscheinlichkeit zum errechneten Zeitpunkt (median_time)
    predictions = np.array([fn(median_time) for fn in survival_functions])
    # Optional: sicherstellen, dass die Vorhersagen im Intervall [0,1] liegen
    predictions = np.clip(predictions, 0.0, 1.0)

    # Ergebnisse speichern
    results = pd.DataFrame({
        'ID': df_test['id'].values,
        'prediction': predictions
    })
    results.to_csv("modell_1.csv", index=False)
    print("✅ Vorhersagen erfolgreich gespeichert")

if __name__ == "__main__":
    main()


Starte Training...


[Parallel(n_jobs=100)]: Using backend ThreadingBackend with 100 concurrent workers.


building tree 1 of 500
building tree 2 of 500
building tree 3 of 500
building tree 4 of 500
building tree 5 of 500
building tree 6 of 500
building tree 7 of 500
building tree 8 of 500
building tree 9 of 500
building tree 10 of 500
building tree 11 of 500
building tree 12 of 500
building tree 13 of 500
building tree 14 of 500
building tree 15 of 500
building tree 16 of 500
building tree 17 of 500
building tree 18 of 500
building tree 19 of 500
building tree 20 of 500
building tree 21 of 500
building tree 22 of 500
building tree 23 of 500
building tree 24 of 500
building tree 25 of 500
building tree 26 of 500
building tree 27 of 500
building tree 28 of 500
building tree 29 of 500
building tree 30 of 500
building tree 31 of 500
building tree 32 of 500
building tree 33 of 500
building tree 34 of 500
building tree 35 of 500
building tree 36 of 500
building tree 37 of 500
building tree 38 of 500
building tree 39 of 500
building tree 40 of 500
building tree 41 of 500
building tree 42 of 500
b

[Parallel(n_jobs=100)]: Done 165 tasks      | elapsed:   18.0s


building tree 267 of 500
building tree 268 of 500
building tree 269 of 500
building tree 270 of 500
building tree 271 of 500
building tree 272 of 500
building tree 273 of 500
building tree 274 of 500
building tree 275 of 500
building tree 276 of 500
building tree 277 of 500
building tree 278 of 500
building tree 279 of 500
building tree 280 of 500
building tree 281 of 500
building tree 282 of 500
building tree 283 of 500
building tree 284 of 500
building tree 285 of 500
building tree 286 of 500
building tree 287 of 500
building tree 288 of 500
building tree 289 of 500
building tree 290 of 500
building tree 291 of 500
building tree 292 of 500
building tree 293 of 500
building tree 294 of 500
building tree 295 of 500
building tree 296 of 500
building tree 297 of 500
building tree 298 of 500
building tree 299 of 500
building tree 300 of 500
building tree 301 of 500
building tree 302 of 500
building tree 303 of 500
building tree 304 of 500
building tree 305 of 500
building tree 306 of 500


[Parallel(n_jobs=100)]: Done 500 out of 500 | elapsed:   47.3s finished
[Parallel(n_jobs=100)]: Using backend ThreadingBackend with 100 concurrent workers.
[Parallel(n_jobs=100)]: Done 165 tasks      | elapsed:    0.1s


Ausgewählter Zeitpunkt für Vorhersage: 9.7965
✅ Vorhersagen erfolgreich gespeichert


[Parallel(n_jobs=100)]: Done 500 out of 500 | elapsed:    0.2s finished
