In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sksurv.ensemble import RandomSurvivalForest
from sksurv.util import Surv
import main_module as md
%run -i ../examples/concordance_index.ipynb

# df = pd.read_csv("../data/train_set.csv")
# df = df.replace(to_replace=["Missing Disease Status", "Missing disease status"], value=np.nan)

hct_df = md.hct("../data/train_set.csv")
df = hct_df.clean(method="replace", params=\
                          [["Not done", "Not tested", "Other", "Missing disease status", "Non-resident of the U.S."], \
                           'missing'])

# df = pd.read_csv("../data/train_set.csv")


feat_df = df.drop(columns=["ID"])
cat_features = list(df.select_dtypes(object).columns)

# Cross-validation
n_splits = 5
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
scores = np.zeros(n_splits)

for fold, (train_idx, test_idx) in enumerate(kfold.split(feat_df)):
    train = feat_df.iloc[train_idx]
    test = feat_df.iloc[test_idx]
    # Preprocessing pipeline
    preproc = ColumnTransformer(
        transformers=[
            ('ohe', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), cat_features),
        ],
        remainder=SimpleImputer(strategy='median'),
        verbose_feature_names_out=False
    ).set_output(transform='pandas')

    X_train_proc = preproc.fit_transform(train.drop(columns=['efs', 'efs_time']))
    X_test_proc = preproc.transform(test.drop(columns=['efs', 'efs_time']))

    y_train_proc = Surv.from_dataframe("efs", "efs_time", train)
    y_test_proc = Surv.from_dataframe("efs", "efs_time", test)

    # Initialize model
    rsf = RandomSurvivalForest(
        n_estimators=30,
        max_depth=10,
        min_samples_split=20,
        min_samples_leaf=10,
        n_jobs=4,
        verbose=1,
        random_state=42
    )
    rsf.fit(X_train_proc, y_train_proc)

    # Predict survival functions
    surv_funcs = rsf.predict_survival_function(X_test_proc, return_array=False)
    preds = np.array([-np.trapz(fn.y, fn.x) for fn in surv_funcs])

    solution = df.iloc[test_idx]
    prediction = pd.DataFrame({"ID": solution["ID"], "prediction": preds})
    scores[fold] = score(solution.copy(deep=True), prediction.copy(deep=True), "ID")

print(scores)


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   12.6s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   10.1s finished
  preds = np.array([-np.trapz(fn.y, fn.x) for fn in surv_funcs])
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   13.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   12.2s finished
  preds = np.array([-np.trapz(fn.y, fn.x) for fn in surv_funcs])
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   13.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  3

[0.6253624  0.61649557 0.63992952 0.61937063 0.63115719]


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sksurv.ensemble import RandomSurvivalForest
from sksurv.util import Surv
import main_module as md
%run -i ../examples/concordance_index.ipynb

# Load training data
hct_df = md.hct("../data/train_set.csv")
df = hct_df.clean(method="replace", params=\
                          [["Not done", "Not tested", "Other", "Missing disease status", "Non-resident of the U.S."], \
                           'missing'])

# df = pd.read_csv("../data/train_set.csv")

feat_df = df.drop(columns=["ID"])
cat_features = list(df.select_dtypes(object).columns)

# Cross-validation
n_splits = 5
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
scores = np.zeros(n_splits)

for fold, (train_idx, test_idx) in enumerate(kfold.split(feat_df)):
    train = feat_df.iloc[train_idx]
    test = feat_df.iloc[test_idx]

    X_train_raw = train.drop(columns=['efs', 'efs_time'])
    X_test_raw = test.drop(columns=['efs', 'efs_time'])

    # ColumnTransformer for categorical features
    cat_transformer = ColumnTransformer(
        transformers=[
            ('ohe', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), cat_features),
        ],
        remainder='passthrough',
        verbose_feature_names_out=False
    )

    # Full preprocessing pipeline
    pipeline = Pipeline([
        ('encode', cat_transformer),
        ('impute', KNNImputer())
    ])

    X_train_proc = pd.DataFrame(
        pipeline.fit_transform(X_train_raw),
        index=train.index
    )
    X_test_proc = pd.DataFrame(
        pipeline.transform(X_test_raw),
        index=test.index
    )

    y_train_proc = Surv.from_dataframe("efs", "efs_time", train)
    y_test_proc = Surv.from_dataframe("efs", "efs_time", test)

    # Initialize model
    rsf = RandomSurvivalForest(
        n_estimators=30,
        max_depth=10,
        min_samples_split=20,
        min_samples_leaf=10,
        n_jobs=4,
        verbose=1,
        random_state=42
    )
    rsf.fit(X_train_proc, y_train_proc)

    # Predict survival functions
    surv_funcs = rsf.predict_survival_function(X_test_proc, return_array=False)
    preds = np.array([-np.trapz(fn.y, fn.x) for fn in surv_funcs])

    solution = df.iloc[test_idx]
    prediction = pd.DataFrame({"ID": solution["ID"], "prediction": preds})
    scores[fold] = score(solution.copy(deep=True), prediction.copy(deep=True), "ID")

print(scores)


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   15.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   10.1s finished
  preds = np.array([-np.trapz(fn.y, fn.x) for fn in surv_funcs])
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   15.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    8.3s finished
  preds = np.array([-np.trapz(fn.y, fn.x) for fn in surv_funcs])
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   14.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  3

[0.62200798 0.61569278 0.63584239 0.61646458 0.62702264]
