In [16]:
# make_csv.py
# Create csv file from the different json files

import json
import pandas as pd
import glob
from pathlib import Path

files = glob.glob("clinical_data/*.json")

rows = []

for f in files:
    with open(f, "r") as fp:
        data = json.load(fp)
        
        
       # Add filename without extension
        data["patient_id"] = Path(f).stem
        
        
        rows.append(data)

df = pd.DataFrame(rows)

df = pd.DataFrame(rows)
other_cols = [c for c in df.columns if c != "patient_id"]
df = df[["patient_id"] + other_cols]


df.to_csv("clinical_data.csv", index=False)

In [7]:
# preprocess_clinical_data.py
# Preprocesses clinical_data.csv to ensure proper training

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import pandas as pd

df = pd.read_csv('clinical_data.csv')

# Handle missing values in binary columns, where missing value does not have meaning, by replacing missing value with 0
df["capsular_penetration"] = df["capsular_penetration"].replace("x", 0)

# Fill columns with NaNs as 0, BCR_PSA <0.1 means undectable, i.e. 0 and tertinary_gleason was not detected so 0 on the ordinal scale (1-5) 
df[["BCR_PSA", "tertiary_gleason"]] = df[["BCR_PSA", "tertiary_gleason"]].fillna(0)

df["BCR_PSA"] = df["BCR_PSA"].replace(" ", 0)

pT_mapping = {
    "1": 1,
    "1a": 2,
    "1b": 3,
    "1c": 4,
    "2": 5,
    "2a": 6,
    "2b": 7,
    "2c": 8,
    "3": 9,
    "3a": 10,
    "3b": 11,
    "4": 12,
    "4b":12
}

df["pT_stage"] = df["pT_stage"].map(pT_mapping)

# OneHotEncode categorical columns
categorical_cols = ["positive_lymph_nodes", "earlier_therapy"]

encoder = OneHotEncoder(
    sparse_output=False,
    handle_unknown="ignore"
)

encoded = encoder.fit_transform(df[categorical_cols])

encoded_df = pd.DataFrame(
    encoded,
    columns=encoder.get_feature_names_out(categorical_cols),
    index=df.index
)

df = df.drop(columns=categorical_cols) 
df = pd.concat([df, encoded_df], axis=1)
# scale numerical columns

scaler = StandardScaler()

numeric_cols = ["age_at_prostatectomy", "pre_operative_PSA", "BCR_PSA"]

df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

df.to_csv("clinical_data_preprocessed.csv", index=False)

In [11]:
import pandas as pd
from sksurv.util import Surv
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import concordance_index_censored

df = pd.read_csv("clinical_data_preprocessed.csv")
splits = pd.read_csv("data_split_5fold.csv")

df = df.drop(columns=[c for c in df.columns if "earlier_therapy" in c.lower()])

df = df.merge(splits, on="patient_id")

c_indices = []

for fold in range(0, 5):

    # training set = all except current fold
    train_df = df[df["fold"] != fold]
    test_df   = df[df["fold"] == fold]

    # survival labels
    y_train = Surv.from_arrays(
        event=train_df["BCR"].astype(bool).values,
        time=train_df['time_to_follow-up/BCR'].values
    )
    y_test = Surv.from_arrays(
        event=test_df["BCR"].astype(bool).values,
        time=test_df["time_to_follow-up/BCR"].values
    )

    X_train = train_df.drop(columns=["BCR", "time_to_follow-up/BCR", "fold", "patient_id"])
    X_test = test_df.drop(columns=["BCR", "time_to_follow-up/BCR", "fold", "patient_id"])
    
    # fit Cox model
    model = CoxPHSurvivalAnalysis()
    model.fit(X_train, y_train)

    # predict on validation
    risk_val = model.predict(X_test)

    # compute C-index
    c_index = concordance_index_censored(
        test_df["BCR"].astype(bool).values,
        test_df["time_to_follow-up/BCR"].values,
        risk_val
    )[0]

    c_indices.append(c_index)
    print(f"Fold {fold} C-index: {c_index:.4f}")

Fold 0 C-index: 0.9405
Fold 1 C-index: 0.8764
Fold 2 C-index: 0.9221
Fold 3 C-index: 0.7308
Fold 4 C-index: 0.7797


  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(

In [10]:
df["time_to_follow-up/BCR"].describe()

count     95.000000
mean      37.354737
std       23.708894
min        1.300000
25%       18.650000
50%       38.000000
75%       58.400000
max      110.000000
Name: time_to_follow-up/BCR, dtype: float64

In [2]:
X_train

Unnamed: 0,age_at_prostatectomy,primary_gleason,secondary_gleason,ISUP,pre_operative_PSA,pT_stage,capsular_penetration,positive_surgical_margins,invasion_seminal_vesicles,lymphovascular_invasion,BCR_PSA,tertiary_gleason,positive_lymph_nodes_0,positive_lymph_nodes_1,positive_lymph_nodes_x,earlier_therapy_none,earlier_therapy_radiotherapy + cryotherapy,earlier_therapy_radiotherapy + hormones,earlier_therapy_unknown
0,-0.265864,3,4,2,-0.072240,5.0,0,1,0,0.0,-0.403919,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.687233,3,4,2,-0.110344,5.0,0,0,0,0.0,-0.403919,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,0.528384,4,5,5,-0.361827,8.0,0,1,0,0.0,0.219387,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.687233,4,3,3,-0.197982,5.0,0,1,0,0.0,-0.403919,5.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
6,0.846083,3,4,2,-0.175120,5.0,0,1,0,0.0,-0.403919,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,0.369534,4,3,3,-0.354206,8.0,0,1,0,0.0,-0.403919,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
90,0.210685,3,4,2,-0.293241,10.0,1,0,0,0.0,-0.403919,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
91,-1.060112,3,4,2,-0.194172,8.0,0,1,0,1.0,-0.403919,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
92,0.687233,4,5,5,-0.110344,8.0,0,0,0,0.0,-0.403919,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [18]:
splits = pd.read_csv("data_split_5fold.csv")

In [21]:
from sksurv.util import Surv
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import concordance_index_censored

c_indices = []

for fold in range(0, 5):

    # training set = all except current fold
    train_df = df[df["fold"] != fold]
    val_df   = df[df["fold"] == fold]

    # survival labels
    y_train = Surv.from_arrays(
        event=train_df["event"].astype(bool).values,
        time=train_df["time"].values
    )
    y_val = Surv.from_arrays(
        event=val_df["event"].astype(bool).values,
        time=val_df["time"].values
    )

    # feature matrix
    X_train = train_df.drop(columns=["event", "time", "fold"])
    X_val = val_df.drop(columns=["event", "time", "fold"])

    # fit Cox model
    model = CoxPHSurvivalAnalysis()
    model.fit(X_train, y_train)

    # predict on validation
    risk_val = model.predict(X_val)

    # compute C-index
    c_index = concordance_index_censored(
        val_df["event"].astype(bool).values,
        val_df["time"].values,
        -risk_val
    )[0]

    c_indices.append(c_index)
    print(f"Fold {fold} C-index: {c_index:.4f}")

Unnamed: 0,patient_id,fold
0,1003,3
1,1010,0
2,1011,0
3,1021,0
4,1025,1
...,...,...
90,1298,4
91,1299,0
92,1301,2
93,1303,3
