In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset_name = "METABRIC"
train = pd.read_csv(f"../../datasets/train/{dataset_name}.csv")
test = pd.read_csv(f"../../datasets/test/{dataset_name}.csv")

In [3]:
# delete instances with duration = 0.0
idx = train[train["duration"] == 0].index[0] if (train["duration"] == 0).any() else None
if idx:
    train.drop(index=idx, inplace=True)
idx = test[test["duration"] == 0].index[0] if (test["duration"] == 0).any() else None
if idx:
    test.drop(index=idx, inplace=True)

In [4]:
print(f"TRAIN SHAPE {train.shape}, TEST SHAPE {test.shape}")

TRAIN SHAPE (1712, 11), TEST SHAPE (191, 11)


In [5]:
X, y = train.drop(["event", "duration"], axis=1), train[["event", "duration"]]
X_test, y_test = test.drop(["event", "duration"], axis=1), test[["event", "duration"]]

In [6]:
X.head(2)

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8
0,7.166604,5.434787,14.091119,5.868354,0.0,1.0,1.0,0.0,39.53
1,6.00669,6.515598,9.696476,5.975583,0.0,0.0,0.0,1.0,48.07


# Hyperparameters Tuning
### The `grid_params` dict describe all the possible hyperparameter values i chose to test

In [7]:
from auton_survival.models.dsm import DeepSurvivalMachines
from sklearn.preprocessing import StandardScaler
from sklearn_pandas import DataFrameMapper
from sklearn.model_selection import KFold
from pycox.evaluation import EvalSurv
import torch
import torchtuples as tt
import itertools
import os
import pickle
from lifelines.utils import concordance_index

np.random.seed(42)
_ = torch.manual_seed(42)

In [10]:
%%time

model_name = "dsm"

grid_params = {
    # net params
    "num_nodes": [[41], [32, 32], [28,28,100,28,28], [32,128,128], [16,32,64,64]],
    "k": [3,4],
    "distribution" : ["Weibull"],        
    # fit params
    "batch_size": [256, 512],
    "epochs": [256, 512],
    # optimizer params
    "lr": [1e-2, 1e-3],
    "optimizer": ["Adam", "RMSProp", "SGD"],
    "iters": [100,1000]        
}

keys, values = zip(*grid_params.items())
experiments = [dict(zip(keys, v)) for v in itertools.product(*values)]
for experiment in experiments:
    Statistics = {'concordance_td':[], 
                  'ibs': [],
                  'c_index': [],
                  'avg_concordance_td':0.5, 
                  'avg_ibs': 0,
                  'avg_c_index': 0.5,
                  'std_concordance_td': 0,
                  'std_ibs': 0,
                  'std_c_index': 0                  
                 }

    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    for i, (train_index, val_index) in enumerate(cv.split(X)):

        # split to train and test 
        X_train, X_val = X.iloc[train_index].copy(), X.iloc[val_index].copy()
        y_train, y_val = y.iloc[train_index].copy(), y.iloc[val_index].copy()

        # preprocess for y
        lower, upper = np.min(y_train["duration"]), np.max(y_train["duration"])
        idx = np.where((y_val["duration"] > lower) & (y_val["duration"] < upper))[0]
        X_val = X_val.iloc[idx].copy()
        y_val = y_val.iloc[idx].copy()

        y_train = y_train.astype('float32')
        y_val = y_val.astype('float32')
        
        original_y_train = y_train.copy()
        original_y_val = y_val.copy()
        
        # preprocess step
        cols_standardize = ['x0', 'x1', 'x2', 'x3', 'x8']
        cols_leave = ['x4', 'x5', 'x6', 'x7']

        standardize = [([col], StandardScaler()) for col in cols_standardize]
        leave = [(col, None) for col in cols_leave]

        standard_scaler = DataFrameMapper(standardize + leave)
        X_train = standard_scaler.fit_transform(X_train).astype('float32')
        y_train = y_train["duration"].values, y_train["event"].values

        X_val = standard_scaler.transform(X_val).astype('float32')
        y_val = y_val["duration"].values, y_val["event"].values
        val = X_val, y_val[0], y_val[1]


        
        # build the network
        num_nodes = experiment["num_nodes"]
        k = experiment["k"]
        distribution = experiment["distribution"]
        use_activation = True
        
        # fit params
        lr = experiment["lr"]        
        optimizer = experiment["optimizer"]        
        batch_size = experiment["batch_size"]
        iters = experiment["iters"]
        

        # train the model
        model = DeepSurvivalMachines(k=k, layers=num_nodes, 
                                     distribution=distribution, random_seed=42)
        
        model.fit(x = X_train, t = y_train[0], e = y_train[1], 
                  val_data = val, iters=iters, learning_rate=lr, 
                  batch_size=batch_size, optimizer=optimizer)

        # model evaluation
        
        ## pycox measures
        tmp_times = np.sort(y_train[0])
        times = [tmp_times[i] for i in range(0, len(tmp_times), 10)]
        
        estimate_surv = model.predict_survival(X_val, times)
        estimate_surv = pd.DataFrame(estimate_surv.T)
        ev = EvalSurv(estimate_surv, y_val[0], y_val[1], censor_surv='km')
        
        concordance_td = ev.concordance_td('antolini')
        ibs = ev.integrated_brier_score(np.array(times))

        ## lifelines measures
        estimate = np.mean(1-estimate_surv.to_numpy(), axis=0)
        c_index = 1 - concordance_index(event_times=original_y_val["duration"], 
                          predicted_scores= estimate, 
                          event_observed=original_y_val["event"])

        # store statistics in Statistics dict
        Statistics["c_index"].append(c_index)
        Statistics["ibs"].append(ibs)
        Statistics["concordance_td"].append(concordance_td)

        
    # summarise cross validation scores
    Statistics["avg_concordance_td"] = np.mean(Statistics["concordance_td"])
    Statistics["avg_ibs"] = np.mean(Statistics["ibs"])
    Statistics["avg_c_index"] = np.mean(Statistics["c_index"])    
    Statistics["std_concordance_td"] = np.std(Statistics["concordance_td"])
    Statistics["std_ibs"] = np.std(Statistics["ibs"])    
    Statistics["std_c_index"] = np.std(Statistics["c_index"])
    
    # save the model for later
    try:
        os.mkdir(f"statistics/{model_name}/models")

    except OSError as error: 
        
        file_name = f"{model_name}_"
        for k,v in experiment.items():
            file_name += f"{str(k)}_{str(v)}_"
        
        # dump model to files
        path_sc = os.path.join(f"statistics/{model_name}/models", f"sc_{file_name}.pkl")            
        path_model = os.path.join(f"statistics/{model_name}/models", f"model_{file_name}.pt")
        
        with open(path_sc, 'wb') as f:
            pickle.dump(standard_scaler, f)
            
        torch.save(model.torch_model.state_dict(), path_model)
    
        # dump statistics to pickle    
        file_name += ".pkl"
        path = os.path.join(f"statistics/{model_name}", file_name)

        with open(path, 'wb') as f:
            pickle.dump(Statistics, f)

print("DONE")

 10%|██████████▊                                                                                               | 1025/10000 [00:00<00:07, 1216.37it/s]
 13%|██████████████▌                                                                                                 | 13/100 [00:00<00:04, 20.73it/s]
 10%|██████████▉                                                                                               | 1027/10000 [00:00<00:07, 1220.23it/s]
 49%|██████████████████████████████████████████████████████▉                                                         | 49/100 [00:01<00:02, 25.28it/s]
 11%|███████████▌                                                                                               | 1080/10000 [00:01<00:09, 982.63it/s]
  9%|██████████▏                                                                                                      | 9/100 [00:00<00:04, 20.37it/s]
 10%|██████████▍                                                                              

RuntimeError: value cannot be converted to type float without overflow

# Extract the best model for each one of the measurements
`c_index` `concordance_td`, `integrated brier score`

In [9]:
best_statistics = {'concordance_td': -1, 
                  'ibs': 1,
                   'c_index': -1
                 }
experiments_list = os.listdir(f"statistics/{model_name}")
for experiment in experiments_list:
    if not experiment.startswith(model_name):
        continue
        
    path = os.path.join(f"statistics/{model_name}", experiment)
    stats = pickle.load(open(path, "rb" ))
    df = pd.DataFrame(stats)

    concordance_td = df["avg_concordance_td"][0]
    std_concordance_td = df["std_concordance_td"][0]
    ibs = df["avg_ibs"][0]
    std_ibs = df["std_ibs"][0]    
    c_index = df["avg_c_index"][0]
    std_c_index = df["std_c_index"][0]    
    
    if c_index > best_statistics["c_index"]:
        best_statistics["c_index"] = c_index
        best_statistics["c_index_std"] = std_c_index
        best_statistics["c_index_std"] = experiment
    
    if concordance_td > best_statistics["concordance_td"]:
        best_statistics["concordance_td"] = concordance_td
        best_statistics["concordance_td_std"] = std_concordance_td
        best_statistics["concordance_td_params"] = experiment
        
    if ibs < best_statistics["ibs"]:
        best_statistics["ibs"] = ibs
        best_statistics["ibs_std"] = std_ibs
        best_statistics["ibs_params"] = experiment



path = os.path.join(f"statistics/{model_name}", "best_model.pkl")
with open(path, 'wb') as f:
    pickle.dump(best_statistics, f)    
    
print("DONE")

DONE
