In [1]:
import pandas as pd
import numpy as np
import h5py
from sklearn.model_selection import train_test_split

import sys
sys.path.append('../theano/DeepSurv/deepsurv/')
import utils

# Helper functions

In [2]:
def save_dataset(datasets, file):
    f = h5py.File(file)
    for (grp, ds) in datasets.items():
        grp_f = f.create_group(grp)
        for (key, value) in ds.items():
            print('Saving: (%s, %s)' % (grp, key))
            grp_f.create_dataset(key, data=value)
    f.close()

def split_dataset(dataset, p = .2):
    x_train, x_test, e_train, e_test, t_train, t_test = train_test_split(
        dataset['x'], dataset['e'], dataset['t'], test_size = p)
    train = {
        'x' : x_train,
        'e' : e_train,
        't' : t_train
    }
    test = {
        'x' : x_test,
        'e' : e_test,
        't' : t_test
    }
    return (train, test)

# Save Simulated datasets

In [30]:
import datasets

# method = 'linear'
method = 'gaussian'
treatment_group = True

print('Method:', method)
ds = datasets.SimulatedData(hr_ratio=10,
            average_death=5, end_time=15, 
            num_features=10, num_var=2, treatment_group=treatment_group)
train_data = ds.generate_data(4000, method=method)
valid_data = ds.generate_data(1000,method=method)
test_data = ds.generate_data(1000, method=method)
viz_data = ds.generate_data(10000, method=method)
sim_dataset = {'train': train_data, 'valid': valid_data, 'test': test_data, 'viz' : viz_data}

Method: gaussian
[ 1.  1.  1. ...,  0.  1.  1.]
[ 0.  0.  0.  0.  0.  1.  1.  0.  1.  0.  0.  1.  0.  1.  1.  0.  0.  0.
  0.  1.  0.  0.  1.  0.  1.  0.  0.  1.  1.  0.  0.  1.  0.  1.  1.  1.
  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  1.  0.  0.  1.  1.  1.  1.  0.
  0.  0.  1.  1.  0.  0.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  1.  1.
  0.  0.  0.  1.  1.  1.  1.  1.  0.  1.  1.  0.  1.  1.  0.  0.  0.  1.
  0.  1.  1.  1.  1.  1.  1.  0.  1.  0.  0.  1.  0.  1.  0.  0.  0.  1.
  1.  0.  0.  0.  1.  1.  0.  1.  0.  1.  1.  1.  0.  0.  1.  0.  0.  1.
  0.  1.  1.  1.  1.  0.  1.  1.  0.  1.  0.  0.  1.  1.  0.  1.  0.  1.
  0.  0.  0.  0.  1.  1.  0.  1.  1.  0.  1.  1.  1.  0.  0.  1.  0.  1.
  0.  0.  1.  1.  1.  0.  0.  0.  0.  1.  1.  1.  0.  0.  1.  1.  0.  0.
  0.  0.  0.  0.  0.  1.  0.  1.  1.  0.  1.  0.  0.  0.  1.  0.  1.  1.
  0.  0.  1.  0.  1.  0.  1.  1.  1.  1.  0.  0.  0.  0.  1.  0.  0.  1.
  0.  0.  1.  1.  1.  1.  0.  0.  0.  0.  1.  1.  1.  0.  1.  0.  0.  0.
  1

In [31]:
save_dataset(sim_dataset, '../theano/experiments/sim_treatment/data/sim_treatment_dataset.h5')

Saving: (train, x)
Saving: (train, e)
Saving: (train, t)
Saving: (train, hr)
Saving: (valid, x)
Saving: (valid, e)
Saving: (valid, t)
Saving: (valid, hr)
Saving: (test, x)
Saving: (test, e)
Saving: (test, t)
Saving: (test, hr)
Saving: (viz, x)
Saving: (viz, e)
Saving: (viz, t)
Saving: (viz, hr)


# Whorchester Heart Attack Study (WHAS)

In [16]:
# Read the file
whas_fp = './datasets/whasncc2.dat'
whas_names = ['SET', 'CASE', 'T', 'LENFOL', 'FSTAT', 'AGE', 'SEX', 'BMI', 'CHF',
          'MIORD', 'NR']
whas_df = pd.read_csv(whas_fp, sep='\s+', names = whas_names)

whas_x = whas_df[['CASE', 'AGE', 'SEX', 'BMI', 'CHF', 'MIORD']].values.astype(np.float32)
whas_e = np.squeeze(whas_df[['FSTAT']].values).astype(np.int32)
whas_t = np.squeeze(whas_df[['T']].values).astype(np.float32)

whas_x_train, whas_x_test, whas_e_train, whas_e_test, whas_t_train, whas_t_test = \
train_test_split(whas_x, whas_e, whas_t, test_size = 0.2)

In [18]:
whas_train = { 'x': whas_x_train, 'e': whas_e_train, 't': whas_t_train}
whas_test = { 'x': whas_x_test, 'e': whas_e_test, 't': whas_t_test}
whas_ds = { 'train' : whas_train, 'test': whas_test}
save_dataset(whas_ds, '../theano/experiments/whas/data/whas_train_test.h5')

Saving: (train, x)
Saving: (train, e)
Saving: (train, t)
Saving: (test, x)
Saving: (test, e)
Saving: (test, t)


Another part of the experiment is we remove the SEX feature and rerun the models

In [11]:
whas_df = utils.load_datasets('../theano/experiments/whas/data/whas_train_test.h5')

whas_df['train']['x'] = np.delete(whas_df['train']['x'], 2, 1)
# whas_df['valid']['x'] = np.delete(whas_df['valid']['x'], 2, 1)
whas_df['test']['x'] = np.delete(whas_df['test']['x'], 2, 1)

In [12]:
save_dataset(whas_df, '../theano/experiments/whas/data/whas_train_test_no_sex.h5')

Saving: (test, e)
Saving: (test, t)
Saving: (test, x)
Saving: (train, e)
Saving: (train, t)
Saving: (train, x)


# METABRIC

In [22]:
metabric_clinical = pd.read_csv('./datasets/brca_metabric/data_clinical_supp_patient.txt', sep='\t')
metabric_clinical = metabric_clinical.set_index('PATIENT_ID')[['OS_MONTHS', 'OS_STATUS']]
metabric_clinical['OS_STATUS'] = (metabric_clinical['OS_STATUS'] == 'DECEASED').astype(np.int32)

Unnamed: 0_level_0,OS_MONTHS,OS_STATUS
PATIENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
MB-0000,140.500000,0
MB-0002,84.633333,0
MB-0005,163.100000,1
MB-0006,164.933333,0
MB-0008,41.366667,1
MB-0010,7.800000,1
MB-0014,164.333333,0
MB-0020,22.400000,1
MB-0022,99.533333,1
MB-0028,36.566667,1


In [14]:
metabric_expression = pd.read_csv('./datasets/brca_metabric/data_expression.txt', sep='\t').set_index('Hugo_Symbol').transpose().drop('Entrez_Gene_Id')

In [23]:
metabric_df = metabric_expression.join(metabric_clinical)
metabric_train_df, metabric_test_df = train_test_split(metabric_df, test_size = 0.2)

In [33]:
mb_x_train = metabric_train_df.drop(['OS_MONTHS', 'OS_STATUS'], axis = 1).values.astype(np.float32)
mb_e_train = metabric_train_df['OS_STATUS'].values.astype(np.int32)
mb_t_train = metabric_train_df['OS_MONTHS'].values.astype(np.float32)
mb_train = {'x': mb_x_train, 'e': mb_e_train, 't': mb_t_train}

mb_x_test = metabric_test_df.drop(['OS_MONTHS', 'OS_STATUS'], axis = 1).values.astype(np.float32)
mb_e_test = metabric_test_df['OS_STATUS'].values.astype(np.int32)
mb_t_test = metabric_test_df['OS_MONTHS'].values.astype(np.float32)
mb_test = {'x': mb_x_test, 'e': mb_e_test, 't': mb_t_test}

metabric_ds = {'train' : mb_train, 'test': mb_test}

In [35]:
save_dataset(metabric_ds, file='./datasets/metabric_all_gene_expression_train_test.h5')

Saving: (train, x)
Saving: (train, e)
Saving: (train, t)
Saving: (test, x)
Saving: (test, e)
Saving: (test, t)


# Cancer Treatment

In [13]:
ct_train_df = pd.read_csv('./datasets/cancer_treatment_train.csv',index_col=0)
ct_test_df = pd.read_csv('./datasets/cancer_treatment_test.csv', index_col=0)

In [14]:
names = ['treat', 'size', 'meno', 'age', 'nodes', 'pr', 'er']
ct_x_train = ct_train_df[names].values.astype(np.float32)
ct_e_train = ct_train_df['event'].values.astype(np.int32)
ct_t_train = ct_train_df['time'].values.astype(np.float32)
ct_train = { 'x' : ct_x_train, 'e': ct_e_train, 't' : ct_t_train}

ct_x_test = ct_test_df[names].values.astype(np.float32)
ct_e_test = ct_test_df['event'].values.astype(np.int32)
ct_t_test = ct_test_df['time'].values.astype(np.float32)
ct_test = { 'x' : ct_x_test, 'e': ct_e_test, 't' : ct_t_test}

ct_ds = { 'train' : ct_train, 'test': ct_test}

In [15]:
save_dataset(ct_ds, './datasets/cancer_treatment_train_test.h5')

Saving: (train, x)
Saving: (train, e)
Saving: (train, t)
Saving: (test, x)
Saving: (test, e)
Saving: (test, t)


In [73]:
ct_ds

{'test': {'e': array([1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
         0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
         0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1,
         1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
         0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
         0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
         0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
         1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
         1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1,
         0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
         0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0,
         1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
         0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 

In [4]:
ds = utils.load_datasets('./datasets/cancer_treatment_train_test.h5'); ds

defaultdict(dict,
            {'test': {'e': array([1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
                     0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
                     0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1,
                     1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
                     0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
                     0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
                     0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
                     1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
                     1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1,
                     0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
                     0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 

In [9]:
print(ds['train']['e'].shape)
print(ds['train']['t'].shape)
print(ds['train']['x'].shape)

(686,)
(686,)
(1546, 7)


# Convert WHAS + Treatment from train/test to train/valid/test

In [7]:
whas_ds = utils.load_datasets('../theano/experiments/whas/data/whas_train_test.h5')
whas_train_ds = whas_ds['train']

In [15]:
whas_train, whas_valid = split_dataset(whas_train_ds)
whas = {'train': whas_train, 'valid': whas_valid, 'test': whas_ds['test']}

In [16]:
save_dataset(whas, '../theano/experiments/whas/data/whas_train_valid_test.h5')

Saving: (train, x)
Saving: (train, e)
Saving: (train, t)
Saving: (valid, x)
Saving: (valid, e)
Saving: (valid, t)
Saving: (test, e)
Saving: (test, t)
Saving: (test, x)


In [17]:
treatment_ds = utils.load_datasets('../theano/experiments/treatment/data/treatment_cancer_train_test.h5')
trt_train_ds = treatment_ds['train']
trt_train, trt_valid = split_dataset(trt_train_ds)
trt_ds = {'train': trt_train, 'valid': trt_valid, 'test': treatment_ds['test']}

In [18]:
save_dataset(trt_ds, '../theano/experiments/treatment/data/treatment_cancer_train_valid_test.h5')

Saving: (train, x)
Saving: (train, e)
Saving: (train, t)
Saving: (valid, x)
Saving: (valid, e)
Saving: (valid, t)
Saving: (test, e)
Saving: (test, t)
Saving: (test, x)


# Normalize Datasets
- normalize, train, valid, test, and viz datasets by using the train data mean and stnadard deviation

In [74]:
import copy
import os

def normalize_dataset(ds_dict):
    train_mean = ds_dict['train']['x'].mean(axis = 0)
#     train_mean = train_mean - train_mean
    train_std = ds_dict['train']['x'].std(axis = 0)
#     train_std = train_std / train_std
    
    print('Train mean:', train_mean)
    print('Train std:', train_std)
    normed_ds = copy.deepcopy(ds_dict)
    for ds in normed_ds.keys():
        x = normed_ds[ds]['x']
        normed_ds[ds]['x'] = (x - train_mean) / train_std
    
    return normed_ds

def append_fp_suffix(ds_fp, suffix):
    (root, ext) = os.path.splitext(ds_fp)
    return root + suffix + ext

In [81]:
dataset_fp = [
    '../theano/experiments/linear/data/linear_survival_data.h5',
#     '../theano/experiments/gaussian/data/nonlinear_survival_data.h5',
#     '../theano/experiments/whas/data/whas_train_valid_test.h5',
#     '../theano/experiments/treatment/data/treatment_cancer_train_valid_test.h5',
#     '../theano/experiments/sim_treatment/data/sim_treatment_dataset.h5'
             ]
for ds_fp in dataset_fp:
    ds_dict = utils.load_datasets(ds_fp)
    ds_dict_normalized = normalize_dataset(ds_dict)
    norm_ds_fp = append_fp_suffix(ds_fp, "_normalized")
    print(norm_ds_fp)
#     save_dataset(ds_dict_normalized, norm_ds_fp)

Train mean: [-0.00668903  0.01541482 -0.0119781   0.00151574 -0.01040963  0.00754932
  0.00183946 -0.00625678 -0.00664587 -0.00326915]
Train std: [ 0.57857269  0.58163404  0.58047062  0.5731315   0.57402116  0.57840085
  0.57474554  0.5802719   0.58213401  0.57194531]


In [92]:
ds_dict_normalized['viz']['x'].shape

(10000, 10)

In [99]:
utils.load_datasets('../theano/experiments/linear/data/linear_survival_data_normalized.h5')['valid']['hr']

array([  2.96792626e-01,  -1.85532761e+00,  -1.23894727e+00,
         1.69436109e+00,  -2.50337696e+00,  -2.00310573e-01,
        -9.42579925e-01,  -1.40498698e+00,  -3.53913695e-01,
        -5.29858433e-02,  -2.67432123e-01,  -2.92784065e-01,
         1.44410896e+00,  -1.29927039e+00,  -8.44901875e-02,
        -1.29647052e+00,  -2.62248087e+00,  -7.35340044e-02,
         1.24726462e+00,  -1.62776923e+00,  -2.62512279e+00,
        -5.36547005e-01,  -1.67469814e-01,  -6.69448614e-01,
         1.42191529e+00,  -1.89232254e+00,   1.26844275e+00,
        -2.23252511e+00,  -7.44044602e-01,  -5.02446055e-01,
        -5.80432951e-01,  -1.19971800e+00,  -1.40576744e+00,
         2.27743840e+00,  -1.41765594e+00,  -3.37160826e-01,
         1.81143686e-01,  -7.81846166e-01,  -2.51729399e-01,
        -5.08802570e-02,   1.72292531e-01,   8.76074076e-01,
        -8.76820743e-01,  -1.18265069e+00,  -3.52568358e-01,
        -1.66348732e+00,  -1.50867546e+00,   6.59239590e-01,
        -8.59788179e-01,

# SUPPORT2
http://biostat.mc.vanderbilt.edu/wiki/Main/SupportDesc

9k patients; 
What are the relevant features? How do we extract them without doing "feature selection"

- death: death at any time up to NDI date
- hospdead: death in hospital
- slos: days from study entry to discharg 
- d.time: days of follow-up

- age
- sex
- num.co: num of comorbidities
- edu
- income
- race
- diabetes
- dementia
- ca (cancer): no, yes, metastic

Day 3 data

- meanbp: mean arterial blood pressure
- hrt: heart rate
- resp: respiration rate
- temp: temperature
- sod: serum sodium day
- 

In [29]:
sdf = pd.read_csv('./datasets/support2.csv')
sdf[['death', 'hospdead', 'slos', 'd.time']]
print(sdf.where(sdf['death'] == 1)['d.time'].median())
sdf['death'].mean()

58.0


0.68105436573311362

In [67]:
sdf = sdf[[  'd.time','death'
#             'edu', 'income'
        ]]
# x_df = x_df.dropna()

In [69]:
factor_cols = ['sex','race','ca']
for factor in factor_cols:
    sdf[factor] = pd.factorize(sdf[factor])[0]

sdf

Unnamed: 0,age,sex,num.co,race,diabetes,dementia,ca,meanbp,hrt,resp,temp,sod,wblc,crea,d.time,death
1,62.84998,0,0,0,0,0,0,97.0,69.0,22.0,36.00000,141.0,6.000000,1.199951,2029,0
2,60.33899,1,2,1,0,0,1,43.0,112.0,34.0,34.59375,132.0,17.097656,5.500000,4,1
3,52.74698,1,2,1,0,0,1,70.0,88.0,28.0,37.39844,134.0,8.500000,2.000000,47,1
4,42.38498,1,2,1,0,0,0,75.0,88.0,32.0,35.00000,139.0,9.099609,0.799927,133,1
5,79.88495,1,1,1,0,0,1,59.0,112.0,20.0,37.89844,143.0,13.500000,0.799927,2029,0
6,93.01599,0,1,1,0,0,1,110.0,101.0,44.0,38.39844,140.0,10.398438,0.699951,4,1
7,62.37097,0,1,1,0,0,1,78.0,120.0,28.0,37.39844,132.0,11.699219,1.599854,659,1
8,86.83899,0,3,1,1,0,1,72.0,100.0,26.0,37.59375,139.0,13.599609,2.000000,142,1
9,85.65594,0,2,2,0,1,0,97.0,56.0,20.0,36.59375,143.0,9.699219,1.000000,63,1
10,42.25897,1,0,3,0,0,0,84.0,94.0,20.0,38.19531,139.0,11.298828,0.799927,370,1


In [73]:
sdf = sdf.dropna()
t = sdf[['d.time']].values.astype(np.float32)
e = sdf[['death']].values.astype(np.int32)
x = sdf[['age','sex','num.co',
         'race', 'diabetes', 'dementia', 
         'ca','meanbp','hrt','resp','temp','sod',
        'wblc', 'crea']].values.astype(np.float32)
support_ds = { 'x' : x, 't': np.squeeze(t), 'e' : np.squeeze(e)}

In [74]:
support_ds

{'e': array([0, 1, 1, ..., 0, 1, 1], dtype=int32),
 't': array([ 2029.,     4.,    47., ...,   346.,     7.,   198.], dtype=float32),
 'x': array([[  62.8499794 ,    0.        ,    0.        , ...,  141.        ,
            6.        ,    1.19995117],
        [  60.33898926,    1.        ,    2.        , ...,  132.        ,
           17.09765625,    5.5       ],
        [  52.74697876,    1.        ,    2.        , ...,  134.        ,
            8.5       ,    2.        ],
        ..., 
        [  70.38195801,    0.        ,    1.        , ...,  139.        ,
            8.3984375 ,    2.69970703],
        [  47.01998901,    0.        ,    1.        , ...,  135.        ,
            7.59960938,    3.5       ],
        [  81.53894043,    1.        ,    1.        , ...,  137.        ,
            8.59960938,    1.19995117]], dtype=float32)}

In [75]:
train_sds, test_sds = split_dataset(support_ds)
train_sds, valid_sds = split_dataset(train_sds)
sds = { 'train': train_sds, 'valid': valid_sds, 'test': test_sds}

In [81]:
save_dataset(sds, './datasets/support_train_valid_test.h5')

Saving: (train, x)
Saving: (train, e)
Saving: (train, t)
Saving: (valid, x)
Saving: (valid, e)
Saving: (valid, t)
Saving: (test, x)
Saving: (test, e)
Saving: (test, t)


# Check the statistics about datasets

In [18]:
# temp = utils.load_datasets('../theano/experiments/linear/data/linear_survival_data.h5')
temp = utils.load_datasets('../theano/experiments/gaussian/data/nonlinear_survival_data.h5')
for key in temp.keys():
    print(key,np.mean(temp[key]['e']))

test 0.908
train 0.911
valid 0.91
viz 0.9065


In [24]:
temp = utils.load_datasets('../theano/experiments/support/data/support_train_valid_test.h5')
for key in temp.keys():
    print(key,np.mean(temp[key]['t']))
    print(key, np.mean(temp[key]['e']))
    print(key, temp[key]['x'].shape)

test 474.363
test 0.678873239437
test (1775, 14)
train 476.208
train 0.681049665375
train (5678, 14)
valid 493.68
valid 0.678873239437
valid (1420, 14)


1179.0