In [1]:
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import sys
import json
import pickle
import json

from tqdm import tqdm
import itertools

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler

In [2]:
seeds = [999, 7, 42]
test_size = 0.3

In [14]:

data_df = pd.read_csv('./../Data/1000_features_survival_3classes.csv',index_col=0).drop(['index', 'y'],axis=1)
data_df_event_time = data_df[['event', 'time']]

# Initialize the LabelEncoder
le = LabelEncoder()

# Loop through each column and apply encoding to object type columns
for col in data_df.columns:
    if data_df[col].dtype == 'object':
        data_df[col] = le.fit_transform(data_df[col])

data_df = data_df.fillna(data_df.mean())
data_df = data_df.loc[:, data_df.nunique() > 1]

# data_df = pd.get_dummies(data_df.drop(['event', 'time'], axis=1),dtype='int')
scaler = MinMaxScaler()
data_df = pd.DataFrame(scaler.fit_transform(data_df.drop(['event', 'time'], axis=1)), columns=data_df.drop(['event', 'time'], axis=1).columns)
data_df['event'] = [int(e) for e in data_df_event_time['event']]
data_df['time'] = data_df_event_time['time']

data_df = data_df.fillna(data_df.mean())
data_df = data_df.loc[:, data_df.nunique() > 1]

hyperopts_df = {'penalizer': [], 'l1_ratio': [], 'train_ci':[], 'valid_ci':[], 'test_ci':[], 'time_elapsed':[], 
                'train_ci_ls':[], 'valid_ci_ls':[], 'test_ci_ls':[]}

combinations = [i for i in itertools.product([0.01, 1], [0.5, 1])]
ncombinations = len(combinations)

for penalizer, l1_ratio in tqdm():
    hyperopts_df['penalizer'] = hyperopts_df['penalizer'] + [penalizer]
    hyperopts_df['l1_ratio'] = hyperopts_df['l1_ratio'] + [l1_ratio]
    train_ci_ls = []
    valid_ci_ls = []
    test_ci_ls = []
    elapsed_time_ls = []
    for seed in seeds:
        
        
        data_train, data_tmp = train_test_split(data_df, test_size=test_size, random_state=seed)
        data_valid, data_test = train_test_split(data_tmp, test_size=0.5, random_state=seed)
        
        cph = CoxPHFitter(penalizer=penalizer, l1_ratio=l1_ratio)
        start = time.time()
        cph.fit(data_train, duration_col='time', event_col='event')
        end = time.time()
        
        elapsed_time_ls = elapsed_time_ls + [end-start]
        train_ci_ls = train_ci_ls + [cph.score(data_train, scoring_method='concordance_index')]
        valid_ci_ls = valid_ci_ls + [cph.score(data_valid, scoring_method='concordance_index')]
        test_ci_ls = test_ci_ls + [cph.score(data_test, scoring_method='concordance_index')]

    print("\nTrain: ",train_ci_ls, "\nValid: ",valid_ci_ls, "\nTest: ",test_ci_ls,
          "\nElapsed time: ", elapsed_time_ls)

    hyperopts_df['train_ci'] = hyperopts_df['train_ci'] + [np.mean(train_ci_ls)]
    hyperopts_df['valid_ci'] = hyperopts_df['valid_ci'] + [np.mean(valid_ci_ls)]
    hyperopts_df['test_ci'] = hyperopts_df['test_ci'] + [np.mean(test_ci_ls)]
    hyperopts_df['time_elapsed'] = hyperopts_df['time_elapsed'] + [np.mean(elapsed_time_ls)]
    hyperopts_df['train_ci_ls'] = hyperopts_df['train_ci_ls'] + [train_ci_ls]
    hyperopts_df['valid_ci_ls'] = hyperopts_df['valid_ci_ls'] + [valid_ci_ls]
    hyperopts_df['test_ci_ls'] = hyperopts_df['test_ci_ls'] + [test_ci_ls]
    # cph.print_summary()

hyperopts_df['seeds'] = [seeds]*ncombinations
pd.DataFrame(hyperopts_df)





>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'ENSG00000022355.18'].var())
>>> print(df.loc[~events, 'ENSG00000022355.18'].var())

A very low variance means that the column ENSG00000022355.18 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.

  result = getattr(ufunc, method)(*inputs, **kwargs)

  result = getattr(ufunc, method)(*inputs, **kwargs)
1it [08:55, 535.80s/it]


Train:  [0.9618255491576029, 0.9641194193371679, 0.9632435117347572] 
Valid:  [0.7118337850045167, 0.616504854368932, 0.7773886943471736] 
Test:  [0.6545842217484008, 0.7289760348583878, 0.7077922077922078] 
Elapsed time:  [240.02991127967834, 31.914050579071045, 263.5219123363495]




  result = getattr(ufunc, method)(*inputs, **kwargs)



>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'ENSG00000022355.18'].var())
>>> print(df.loc[~events, 'ENSG00000022355.18'].var())

A very low variance means that the column ENSG00000022355.18 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.

  result = getattr(ufunc, method)(*inputs, **kwargs)
2it [24:24, 766.95s/it]


Train:  [0.9275868085848892, 0.9260897962623518, 0.9247899333581688] 
Valid:  [0.7276422764227642, 0.6954986760812003, 0.6828414207103551] 
Test:  [0.7313432835820896, 0.787363834422658, 0.6432900432900432] 
Elapsed time:  [278.9392783641815, 226.32988619804382, 422.8249480724335]





3it [38:15, 796.32s/it]


Train:  [0.7858236879350123, 0.7755093441206835, 0.7989776066890186] 
Valid:  [0.7005420054200542, 0.7294792586054722, 0.7698849424712356] 
Test:  [0.6652452025586354, 0.6570806100217865, 0.6943722943722944] 
Elapsed time:  [273.5999803543091, 293.90710616111755, 263.40561175346375]





4it [50:35, 758.97s/it]


Train:  [0.7858236879350123, 0.7755304131639382, 0.7988741255846682] 
Valid:  [0.7005420054200542, 0.7294792586054722, 0.7693846923461731] 
Test:  [0.6652452025586354, 0.6570806100217865, 0.6943722943722944] 
Elapsed time:  [177.59113669395447, 309.60726022720337, 252.45375037193298]





ValueError: All arrays must be of the same length

In [21]:
hyperopts_df = pd.DataFrame(hyperopts_df)
hyperopts_df.to_csv('./../results/CPH/hyperopts_results.df')

In [22]:
pd.read_csv('./../results/CPH/hyperopts_results.df',index_col=0).sort_values('valid_ci')

Unnamed: 0,penalizer,l1_ratio,train_ci,valid_ci,test_ci,time_elapsed,train_ci_ls,valid_ci_ls,test_ci_ls,seeds
0,0.01,0.5,0.963063,0.701909,0.697117,178.488625,"[0.9618255491576029, 0.9641194193371679, 0.963...","[0.7118337850045167, 0.616504854368932, 0.7773...","[0.6545842217484008, 0.7289760348583878, 0.707...","[999, 7, 42]"
1,0.01,1.0,0.926156,0.701994,0.720666,309.364704,"[0.9275868085848892, 0.9260897962623518, 0.924...","[0.7276422764227642, 0.6954986760812003, 0.682...","[0.7313432835820896, 0.787363834422658, 0.6432...","[999, 7, 42]"
3,1.0,1.0,0.786743,0.733135,0.672233,246.550716,"[0.7858236879350123, 0.7755304131639382, 0.798...","[0.7005420054200542, 0.7294792586054722, 0.769...","[0.6652452025586354, 0.6570806100217865, 0.694...","[999, 7, 42]"
2,1.0,0.5,0.78677,0.733302,0.672233,276.970899,"[0.7858236879350123, 0.7755093441206835, 0.798...","[0.7005420054200542, 0.7294792586054722, 0.769...","[0.6652452025586354, 0.6570806100217865, 0.694...","[999, 7, 42]"


# Best config

In [25]:
data_df = pd.read_csv('./../data/breast_cancer/1000_features_concat_survival_3classes.csv',index_col=0).drop(['index', 'y'],axis=1)
data_df_event_time = data_df[['event', 'time']]

# Initialize the LabelEncoder
le = LabelEncoder()

# Loop through each column and apply encoding to object type columns
for col in data_df.columns:
    if data_df[col].dtype == 'object':
        data_df[col] = le.fit_transform(data_df[col])

data_df = data_df.fillna(data_df.mean())
data_df = data_df.loc[:, data_df.nunique() > 1]

# data_df = pd.get_dummies(data_df.drop(['event', 'time'], axis=1),dtype='int')
scaler = MinMaxScaler()
data_df = pd.DataFrame(scaler.fit_transform(data_df.drop(['event', 'time'], axis=1)), columns=data_df.drop(['event', 'time'], axis=1).columns)
data_df['event'] = [int(e) for e in data_df_event_time['event']]
data_df['time'] = data_df_event_time['time']

data_df = data_df.fillna(data_df.mean())
data_df = data_df.loc[:, data_df.nunique() > 1]

seeds = [999, 7, 42, 1995, 1303, 2405, 1996, 200, 0, 777]
penalizer = 1
l1_ratio = 0.5
train_ci_ls = []
valid_ci_ls = []
test_ci_ls = []
elapsed_time_ls = []
for seed in tqdm(seeds):

    data_train, data_tmp = train_test_split(data_df, test_size=test_size, random_state=seed)
    data_valid, data_test = train_test_split(data_tmp, test_size=0.5, random_state=seed)
    
    cph = CoxPHFitter(penalizer=penalizer, l1_ratio=l1_ratio)
    start = time.time()
    cph.fit(data_train, duration_col='time', event_col='event')
    end = time.time()
    
    elapsed_time_ls = elapsed_time_ls + [end-start]
    train_ci_ls = train_ci_ls + [cph.score(data_train, scoring_method='concordance_index')]
    valid_ci_ls = valid_ci_ls + [cph.score(data_valid, scoring_method='concordance_index')]
    test_ci_ls = test_ci_ls + [cph.score(data_test, scoring_method='concordance_index')]

print("\nTrain: ", seed,
      "\nTrain: ",train_ci_ls, "\nValid: ",valid_ci_ls, "\nTest: ",test_ci_ls,
      "\nElapsed time: ", elapsed_time_ls)











100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [39:57<00:00, 239.77s/it]


Train:  777 
Train:  [0.7858236879350123, 0.7755093441206835, 0.7989776066890186, 0.7962046392810711, 0.7972950055117443, 0.7649655940495604, 0.7880371290809065, 0.7857633611625825, 0.7867192118226601, 0.7828326533976182] 
Valid:  [0.7005420054200542, 0.7294792586054722, 0.7698849424712356, 0.694300518134715, 0.722707423580786, 0.6735149620366235, 0.7225378787878788, 0.6493679308050565, 0.6555401019935095, 0.7315157116451017] 
Test:  [0.6652452025586354, 0.6570806100217865, 0.6943722943722944, 0.6125076640098099, 0.5959552495697074, 0.6159334126040428, 0.645304312777106, 0.6987504221546774, 0.6603194103194103, 0.5444234404536862] 
Elapsed time:  [177.59113669395447, 309.60726022720337, 252.45375037193298, 335.3821527957916, 322.638436794281, 337.438747882843, 297.6262867450714, 179.87121534347534, 183.6999866962433, 180.09716176986694, 181.84937500953674, 190.54320359230042, 187.36748957633972]





# Feature weights

In [3]:
data_df = pd.read_csv('./../data/breast_cancer/1000_features_concat_survival_3classes.csv',index_col=0).drop(['index', 'y'],axis=1)
data_df_event_time = data_df[['event', 'time']]

# Initialize the LabelEncoder
le = LabelEncoder()

# Loop through each column and apply encoding to object type columns
for col in data_df.columns:
    if data_df[col].dtype == 'object':
        data_df[col] = le.fit_transform(data_df[col])

data_df = data_df.fillna(data_df.mean())
data_df = data_df.loc[:, data_df.nunique() > 1]

# data_df = pd.get_dummies(data_df.drop(['event', 'time'], axis=1),dtype='int')
scaler = MinMaxScaler()
data_df = pd.DataFrame(scaler.fit_transform(data_df.drop(['event', 'time'], axis=1)), columns=data_df.drop(['event', 'time'], axis=1).columns)
data_df['event'] = [int(e) for e in data_df_event_time['event']]
data_df['time'] = data_df_event_time['time']

data_df = data_df.fillna(data_df.mean())
data_df = data_df.loc[:, data_df.nunique() > 1]

seeds = [999]
penalizer = 1
l1_ratio = 0.5
train_ci_ls = []
valid_ci_ls = []
test_ci_ls = []
elapsed_time_ls = []
for seed in tqdm(seeds):

    data_train, data_tmp = train_test_split(data_df, test_size=test_size, random_state=seed)
    data_valid, data_test = train_test_split(data_tmp, test_size=0.5, random_state=seed)
    
    cph = CoxPHFitter(penalizer=penalizer, l1_ratio=l1_ratio)
    start = time.time()
    cph.fit(data_train, duration_col='time', event_col='event')
    end = time.time()
    
    elapsed_time_ls = elapsed_time_ls + [end-start]
    train_ci_ls = train_ci_ls + [cph.score(data_train, scoring_method='concordance_index')]
    valid_ci_ls = valid_ci_ls + [cph.score(data_valid, scoring_method='concordance_index')]
    test_ci_ls = test_ci_ls + [cph.score(data_test, scoring_method='concordance_index')]

print("\nTrain: ", seed,
      "\nTrain: ",train_ci_ls, "\nValid: ",valid_ci_ls, "\nTest: ",test_ci_ls,
      "\nElapsed time: ", elapsed_time_ls)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [05:34<00:00, 334.45s/it]


Train:  999 
Train:  [0.7858236879350123] 
Valid:  [0.7005420054200542] 
Test:  [0.6652452025586354] 
Elapsed time:  [334.2337603569031]





In [41]:
cph.summary['coef'].reset_index().sort_values('coef',ascending=False).to_csv('./../results/CPH/'+str(seed)+'_coef_cph.csv')