In [1]:
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import sys
import json
import pickle
import json

from tqdm import tqdm
import itertools

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler

# Best config

In [None]:
data_df = pd.read_csv('./../Data/1000_features_survival_3classes.csv',index_col=0).drop(['index', 'y'],axis=1)
data_df_event_time = data_df[['event', 'time']]

# Initialize the LabelEncoder
le = LabelEncoder()

# Loop through each column and apply encoding to object type columns
for col in data_df.columns:
    if data_df[col].dtype == 'object':
        data_df[col] = le.fit_transform(data_df[col])

data_df = data_df.fillna(data_df.mean())
data_df = data_df.loc[:, data_df.nunique() > 1]

# data_df = pd.get_dummies(data_df.drop(['event', 'time'], axis=1),dtype='int')
scaler = MinMaxScaler()
data_df = pd.DataFrame(scaler.fit_transform(data_df.drop(['event', 'time'], axis=1)), columns=data_df.drop(['event', 'time'], axis=1).columns)
data_df['event'] = [int(e) for e in data_df_event_time['event']]
data_df['time'] = data_df_event_time['time']

data_df = data_df.fillna(data_df.mean())
data_df = data_df.loc[:, data_df.nunique() > 1]

seeds = [999, 7, 42, 1995, 1303, 2405, 1996, 200, 0, 777]
penalizer = 1
l1_ratio = 0.5
train_ci_ls = []
valid_ci_ls = []
test_ci_ls = []
elapsed_time_ls = []
for seed in tqdm(seeds):

    data_train, data_tmp = train_test_split(data_df, test_size=test_size, random_state=seed)
    data_valid, data_test = train_test_split(data_tmp, test_size=0.5, random_state=seed)
    
    cph = CoxPHFitter(penalizer=penalizer, l1_ratio=l1_ratio)
    start = time.time()
    cph.fit(data_train, duration_col='time', event_col='event')
    end = time.time()
    
    elapsed_time_ls = elapsed_time_ls + [end-start]
    train_ci_ls = train_ci_ls + [cph.score(data_train, scoring_method='concordance_index')]
    valid_ci_ls = valid_ci_ls + [cph.score(data_valid, scoring_method='concordance_index')]
    test_ci_ls = test_ci_ls + [cph.score(data_test, scoring_method='concordance_index')]

print("\nTrain: ", seed,
      "\nTrain: ",train_ci_ls, "\nValid: ",valid_ci_ls, "\nTest: ",test_ci_ls,
      "\nElapsed time: ", elapsed_time_ls)

# Feature weights

In [None]:
data_df = pd.read_csv('./../Data/1000_features_survival_3classes.csv',index_col=0).drop(['index', 'y'],axis=1)
data_df_event_time = data_df[['event', 'time']]

# Initialize the LabelEncoder
le = LabelEncoder()

# Loop through each column and apply encoding to object type columns
for col in data_df.columns:
    if data_df[col].dtype == 'object':
        data_df[col] = le.fit_transform(data_df[col])

data_df = data_df.fillna(data_df.mean())
data_df = data_df.loc[:, data_df.nunique() > 1]

scaler = MinMaxScaler()
data_df = pd.DataFrame(scaler.fit_transform(data_df.drop(['event', 'time'], axis=1)), columns=data_df.drop(['event', 'time'], axis=1).columns)
data_df['event'] = [int(e) for e in data_df_event_time['event']]
data_df['time'] = data_df_event_time['time']

data_df = data_df.fillna(data_df.mean())
data_df = data_df.loc[:, data_df.nunique() > 1]

seeds = [999]
penalizer = 1
l1_ratio = 0.5
train_ci_ls = []
valid_ci_ls = []
test_ci_ls = []
elapsed_time_ls = []
for seed in tqdm(seeds):

    data_train, data_tmp = train_test_split(data_df, test_size=test_size, random_state=seed)
    data_valid, data_test = train_test_split(data_tmp, test_size=0.5, random_state=seed)
    
    cph = CoxPHFitter(penalizer=penalizer, l1_ratio=l1_ratio)
    start = time.time()
    cph.fit(data_train, duration_col='time', event_col='event')
    end = time.time()
    
    elapsed_time_ls = elapsed_time_ls + [end-start]
    train_ci_ls = train_ci_ls + [cph.score(data_train, scoring_method='concordance_index')]
    valid_ci_ls = valid_ci_ls + [cph.score(data_valid, scoring_method='concordance_index')]
    test_ci_ls = test_ci_ls + [cph.score(data_test, scoring_method='concordance_index')]

print("\nTrain: ", seed,
      "\nTrain: ",train_ci_ls, "\nValid: ",valid_ci_ls, "\nTest: ",test_ci_ls,
      "\nElapsed time: ", elapsed_time_ls)

In [41]:
cph.summary['coef'].reset_index().sort_values('coef',ascending=False).to_csv('./../results/CPH/'+str(seed)+'_coef_cph.csv')