In [1]:
import copy

import numpy as np
import pandas as pd
from itertools import combinations

from sklearn.model_selection import KFold
from skopt.space import Real

import sys
sys.path.append('G:/내 드라이브/ensemble kernel')

from Kernel_Function_3 import split_data, prepare_response_variable, c_index_kernel_type

## Preprocessing

In [64]:
data = pd.read_csv("icu_pneu.csv")

data['OS'] = data['stop'] - data['start']

df = data.drop(columns=['Unnamed: 0','id', 'start', 'stop']).dropna()
df.rename(columns = {'status':'Status', 'time':'OS', 'age':'Age', 'sex':'Sex'}, inplace=True)
df.replace({'Sex' : {'F':0, 'M':1}}, inplace=True)

columns_to_convert = ['event','pneu','Sex']
df[columns_to_convert] = df[columns_to_convert].astype('category')

#onehot encode
df_onehot = pd.get_dummies(df, drop_first=True).rename(columns={'Sex_1':'Sex', 'pneu_1':'pneu'})

columns_to_convert = ['event_3','pneu','Sex']
df_onehot[columns_to_convert] = df_onehot[columns_to_convert].astype('category')
df_onehot

Unnamed: 0,Status,adm.cens.exit,Age,OS,event_3,pneu,Sex
0,1,421.0,62.533071,3.0,1,0,0
1,1,545.0,75.341530,5.0,1,0,0
2,1,372.0,76.103825,88.0,1,0,1
3,1,373.0,39.789617,8.0,1,0,0
4,1,372.0,19.173800,25.0,1,0,1
...,...,...,...,...,...,...,...
1416,1,413.0,61.442600,21.0,1,0,0
1417,1,445.0,69.851059,21.0,1,0,1
1418,1,537.0,60.232570,49.0,1,0,1
1419,1,555.0,59.521858,35.0,1,0,1


### K-fold setting

In [65]:
cv = KFold(n_splits=5, shuffle=True, random_state=36)

param_grid = {'alpha': 2. ** np.arange(-12, 13, 2)}
param_space = {'alpha': Real(1e-6, 1e+6, 'log-uniform'),}

### Selection of 100 random state numbers

In [67]:
import random

with open("random_state_100.txt", "r") as file:
    random_state=file.read()
    
random_state=random_state.split("\n")
random_state=[int(x) for x in random_state if x]

## Scenario 1

### Check results by Kernel type

Linear Kernel

In [138]:
linear_results=pd.DataFrame()

linear_results['train_C_index']=[]
linear_results['test_C_index']=[]

In [None]:
kernel_type = 'linear'

for i in range(100):
    
    x_train, x_test, target_train, target_test = split_data(df_onehot, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    linear_results.loc[i] = [results[0], results[1]]

linear_results.to_csv("icu.pneu_linear.csv", index = False, encoding = 'cp949')


Clinical Kernel

In [140]:
clinical_results=pd.DataFrame()

clinical_results['train_C_index']=[]
clinical_results['test_C_index']=[]

In [None]:
kernel_type = 'clinical'

for i in range(100):
    
    x_train, x_test, target_train, target_test = split_data(df_onehot, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    clinical_results.loc[i] = [results[0], results[1]]

clinical_results.to_csv("icu.pneu_clinical.csv", index = False, encoding = 'cp949')

Ensemble Cox Kernel

In [142]:
ensemble_cox_results=pd.DataFrame()

ensemble_cox_results['train_C_index']=[]
ensemble_cox_results['test_C_index']=[]

In [None]:
kernel_type = 'ensemble_cox'

for i in range(100):
    
    x_train, x_test, target_train, target_test = split_data(df_onehot, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_cox_results.loc[i] = [results[0], results[1]]

ensemble_cox_results.to_csv("icu.pneu_cox.csv", index = False, encoding = 'cp949')

#### Ensemble AFT

In [144]:
ensemble_aft_results=pd.DataFrame()

ensemble_aft_results['train_C_index']=[]
ensemble_aft_results['test_C_index']=[]

In [None]:
kernel_type = 'ensemble_aft'

for i in range(100):
    
    x_train, x_test, target_train, target_test = split_data(df_onehot, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_aft_results.loc[i] = [results[0], results[1]]

ensemble_aft_results.to_csv("icu.pneu_aft.csv", index = False, encoding = 'cp949')

### Result

In [146]:
linear_results = pd.read_csv('icu.pneu_linear.csv')
clinical_results = pd.read_csv('icu.pneu_clinical.csv')
ensemble_cox_results = pd.read_csv('icu.pneu_cox.csv')
ensemble_aft_results = pd.read_csv('icu.pneu_aft.csv')

### The value of C-index

In [None]:
print(round(np.mean(linear_results['train_C_index']),4))
print(round(np.std(linear_results['train_C_index']),4))
print(round(np.mean(linear_results['test_C_index']),4))
print(round(np.std(linear_results['test_C_index']),4))

In [None]:
print(round(np.mean(clinical_results['train_C_index']),4))
print(round(np.std(clinical_results['train_C_index']),4))
print(round(np.mean(clinical_results['test_C_index']),4))
print(round(np.std(clinical_results['test_C_index']),4))

In [None]:
print(round(np.mean(ensemble_cox_results['train_C_index']),4))
print(round(np.std(ensemble_cox_results['train_C_index']),4))
print(round(np.mean(ensemble_cox_results['test_C_index']),4))
print(round(np.std(ensemble_cox_results['test_C_index']),4))

In [None]:
print(round(np.mean(ensemble_aft_results['train_C_index']),4))
print(round(np.std(ensemble_aft_results['train_C_index']),4))
print(round(np.mean(ensemble_aft_results['test_C_index']),4))
print(round(np.std(ensemble_aft_results['test_C_index']),4))

## Scenario 2

In [71]:
# Remaining variable results from 100 runs
cox_remaining_variable=pd.DataFrame()

cox_remaining_variable['variable']=[]

### CV of selection of variables

In [None]:
for i in range(100):
    x_train, x_test, target_train, target_test = split_data(df_onehot, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    # Separating OS and Status
    drop = x_train[['Sex', 'Age', 'OS', 'Status']]
    x_train_drop = x_train.drop(columns=['Sex', 'Age', 'OS','Status'])

    # Save column names of a DataFrame into a list
    columns = x_train_drop.columns

    column_groups = []

    for i in range(len(columns)):
        # Generate all combinations of columns taken i+1 at a time
        all_column_combinations = list(combinations(columns, i+1))

        # Create a DataFrame for each combination
        for column_combination in all_column_combinations:
            selected_columns = list(column_combination)
            sub_train1 = x_train_drop[selected_columns]
            sub_train2 = pd.concat([sub_train1, drop], axis=1)
            column_groups.append(sub_train2)
    
    num_groups = 3
    best_cindex = []

    for i in range(len(column_groups)):
        x_groups = []
        train = column_groups[i].sample(frac=1).reset_index(drop=True)
        
        censored = train[train['Status'] == 0]  # Extract rows with a value of 0
        uncensored = train[train['Status'] == 1]  # Extract rows with a value of 1

        group_size1 = len(censored) // num_groups
        group_size2 = len(uncensored) // num_groups

        # Split train data into groups
        for i in range(num_groups):
                
            if i < num_groups - 1:
                # Adjusting the censoring ratio
                group1 = censored.iloc[i * group_size1:(i + 1) * group_size1]
                group2 = uncensored.iloc[i * group_size2:(i + 1) * group_size2]

                group = pd.concat([group1, group2], ignore_index=True)
            else:
                group1 = censored.iloc[i * group_size1:]
                group2 = uncensored.iloc[i * group_size2:]

                group = pd.concat([group1, group2], ignore_index=True)
    
            x_groups.append(group)

        cindex = []

        for i in range(len(x_groups)):

            temp = []
            for j in range(len(x_groups)):
                if i != j:
                    temp.append(x_groups[j])
            # train = train data(One out of the equally divided segments, excluding one)
            train = pd.concat(temp, ignore_index=True)

            # validation = validation data(One of the equally divided segments)
            validation = x_groups[i]
            
            train_target = train[['Status','OS']]
            test_target = validation[['Status','OS']]

            train_y = prepare_response_variable(train_target)
            test_y = prepare_response_variable(test_target)
            #Define data -> train, train_y, validation,  test_y

            result = c_index_kernel_type(train, train_y, validation, test_y, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = 'ensemble_cox')
            cindex.append(result[1])

        #best_cindex: Store all validation c-index values for each selected variable as a list
        best_cindex.append(np.mean(cindex))

    #max_num: Index number of the best c-index in the list
    max_num = best_cindex.index(max(best_cindex))

    #train_column: Selected variables for the best c-index
    train_column = column_groups[max_num].columns

    cox_remaining_variable.loc[i] = [list(train_column)]
    
cox_remaining_variable.to_csv("icu.pneu_cox_remaining_variables.csv", index = False, encoding = 'cp949')

### C-index by Kernel type

Linear Kernel

In [79]:
linear_results=pd.DataFrame()

linear_results['train_C_index']=[]
linear_results['test_C_index']=[]

In [None]:
kernel_type = 'linear'

for i in range(100):
    variables = eval(pd.read_csv("icu.pneu_cox_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    linear_results.loc[i] = [results[0], results[1]]

linear_results.to_csv("icu.pneu_cox_linear.csv", index = False, encoding = 'cp949')


Clinical Kernel

In [90]:
clinical_results=pd.DataFrame()

clinical_results['train_C_index']=[]
clinical_results['test_C_index']=[]

In [None]:
kernel_type = 'clinical'

for i in range(100):
    variables = eval(pd.read_csv("icu.pneu_cox_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    clinical_results.loc[i] = [results[0], results[1]]

clinical_results.to_csv("icu.pneu_cox_clinical.csv", index = False, encoding = 'cp949')

Ensemble Cox Kernel

In [92]:
ensemble_cox_results=pd.DataFrame()

ensemble_cox_results['train_C_index']=[]
ensemble_cox_results['test_C_index']=[]

In [None]:
kernel_type = 'ensemble_cox'

for i in range(100):
    variables = eval(pd.read_csv("icu.pneu_cox_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_cox_results.loc[i] = [results[0], results[1]]
    
ensemble_cox_results.to_csv("icu.pneu_cox_cox.csv", index = False, encoding = 'cp949')

Ensemble AFT Kernel

In [95]:
ensemble_aft_results=pd.DataFrame()

ensemble_aft_results['train_C_index']=[]
ensemble_aft_results['test_C_index']=[]

In [None]:
kernel_type = 'ensemble_aft'

for i in range(100):
    variables = eval(pd.read_csv("icu.pneu_cox_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_aft_results.loc[i] = [results[0], results[1]]

ensemble_aft_results.to_csv("icu.pneu_cox_aft.csv", index = False, encoding = 'cp949')

#### Result

In [97]:
linear_results = pd.read_csv('icu.pneu_cox_linear.csv')
clinical_results = pd.read_csv('icu.pneu_cox_clinical.csv')
ensemble_cox_results = pd.read_csv('icu.pneu_cox_cox.csv')
ensemble_aft_results = pd.read_csv('icu.pneu_cox_aft.csv')

#### Count of selections for remaining variables

In [None]:
remaining_variables_all=[]
for i in range(0,100):
    remaining_variables_all+=eval(pd.read_csv("icu.pneu_cox_remaining_variables.csv")['variable'][i])

element_counts = {}

for element in remaining_variables_all:
    if element in element_counts:
        element_counts[element] += 1
    else:
        element_counts[element] = 1

print(element_counts)

### The value of C-index

In [None]:
print(round(np.mean(linear_results['train_C_index']),4))
print(round(np.std(linear_results['train_C_index']),4))
print(round(np.mean(linear_results['test_C_index']),4))
print(round(np.std(linear_results['test_C_index']),4))

In [None]:
print(round(np.mean(clinical_results['train_C_index']),4))
print(round(np.std(clinical_results['train_C_index']),4))
print(round(np.mean(clinical_results['test_C_index']),4))
print(round(np.std(clinical_results['test_C_index']),4))

In [None]:
print(round(np.mean(ensemble_cox_results['train_C_index']),4))
print(round(np.std(ensemble_cox_results['train_C_index']),4))
print(round(np.mean(ensemble_cox_results['test_C_index']),4))
print(round(np.std(ensemble_cox_results['test_C_index']),4))

In [None]:
print(round(np.mean(ensemble_aft_results['train_C_index']),4))
print(round(np.std(ensemble_aft_results['train_C_index']),4))
print(round(np.mean(ensemble_aft_results['test_C_index']),4))
print(round(np.std(ensemble_aft_results['test_C_index']),4))

## Scenario 3

In [104]:
# Remaining variable results from 100 runs
aft_remaining_variable=pd.DataFrame()

aft_remaining_variable['variable']=[]

### CV of selection of variables

In [None]:
for i in range(100):
    x_train, x_test, target_train, target_test = split_data(df_onehot, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    # Separating OS and Status
    drop = x_train[['Sex', 'Age', 'OS','Status']]
    x_train_drop = x_train.drop(columns=['Sex', 'Age', 'OS','Status'])

    # Save column names of a DataFrame into a list
    columns = x_train_drop.columns

    column_groups = []

    for i in range(len(columns)):
        # Generate all combinations of columns taken i+1 at a time
        all_column_combinations = list(combinations(columns, i+1))

        # Create a DataFrame for each combination
        for column_combination in all_column_combinations:
            selected_columns = list(column_combination)
            sub_train1 = x_train_drop[selected_columns]
            sub_train2 = pd.concat([sub_train1, drop], axis=1)
            column_groups.append(sub_train2)
    
    num_groups = 3
    best_cindex = []

    for i in range(len(column_groups)):
        x_groups = []
        train = column_groups[i].sample(frac=1).reset_index(drop=True)
        
        censored = train[train['Status'] == 0]  # Extract rows with a value of 0
        uncensored = train[train['Status'] == 1]  # Extract rows with a value of 1

        group_size1 = len(censored) // num_groups
        group_size2 = len(uncensored) // num_groups

        # Splitting the train data into groups
        for i in range(num_groups):

            if i < num_groups - 1:
                # Adjusting the censoring ratio
                group1 = censored.iloc[i * group_size1:(i + 1) * group_size1]
                group2 = uncensored.iloc[i * group_size2:(i + 1) * group_size2]

                group = pd.concat([group1, group2], ignore_index=True)
            else:
                group1 = censored.iloc[i * group_size1:]
                group2 = uncensored.iloc[i * group_size2:]

                group = pd.concat([group1, group2], ignore_index=True)
    
            x_groups.append(group)

        cindex = []

        for i in range(len(x_groups)):

            temp = []
            for j in range(len(x_groups)):
                if i != j:
                    temp.append(x_groups[j])
            #train = train data(One out of the equally divided segments, excluding one)
            train=pd.concat(temp, ignore_index=True)

            #validation = validation data(One of the equally divided segments)
            validation = x_groups[i]
            
            train_target=train[['Status','OS']]
            test_target=validation[['Status','OS']]

            train_y=prepare_response_variable(train_target)
            test_y=prepare_response_variable(test_target)
            #Define data -> train, train_y, validation,  test_y

            result = c_index_kernel_type(train, train_y, validation, test_y, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = 'ensemble_aft')
            cindex.append(result[1])

        #best_cindex: Store all validation c-index values for each selected variable as a list
        best_cindex.append(np.mean(cindex))

    #max_num: Index number of the best c-index in the list
    max_num = best_cindex.index(max(best_cindex))

    #train_column: Selected variables for the best c-index
    train_column = column_groups[max_num].columns
    
    aft_remaining_variable.loc[i] = [list(train_column)]

aft_remaining_variable.to_csv("icu.pneu_aft_remaining_variables.csv", index = False, encoding = 'cp949')

### C-index by Kernel type

Linear Kernel

In [135]:
linear_results=pd.DataFrame()

linear_results['train_C_index']=[]
linear_results['test_C_index']=[]

In [None]:
kernel_type = 'linear'

for i in range(100):
    variables = eval(pd.read_csv("icu.pneu_aft_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    linear_results.loc[i] = [results[0], results[1]]

linear_results.to_csv("icu.pneu_aft_linear.csv", index = False, encoding = 'cp949')

Clinical Kernel

In [122]:
clinical_results=pd.DataFrame()

clinical_results['train_C_index']=[]
clinical_results['test_C_index']=[]

In [None]:
kernel_type = 'clinical'

for i in range(100):
    variables = eval(pd.read_csv("icu.pneu_aft_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    clinical_results.loc[i] = [results[0], results[1]]

clinical_results.to_csv("icu.pneu_aft_clinical.csv", index = False, encoding = 'cp949')

Ensemble Cox Kernel

In [124]:
ensemble_cox_results=pd.DataFrame()

ensemble_cox_results['train_C_index']=[]
ensemble_cox_results['test_C_index']=[]

In [None]:
kernel_type = 'ensemble_cox'

for i in range(100):
    variables = eval(pd.read_csv("icu.pneu_aft_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_cox_results.loc[i] = [results[0], results[1]]

ensemble_cox_results.to_csv("icu.pneu_aft_cox.csv", index = False, encoding = 'cp949')

Ensemble AFT Kernel

In [126]:
ensemble_aft_results=pd.DataFrame()

ensemble_aft_results['train_C_index']=[]
ensemble_aft_results['test_C_index']=[]

In [None]:
kernel_type = 'ensemble_aft'

for i in range(100):
    variables = eval(pd.read_csv("icu.pneu_aft_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_aft_results.loc[i] = [results[0], results[1]]

ensemble_aft_results.to_csv("icu.pneu_aft_aft.csv", index = False, encoding = 'cp949')

### Result

In [128]:
linear_results = pd.read_csv('icu.pneu_aft_linear.csv')
clinical_results = pd.read_csv('icu.pneu_aft_clinical.csv')
ensemble_cox_results = pd.read_csv('icu.pneu_aft_cox.csv')
ensemble_aft_results = pd.read_csv('icu.pneu_aft_aft.csv')

##### Count of selections for each remaining variables

In [None]:
remaining_variables_all=[]
for i in range(0,100):
    remaining_variables_all += eval(pd.read_csv("icu.pneu_aft_remaining_variables.csv")['variable'][i])

element_counts = {}

for element in remaining_variables_all:
    if element in element_counts:
        element_counts[element] += 1
    else:
        element_counts[element] = 1

print(element_counts)

### C-index

In [None]:
print(round(np.mean(linear_results['train_C_index']),4))
print(round(np.std(linear_results['train_C_index']),4))
print(round(np.mean(linear_results['test_C_index']),4))
print(round(np.std(linear_results['test_C_index']),4))

In [None]:
print(round(np.mean(clinical_results['train_C_index']),4))
print(round(np.std(clinical_results['train_C_index']),4))
print(round(np.mean(clinical_results['test_C_index']),4))
print(round(np.std(clinical_results['test_C_index']),4))

In [None]:
print(round(np.mean(ensemble_cox_results['train_C_index']),4))
print(round(np.std(ensemble_cox_results['train_C_index']),4))
print(round(np.mean(ensemble_cox_results['test_C_index']),4))
print(round(np.std(ensemble_cox_results['test_C_index']),4))

In [None]:
print(round(np.mean(ensemble_aft_results['train_C_index']),4))
print(round(np.std(ensemble_aft_results['train_C_index']),4))
print(round(np.mean(ensemble_aft_results['test_C_index']),4))
print(round(np.std(ensemble_aft_results['test_C_index']),4))