In [1]:
import copy

import numpy as np
import pandas as pd
from itertools import combinations

from sklearn.model_selection import KFold
from skopt.space import Real

import sys
sys.path.append('G:/내 드라이브/ensemble kernel')

from Kernel_Function_3 import split_data, prepare_response_variable, c_index_kernel_type

## Preprocessing

In [2]:
data = pd.read_csv("veteran_data.csv")

df = data.drop(['Unnamed: 0'], axis=1)
df = df.astype({'trt' : 'category', 'celltype' : 'category', 'prior' : 'category'})
df.replace({'celltype' : {'squamous' : 0, 'large' : 1, 'smallcell' : 2, 'adeno' : 3}}, inplace = True)

df = df.rename(columns={'age':'Age',
                        'time':'OS',
                        'status':'Status'})

# onehot encode
df_onehot = pd.get_dummies(df, drop_first=True)

columns_to_convert=['celltype_1', 'celltype_2','celltype_0', 'prior_10', 'trt_2']

df_onehot[columns_to_convert] = df_onehot[columns_to_convert].astype('category')

df_onehot

Unnamed: 0,OS,Status,karno,diagtime,Age,trt_2,celltype_1,celltype_2,celltype_0,prior_10
0,72,1,60,7,69,0,0,0,1,0
1,411,1,70,5,64,0,0,0,1,1
2,228,1,60,3,38,0,0,0,1,0
3,126,1,60,9,63,0,0,0,1,1
4,118,1,70,11,65,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...
132,133,1,75,1,65,1,1,0,0,0
133,111,1,60,5,64,1,1,0,0,0
134,231,1,70,18,67,1,1,0,0,1
135,378,1,80,4,65,1,1,0,0,0


KFold Setting

In [3]:
# KFold setting
cv=KFold(n_splits = 5, shuffle=True, random_state=36)

param_space = {
    'alpha': Real(1e-6, 1e+6, 'log-uniform'),
}
param_grid = {'alpha': 2. ** np.arange(-12, 13, 2)}

Selecting 100 random state numbers

In [4]:
import random

with open("random_state_100.txt", "r") as file:
    random_state=file.read()
    
random_state=random_state.split("\n")
random_state=[int(x) for x in random_state if x]

## Scenario 1

### Check results by Kernel type

Linear Kernel

In [None]:
linear_results=pd.DataFrame()

linear_results['train_C_index']=[]
linear_results['test_C_index']=[]

In [None]:
kernel_type = 'linear'

for i in range(100):
    
    x_train, x_test, target_train, target_test = split_data(df_onehot, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    linear_results.loc[i] = [results[0], results[1]]

linear_results.to_csv("veteran_linear.csv", index = False, encoding = 'cp949')

## Scenario 2

In [6]:
# Remaining variable results from 100 runs
cox_remaining_variable=pd.DataFrame()

cox_remaining_variable['variable']=[]

The C-index results

In [None]:
for i in range(100):
    x_train, x_test, target_train, target_test = split_data(df_onehot, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    # Separating OS and Status
    drop = x_train[['Age','Sex','OS','Status']]
    x_train_drop = x_train.drop(columns=['Age','OS','Status'])

    # Save column names of a DataFrame into a list
    columns = x_train_drop.columns

    column_groups = []

    for i in range(len(columns)):
        # Generate all combinations of columns taken i+1 at a time
        all_column_combinations = list(combinations(columns, i+1))

        # Create a DataFrame for each combination
        for column_combination in all_column_combinations:
            selected_columns = list(column_combination)
            sub_train1 = x_train_drop[selected_columns]
            sub_train2 = pd.concat([sub_train1, drop], axis=1)
            column_groups.append(sub_train2)
    
    num_groups = 3
    best_cindex = []

    for i in range(len(column_groups)):
        x_groups = []
        train = column_groups[i].sample(frac=1).reset_index(drop=True)

        # Splitting the train data into groups
        for i in range(num_groups):
            censored = train[train['Status'] == 0]  # Extract rows with a value of 0
            uncensored = train[train['Status'] == 1]  # Extract rows with a value of 1

            group_size1 = len(censored) // num_groups
            group_size2 = len(uncensored) // num_groups

            if i < num_groups - 1:
                # Adjusting the censoring ratio
                group1 = censored.iloc[i * group_size1:(i + 1) * group_size1]
                group2 = uncensored.iloc[i * group_size2:(i + 1) * group_size2]

                group = pd.concat([group1, group2], ignore_index=True)
            else:
                group1 = censored.iloc[i * group_size1:]
                group2 = uncensored.iloc[i * group_size2:]

                group = pd.concat([group1, group2], ignore_index=True)
    
            x_groups.append(group)

        cindex = []

        for i in range(len(x_groups)):

            temp = []
            for j in range(len(x_groups)):
                if i != j:
                    temp.append(x_groups[j])
            #train = train data(One out of the equally divided segments, excluding one)
            train=pd.concat(temp)

            #validation = validation data(One of the equally divided segments)
            validation = x_groups[i]
            
            train_target=train[['Status','OS']]
            test_target=validation[['Status','OS']]

            train_y=prepare_response_variable(train_target)
            test_y=prepare_response_variable(test_target)
            #Define data -> train, train_y, validation,  test_y

            result = c_index_kernel_type(train, train_y, validation, test_y, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = 'ensemble_cox')
            cindex.append(result[1])

        #best_cindex: Store all validation c-index values for each selected variable as a list
        best_cindex.append(np.mean(cindex))

    #max_num: Index number of the best c-index in the list
    max_num = best_cindex.index(max(best_cindex))

    #train_column: Selected variables for the best c-index
    train_column = column_groups[max_num].columns
    
    cox_remaining_variable = cox_remaining_variable.append({"variable": list(train_column)}, ignore_index = True)

cox_remaining_variable.to_csv("veteran_cox_remaining_variables.csv", index = False, encoding = 'cp949')

### Check results by Kernel type

Linear Kernel

In [17]:
linear_results=pd.DataFrame()

linear_results['train_C_index']=[]
linear_results['test_C_index']=[]

In [None]:
kernel_type = 'linear'

for i in range(100):
    variables = eval(pd.read_csv("veteran_cox_remaining_variables.csv")['variable'][i])

    variables.append('Status')
    variables.append('OS')

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    linear_results = linear_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

linear_results.to_csv("veteran_linear_repetition.csv", index = False, encoding = 'cp949')


Clinical Kernel

In [18]:
clinical_results=pd.DataFrame()

clinical_results['train_C_index']=[]
clinical_results['test_C_index']=[]

In [None]:
kernel_type = 'clinical'

for i in range(100):
    #variables = all_results['remaining_variables'][i]
    variables = eval(pd.read_csv("veteran_cox_remaining_variables.csv")['variable'][i])

    variables.append('Status')
    variables.append('OS')

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    clinical_results = clinical_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

clinical_results.to_csv("veteran_clinical_repetition.csv", index = False, encoding = 'cp949')

Ensemble Cox Kernel

In [None]:
ensemble_cox_results=pd.DataFrame()

ensemble_cox_results['train_C_index']=[]
ensemble_cox_results['test_C_index']=[]

In [None]:
kernel_type = 'ensemble_cox'

for i in range(100):
    #variables = all_results['remaining_variables'][i]
    variables = eval(pd.read_csv("veteran_cox_remaining_variables.csv")['variable'][i])

    variables.append('Status')
    variables.append('OS')

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_aft_results = ensemble_aft_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

ensemble_aft_results.to_csv("veteran_cox_repetition.csv", index = False, encoding = 'cp949')

Ensemble AFT Kernel

In [None]:
ensemble_aft_results=pd.DataFrame()

ensemble_aft_results['train_C_index']=[]
ensemble_aft_results['test_C_index']=[]

In [None]:
kernel_type = 'ensemble_aft'

for i in range(100):
    #variables = all_results['remaining_variables'][i]
    variables = eval(pd.read_csv("veteran_cox_remaining_variables.csv")['variable'][i])

    variables.append('Status')
    variables.append('OS')

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_aft_results = ensemble_aft_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

ensemble_aft_results.to_csv("veteran_aft_repetition.csv", index = False, encoding = 'cp949')

### Result

In [21]:
linear_results = pd.read_csv('veteran_linear_repetition.csv')
clinical_results = pd.read_csv('veteran_clinical_repetition.csv')
ensemble_cox_results = pd.read_csv('veteran_cox_repetition.csv')
ensemble_aft_results = pd.read_csv('veteran_aft_repetition.csv')

The count of selections for each remaining variables

In [22]:
remaining_variables_all=[]
for i in range(0,100):
    remaining_variables_all+=eval(pd.read_csv("veteran_cox_repetition.csv")['remaining_variables'][i])

element_counts = {}

for element in remaining_variables_all:
    if element in element_counts:
        element_counts[element] += 1
    else:
        element_counts[element] = 1

print(element_counts)

{'karno': 100, 'celltype_2': 43, 'trt_2': 35, 'Age': 100, 'diagtime': 43, 'celltype_1': 69, 'celltype_0': 59, 'prior_10': 32}


The value of the C-index

In [23]:
print(round(np.mean(linear_results['train_C_index']),4))
print(round(np.std(linear_results['train_C_index']),4))
print(round(np.mean(linear_results['test_C_index']),4))
print(round(np.std(linear_results['test_C_index']),4))

0.7149
0.0169
0.7008
0.035


In [24]:
print(round(np.mean(clinical_results['train_C_index']),4))
print(round(np.std(clinical_results['train_C_index']),4))
print(round(np.mean(clinical_results['test_C_index']),4))
print(round(np.std(clinical_results['test_C_index']),4))

0.7605
0.0284
0.6942
0.0371


In [25]:
print(round(np.mean(ensemble_cox_results['train_C_index']),4))
print(round(np.std(ensemble_cox_results['train_C_index']),4))
print(round(np.mean(ensemble_cox_results['test_C_index']),4))
print(round(np.std(ensemble_cox_results['test_C_index']),4))

0.7588
0.0255
0.6949
0.0371


In [26]:
print(round(np.mean(ensemble_aft_results['train_C_index']),4))
print(round(np.std(ensemble_aft_results['train_C_index']),4))
print(round(np.mean(ensemble_aft_results['test_C_index']),4))
print(round(np.std(ensemble_aft_results['test_C_index']),4))

0.7582
0.0254
0.6945
0.0368


## Scenario 3

In [None]:
# Remaining variable results from 100 runs
aft_remaining_variable=pd.DataFrame()

aft_remaining_variable['variable']=[]

The C-index results

In [None]:
for i in range(100):
    x_train, x_test, target_train, target_test = split_data(df_onehot, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    # Separating OS and Status
    drop = x_train[['Age','OS','Status']]
    x_train_drop = x_train.drop(columns=['Age','OS','Status'])

    # Save column names of a DataFrame into a list
    columns = x_train_drop.columns

    column_groups = []

    for i in range(len(columns)):
        # Generate all combinations of columns taken i+1 at a time
        all_column_combinations = list(combinations(columns, i+1))

        # Create a DataFrame for each combination
        for column_combination in all_column_combinations:
            selected_columns = list(column_combination)
            sub_train1 = x_train_drop[selected_columns]
            sub_train2 = pd.concat([sub_train1, drop], axis=1)
            column_groups.append(sub_train2)
    
    num_groups = 3
    best_cindex = []

    for i in range(len(column_groups)):
        x_groups = []
        train = column_groups[i].sample(frac=1).reset_index(drop=True)

        # Splitting the train data into groups
        for i in range(num_groups):
            censored = train[train['Status'] == 0]  # Extract rows with a value of 0
            uncensored = train[train['Status'] == 1]  # Extract rows with a value of 1

            group_size1 = len(censored) // num_groups
            group_size2 = len(uncensored) // num_groups

            if i < num_groups - 1:
                # Adjusting the censoring ratio
                group1 = censored.iloc[i * group_size1:(i + 1) * group_size1]
                group2 = uncensored.iloc[i * group_size2:(i + 1) * group_size2]

                group = pd.concat([group1, group2], ignore_index=True)
            else:
                group1 = censored.iloc[i * group_size1:]
                group2 = uncensored.iloc[i * group_size2:]

                group = pd.concat([group1, group2], ignore_index=True)
    
            x_groups.append(group)

        cindex = []

        for i in range(len(x_groups)):

            temp = []
            for j in range(len(x_groups)):
                if i != j:
                    temp.append(x_groups[j])
            #train = train data(One out of the equally divided segments, excluding one)
            train=pd.concat(temp)

            #validation = validation data(One of the equally divided segments)
            validation = x_groups[i]
            
            train_target=train[['Status','OS']]
            test_target=validation[['Status','OS']]

            train_y=prepare_response_variable(train_target)
            test_y=prepare_response_variable(test_target)
            #Define data -> train, train_y, validation,  test_y

            result = c_index_kernel_type(train, train_y, validation, test_y, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = 'ensemble_aft')
            cindex.append(result[1])

        #best_cindex: Store all validation c-index values for each selected variable as a list
        best_cindex.append(np.mean(cindex))

    #max_num: Index number of the best c-index in the list
    max_num = best_cindex.index(max(best_cindex))

    #train_column: Selected variables for the best c-index
    train_column = column_groups[max_num].columns
    
    aft_remaining_variable = aft_remaining_variable.append({"variable": list(train_column)}, ignore_index = True)

aft_remaining_variable.to_csv("veteran_aft_remaining_variables.csv", index = False, encoding = 'cp949')

### Check results by Kernel type

Linear Kernel

In [27]:
linear_results=pd.DataFrame()

linear_results['train_C_index']=[]
linear_results['test_C_index']=[]

In [None]:
kernel_type = 'linear'

for i in range(100):
    #variables = all_results['remaining_variables'][i]
    variables = eval(pd.read_csv("veteran_aft_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    linear_results = linear_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

linear_results.to_csv("veteran_aft_linear.csv", index = False, encoding = 'cp949')

Clinical Kernel

In [28]:
clinical_results=pd.DataFrame()

clinical_results['train_C_index']=[]
clinical_results['test_C_index']=[]

In [None]:
kernel_type = 'clinical'

for i in range(100):
    #variables = all_results['remaining_variables'][i]
    variables = eval(pd.read_csv("veteran_aft_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    clinical_results = clinical_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

clinical_results.to_csv("veteran_aft_clinical.csv", index = False, encoding = 'cp949')

Ensemble Cox Kernel

In [29]:
ensemble_cox_results=pd.DataFrame()

ensemble_cox_results['train_C_index']=[]
ensemble_cox_results['test_C_index']=[]

In [None]:
kernel_type = 'ensemble_cox'

for i in range(100):
    #variables = all_results['remaining_variables'][i]
    variables = eval(pd.read_csv("veteran_aft_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_cox_results = ensemble_cox_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

ensemble_cox_results.to_csv("veteran_aft_cox.csv", index = False, encoding = 'cp949')

Ensemble AFT Kernel

In [31]:
ensemble_aft_results=pd.DataFrame()

ensemble_aft_results['train_C_index']=[]
ensemble_aft_results['test_C_index']=[]

In [None]:
kernel_type = 'ensemble_aft'

for i in range(100):
    #variables = remaining_variable[i]
    variables = eval(pd.read_csv("veteran_aft_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_aft_results = ensemble_aft_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

ensemble_aft_results.to_csv("veteran_aft_aft.csv", index = False, encoding = 'cp949')

### Result

In [33]:
linear_results = pd.read_csv('veteran_aft_linear.csv')
clinical_results = pd.read_csv('veteran_aft_clinical.csv')
ensemble_cox_results = pd.read_csv('veteran_aft_cox.csv')
ensemble_aft_results = pd.read_csv('veteran_aft_aft.csv')

The count of selections for each remaining variables

In [35]:
remaining_variables_all=[]
for i in range(0,100):
    remaining_variables_all += eval(pd.read_csv("veteran_aft_remaining_variables.csv")['variable'][i])

element_counts = {}

for element in remaining_variables_all:
    if element in element_counts:
        element_counts[element] += 1
    else:
        element_counts[element] = 1

print(element_counts)

{'karno': 100, 'diagtime': 44, 'celltype_1': 78, 'prior_10': 39, 'Age': 100, 'OS': 100, 'Status': 100, 'celltype_2': 52, 'celltype_0': 62, 'trt_2': 46}


The value of the C-index

In [36]:
print(round(np.mean(linear_results['train_C_index']),4))
print(round(np.std(linear_results['train_C_index']),4))
print(round(np.mean(linear_results['test_C_index']),4))
print(round(np.std(linear_results['test_C_index']),4))

0.7146
0.0173
0.7002
0.0342


In [37]:
print(round(np.mean(clinical_results['train_C_index']),4))
print(round(np.std(clinical_results['train_C_index']),4))
print(round(np.mean(clinical_results['test_C_index']),4))
print(round(np.std(clinical_results['test_C_index']),4))

0.7607
0.0245
0.6965
0.037


In [38]:
print(round(np.mean(ensemble_cox_results['train_C_index']),4))
print(round(np.std(ensemble_cox_results['train_C_index']),4))
print(round(np.mean(ensemble_cox_results['test_C_index']),4))
print(round(np.std(ensemble_cox_results['test_C_index']),4))

0.7599
0.0228
0.6961
0.0392


In [39]:
print(round(np.mean(ensemble_aft_results['train_C_index']),4))
print(round(np.std(ensemble_aft_results['train_C_index']),4))
print(round(np.mean(ensemble_aft_results['test_C_index']),4))
print(round(np.std(ensemble_aft_results['test_C_index']),4))

0.7592
0.025
0.6954
0.0388
