In [1]:
import copy

import numpy as np
import pandas as pd
from itertools import combinations

from sklearn.model_selection import KFold
from skopt.space import Real

import sys
sys.path.append('G:/내 드라이브/대학/대외/2023/연구원/연구과제/Ensemble-Kernel')

from Kernel_Function_3 import split_data, prepare_response_variable, c_index_kernel_type

## Preprocessing

In [3]:
data = pd.read_csv("lung(mice).csv")

df = copy.deepcopy(data)

column_mapping = {'age' : 'Age',
                  'sex' : 'Sex',
                  'time' : 'OS',
                  'status' : 'Status'}
df_rename = df.rename(columns = column_mapping)

df_drop = df_rename.drop(['Unnamed: 0','inst'], axis=1).dropna()

df_drop = df_drop.astype({'Sex' : 'category'})

# onehot encode
df_onehot = pd.get_dummies(df_drop, drop_first=True)

df_onehot= df_onehot.astype({'Sex_2' : 'category'})

df_onehot = df_onehot.rename(columns = {'Sex_2' : 'Sex'})

df_onehot

Unnamed: 0,OS,Status,Age,ph.ecog,ph.karno,pat.karno,meal.cal,wt.loss,Sex
0,306,1,74,1,90,100,1175,37,False
1,455,1,68,0,90,90,1225,15,False
2,1010,0,56,0,90,90,1500,15,False
3,210,1,57,1,90,60,1150,11,False
4,883,1,60,0,100,90,1025,0,False
...,...,...,...,...,...,...,...,...,...
223,188,0,77,1,80,60,1225,3,False
224,191,0,39,0,90,90,2350,-5,False
225,105,0,75,2,60,70,1025,5,True
226,174,0,66,1,90,100,1075,1,False


Kfold settion

In [22]:
cv = KFold(n_splits=5, shuffle=True, random_state=36)

param_grid = {'alpha': 2. ** np.arange(-12, 13, 2)}
param_space = {'alpha': Real(1e-6, 1e+6, 'log-uniform'),}

Selecting 100 random state numbers

In [23]:
import random

with open("random_state_100.txt", "r") as file:
    random_state=file.read()
    
random_state=random_state.split("\n")
random_state=[int(x) for x in random_state if x]

## Scenario 2

In [24]:
# Remaining variable results from 100 runs
cox_remaining_variable=pd.DataFrame()

cox_remaining_variable['variable']=[]

In [None]:
for i in range(100):
    x_train, x_test, target_train, target_test = split_data(df_onehot, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    # Separating OS and Status
    drop = x_train[['Age','Sex','OS','Status']]
    x_train_drop = x_train.drop(columns=['Age','Sex','OS','Status'])

    # Save column names of a DataFrame into a list
    columns = x_train_drop.columns

    column_groups = []

    for i in range(len(columns)):
        # Generate all combinations of columns taken i+1 at a time
        all_column_combinations = list(combinations(columns, i+1))

        # Create a DataFrame for each combination
        for column_combination in all_column_combinations:
            selected_columns = list(column_combination)
            sub_train1 = x_train_drop[selected_columns]
            sub_train2 = pd.concat([sub_train1, drop], axis=1)
            column_groups.append(sub_train2)
    
    num_groups = 3
    best_cindex = []

    for i in range(len(column_groups)):
        x_groups = []
        train = column_groups[i].sample(frac=1).reset_index(drop=True)

        # Splitting the train data into groups
        for i in range(num_groups):
            censored = train[train['Status'] == 0]  # Extract rows with a value of 0
            uncensored = train[train['Status'] == 1]  # Extract rows with a value of 1

            group_size1 = len(censored) // num_groups
            group_size2 = len(uncensored) // num_groups

            if i < num_groups - 1:
                # Adjusting the censoring ratio
                group1 = censored.iloc[i * group_size1:(i + 1) * group_size1]
                group2 = uncensored.iloc[i * group_size2:(i + 1) * group_size2]

                group = pd.concat([group1, group2], ignore_index=True)
            else:
                group1 = censored.iloc[i * group_size1:]
                group2 = uncensored.iloc[i * group_size2:]

                group = pd.concat([group1, group2], ignore_index=True)
    
            x_groups.append(group)

        cindex = []

        for i in range(len(x_groups)):

            temp = []
            for j in range(len(x_groups)):
                if i != j:
                    temp.append(x_groups[j])
            #train = train data(One out of the equally divided segments, excluding one)
            train=pd.concat(temp)

            #validation = validation data(One of the equally divided segments)
            validation = x_groups[i]
            
            train_target=train[['Status','OS']]
            test_target=validation[['Status','OS']]

            train_y=prepare_response_variable(train_target)
            test_y=prepare_response_variable(test_target)
            #Define data -> train, train_y, validation,  test_y

            result = c_index_kernel_type(train, train_y, validation, test_y, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = 'ensemble_cox')
            cindex.append(result[1])

        #best_cindex: Store all validation c-index values for each selected variable as a list
        best_cindex.append(np.mean(cindex))

    #max_num: Index number of the best c-index in the list
    max_num = best_cindex.index(max(best_cindex))

    #train_column: Selected variables for the best c-index
    train_column = column_groups[max_num].columns
    
    cox_remaining_variable = cox_remaining_variable.append({"variable": list(train_column)}, ignore_index = True)

cox_remaining_variable.to_csv("lung_cox_remaining_variables.csv", index = False, encoding = 'cp949')

### Check results by Kernel type

Linear Kernel

In [27]:
linear_results=pd.DataFrame()

linear_results['train_C_index']=[]
linear_results['test_C_index']=[]

In [None]:
kernel_type = 'linear'

for i in range(100):
    #variables = remaining_variable[i]
    variables = eval(pd.read_csv("lung_cox_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    linear_results = linear_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

linear_results.to_csv("lung_cox_linear.csv", index = False, encoding = 'cp949')


Clinical Kernel

In [30]:
clinical_results=pd.DataFrame()

clinical_results['train_C_index']=[]
clinical_results['test_C_index']=[]

In [None]:
kernel_type = 'clinical'

for i in range(100):
    #variables = remaining_variable[i]
    variables = eval(pd.read_csv("lung_cox_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    clinical_results = clinical_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

clinical_results.to_csv("lung_cox_clinical.csv", index = False, encoding = 'cp949')

Ensemble Cox Kernel

In [None]:
ensemble_cox_results=pd.DataFrame()

ensemble_cox_results['train_C_index']=[]
ensemble_cox_results['test_C_index']=[]

In [None]:
kernel_type = 'ensemble_cox'

for i in range(100):
    #variables = all_results['remaining_variables'][i]
    variables = eval(pd.read_csv("lung_cox_remaining_variables.csv")['variable'][i])

    variables.append('Status')
    variables.append('OS')

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_aft_results = ensemble_aft_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

ensemble_aft_results.to_csv("lung_cox_cox.csv", index = False, encoding = 'cp949')

Ensemble AFT Kernel

In [33]:
ensemble_aft_results=pd.DataFrame()

ensemble_aft_results['train_C_index']=[]
ensemble_aft_results['test_C_index']=[]

In [None]:
kernel_type = 'ensemble_aft'

for i in range(100):
    #variables = remaining_variable[i]
    variables = eval(pd.read_csv("lung_cox_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_aft_results = ensemble_aft_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

ensemble_aft_results.to_csv("lung_cox_aft.csv", index = False, encoding = 'cp949')

### Result

In [25]:
linear_results = pd.read_csv('lung_cox_linear.csv')
clinical_results = pd.read_csv('lung_cox_clinical.csv')
ensemble_cox_results = pd.read_csv('lung_cox_cox.csv')
ensemble_aft_results = pd.read_csv('lung_cox_aft.csv')

The count of selevtions for each remaining variables

In [26]:
remaining_variables_all = []
for i in range(0,100):
    remaining_variables_all += eval(pd.read_csv("lung_cox_remaining_variables.csv")['variable'][i])

element_counts = {}

for element in remaining_variables_all:
    if element in element_counts:
        element_counts[element] += 1
    else:
        element_counts[element] = 1

print(element_counts)

{'ph.ecog': 74, 'pat.karno': 81, 'Sex': 100, 'Age': 100, 'OS': 100, 'Status': 100, 'ph.karno': 58, 'wt.loss': 54, 'meal.cal': 51}


The value of the C-index

In [27]:
print(round(np.mean(linear_results['train_C_index']),4))
print(round(np.std(linear_results['train_C_index']),4))
print(round(np.mean(linear_results['test_C_index']),4))
print(round(np.std(linear_results['test_C_index']),4))

0.5653
0.0415
0.5559
0.0455


In [28]:
print(round(np.mean(clinical_results['train_C_index']),4))
print(round(np.std(clinical_results['train_C_index']),4))
print(round(np.mean(clinical_results['test_C_index']),4))
print(round(np.std(clinical_results['test_C_index']),4))

0.667
0.0208
0.6316
0.0341


In [29]:
print(round(np.mean(ensemble_cox_results['train_C_index']),4))
print(round(np.std(ensemble_cox_results['train_C_index']),4))
print(round(np.mean(ensemble_cox_results['test_C_index']),4))
print(round(np.std(ensemble_cox_results['test_C_index']),4))

0.6612
0.0197
0.6292
0.0348


In [30]:
print(round(np.mean(ensemble_aft_results['train_C_index']),4))
print(round(np.std(ensemble_aft_results['train_C_index']),4))
print(round(np.mean(ensemble_aft_results['test_C_index']),4))
print(round(np.std(ensemble_aft_results['test_C_index']),4))

0.6622
0.021
0.6282
0.0347


## Scenario 3

In [44]:
aft_remaining_variable=pd.DataFrame()

aft_remaining_variable['variable']=[]

The C-index results

In [None]:
for i in range(100):
    x_train, x_test, target_train, target_test = split_data(df_onehot, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    # Separating OS and Status
    drop = x_train[['Age','Sex','OS','Status']]
    x_train_drop = x_train.drop(columns=['Age','Sex','OS','Status'])

    # Save column names of a DataFrame into a list
    columns = x_train_drop.columns

    column_groups = []

    for i in range(len(columns)):
        # Generate all combinations of columns taken i+1 at a time
        all_column_combinations = list(combinations(columns, i+1))

        # Create a DataFrame for each combination
        for column_combination in all_column_combinations:
            selected_columns = list(column_combination)
            sub_train1 = x_train_drop[selected_columns]
            sub_train2 = pd.concat([sub_train1, drop], axis=1)
            column_groups.append(sub_train2)
    
    num_groups = 3
    best_cindex = []

    for i in range(len(column_groups)):
        x_groups = []
        train = column_groups[i].sample(frac=1).reset_index(drop=True)

        # Splitting the train data into groups
        for i in range(num_groups):
            censored = train[train['Status'] == 0]  # Extract rows with a value of 0
            uncensored = train[train['Status'] == 1]  # Extract rows with a value of 1

            group_size1 = len(censored) // num_groups
            group_size2 = len(uncensored) // num_groups

            if i < num_groups - 1:
                # Adjusting the censoring ratio
                group1 = censored.iloc[i * group_size1:(i + 1) * group_size1]
                group2 = uncensored.iloc[i * group_size2:(i + 1) * group_size2]

                group = pd.concat([group1, group2], ignore_index=True)
            else:
                group1 = censored.iloc[i * group_size1:]
                group2 = uncensored.iloc[i * group_size2:]

                group = pd.concat([group1, group2], ignore_index=True)
    
            x_groups.append(group)

        cindex = []

        for i in range(len(x_groups)):

            temp = []
            for j in range(len(x_groups)):
                if i != j:
                    temp.append(x_groups[j])
            #train = train data(One out of the equally divided segments, excluding one)
            train=pd.concat(temp)

            #validation = validation data(One of the equally divided segments)
            validation = x_groups[i]
            
            train_target=train[['Status','OS']]
            test_target=validation[['Status','OS']]

            train_y=prepare_response_variable(train_target)
            test_y=prepare_response_variable(test_target)
            #Define data -> train, train_y, validation,  test_y

            result = c_index_kernel_type(train, train_y, validation, test_y, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = 'ensemble_aft')
            cindex.append(result[1])

        #best_cindex: Store all validation c-index values for each selected variable as a list
        best_cindex.append(np.mean(cindex))

    #max_num: Index number of the best c-index in the list
    max_num = best_cindex.index(max(best_cindex))

    #train_column: Selected variables for the best c-index
    train_column = column_groups[max_num].columns
    
    aft_remaining_variable = aft_remaining_variable.append({"variable": list(train_column)}, ignore_index = True)

aft_remaining_variable.to_csv("lung_aft_remaining_variables.csv", index = False, encoding = 'cp949')

### Check results by Kernel type

Linear Kernel

In [31]:
linear_results=pd.DataFrame()

linear_results['train_C_index']=[]
linear_results['test_C_index']=[]

In [None]:
kernel_type = 'linear'

for i in range(100):
    #variables = remaining_variable[i]
    variables = eval(pd.read_csv("lung_aft_remaining_variables.csv")['variable'][i])
    
    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    linear_results = linear_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

linear_results.to_csv("lung_aft_linear.csv", index = False, encoding = 'cp949')


Clinical Kernel

In [None]:
clinical_results=pd.DataFrame()

clinical_results['train_C_index']=[]
clinical_results['test_C_index']=[]

In [None]:
kernel_type = 'clinical'

for i in range(100):
    #variables = remaining_variable[i]
    variables = eval(pd.read_csv("lung_aft_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    clinical_results = clinical_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

clinical_results.to_csv("lung_aft_clinical.csv", index = False, encoding = 'cp949')


Ensemble Cox Kernel

In [None]:
ensemble_cox_results=pd.DataFrame()

ensemble_cox_results['train_C_index']=[]
ensemble_cox_results['test_C_index']=[]

In [None]:
kernel_type = 'ensemble_cox'

for i in range(100):
    #variables = remaining_variable[i]
    variables = eval(pd.read_csv("lung_aft_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_cox_results = ensemble_cox_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

ensemble_cox_results.to_csv("lung_aft_cox.csv", index = False, encoding = 'cp949')

Ensemble AFT Kernel

In [None]:
ensemble_aft_results=pd.DataFrame()

ensemble_aft_results['train_C_index']=[]
ensemble_aft_results['test_C_index']=[]

In [None]:
kernel_type = 'ensemble_aft'

for i in range(100):
    #variables = remaining_variable[i]
    variables = eval(pd.read_csv("lung_aft_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_aft_results = ensemble_aft_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

ensemble_aft_results.to_csv("lung_aft_aft.csv", index = False, encoding = 'cp949')

### Result

In [33]:
linear_results = pd.read_csv('lung_aft_linear.csv')
clinical_results = pd.read_csv('lung_aft_clinical.csv')
ensemble_cox_results = pd.read_csv('lung_aft_cox.csv')
ensemble_aft_results = pd.read_csv('lung_aft_aft.csv')

The count of selections for each remaining variables

In [36]:
remaining_variables_all=[]
for i in range(0,100):
    remaining_variables_all += eval(pd.read_csv("lung_aft_remaining_variables.csv")['variable'][i])

element_counts = {}

for element in remaining_variables_all:
    if element in element_counts:
        element_counts[element] += 1
    else:
        element_counts[element] = 1

print(element_counts)

{'ph.ecog': 76, 'pat.karno': 76, 'meal.cal': 52, 'wt.loss': 46, 'Sex': 100, 'Age': 100, 'OS': 100, 'Status': 100, 'ph.karno': 46}


The value of the C-index

In [37]:
print(round(np.mean(linear_results['train_C_index']),4))
print(round(np.std(linear_results['train_C_index']),4))
print(round(np.mean(linear_results['test_C_index']),4))
print(round(np.std(linear_results['test_C_index']),4))

0.5616
0.039
0.5598
0.0486


In [38]:
print(round(np.mean(clinical_results['train_C_index']),4))
print(round(np.std(clinical_results['train_C_index']),4))
print(round(np.mean(clinical_results['test_C_index']),4))
print(round(np.std(clinical_results['test_C_index']),4))

0.669
0.0229
0.6304
0.0369


In [39]:
print(round(np.mean(ensemble_cox_results['train_C_index']),4))
print(round(np.std(ensemble_cox_results['train_C_index']),4))
print(round(np.mean(ensemble_cox_results['test_C_index']),4))
print(round(np.std(ensemble_cox_results['test_C_index']),4))

0.6623
0.0233
0.6268
0.0375


In [40]:
print(round(np.mean(ensemble_aft_results['train_C_index']),4))
print(round(np.std(ensemble_aft_results['train_C_index']),4))
print(round(np.mean(ensemble_aft_results['test_C_index']),4))
print(round(np.std(ensemble_aft_results['test_C_index']),4))

0.6638
0.023
0.6264
0.0362
