In [1]:
import copy

import numpy as np
import pandas as pd
from itertools import combinations

from sklearn.model_selection import KFold
from skopt.space import Real

from Kernel_Function_3 import split_data, prepare_response_variable, c_index_kernel_type

### Preprocessing

In [2]:
data = pd.read_csv("melanoma.csv")

df = copy.deepcopy(data)

column_mapping = {'age' : 'Age',
                  'sex' : 'Sex',
                  'time' : 'OS',
                  'status' : 'Status'}

df_rename = df.rename(columns = column_mapping)

df_drop = df_rename.drop(['Unnamed: 0'], axis=1).dropna()

df_drop = df_drop.astype({'Sex' : 'category','ulcer' : 'category'})

#The final variable name in the existing code is set as 'df_onehot' even if the one-hot encoding isn't performed.
df_onehot = copy.deepcopy(df_drop)
#Update the dataset to keep only the untruncated data for cases where the cause of death was melanoma (case number 1).
df_onehot['Status'].replace({1: 0, 2: 1, 3: 1}, inplace=True)

df_onehot

Unnamed: 0,OS,Status,Sex,Age,year,thickness,ulcer
0,10,1,1,76,1972,6.76,1
1,30,1,1,56,1968,0.65,0
2,35,1,1,41,1977,1.34,0
3,99,1,0,71,1968,2.90,0
4,185,0,1,52,1965,12.08,1
...,...,...,...,...,...,...,...
200,4492,1,1,29,1965,7.06,1
201,4668,1,0,40,1965,6.12,0
202,4688,1,0,42,1965,0.48,0
203,4926,1,0,50,1964,2.26,0


Kfold setting

In [3]:
cv = KFold(n_splits=5, shuffle=True, random_state=36)

param_grid = {'alpha': 2. ** np.arange(-12, 13, 2)}
param_space = {'alpha': Real(1e-6, 1e+6, 'log-uniform'),}

Selection 100 random state numbers

In [4]:
import random

with open("random_state_100.txt", "r") as file:
    random_state=file.read()
    
random_state=random_state.split("\n")
random_state=[int(x) for x in random_state if x]

## Scenario 1

### C-index by Kernel type

Linear Kernel

In [5]:
linear_results=pd.DataFrame()

linear_results['train_C_index']=[]
linear_results['test_C_index']=[]

In [7]:
kernel_type = 'linear'

for i in range(100):
    x_train, x_test, target_train, target_test = split_data(df_onehot, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    linear_results.loc[i] = [results[0], results[1]]

linear_results.to_csv("melanoma_linear.csv", index = False, encoding = 'cp949')

Clinical Kernel

In [8]:
clinical_results=pd.DataFrame()

clinical_results['train_C_index']=[]
clinical_results['test_C_index']=[]

In [None]:
kernel_type = 'clinical'

for i in range(100):
    x_train, x_test, target_train, target_test = split_data(df_onehot, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    clinical_results.loc[i] = [results[0], results[1]]

clinical_results.to_csv("melanoma_clinical.csv", index = False, encoding = 'cp949')

Ensemble Cox Kernel

In [10]:
ensemble_cox_results=pd.DataFrame()

ensemble_cox_results['train_C_index']=[]
ensemble_cox_results['test_C_index']=[]

In [None]:
kernel_type = 'ensemble_cox'

for i in range(100):
    x_train, x_test, target_train, target_test = split_data(df_onehot, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_cox_results.loc[i] = [results[0], results[1]]

ensemble_cox_results.to_csv("melanoma_cox.csv", index = False, encoding = 'cp949')

Ensemble AFT Kernel

In [12]:
ensemble_aft_results=pd.DataFrame()

ensemble_aft_results['train_C_index']=[]
ensemble_aft_results['test_C_index']=[]

In [None]:
kernel_type = 'ensemble_aft'

for i in range(100):
    x_train, x_test, target_train, target_test = split_data(df_onehot, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_aft_results.loc[i] = [results[0], results[1]]

ensemble_aft_results.to_csv("melanoma_aft.csv", index = False, encoding = 'cp949')

### Result

In [19]:
linear_results = pd.read_csv('melanoma_linear.csv')
clinical_results = pd.read_csv('melanoma_clinical.csv')
ensemble_cox_results = pd.read_csv('melanoma_cox.csv')
ensemble_aft_results = pd.read_csv('melanoma_aft.csv')

The value of C-index

In [None]:
print(round(np.mean(linear_results['train_C_index']),4))
print(round(np.std(linear_results['train_C_index']),4))
print(round(np.mean(linear_results['test_C_index']),4))
print(round(np.std(linear_results['test_C_index']),4))

In [None]:
print(round(np.mean(clinical_results['train_C_index']),4))
print(round(np.std(clinical_results['train_C_index']),4))
print(round(np.mean(clinical_results['test_C_index']),4))
print(round(np.std(clinical_results['test_C_index']),4))

In [None]:
print(round(np.mean(ensemble_cox_results['train_C_index']),4))
print(round(np.std(ensemble_cox_results['train_C_index']),4))
print(round(np.mean(ensemble_cox_results['test_C_index']),4))
print(round(np.std(ensemble_cox_results['test_C_index']),4))

In [None]:
print(round(np.mean(ensemble_aft_results['train_C_index']),4))
print(round(np.std(ensemble_aft_results['train_C_index']),4))
print(round(np.mean(ensemble_aft_results['test_C_index']),4))
print(round(np.std(ensemble_aft_results['test_C_index']),4))

## Scenario 2

In [12]:
# Remaining variable results from 100 runs
cox_remaining_variable=pd.DataFrame()

cox_remaining_variable['variable']=[]

### CV of selection of variables

In [None]:
for i in range(100):
    x_train, x_test, target_train, target_test = split_data(df_onehot, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    # Separating OS and Status
    drop = x_train[['Age','Sex','OS','Status']]
    x_train_drop = x_train.drop(columns=['Age','Sex','OS','Status'])

    # Save column names of a DataFrame into a list
    columns = x_train_drop.columns

    column_groups = []

    for i in range(len(columns)):
        # Generate all combinations of columns taken i+1 at a time
        all_column_combinations = list(combinations(columns, i+1))

        # Create a DataFrame for each combination
        for column_combination in all_column_combinations:
            selected_columns = list(column_combination)
            sub_train1 = x_train_drop[selected_columns]
            sub_train2 = pd.concat([sub_train1, drop], axis=1)
            column_groups.append(sub_train2)
    
    num_groups = 3
    best_cindex = []

    for i in range(len(column_groups)):
        x_groups = []
        train = column_groups[i].sample(frac=1).reset_index(drop=True)

        # Splitting the train data into groups
        for i in range(num_groups):
            censored = train[train['Status'] == 0]  # Extract rows with a value of 0
            uncensored = train[train['Status'] == 1]  # Extract rows with a value of 1

            group_size1 = len(censored) // num_groups
            group_size2 = len(uncensored) // num_groups

            if i < num_groups - 1:
                # Adjusting the censoring ratio
                group1 = censored.iloc[i * group_size1:(i + 1) * group_size1]
                group2 = uncensored.iloc[i * group_size2:(i + 1) * group_size2]

                group = pd.concat([group1, group2], ignore_index=True)
            else:
                group1 = censored.iloc[i * group_size1:]
                group2 = uncensored.iloc[i * group_size2:]

                group = pd.concat([group1, group2], ignore_index=True)
    
            x_groups.append(group)

        cindex = []

        for i in range(len(x_groups)):

            temp = []
            for j in range(len(x_groups)):
                if i != j:
                    temp.append(x_groups[j])
            #train = train data(One out of the equally divided segments, excluding one)
            train=pd.concat(temp, ignore_index=True)

            #validation = validation data(One of the equally divided segments)
            validation = x_groups[i]
            
            train_target=train[['Status','OS']]
            test_target=validation[['Status','OS']]

            train_y=prepare_response_variable(train_target)
            test_y=prepare_response_variable(test_target)
            #Define data -> train, train_y, validation,  test_y

            result = c_index_kernel_type(train, train_y, validation, test_y, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = 'ensemble_cox')
            cindex.append(result[1])

        #best_cindex: Store all validation c-index values for each selected variable as a list
        best_cindex.append(np.mean(cindex))

    #max_num: Index number of the best c-index in the list
    max_num = best_cindex.index(max(best_cindex))

    #train_column: Selected variables for the best c-index
    train_column = column_groups[max_num].columns
    
    #cox_remaining_variable = cox_remaining_variable.append({"variable": list(train_column)}, ignore_index = True)
    cox_remaining_variable.loc[i] = [list(train_column)]

cox_remaining_variable.to_csv("melanoma_cox_remaining_variables.csv", index = False, encoding = 'cp949')

### C-index by Kernel type

Linear Kernel

In [14]:
linear_results=pd.DataFrame()

linear_results['train_C_index']=[]
linear_results['test_C_index']=[]

In [None]:
kernel_type = 'linear'

for i in range(100):
    variables = eval(pd.read_csv("melanoma_cox_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    linear_results.loc[i] = [results[0], results[1]]

linear_results.to_csv("melanoma_cox_linear.csv", index = False, encoding = 'cp949')

Clinical Kernel

In [16]:
clinical_results=pd.DataFrame()

clinical_results['train_C_index']=[]
clinical_results['test_C_index']=[]

In [None]:
kernel_type = 'clinical'

for i in range(100):
    variables = eval(pd.read_csv("melanoma_cox_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    clinical_results.loc[i] = [results[0], results[1]]

clinical_results.to_csv("melanoma_cox_clinical.csv", index = False, encoding = 'cp949')

Ensemble Cox Kernel

In [18]:
ensemble_cox_results=pd.DataFrame()

ensemble_cox_results['train_C_index']=[]
ensemble_cox_results['test_C_index']=[]

In [None]:
kernel_type = 'ensemble_cox'

for i in range(100):
    variables = eval(pd.read_csv("melanoma_cox_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_cox_results.loc[i] = [results[0], results[1]]

ensemble_cox_results.to_csv("melanoma_cox_cox.csv", index = False, encoding = 'cp949')

Ensemble AFT Kernel

In [None]:
ensemble_aft_results=pd.DataFrame()

ensemble_aft_results['train_C_index']=[]
ensemble_aft_results['test_C_index']=[]

In [None]:
kernel_type = 'ensemble_aft'

for i in range(100):
    variables = eval(pd.read_csv("melanoma_cox_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_aft_results.loc[i] = [results[0], results[1]]
    
ensemble_aft_results.to_csv("melanoma_cox_aft.csv", index = False, encoding = 'cp949')

### Result

In [None]:
linear_results = pd.read_csv('melanoma_cox_linear.csv')
clinical_results = pd.read_csv('melanoma_cox_clinical.csv')
ensemble_cox_results = pd.read_csv('melanoma_cox_cox.csv')
ensemble_aft_results = pd.read_csv('melanoma_cox_aft.csv')

Count of selections for remaining variables

In [None]:
remaining_variables_all = []
for i in range(0,100):
    remaining_variables_all += eval(pd.read_csv("melanoma_cox_remaining_variables.csv")['variable'][i])

element_counts = {}

for element in remaining_variables_all:
    if element in element_counts:
        element_counts[element] += 1
    else:
        element_counts[element] = 1

print(element_counts)

The value of C-index

In [None]:
print(round(np.mean(linear_results['train_C_index']),4))
print(round(np.std(linear_results['train_C_index']),4))
print(round(np.mean(linear_results['test_C_index']),4))
print(round(np.std(linear_results['test_C_index']),4))

In [None]:
print(round(np.mean(clinical_results['train_C_index']),4))
print(round(np.std(clinical_results['train_C_index']),4))
print(round(np.mean(clinical_results['test_C_index']),4))
print(round(np.std(clinical_results['test_C_index']),4))

In [None]:
print(round(np.mean(ensemble_cox_results['train_C_index']),4))
print(round(np.std(ensemble_cox_results['train_C_index']),4))
print(round(np.mean(ensemble_cox_results['test_C_index']),4))
print(round(np.std(ensemble_cox_results['test_C_index']),4))

In [None]:
print(round(np.mean(ensemble_aft_results['train_C_index']),4))
print(round(np.std(ensemble_aft_results['train_C_index']),4))
print(round(np.mean(ensemble_aft_results['test_C_index']),4))
print(round(np.std(ensemble_aft_results['test_C_index']),4))

## Scenario 3

In [7]:
# Remaining varialbe results from 100 runs
aft_remaining_variable = pd.DataFrame()

aft_remaining_variable['variable']=[]

### CV of selection of variables

In [None]:
for i in range(100):
    x_train, x_test, target_train, target_test = split_data(df_onehot, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    # Separating OS and Status
    drop = x_train[['Age','Sex','OS','Status']]
    x_train_drop = x_train.drop(columns=['Age','Sex','OS','Status'])

    # Save column names of a DataFrame into a list
    columns = x_train_drop.columns

    column_groups = []

    for i in range(len(columns)):
        # Generate all combinations of columns taken i+1 at a time
        all_column_combinations = list(combinations(columns, i+1))

        # Create a DataFrame for each combination
        for column_combination in all_column_combinations:
            selected_columns = list(column_combination)
            sub_train1 = x_train_drop[selected_columns]
            sub_train2 = pd.concat([sub_train1, drop], axis=1)
            column_groups.append(sub_train2)
    
    num_groups = 3
    best_cindex = []

    for i in range(len(column_groups)):
        x_groups = []
        train = column_groups[i].sample(frac=1).reset_index(drop=True)

        # Splitting the train data into groups
        for i in range(num_groups):
            censored = train[train['Status'] == 0]  # Extract rows with a value of 0
            uncensored = train[train['Status'] == 1]  # Extract rows with a value of 1

            group_size1 = len(censored) // num_groups
            group_size2 = len(uncensored) // num_groups

            if i < num_groups - 1:
                # Adjusting the censoring ratio
                group1 = censored.iloc[i * group_size1:(i + 1) * group_size1]
                group2 = uncensored.iloc[i * group_size2:(i + 1) * group_size2]

                group = pd.concat([group1, group2], ignore_index=True)
            else:
                group1 = censored.iloc[i * group_size1:]
                group2 = uncensored.iloc[i * group_size2:]

                group = pd.concat([group1, group2], ignore_index=True)
    
            x_groups.append(group)

        cindex = []

        for i in range(len(x_groups)):

            temp = []
            for j in range(len(x_groups)):
                if i != j:
                    temp.append(x_groups[j])
            #train = train data(One out of the equally divided segments, excluding one)
            train=pd.concat(temp, ignore_index=True)

            #validation = validation data(One of the equally divided segments)
            validation = x_groups[i]
            
            train_target=train[['Status','OS']]
            test_target=validation[['Status','OS']]

            train_y=prepare_response_variable(train_target)
            test_y=prepare_response_variable(test_target)
            #Define data -> train, train_y, validation,  test_y

            result = c_index_kernel_type(train, train_y, validation, test_y, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = 'ensemble_aft')
            cindex.append(result[1])

        #best_cindex: Store all validation c-index values for each selected variable as a list
        best_cindex.append(np.mean(cindex))

    #max_num: Index number of the best c-index in the list
    max_num = best_cindex.index(max(best_cindex))

    #train_column: Selected variables for the best c-index
    train_column = column_groups[max_num].columns
    
    aft_remaining_variable.loc[i] = [list(train_column)]
    
aft_remaining_variable.to_csv("melanoma_aft_remaining_variables.csv", index = False, encoding = 'cp949')

### C-index by Kernel type

Linear Kernel

In [19]:
linear_results=pd.DataFrame()

linear_results['train_C_index']=[]
linear_results['test_C_index']=[]

In [21]:
kernel_type = 'linear'

for i in range(100):
    variables = eval(pd.read_csv("melanoma_aft_remaining_variables.csv")['variable'][i])
    
    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    linear_results.loc[i] = [results[0], results[1]]

linear_results.to_csv("melanoma_aft_linear.csv", index = False, encoding = 'cp949')

Clinical Kernel

In [22]:
clinical_results=pd.DataFrame()

clinical_results['train_C_index']=[]
clinical_results['test_C_index']=[]

In [None]:
kernel_type = 'clinical'

for i in range(100):
    variables = eval(pd.read_csv("melanoma_aft_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    clinical_results.loc[i] = [results[0], results[1]]

clinical_results.to_csv("melanoma_aft_clinical.csv", index = False, encoding = 'cp949')

Ensemble Cox Kernel

In [24]:
ensemble_cox_results=pd.DataFrame()

ensemble_cox_results['train_C_index']=[]
ensemble_cox_results['test_C_index']=[]

In [25]:
kernel_type = 'ensemble_cox'

for i in range(100):
    variables = eval(pd.read_csv("melanoma_aft_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_cox_results.loc[i] = [results[0], results[1]]

ensemble_cox_results.to_csv("melanoma_aft_cox.csv", index = False, encoding = 'cp949')

Ensemble AFT Kernel

In [26]:
ensemble_aft_results=pd.DataFrame()

ensemble_aft_results['train_C_index']=[]
ensemble_aft_results['test_C_index']=[]

In [27]:
kernel_type = 'ensemble_aft'

for i in range(100):
    variables = eval(pd.read_csv("melanoma_aft_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_aft_results.loc[i] = [results[0], results[1]]

ensemble_aft_results.to_csv("melanoma_aft_aft.csv", index = False, encoding = 'cp949')

### Result

In [28]:
linear_results = pd.read_csv('melanoma_aft_linear.csv')
clinical_results = pd.read_csv('melanoma_aft_clinical.csv')
ensemble_cox_results = pd.read_csv('melanoma_aft_cox.csv')
ensemble_aft_results = pd.read_csv('melanoma_aft_aft.csv')

Count of selections for remaining variables

In [None]:
remaining_variables_all = []
for i in range(0,100):
    remaining_variables_all += eval(pd.read_csv("melanoma_aft_remaining_variables.csv")['variable'][i])

element_counts = {}

for element in remaining_variables_all:
    if element in element_counts:
        element_counts[element] += 1
    else:
        element_counts[element] = 1

print(element_counts)

The value of C-index

In [None]:
print(round(np.mean(linear_results['train_C_index']),4))
print(round(np.std(linear_results['train_C_index']),4))
print(round(np.mean(linear_results['test_C_index']),4))
print(round(np.std(linear_results['test_C_index']),4))

In [None]:
print(round(np.mean(clinical_results['train_C_index']),4))
print(round(np.std(clinical_results['train_C_index']),4))
print(round(np.mean(clinical_results['test_C_index']),4))
print(round(np.std(clinical_results['test_C_index']),4))

In [None]:
print(round(np.mean(ensemble_cox_results['train_C_index']),4))
print(round(np.std(ensemble_cox_results['train_C_index']),4))
print(round(np.mean(ensemble_cox_results['test_C_index']),4))
print(round(np.std(ensemble_cox_results['test_C_index']),4))

In [None]:
print(round(np.mean(ensemble_aft_results['train_C_index']),4))
print(round(np.std(ensemble_aft_results['train_C_index']),4))
print(round(np.mean(ensemble_aft_results['test_C_index']),4))
print(round(np.std(ensemble_aft_results['test_C_index']),4))