In [22]:
import copy

import numpy as np
import pandas as pd
from itertools import combinations

from sklearn.model_selection import KFold
from skopt.space import Real

import sys
sys.path.append('G:/내 드라이브/ensemble kernel')

from Kernel_Function_3 import split_data, prepare_response_variable, c_index_kernel_type

### Preprocessing

In [27]:
data = pd.read_csv("G:/내 드라이브/ensemble kernel/pancreatic.csv")

df = copy.deepcopy(data)

column_mapping = {'age_at_initial_pathologic_diagnosis' : 'Age',
                  'gender' : 'Sex'}
df_rename = df.rename(columns = column_mapping)

df_drop = df_rename.drop(['patient_barcode'], axis=1).dropna()

columns_to_convert = ['Sex',
                      'alcohol_history_documented',
                      'history_of_diabetes',
                      'history_of_chronic_pancreatitis',
                      'anatomic_neoplasm_subdivision',
                      'surgery_performed_type',
                      'residual_tumor',
                      'radiation_therapy',
                      'postoperative_rx_tx',
                      'person_neoplasm_cancer_status'
                     ]

df_drop[columns_to_convert] = df_drop[columns_to_convert].astype('category')

#onehot encode
df_onehot = pd.get_dummies(df_drop, drop_first=True).rename(columns={'Sex_MALE' : 'Sex'})

columns_to_convert=['Sex', 'alcohol_history_documented_YES',
       'anatomic_neoplasm_subdivision_Head of Pancreas',
       'anatomic_neoplasm_subdivision_Other (please specify)',
       'history_of_chronic_pancreatitis_YES', 'history_of_diabetes_YES',
       'person_neoplasm_cancer_status_WITH TUMOR', 'postoperative_rx_tx_YES',
       'radiation_therapy_YES', 'residual_tumor_R1',
       'surgery_performed_type_Other Method (please specify)',
       'surgery_performed_type_Whipple']

df_onehot[columns_to_convert] = df_onehot[columns_to_convert].astype('category')

### 표준오차 큰 변수 제거

In [28]:
from lifelines import CoxPHFitter

coxph = CoxPHFitter()
coxph.fit(df_onehot, duration_col='OS', event_col='Status')
coef = np.abs(np.log(coxph.hazard_ratios_))

pvalue=pd.DataFrame(coxph.summary['p'])
pvalue=pvalue.sort_values('p')
sd=pd.DataFrame(coxph.summary['se(coef)'])
multi=pd.concat([pvalue,sd],axis=1).reset_index(drop=False)
multi




Unnamed: 0,covariate,p,se(coef)
0,postoperative_rx_tx_YES,0.00095,0.298517
1,person_neoplasm_cancer_status_WITH TUMOR,0.002332,0.38095
2,maximum_tumor_dimension,0.0031,0.121123
3,residual_tumor_R1,0.049057,0.270593
4,surgery_performed_type_Other Method (please sp...,0.050006,1.197213
5,alcohol_history_documented_YES,0.090091,0.291264
6,Age,0.267099,0.01263
7,Sex,0.303406,0.291458
8,radiation_therapy_YES,0.353805,0.387914
9,tobacco_smoking_history,0.419585,0.123152


In [29]:
df_onehot_drop=df_onehot.drop(['surgery_performed_type_Other Method (please specify)','surgery_performed_type_Whipple',
                          'anatomic_neoplasm_subdivision_Head of Pancreas',
                          'anatomic_neoplasm_subdivision_Other (please specify)'], axis=1)
df_onehot_drop

Unnamed: 0,Age,tobacco_smoking_history,maximum_tumor_dimension,T_Stage,N_Stage,OS,Status,Sex,alcohol_history_documented_YES,history_of_diabetes_YES,history_of_chronic_pancreatitis_YES,residual_tumor_R1,radiation_therapy_YES,postoperative_rx_tx_YES,person_neoplasm_cancer_status_WITH TUMOR
0,45,4,4.25,3,2,1323,0,1,1,0,0,0,1,1,0
1,72,1,3.84,3,3,732,1,0,0,0,0,0,0,1,1
2,50,2,4.40,3,3,128,1,1,1,0,0,0,0,1,0
3,72,1,3.95,3,2,232,0,0,0,0,0,0,0,1,1
4,61,1,3.30,2,2,289,0,0,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,66,4,2.00,1,1,1021,0,0,1,0,1,0,0,1,1
120,40,4,2.00,1,1,1037,0,1,1,1,0,0,1,1,0
121,85,3,2.00,1,1,244,1,1,1,0,0,0,0,1,1
122,43,1,2.00,1,2,286,0,1,1,0,0,0,1,1,0


### K-fold setting

In [30]:
cv = KFold(n_splits=5, shuffle=True, random_state=36)

param_grid = {'alpha': 2. ** np.arange(-12, 13, 2)}
param_space = {'alpha': Real(1e-6, 1e+6, 'log-uniform'),}

### Selection of 100 random state numbers

In [31]:
import random

with open("random_state_100.txt", "r") as file:
    random_state=file.read()
    
random_state=random_state.split("\n")
random_state=[int(x) for x in random_state if x]

## Scenario 2

In [33]:
# Remaining variable results from 100 runs
cox_remaining_variable=pd.DataFrame()

cox_remaining_variable['variable']=[]

### C-index

In [34]:
for i in range(100):
    x_train, x_test, target_train, target_test = split_data(df_onehot_drop, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    # Separating OS and Status
    drop = x_train[['Sex', 'Age', 'OS', 'Status']]
    x_train_drop = x_train.drop(columns=['Sex', 'Age', 'OS','Status'])

    # Save column names of a DataFrame into a list
    columns = x_train_drop.columns

    column_groups = []

    for i in range(len(columns)):
        # Generate all combinations of columns taken i+1 at a time
        all_column_combinations = list(combinations(columns, i+1))

        # Create a DataFrame for each combination
        for column_combination in all_column_combinations:
            selected_columns = list(column_combination)
            sub_train1 = x_train_drop[selected_columns]
            sub_train2 = pd.concat([sub_train1, drop], axis=1)
            column_groups.append(sub_train2)
    
    num_groups = 3
    best_cindex = []

    for i in range(len(column_groups)):
        x_groups = []
        train = column_groups[i].sample(frac=1).reset_index(drop=True)
        
        censored = train[train['Status'] == 0]  # Extract rows with a value of 0
        uncensored = train[train['Status'] == 1]  # Extract rows with a value of 1

        group_size1 = len(censored) // num_groups
        group_size2 = len(uncensored) // num_groups

        # Split train data into groups
        for i in range(num_groups):
                
            if i < num_groups - 1:
                # Adjusting the censoring ratio
                group1 = censored.iloc[i * group_size1:(i + 1) * group_size1]
                group2 = uncensored.iloc[i * group_size2:(i + 1) * group_size2]

                group = pd.concat([group1, group2], ignore_index=True)
            else:
                group1 = censored.iloc[i * group_size1:]
                group2 = uncensored.iloc[i * group_size2:]

                group = pd.concat([group1, group2], ignore_index=True)
    
            x_groups.append(group)

        cindex = []

        for i in range(len(x_groups)):

            temp = []
            for j in range(len(x_groups)):
                if i != j:
                    temp.append(x_groups[j])
            # train = train data(One out of the equally divided segments, excluding one)
            train = pd.concat(temp)

            # validation = validation data(One of the equally divided segments)
            validation = x_groups[i]
            
            train_target = train[['Status','OS']]
            test_target = validation[['Status','OS']]

            train_y = prepare_response_variable(train_target)
            test_y = prepare_response_variable(test_target)
            #Define data -> train, train_y, validation,  test_y

            result = c_index_kernel_type(train, train_y, validation, test_y, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = 'ensemble_cox')
            cindex.append(result[1])

        #best_cindex: Store all validation c-index values for each selected variable as a list
        best_cindex.append(np.mean(cindex))

    #max_num: Index number of the best c-index in the list
    max_num = best_cindex.index(max(best_cindex))

    #train_column: Selected variables for the best c-index
    train_column = column_groups[max_num].columns
    
    cox_remaining_variable = cox_remaining_variable.append({"variable": list(train_column)}, ignore_index = True)

cox_remaining_variable.to_csv("pancreatic_cox_remaining_variables.csv", index = False, encoding = 'cp949')

  self.best_estimator_.fit(X, y, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


KeyboardInterrupt: 

### C-index by Kernel type

##### Linear Kernel

In [25]:
linear_results=pd.DataFrame()

linear_results['train_C_index']=[]
linear_results['test_C_index']=[]

In [None]:
kernel_type = 'linear'

for i in range(100):
    variables = eval(pd.read_csv("ensemble_cox_remaining_variables.csv")['variable'][i])

    variables.append('Status')
    variables.append('OS')

    df_onehot_re = df_onehot_drop[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    linear_results = linear_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

linear_results.to_csv("pancreatic_linear_repetition.csv", index = False, encoding = 'cp949')


##### Clinical Kernel

In [None]:
clinical_results=pd.DataFrame()

clinical_results['train_C_index']=[]
clinical_results['test_C_index']=[]

In [None]:
kernel_type = 'clinical'

for i in range(100):
    #variables = all_results['remaining_variables'][i]
    variables = eval(pd.read_csv("pancreatic_cox_remaining_variables.csv")['variable'][i])

    variables.append('Status')
    variables.append('OS')

    df_onehot_re = df_onehot_drop[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    clinical_results = clinical_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

clinical_results.to_csv("pancreatic_clinical_repetition.csv", index = False, encoding = 'cp949')

##### Ensemble Cox Kernel

In [None]:
ensemble_cox_results=pd.DataFrame()

ensemble_cox_results['train_C_index']=[]
ensemble_cox_results['test_C_index']=[]

In [None]:
kernel_type = 'ensemble_cox'

for i in range(100):
    #variables = all_results['remaining_variables'][i]
    variables = eval(pd.read_csv("pancreatic_cox_remaining_variables.csv")['variable'][i])

    variables.append('Status')
    variables.append('OS')

    df_onehot_re = df_onehot_drop[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_aft_results = ensemble_aft_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

ensemble_aft_results.to_csv("pancreatic_cox_repetition.csv", index = False, encoding = 'cp949')

##### Ensemble AFT Kernel

In [None]:
ensemble_aft_results=pd.DataFrame()

ensemble_aft_results['train_C_index']=[]
ensemble_aft_results['test_C_index']=[]

In [None]:
kernel_type = 'ensemble_aft'

for i in range(100):
    #variables = all_results['remaining_variables'][i]
    variables = eval(pd.read_csv("pancreatic_cox_remaining_variables.csv")['variable'][i])

    variables.append('Status')
    variables.append('OS')

    df_onehot_re = df_onehot[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_aft_results = ensemble_aft_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

ensemble_aft_results.to_csv("pancreatic_aft_repetition.csv", index = False, encoding = 'cp949')

#### Result

In [None]:
linear_results = pd.read_csv('pancreatic_linear_repetition.csv')
clinical_results = pd.read_csv('pancreatic_clinical_repetition.csv')
ensemble_cox_results = pd.read_csv('pancreatic_cox_repetition.csv')
ensemble_aft_results = pd.read_csv('pancreatic_aft_repetition.csv')

##### Count of selections for each remaining variables

In [None]:
remaining_variables_all=[]
for i in range(0,100):
    remaining_variables_all+=eval(pd.read_csv("pancreatic_cox_repetition.csv")['remaining_variables'][i])

element_counts = {}

for element in remaining_variables_all:
    if element in element_counts:
        element_counts[element] += 1
    else:
        element_counts[element] = 1

print(element_counts)

##### The value of C-index

In [None]:
print(round(np.mean(linear_results['train_C_index']),4))
print(round(np.std(linear_results['train_C_index']),4))
print(round(np.mean(linear_results['test_C_index']),4))
print(round(np.std(linear_results['test_C_index']),4))

In [None]:
print(round(np.mean(clinical_results['train_C_index']),4))
print(round(np.std(clinical_results['train_C_index']),4))
print(round(np.mean(clinical_results['test_C_index']),4))
print(round(np.std(clinical_results['test_C_index']),4))

In [None]:
print(round(np.mean(ensemble_cox_results['train_C_index']),4))
print(round(np.std(ensemble_cox_results['train_C_index']),4))
print(round(np.mean(ensemble_cox_results['test_C_index']),4))
print(round(np.std(ensemble_cox_results['test_C_index']),4))

In [None]:
print(round(np.mean(ensemble_aft_results['train_C_index']),4))
print(round(np.std(ensemble_aft_results['train_C_index']),4))
print(round(np.mean(ensemble_aft_results['test_C_index']),4))
print(round(np.std(ensemble_aft_results['test_C_index']),4))

## Scenario 3

In [None]:
# Remaining variable results from 100 runs
aft_remaining_variable=pd.DataFrame()

aft_remaining_variable['variable']=[]

##### C-index results

In [None]:
for i in range(100):
    x_train, x_test, target_train, target_test = split_data(df_onehot_drop, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    # Separating OS and Status
    drop = x_train[['Sex', 'Age', 'OS','Status']]
    x_train_drop = x_train.drop(columns=['Sex', 'Age', 'OS','Status'])

    # Save column names of a DataFrame into a list
    columns = x_train_drop.columns

    column_groups = []

    for i in range(len(columns)):
        # Generate all combinations of columns taken i+1 at a time
        all_column_combinations = list(combinations(columns, i+1))

        # Create a DataFrame for each combination
        for column_combination in all_column_combinations:
            selected_columns = list(column_combination)
            sub_train1 = x_train_drop[selected_columns]
            sub_train2 = pd.concat([sub_train1, drop], axis=1)
            column_groups.append(sub_train2)
    
    num_groups = 3
    best_cindex = []

    for i in range(len(column_groups)):
        x_groups = []
        train = column_groups[i].sample(frac=1).reset_index(drop=True)
        
        censored = train[train['Status'] == 0]  # Extract rows with a value of 0
        uncensored = train[train['Status'] == 1]  # Extract rows with a value of 1

        group_size1 = len(censored) // num_groups
        group_size2 = len(uncensored) // num_groups

        # Splitting the train data into groups
        for i in range(num_groups):

            if i < num_groups - 1:
                # Adjusting the censoring ratio
                group1 = censored.iloc[i * group_size1:(i + 1) * group_size1]
                group2 = uncensored.iloc[i * group_size2:(i + 1) * group_size2]

                group = pd.concat([group1, group2], ignore_index=True)
            else:
                group1 = censored.iloc[i * group_size1:]
                group2 = uncensored.iloc[i * group_size2:]

                group = pd.concat([group1, group2], ignore_index=True)
    
            x_groups.append(group)

        cindex = []

        for i in range(len(x_groups)):

            temp = []
            for j in range(len(x_groups)):
                if i != j:
                    temp.append(x_groups[j])
            #train = train data(One out of the equally divided segments, excluding one)
            train=pd.concat(temp)

            #validation = validation data(One of the equally divided segments)
            validation = x_groups[i]
            
            train_target=train[['Status','OS']]
            test_target=validation[['Status','OS']]

            train_y=prepare_response_variable(train_target)
            test_y=prepare_response_variable(test_target)
            #Define data -> train, train_y, validation,  test_y

            result = c_index_kernel_type(train, train_y, validation, test_y, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = 'ensemble_aft')
            cindex.append(result[1])

        #best_cindex: Store all validation c-index values for each selected variable as a list
        best_cindex.append(np.mean(cindex))

    #max_num: Index number of the best c-index in the list
    max_num = best_cindex.index(max(best_cindex))

    #train_column: Selected variables for the best c-index
    train_column = column_groups[max_num].columns
    
    aft_remaining_variable = aft_remaining_variable.append({"variable": list(train_column)}, ignore_index = True)

aft_remaining_variable.to_csv("pancreatic_aft_remaining_variables.csv", index = False, encoding = 'cp949')

### Check results by Kernel type

##### Linear Kernel

In [None]:
linear_results=pd.DataFrame()

linear_results['train_C_index']=[]
linear_results['test_C_index']=[]

In [None]:
kernel_type = 'linear'

for i in range(100):
    #variables = all_results['remaining_variables'][i]
    variables = eval(pd.read_csv("pancreatic_aft_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot_drop[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    linear_results = linear_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

linear_results.to_csv("pancreatic_aft_linear.csv", index = False, encoding = 'cp949')

##### Clinical Kernel

In [None]:
clinical_results=pd.DataFrame()

clinical_results['train_C_index']=[]
clinical_results['test_C_index']=[]

In [None]:
kernel_type = 'clinical'

for i in range(100):
    #variables = all_results['remaining_variables'][i]
    variables = eval(pd.read_csv("pancreatic_aft_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot_drop[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    clinical_results = clinical_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

clinical_results.to_csv("pancreatic_aft_clinical.csv", index = False, encoding = 'cp949')

##### Ensemble Cox Kernel

In [None]:
ensemble_cox_results=pd.DataFrame()

ensemble_cox_results['train_C_index']=[]
ensemble_cox_results['test_C_index']=[]

In [None]:
kernel_type = 'ensemble_cox'

for i in range(100):
    #variables = all_results['remaining_variables'][i]
    variables = eval(pd.read_csv("pancreatic_aft_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot_drop[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_cox_results = ensemble_cox_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

ensemble_cox_results.to_csv("pancreatic_aft_cox.csv", index = False, encoding = 'cp949')

##### Ensemble AFT Kernel

In [None]:
ensemble_aft_results=pd.DataFrame()

ensemble_aft_results['train_C_index']=[]
ensemble_aft_results['test_C_index']=[]

In [None]:
kernel_type = 'ensemble_aft'

for i in range(100):
    #variables = remaining_variable[i]
    variables = eval(pd.read_csv("pancreatic_aft_remaining_variables.csv")['variable'][i])

    df_onehot_re = df_onehot_drop[variables]
    
    x_train, x_test, target_train, target_test = split_data(df_onehot_re, randomState = random_state[i])

    y_train = prepare_response_variable(target_train)
    y_test = prepare_response_variable(target_test)

    results = c_index_kernel_type(x_train, y_train, x_test, y_test, param_grid, param_space, cv, keywords = ['Age', 'Sex'], type = kernel_type)
    
    ensemble_aft_results = ensemble_aft_results.append({"train_C_index":results[0],"test_C_index":results[1]}, ignore_index=True)

ensemble_aft_results.to_csv("pancreatic_aft_aft.csv", index = False, encoding = 'cp949')

### Result

In [None]:
linear_results = pd.read_csv('pancreatic_aft_linear.csv')
clinical_results = pd.read_csv('pancreatic_aft_clinical.csv')
ensemble_cox_results = pd.read_csv('pancreatic_aft_cox.csv')
ensemble_aft_results = pd.read_csv('pancreatic_aft_aft.csv')

##### Count of selections for each remaining variables

In [None]:
remaining_variables_all=[]
for i in range(0,100):
    remaining_variables_all += eval(pd.read_csv("pancreatic_aft_remaining_variables.csv")['variable'][i])

element_counts = {}

for element in remaining_variables_all:
    if element in element_counts:
        element_counts[element] += 1
    else:
        element_counts[element] = 1

print(element_counts)

##### The value of the C-index

In [None]:
print(round(np.mean(linear_results['train_C_index']),4))
print(round(np.std(linear_results['train_C_index']),4))
print(round(np.mean(linear_results['test_C_index']),4))
print(round(np.std(linear_results['test_C_index']),4))

In [None]:
print(round(np.mean(clinical_results['train_C_index']),4))
print(round(np.std(clinical_results['train_C_index']),4))
print(round(np.mean(clinical_results['test_C_index']),4))
print(round(np.std(clinical_results['test_C_index']),4))

In [None]:
print(round(np.mean(ensemble_cox_results['train_C_index']),4))
print(round(np.std(ensemble_cox_results['train_C_index']),4))
print(round(np.mean(ensemble_cox_results['test_C_index']),4))
print(round(np.std(ensemble_cox_results['test_C_index']),4))

In [None]:
print(round(np.mean(ensemble_aft_results['train_C_index']),4))
print(round(np.std(ensemble_aft_results['train_C_index']),4))
print(round(np.mean(ensemble_aft_results['test_C_index']),4))
print(round(np.std(ensemble_aft_results['test_C_index']),4))