<a href="https://colab.research.google.com/github/Seong-Gyu-Choi/Koraen-specific_CVD_risk_model/blob/main/CVD_Risk_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 데이터

In [None]:
!pip install lifelines
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index

!pip install pyreadstat # SPSS 파일
!pip install scikit-survival # COX 모델을 포함한 머신러닝 모델들 사용 가능, 추가적인 작업이 필요함
!pip install shap
!pip install optuna
!pip install koreanize-matplotlib

import pandas as pd
import numpy as np
import optuna

from sksurv.linear_model import CoxPHSurvivalAnalysis, CoxnetSurvivalAnalysis
from sksurv.tree import SurvivalTree
from sksurv.svm import FastKernelSurvivalSVM
from sksurv.ensemble import RandomSurvivalForest, GradientBoostingSurvivalAnalysis
from sksurv.util import Surv
from sksurv.metrics import concordance_index_censored, cumulative_dynamic_auc

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample

import koreanize_matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(42) # 전역 난수 시드 고정

from google.colab import drive # 코랩과 구글 드라이브 연동
drive.mount('/content/drive')
#dir_total = '/content/drive/MyDrive/CVD 생존분석/Total_KMI.sav' # 전체 데이터 (원본, 결측치 대치 전)
dir_dev = '/content/drive/MyDrive/CVD 생존분석/Model building data.sav' # 개발 데이터 (결측치 대치 완료)
dir_val = '/content/drive/MyDrive/CVD 생존분석/Validation data.sav' # 검증 데이터 (결측치 대치 완료)
# dir_rsf = '/content/drive/MyDrive/CVD 생존분석/Data_rsf.sav' # 개발 데이터 중 일부 샘플링 데이터 (rsf, gbs 모델 사용시 램용량초과)

# Data_total = pd.read_spss(dir_total) # 원본 전체 데이터(전처리X) , EDA 할 때만 사용함.
Data_train = pd.read_spss(dir_dev) # 전처리된 train data
Data_test = pd.read_spss(dir_val) # 전처리된 test data
Data_total = pd.concat([Data_train,Data_test]) # 전처리된 Train 과 Test를 concat한 전체 데이터, 실제 분석에 사용함

In [None]:
Data_temp = Data_total.copy()
Data_temp.info()

In [None]:
# 성별에 따라 통계량 확인
display(
    Data_total.groupby('성별').size(),
    Data_total.groupby('성별')['나이'].mean(),
    Data_total.groupby('성별')['나이'].std()
)

In [None]:
print(Data_total.shape , Data_train.shape , Data_test.shape)
# CVD 사망 여부가 결측인 대상자 없음
print('전체',Data_total['CVD사망'].value_counts(dropna=False) , Data_total['CVD사망'].value_counts(dropna=False , normalize=True),'\n')
print('Train',Data_train['CVD사망'].value_counts(dropna=False) , Data_train['CVD사망'].value_counts(dropna=False , normalize=True),'\n')
print('Test',Data_test['CVD사망'].value_counts(dropna=False) , Data_test['CVD사망'].value_counts(dropna=False , normalize=True))

In [None]:
# 추적기간 음수인 대상자 삭제함
mask = Data_temp['추적기간_연'] <= 0
Data_temp[mask] # 관찰기간이 Baseline 이전인 대상자 67명 확인 -> 현재 삭제함

# EDA

### 결측치

In [None]:
pd.options.display.max_rows = 65

missing = Data_temp.isnull().sum().reset_index()
missing.columns = ['feature','num']
missing['%'] = 100*missing['num']/Data_temp.shape[0]
missing = missing.sort_values(by='num', ascending=False)
missing[missing['num'] > 0].round(2)

### 고유값 확인

In [None]:
Data_temp.nunique().T

### 범주형 변수 더미화
- Cox PH , Logistic regression

In [None]:
Data_total = pd.concat([Data_train,Data_test]) # 실제 분석에 사용하는 전체 데이터
Data_temp = Data_total.copy()

In [None]:
# 범주형 변수 지정
cat_features = ['성별','대사증후군여부','당뇨병여부','고혈압여부',
                'IPAQ_4group_mets_days', 'PA_3group' ,'PA_2group',
                '나이_대' ,'나이_5그룹' ,'나이_4그룹' ,'나이_3그룹' ,'나이_2그룹']
for feature in cat_features:
    Data_train[feature] = Data_train[feature].astype('category')
    Data_test[feature] = Data_test[feature].astype('category')
    Data_temp[feature] = Data_temp[feature].astype('category')

In [None]:
Data_temp = Data_total
dummy_features = ['IPAQ_4group_mets_days', 'PA_3group']
Data_temp_dummy = pd.get_dummies(Data_temp, columns=dummy_features, drop_first=True) # drop_first로 더미 변수 조절

In [None]:
dummy=['IPAQ_4group_mets_days_1.0', 'IPAQ_4group_mets_days_2.0','IPAQ_4group_mets_days_3.0',
       'PA_3group_2.0', 'PA_3group_3.0']
Data_temp_dummy[dummy].astype('int')

## 다중 공선성 확인
 - 범주형 변수는 다중 공선성을 일으키지 않음
 - 연속형 변수에 대해서 VIF 확인

In [None]:
import matplotlib.pyplot as plt
import koreanize_matplotlib
import seaborn as sns
var_table = ['나이','나이_시간','WHtR','고혈압여부','당뇨병여부','대사증후군여부','성별', 'PA_3group']

var_all = [ '성별', '나이', '신장', '체중', 'BMI', '허리둘레', 'WHtR', '허벅지둘레', 'SBP', 'DBP',
            '공복혈당', 'LDL', 'HDL', 'TG', '대사증후군여부', '당뇨병여부', '고혈압여부', '고강도mets',
            'TotalMets', '중강도mets', '걷기mets', '고강도PA', '중강도PA', '걷기PA',
            'IPAQ_4group_mets_days', '추적기간_연', 'CVD사망', '나이_대', 'PA_3group','PA_2group',
            '나이_시간']

var = var_table
# train
corr_matrix1 = Data_train[var].corr()
plt.figure(figsize=(15, 6))  # 그림 크기 조절
plt.subplot(1, 2, 1)  # 1행 2열의 첫 번째 subplot
sns.heatmap(corr_matrix1, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix - Train set')

# test
corr_matrix2 = Data_test[var].corr()
plt.subplot(1, 2, 2)  # 1행 2열의 두 번째 subplot
sns.heatmap(corr_matrix2, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix - Test set')

plt.tight_layout()  # subplot 간 간격 조절
plt.show()

In [None]:
import matplotlib.pyplot as plt
import koreanize_matplotlib
import seaborn as sns
var_table = ['나이','나이_시간','WHtR','고혈압여부','당뇨병여부','대사증후군여부','성별', 'PA_3group']

var_all = [ '성별', '나이', '신장', '체중', 'BMI', '허리둘레', 'WHtR', '허벅지둘레', 'SBP', 'DBP',
            '공복혈당', 'LDL', 'HDL', 'TG', '대사증후군여부', '당뇨병여부', '고혈압여부', '고강도mets',
            'TotalMets', '중강도mets', '걷기mets', '고강도PA', '중강도PA', '걷기PA',
            'IPAQ_4group_mets_days', '추적기간_연', 'CVD사망', '나이_대', 'PA_3group','PA_2group',
            '나이_시간']

var = var_all
# train
corr_matrix1 = Data_train[var].corr()
plt.figure(figsize=(30, 12))  # 그림 크기 조절
plt.subplot(1, 2, 1)  # 1행 2열의 첫 번째 subplot
sns.heatmap(corr_matrix1, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix - Train set')

# test
corr_matrix2 = Data_test[var].corr()
plt.subplot(1, 2, 2)  # 1행 2열의 두 번째 subplot
sns.heatmap(corr_matrix2, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix - Test set')

plt.tight_layout()  # subplot 간 간격 조절
plt.show()

In [None]:
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor

continuous_vars = ['WHtR', '나이_시간']
X = Data_train[continuous_vars]
# VIF 계산
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif_data)

# 조건수 계산
condition_number = np.linalg.cond(X)
print("조건수:", condition_number)

In [None]:
!pip install researchpy
import researchpy as rp
import pandas as pd

In [None]:
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = rp.crosstab(x,y,test="chi-square",expected_freqs=True,prop="cell")[1].results[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

cat_vars = ['고혈압여부', '당뇨병여부', '대사증후군여부']

for i in range(len(cat_vars)):
    for j in range(i+1, len(cat_vars)):
        var1 = cat_vars[i]
        var2 = cat_vars[j]
        v = cramers_v(Data_train[var1], Data_train[var2])
        print(f"{var1}와 {var2}의 Cramer's V: {v:.4f}")

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

def chi2_test(data, var1, var2):
    contingency_table = pd.crosstab(data[var1], data[var2])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    return chi2, p

# 변수 간 카이제곱 검정
cat_vars = ['고혈압여부', '당뇨병여부', '대사증후군여부']
results = {}

for i in range(len(cat_vars)):
    for j in range(i + 1, len(cat_vars)):
        var1 = cat_vars[i]
        var2 = cat_vars[j]
        chi2, p = chi2_test(Data_temp, var1, var2)
        results[f'{var1} - {var2}'] = {'chi2': chi2, 'p-value': p}

# 결과 출력
for pair, result in results.items():
    print(f'{pair}: chi2 = {result["chi2"]}, p-value = {result["p-value"]:.4f}')

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency
from decimal import Decimal
# 교차표 생성
crosstab = pd.crosstab([Data_train['고혈압여부'], Data_train['당뇨병여부']], Data_train['대사증후군여부'])
# 다변량 카이제곱 검정 수행
chi2, p, dof, expected = chi2_contingency(crosstab)
# p-value를 Decimal 객체로 변환하여 정밀도 설정
p_decimal = Decimal(p)
print("다변량 카이제곱 검정 결과:")
print(f"  Chi-square statistic: {chi2}")
print(f"  p-value (소수점 넷째 자리까지): {p_decimal:.4f}")  # 소수점 넷째 자리까지 출력
print(f"  Degrees of freedom: {dof}")

In [None]:
import matplotlib.pyplot as plt
import koreanize_matplotlib
import seaborn as sns
# 과학적 표기법 대신 일반 숫자로 표시
pd.set_option('display.float_format', lambda x: '{:.0f}'.format(x))
# 교차표 생성
crosstab = pd.crosstab([Data_temp['고혈압여부'], Data_temp['당뇨병여부']], Data_temp['대사증후군여부'])
# Heatmap 시각화
plt.figure(figsize=(8, 6))
sns.heatmap(crosstab, annot=True, cmap="YlGnBu", fmt='d')
plt.title("Contingency Table (고혈압여부, 당뇨병여부, 대사증후군여부)")
plt.show()

### 통계량 확인

In [None]:
Data_temp.shape

# 변수 선정

In [None]:
Data_temp[Data_temp['CVD사망']==1]['성별'].value_counts() # 남녀 사망비율 확인

### Cox - life lines

In [None]:
from lifelines import CoxPHFitter

In [None]:
cph_ni = CoxPHFitter()
formula1 = '나이 + WHtR + C(성별) + C(PA_3group)'
formula2 = '나이 + WHtR + C(성별) + C(고혈압여부)'
formula3 = '나이 + WHtR + C(성별) + C(고혈압여부) + C(당뇨병여부) + C(대사증후군여부) + C(PA_3group)' # PA_3gruop 유의하지 않음, IPQA_4group 모두 유의함
formula4 = '나이 + WHtR + C(성별) + C(고혈압여부) + C(당뇨병여부) + C(대사증후군여부) + C(IPAQ_4group_mets_days)'
cph_ni.fit(Data_train , duration_col = '추적기간_연' , event_col = 'CVD사망' , formula= formula4)
cph_ni.print_summary()

# 공복혈당과 당뇨병여부를 동시에 사용할 수 없음
cph_i = CoxPHFitter()
formula5 = '나이 + WHtR + C(성별) + C(고혈압여부) + C(대사증후군여부) + C(IPAQ_4group_mets_days) + TG + LDL + 공복혈당' #  C(당뇨병여부)
formula6 = '나이 + WHtR + C(성별) + C(고혈압여부) + C(당뇨병여부) + C(IPAQ_4group_mets_days) + TG + LDL' #  대사증후군 제외할 경우
cph_i.fit(Data_train , duration_col = '추적기간_연' , event_col = 'CVD사망' , formula= formula6)
cph_i.print_summary()

## Feature importance
- LightGBM

In [None]:
import lightgbm
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import koreanize_matplotlib
from sklearn.metrics import roc_auc_score

In [None]:
cols_all = ['성별', '나이', '신장', '체중', 'BMI', '허리둘레', 'WHtR', '허벅지둘레', 'SBP', 'DBP',
            '공복혈당', 'LDL', 'HDL', 'TG', '대사증후군여부', '당뇨병여부', '고혈압여부',
            '고강도mets','TotalMets', '중강도mets', '걷기mets', '고강도PA', '중강도PA', '걷기PA','IPAQ_4group_mets_days',  'PA_3group', 'PA_2group',
            '추적기간_연', 'CVD사망']

In [None]:
cols_temp = ['TotalMets', '고강도mets', '중강도mets', '걷기mets', '고강도PA', '중강도PA', '걷기PA','IPAQ_4group_mets_days', 'PA_3group', 'PA_2group',
             'CVD사망'] # '나이'

In [None]:
# 상관관계가 높은 변수 제거
cols1 = ['나이','성별','WHtR', 'SBP', 'TG', 'LDL','HDL', '공복혈당',
        'TotalMets','IPAQ_4group_mets_days','걷기mets','PA_3group',
         '당뇨병여부','고혈압여부', '대사증후군여부',
         'CVD사망'] # 체중, BMI, 허리둘레 -> WHtR / DBP -> SBP

cols2 = ['나이','성별','WHtR',  'LDL','HDL', '공복혈당',
        'TotalMets','IPAQ_4group_mets_days','걷기mets','PA_3group', '당뇨병여부','고혈압여부', '대사증후군여부',
         'CVD사망'] # SBP, TG 제외

cols_opt = ['나이','성별','WHtR', 'SBP', 'TG', 'LDL','HDL', '공복혈당', 'TotalMets','IPAQ_4group_mets_days','걷기mets', '당뇨병여부','고혈압여부',
            'CVD사망']

In [None]:
cols = cols_temp
data = Data_temp[cols]

Feature = data.drop('CVD사망', axis=1)
Label = data.CVD사망

# LightGBM 모델 파라미터 설정
lgbm_params = {
    'objective': 'binary',
    'class_weight': 'balanced',
    'metric': 'auc',
    'num_leaves': 16,
    'max_depth': 8,
    'n_estimators': 14,
    'reg_alpha': 0.05,
    'reg_lambda': 0.8,
    'learning_rate' : 0.1,
    'min_child_samples': 20
}
# 변수 중요도를 저장할 리스트 생성
feature_importance_list = []

# n개의 stratified 폴드 생성
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# subplot 설정
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(25, 8))  # fold=3->1행,3열 / fold=5->2행,3열
fig.subplots_adjust(hspace=0.3)

# 각 폴드에서 학습 및 변수 중요도 그래프 출력
for i, (train_idx, valid_idx) in enumerate(skf.split(Feature, Label)):
    # 폴드별 학습 데이터 및 검증 데이터 준비
    X_train, X_valid = Feature.iloc[train_idx], Feature.iloc[valid_idx]
    y_train, y_valid = Label.iloc[train_idx], Label.iloc[valid_idx]
    # LightGBM 모델 생성 및 학습
    model = lightgbm.LGBMClassifier(**lgbm_params)
    model.fit(X_train, y_train)
    # 변수 중요도 저장
    feature_importance_list.append(model.feature_importances_)
    # AUC 계산
    y_valid_pred_prob = model.predict_proba(X_valid)[:, 1]
    auc_score = roc_auc_score(y_valid, y_valid_pred_prob)
    print(f'Fold {i + 1} AUC: {auc_score}')
    # 변수 중요도 그래프 출력 (상위 n개 변수)
    lightgbm.plot_importance(model, ax=axes[i], height=0.8, max_num_features=32)
    axes[i].set_title(f'Fold {i + 1}\nAUC: {auc_score:.3f}')

plt.show() # 그래프로 출력

# 각 폴드에서의 AUC를 가중치로 사용하여 변수 중요도에 반영
weighted_importance_list = []

for i, importance in enumerate(feature_importance_list):
    # 현재 폴드의 AUC 계산
    y_valid_pred_prob = model.predict_proba(Feature.iloc[valid_idx])[:, 1]
    auc_score = roc_auc_score(Label.iloc[valid_idx], y_valid_pred_prob)

    # 현재 폴드의 AUC를 가중치로 사용하여 변수 중요도 조정
    weighted_importance = importance * auc_score
    weighted_importance_list.append(weighted_importance)

# 각 변수별로 폴드별 중요도를 가중치를 고려하여 합산하여 평균 중요도 계산
average_weighted_importance = np.mean(weighted_importance_list, axis=0)

# 평균 중요도를 기준으로 내림차순으로 정렬하여 상위 n개 변수 확인
top_n = 10  # 상위 몇 개의 변수를 확인할지 설정
top_features_indices = np.argsort(average_weighted_importance)[::-1][:top_n]
top_features = Feature.columns[top_features_indices]

print("가중 평균 중요도 상위 변수:")
for feature, importance in zip(top_features, average_weighted_importance[top_features_indices]):
    print(f"{feature}: {importance:.4f}")

# 단순 평균 중요도 계산
average_importance = np.mean(feature_importance_list, axis=0)

# 평균 중요도를 기준으로 내림차순으로 정렬하여 상위 n개 변수 확인
top_features_indices_simple = np.argsort(average_importance)[::-1][:top_n]
top_features_simple = Feature.columns[top_features_indices_simple]

print("\n단순 평균 중요도 상위 변수:")
for feature, importance in zip(top_features_simple, average_importance[top_features_indices_simple]):
    print(f"{feature}: {importance:.4f}")

# 변수 이름과 중요도를 데이터프레임으로 변환
df = pd.DataFrame({
    'Feature': Feature.columns,
    'Weighted Importance': average_weighted_importance,
    'Simple Average Importance': average_importance # 위에서 정의 해야 함.
})

df_sorted = df.sort_values(by='Weighted Importance', ascending=True) # 가중 평균 중요도를 기준으로 내림차순 정렬
df_top_20 = df_sorted.tail(20) # 상위 20개 변수만을 포함한 데이터프레임 생성
plt.barh(df_top_20['Feature'], df_top_20['Weighted Importance'], label='Weighted Importance', color='b', alpha=0.7)
plt.barh(df_top_20['Feature'], df_top_20['Simple Average Importance'], label='Simple Average Importance', color='r', alpha=0.5) # 단순 평균 중요도 그래프 그리기
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Top 20 Features by Importance')
plt.legend(loc='lower right')
plt.show()

In [None]:
# 사망한 대상자 추출
train_deaths = Data_train[Data_train['CVD사망'] == 1]
test_deaths = Data_test[Data_test['CVD사망'] == 1]

# 사망하지 않은 대상자 추출
train_survivors = Data_train[Data_train['CVD사망'] == 0]
test_survivors = Data_test[Data_test['CVD사망'] == 0]

# 사망하지 않은 대상자를 샘플링 (예: 1% 샘플링, 필요에 따라 비율 조정)
train_survivors_sampled = train_survivors.sample(frac=0.03, random_state=42)
test_survivors_sampled = test_survivors.sample(frac=0.03, random_state=42)

# 샘플링된 데이터를 다시 합침
sampled_train = pd.concat([train_deaths, train_survivors_sampled])
sampled_test = pd.concat([test_deaths, test_survivors_sampled])
sampled_total = pd.concat([sampled_train,sampled_test])

In [None]:
import shap
cols = cols_temp
data = sampled_total[cols]
Feature = data.drop('CVD사망', axis=1)
Label = data.CVD사망

# 전체 데이터를 사용하여 최종 모델 학습 및 SHAP 값 계산
final_model = lightgbm.LGBMClassifier(**lgbm_params)
final_model.fit(Feature, Label)

explainer = shap.TreeExplainer(final_model)
shap_values = explainer.shap_values(Feature)

shap.summary_plot(shap_values, Feature, plot_type="bar") # SHAP summary plot (전체 데이터에 대한 기여도)
shap.summary_plot(shap_values, Feature) # SHAP summary plot (상세한 기여도와 방향성)
shap_interaction_values = explainer.shap_interaction_values(Feature) # 변수 상호작용 분석 - SHAP Interaction Values

In [None]:
shap.dependence_plot(('나이', 'SBP'), shap_interaction_values, Feature)

In [None]:
shap.dependence_plot(('나이', 'WHtR'), shap_interaction_values, Feature)

In [None]:
shap.dependence_plot(('나이', '허벅지둘레'), shap_interaction_values, Feature)

# 생존분석

## 데이터 샘플링
- RAM 용량 한계로 인한 데이터 샘플링
- CVD 사망자수 유지, 사망하지 않은 대상자 랜덤 샘플링
- 샘플링을 많이 할 수록 예측모델들간 성능차이가 작아짐, 사망자를 예측하지 못해도 생존자가 매우 많아서 대상자를 생존자로 잘못예측해도 성능이 올라간다고 생각됨

In [None]:
print(Data_total.shape, Data_train.shape, Data_test.shape)

In [None]:
train_deaths = Data_train[Data_train['CVD사망'] == 1] # 사망한 대상자
test_deaths = Data_test[Data_test['CVD사망'] == 1]

train_survivors = Data_train[Data_train['CVD사망'] == 0] # 사망하지 않은 대상자
test_survivors = Data_test[Data_test['CVD사망'] == 0]

# 사망하지 않은 대상자를 샘플링
train_survivors_sampled = train_survivors.sample(frac=0.01, random_state=42) # 1% 샘플링 했을때 CoxPH와 Ensemble간 차이가 많이남, 속도도빠름
test_survivors_sampled = test_survivors.sample(frac=0.01, random_state=42) # 1% 샘플링 보다 많아지면 모델별 성능차이 줄어듦(생존자 증가)

sampled_train = pd.concat([train_deaths, train_survivors_sampled]) # 샘플링된 데이터를 다시 합침
sampled_test = pd.concat([test_deaths, test_survivors_sampled])
sampled_total = pd.concat([sampled_train,sampled_test])
Data_temp = sampled_total.copy()

# 결과 확인
print('\nSampled Total Data:')
print(sampled_total['CVD사망'].value_counts())
print("Sampled Train Data:")
print(sampled_train['CVD사망'].value_counts())
print("\nSampled Test Data:")
print(sampled_test['CVD사망'].value_counts())

# Time dependent AUC

## COX PH 전체 대상자

In [None]:
# 1. 데이터 준비 및 변수 설정

data = Data_temp
non_invasive = ['나이', 'WHtR', '성별', '고혈압여부', '당뇨병여부', 'IPAQ_4group_mets_days']
invasive = ['나이', 'WHtR', '성별', '고혈압여부', '당뇨병여부', 'IPAQ_4group_mets_days', 'TG', 'LDL','HDL']

results = {}  # 모델별 결과를 저장할 딕셔너리


# 2. 모델 학습 및 평가 (비침습 vs 침습)
for var_type in ['non_invasive', 'invasive']:
    print(f"\n=== {var_type.title()} Model Analysis ===")
    results[var_type] = {}

    # 사용할 변수 선택
    features = non_invasive if var_type == 'non_invasive' else invasive
    X = data[features]
    y_event = data['CVD사망'].astype(bool)  # 사건 발생 여부: Boolean
    y_time = data['추적기간_연']

    # Train/Test 분할 (stratify를 이용해 사건 비율 유지)
    X_train, X_test, y_train_event, y_test_event, y_train_time, y_test_time = train_test_split(
         X, data['CVD사망'], data['추적기간_연'],
         test_size=0.3, stratify=data['CVD사망'], random_state=42
    )

    # sksurv에서 사용할 생존 데이터 생성
    y_train_surv = Surv.from_arrays(y_train_event.astype(bool), y_train_time)
    y_test_surv = Surv.from_arrays(y_test_event.astype(bool), y_test_time)

    # Cox 모델 학습 (sksurv 이용)
    model = CoxPHSurvivalAnalysis()
    model.fit(X_train, y_train_surv)

    # 위험 점수 예측 (Train, Test)
    risk_scores_train = model.predict(X_train)
    risk_scores_test = model.predict(X_test)


    # 3. 기본 성능 지표 계산
    # C-index 계산
    c_index_train = concordance_index_censored(y_train_event.astype(bool), y_train_time, risk_scores_train)[0]
    c_index_test = concordance_index_censored(y_test_event.astype(bool), y_test_time, risk_scores_test)[0]

    # 시간 포인트 설정: Train 데이터 중 사건이 발생한 시간의 분위수 (예: 10개)
    times = np.quantile(y_train_time[y_train_event.astype(bool)], np.linspace(0.1, 0.9, 10))

    # Time-dependent AUC 계산
    auc_train_values, mean_auc_train = cumulative_dynamic_auc(y_train_surv, y_train_surv, risk_scores_train, times)
    auc_test_values, mean_auc_test = cumulative_dynamic_auc(y_train_surv, y_test_surv, risk_scores_test, times)


    # 4. 부트스트랩을 통한 95% 신뢰구간 산출
    n_bootstraps = 100
    boot_c_indices_train = []
    boot_c_indices_test = []
    boot_aucs_train = []  # shape: (n_bootstraps, len(times))
    boot_aucs_test = []

    train_indices = np.arange(len(X_train))
    test_indices = np.arange(len(X_test))

    for i in range(n_bootstraps):
        # ----- Train 데이터 부트스트랩 -----
        boot_idx_train = np.random.choice(train_indices, size=len(train_indices), replace=True)
        boot_y_train_event = y_train_event.iloc[boot_idx_train]
        boot_y_train_time = y_train_time.iloc[boot_idx_train]
        boot_risk_scores_train = risk_scores_train[boot_idx_train]
        boot_y_train_surv = Surv.from_arrays(boot_y_train_event.astype(bool), boot_y_train_time)

        boot_c_idx_train = concordance_index_censored(boot_y_train_event.astype(bool), boot_y_train_time, boot_risk_scores_train)[0]
        boot_c_indices_train.append(boot_c_idx_train)

        boot_auc_train, _ = cumulative_dynamic_auc(boot_y_train_surv, boot_y_train_surv, boot_risk_scores_train, times)
        boot_aucs_train.append(boot_auc_train)

        # ----- Test 데이터 부트스트랩 (참조: 전체 Train 데이터) -----
        boot_idx_test = np.random.choice(test_indices, size=len(test_indices), replace=True)
        boot_y_test_event = y_test_event.iloc[boot_idx_test]
        boot_y_test_time = y_test_time.iloc[boot_idx_test]
        boot_risk_scores_test = risk_scores_test[boot_idx_test]

        # Test AUC 계산 시, 참조는 전체 Train 데이터를 사용
        boot_auc_test, _ = cumulative_dynamic_auc(
            y_train_surv,
            Surv.from_arrays(boot_y_test_event.astype(bool), boot_y_test_time),
            boot_risk_scores_test, times
        )
        boot_aucs_test.append(boot_auc_test)

        boot_c_idx_test = concordance_index_censored(boot_y_test_event.astype(bool), boot_y_test_time, boot_risk_scores_test)[0]
        boot_c_indices_test.append(boot_c_idx_test)

    boot_c_indices_train = np.array(boot_c_indices_train)
    boot_c_indices_test = np.array(boot_c_indices_test)
    boot_aucs_train = np.array(boot_aucs_train)  # shape: (n_bootstraps, len(times))
    boot_aucs_test = np.array(boot_aucs_test)

    # 95% 신뢰구간 (C-index)
    lower_c_index_train = np.percentile(boot_c_indices_train, 2.5)
    upper_c_index_train = np.percentile(boot_c_indices_train, 97.5)
    lower_c_index_test = np.percentile(boot_c_indices_test, 2.5)
    upper_c_index_test = np.percentile(boot_c_indices_test, 97.5)

    # 95% 신뢰구간 (AUC 각 시간 포인트)
    lower_auc_train = np.percentile(boot_aucs_train, 2.5, axis=0)
    upper_auc_train = np.percentile(boot_aucs_train, 97.5, axis=0)
    lower_auc_test = np.percentile(boot_aucs_test, 2.5, axis=0)
    upper_auc_test = np.percentile(boot_aucs_test, 97.5, axis=0)

    # 95% 신뢰구간 (Mean AUC; 각 부트스트랩 반복에서 시간 포인트별 AUC의 평균)
    mean_auc_train_boot = np.mean(boot_aucs_train, axis=1)
    mean_auc_test_boot = np.mean(boot_aucs_test, axis=1)
    lower_mean_auc_train = np.percentile(mean_auc_train_boot, 2.5)
    upper_mean_auc_train = np.percentile(mean_auc_train_boot, 97.5)
    lower_mean_auc_test = np.percentile(mean_auc_test_boot, 2.5)
    upper_mean_auc_test = np.percentile(mean_auc_test_boot, 97.5)

    # 5. 결과 저장 및 출력
    results[var_type]['train'] = {
        'c_index': c_index_train,
        'c_index_ci': (lower_c_index_train, upper_c_index_train),
        'auc_values': auc_train_values,
        'mean_auc': mean_auc_train,
        'auc_ci': (lower_auc_train, upper_auc_train),
        'mean_auc_ci': (lower_mean_auc_train, upper_mean_auc_train)
    }
    results[var_type]['test'] = {
        'c_index': c_index_test,
        'c_index_ci': (lower_c_index_test, upper_c_index_test),
        'auc_values': auc_test_values,
        'mean_auc': mean_auc_test,
        'auc_ci': (lower_auc_test, upper_auc_test),
        'mean_auc_ci': (lower_mean_auc_test, upper_mean_auc_test)
    }

    # 지정한 출력 형식으로 결과 표시
    model_name = "Non_Invasive Model" if var_type == 'non_invasive' else "Invasive Model"
    print(f"\n{model_name}:")
    print(f"  C-index (Test Data): {c_index_test:.3f} ({lower_c_index_test:.3f}-{upper_c_index_test:.3f})")
    print(f"  Train Data Mean AUC: {mean_auc_train:.3f} ({lower_mean_auc_train:.3f}-{upper_mean_auc_train:.3f})")
    print(f"  Test Data Mean AUC: {mean_auc_test:.3f} ({lower_mean_auc_test:.3f}-{upper_mean_auc_test:.3f})")

In [None]:
# 6. Test 데이터에 대한 그래프 그리기 (침습 vs 비침습)
plt.figure(figsize=(12, 8))
colors = {'non_invasive': '#1f77b4', 'invasive': '#ff7f0e'}

for var_type in ['non_invasive', 'invasive']:
    res = results[var_type]['test']
    # 범례에 Test 데이터의 Mean AUC와 95% CI를 표시
    label = (f"{var_type.title()} (Test Mean AUC={res['mean_auc']:.3f} "
             f"({res['mean_auc_ci'][0]:.3f}-{res['mean_auc_ci'][1]:.3f}))")
    plt.plot(times, res['auc_values'], color=colors[var_type], linewidth=2, label=label)
    plt.fill_between(times, res['auc_ci'][0], res['auc_ci'][1], color=colors[var_type], alpha=0.2)

plt.xlabel('Follow-up Time (Years)', fontsize=12)
plt.ylabel('Time-dependent AUC', fontsize=12)
plt.title('Time-Dependent AUC of the Cox Proportional Hazard Model', fontsize=14)
plt.legend(loc='lower right', fontsize=10)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## COX PH 성별 분할

In [None]:
# 성별 그룹 정의 (1: 남자, 2: 여자)
genders = {1: "Male", 2: "Female"}

# 최종 결과를 저장할 딕셔너리
final_results = {}

for gender_value, gender_label in genders.items():
    print("\n=====================================")
    print(f"=== {gender_label} 대상 분석 ===")
    print("=====================================")

    # 해당 성별에 해당하는 데이터 필터링
    data_gender = data[data['성별'] == gender_value].copy()
    final_results[gender_label] = {}

    # 모델 종류별 (비침습 vs 침습)
    for var_type in ['non_invasive', 'invasive']:
        print(f"\n=== {var_type.title()} Model Analysis ===")
        final_results[gender_label][var_type] = {}

        # 기본 feature 설정
        features = non_invasive if var_type == 'non_invasive' else invasive
        # 남자 또는 여자 그룹일 때, 성별 변수는 모두 같은 값이므로 제거
        if gender_label == "Male" or gender_label == "Female":
            features = [col for col in features if col != '성별']

        X = data_gender[features]
        y_event = data_gender['CVD사망'].astype(bool)
        y_time = data_gender['추적기간_연']

        # Train/Test 분할 (stratify 옵션 사용)
        X_train, X_test, y_train_event, y_test_event, y_train_time, y_test_time = train_test_split(
             X, data_gender['CVD사망'], data_gender['추적기간_연'],
             test_size=0.3, stratify=data_gender['CVD사망'], random_state=42
        )

        # sksurv의 Surv 객체 생성
        y_train_surv = Surv.from_arrays(y_train_event.astype(bool), y_train_time)
        y_test_surv  = Surv.from_arrays(y_test_event.astype(bool), y_test_time)

        # Cox 모델 학습 (sksurv 사용)
        model = CoxPHSurvivalAnalysis()
        model.fit(X_train, y_train_surv)

        # 위험 점수 예측 (Train, Test)
        risk_scores_train = model.predict(X_train)
        risk_scores_test  = model.predict(X_test)

        # -------------------------------
        # 기본 성능 지표 계산
        # -------------------------------
        # C-index 계산
        c_index_train = concordance_index_censored(y_train_event.astype(bool), y_train_time, risk_scores_train)[0]
        c_index_test  = concordance_index_censored(y_test_event.astype(bool), y_test_time, risk_scores_test)[0]

        # 시간 포인트 설정: Train 데이터에서 사건 발생 시간 분위수 (예: 10개)
        times = np.quantile(y_train_time[y_train_event.astype(bool)], np.linspace(0.1, 0.9, 10))

        # Time-dependent AUC 계산 (Train & Test)
        auc_train_values, mean_auc_train = cumulative_dynamic_auc(y_train_surv, y_train_surv, risk_scores_train, times)
        auc_test_values, mean_auc_test = cumulative_dynamic_auc(y_train_surv, y_test_surv, risk_scores_test, times)

        # -------------------------------
        # 부트스트랩을 통한 95% 신뢰구간 산출 (n_bootstraps = 100)
        # -------------------------------
        n_bootstraps = 100
        boot_c_indices_train = []
        boot_c_indices_test  = []
        boot_aucs_train = []   # shape: (n_bootstraps, len(times))
        boot_aucs_test  = []

        train_indices = np.arange(len(X_train))
        test_indices  = np.arange(len(X_test))

        for i in range(n_bootstraps):
            # ----- Train 부트스트랩 -----
            boot_idx_train = np.random.choice(train_indices, size=len(train_indices), replace=True)
            boot_y_train_event = y_train_event.iloc[boot_idx_train]
            boot_y_train_time  = y_train_time.iloc[boot_idx_train]
            boot_risk_scores_train = risk_scores_train[boot_idx_train]
            boot_y_train_surv = Surv.from_arrays(boot_y_train_event.astype(bool), boot_y_train_time)

            boot_c_idx_train = concordance_index_censored(boot_y_train_event.astype(bool), boot_y_train_time, boot_risk_scores_train)[0]
            boot_c_indices_train.append(boot_c_idx_train)

            boot_auc_train, _ = cumulative_dynamic_auc(boot_y_train_surv, boot_y_train_surv, boot_risk_scores_train, times)
            boot_aucs_train.append(boot_auc_train)

            # ----- Test 부트스트랩 (참조: 전체 Train 데이터 사용) -----
            boot_idx_test = np.random.choice(test_indices, size=len(test_indices), replace=True)
            boot_y_test_event = y_test_event.iloc[boot_idx_test]
            boot_y_test_time  = y_test_time.iloc[boot_idx_test]
            boot_risk_scores_test = risk_scores_test[boot_idx_test]

            boot_auc_test, _ = cumulative_dynamic_auc(
                y_train_surv,
                Surv.from_arrays(boot_y_test_event.astype(bool), boot_y_test_time),
                boot_risk_scores_test,
                times
            )
            boot_aucs_test.append(boot_auc_test)

            boot_c_idx_test = concordance_index_censored(boot_y_test_event.astype(bool), boot_y_test_time, boot_risk_scores_test)[0]
            boot_c_indices_test.append(boot_c_idx_test)

        boot_c_indices_train = np.array(boot_c_indices_train)
        boot_c_indices_test  = np.array(boot_c_indices_test)
        boot_aucs_train = np.array(boot_aucs_train)   # (n_bootstraps, len(times))
        boot_aucs_test  = np.array(boot_aucs_test)

        # 95% 신뢰구간 계산 (C-index)
        lower_c_index_train = np.percentile(boot_c_indices_train, 2.5)
        upper_c_index_train = np.percentile(boot_c_indices_train, 97.5)
        lower_c_index_test  = np.percentile(boot_c_indices_test, 2.5)
        upper_c_index_test  = np.percentile(boot_c_indices_test, 97.5)

        # 95% 신뢰구간 계산 (AUC: 각 시간 포인트별)
        lower_auc_train = np.percentile(boot_aucs_train, 2.5, axis=0)
        upper_auc_train = np.percentile(boot_aucs_train, 97.5, axis=0)
        lower_auc_test  = np.percentile(boot_aucs_test, 2.5, axis=0)
        upper_auc_test  = np.percentile(boot_aucs_test, 97.5, axis=0)

        # 95% 신뢰구간 (Mean AUC)
        mean_auc_train_boot = np.mean(boot_aucs_train, axis=1)
        mean_auc_test_boot  = np.mean(boot_aucs_test, axis=1)
        lower_mean_auc_train = np.percentile(mean_auc_train_boot, 2.5)
        upper_mean_auc_train = np.percentile(mean_auc_train_boot, 97.5)
        lower_mean_auc_test  = np.percentile(mean_auc_test_boot, 2.5)
        upper_mean_auc_test  = np.percentile(mean_auc_test_boot, 97.5)

        # 결과 저장
        final_results[gender_label][var_type]['train'] = {
            'c_index': c_index_train,
            'c_index_ci': (lower_c_index_train, upper_c_index_train),
            'auc_values': auc_train_values,
            'mean_auc': mean_auc_train,
            'auc_ci': (lower_auc_train, upper_auc_train),
            'mean_auc_ci': (lower_mean_auc_train, upper_mean_auc_train)
        }
        final_results[gender_label][var_type]['test'] = {
            'c_index': c_index_test,
            'c_index_ci': (lower_c_index_test, upper_c_index_test),
            'auc_values': auc_test_values,
            'mean_auc': mean_auc_test,
            'auc_ci': (lower_auc_test, upper_auc_test),
            'mean_auc_ci': (lower_mean_auc_test, upper_mean_auc_test)
        }

        # 결과 출력 (모델별)
        model_name = "Non_Invasive Model" if var_type == 'non_invasive' else "Invasive Model"
        print(f"\n{model_name}:")
        print(f"  C-index (Test Data): {c_index_test:.3f} ({lower_c_index_test:.3f}-{upper_c_index_test:.3f})")
        print(f"  Train Data Mean AUC: {mean_auc_train:.3f} ({lower_mean_auc_train:.3f}-{upper_mean_auc_train:.3f})")
        print(f"  Test Data Mean AUC: {mean_auc_test:.3f} ({lower_mean_auc_test:.3f}-{upper_mean_auc_test:.3f})")

        # 시간 포인트 저장 (그래프용)
        final_results[gender_label][var_type]['times'] = times

    # -------------------------------
    # 해당 성별 그룹에 대한 Test 데이터 AUC 그래프 그리기
    # -------------------------------
    plt.figure(figsize=(12, 8))
    colors = {'non_invasive': '#1f77b4', 'invasive': '#ff7f0e'}

    for var_type in ['non_invasive', 'invasive']:
        res = final_results[gender_label][var_type]['test']
        label = (f"{var_type.title()} (Test Mean AUC={res['mean_auc']:.3f} "
                 f"({res['mean_auc_ci'][0]:.3f}-{res['mean_auc_ci'][1]:.3f}))")
        plt.plot(final_results[gender_label][var_type]['times'], res['auc_values'],
                 color=colors[var_type], linewidth=2, label=label)
        plt.fill_between(final_results[gender_label][var_type]['times'],
                         res['auc_ci'][0], res['auc_ci'][1],
                         color=colors[var_type], alpha=0.2)

    plt.xlabel('Follow-up Time (Years)', fontsize=12)
    plt.ylabel('Time-dependent AUC', fontsize=12)
    plt.title(f"Test Data AUC Comparison for {gender_label} (Invasive vs Non-Invasive)", fontsize=14)
    plt.legend(loc='lower right', fontsize=10)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

## COX PH 전체 + 성별 분할 통합

In [None]:
# 1. Predictor 변수 정의
# 전체 분석에서는 '성별' 변수를 포함합니다.
non_invasive_orig = ['나이', 'WHtR', '성별', '고혈압여부', '당뇨병여부', 'IPAQ_4group_mets_days']
invasive_orig = non_invasive_orig + ['TG', 'LDL', 'HDL']

# 성별별 분석 시에는 '성별' 변수를 제거합니다.
non_invasive = [col for col in non_invasive_orig if col != '성별']
invasive = [col for col in invasive_orig if col != '성별']

# 2. 데이터 로드
data = Data_temp

# 3. 분석 수행 함수 정의
def perform_analysis(data_subset, features_non_inv, features_inv, analysis_label="Overall"):
    """
    data_subset: 분석 대상 데이터 (전체, 남자, 여자 등)
    features_non_inv: 비침습 변수 리스트
    features_inv: 침습 변수 리스트
    analysis_label: 출력 그래프 제목에 들어갈 label
    """
    results = {}

    for model_type in ['non_invasive', 'invasive']:
        # 모델별 predictor 설정
        features = features_non_inv if model_type == 'non_invasive' else features_inv
        X = data_subset[features]
        y_event = data_subset['CVD사망'].astype(bool)
        y_time = data_subset['추적기간_연']

        # Train/Test 분할 (stratify로 사건 비율 유지)
        X_train, X_test, y_train_event, y_test_event, y_train_time, y_test_time = train_test_split(
            X, data_subset['CVD사망'], data_subset['추적기간_연'],
            test_size=0.3, stratify=data_subset['CVD사망'], random_state=42
        )

        # 생존 데이터 생성 (sksurv 형식)
        y_train_surv = Surv.from_arrays(y_train_event.astype(bool), y_train_time)
        y_test_surv = Surv.from_arrays(y_test_event.astype(bool), y_test_time)

        # Cox 모델 학습 (sksurv)
        model = CoxPHSurvivalAnalysis()
        model.fit(X_train, y_train_surv)

        # 위험 점수 예측
        risk_scores_train = model.predict(X_train)
        risk_scores_test = model.predict(X_test)

        # C-index 계산
        c_index_train = concordance_index_censored(y_train_event.astype(bool), y_train_time, risk_scores_train)[0]
        c_index_test = concordance_index_censored(y_test_event.astype(bool), y_test_time, risk_scores_test)[0]

        # 시간 포인트 설정: Train 데이터 중 사건 발생 시간의 분위수 (예: 10개)
        times = np.quantile(y_train_time[y_train_event.astype(bool)], np.linspace(0.1, 0.9, 10))

        # Time-dependent AUC 계산 (Train & Test)
        auc_train_values, mean_auc_train = cumulative_dynamic_auc(y_train_surv, y_train_surv, risk_scores_train, times)
        auc_test_values, mean_auc_test = cumulative_dynamic_auc(y_train_surv, y_test_surv, risk_scores_test, times)

        # 부트스트랩을 통한 95% 신뢰구간 산출 (n_bootstraps = 100)
        n_bootstraps = 100
        boot_c_indices_train = []
        boot_c_indices_test = []
        boot_aucs_train = []  # shape: (n_bootstraps, len(times))
        boot_aucs_test = []

        train_indices = np.arange(len(X_train))
        test_indices = np.arange(len(X_test))

        for i in range(n_bootstraps):
            # Train 부트스트랩
            boot_idx_train = np.random.choice(train_indices, size=len(train_indices), replace=True)
            boot_y_train_event = y_train_event.iloc[boot_idx_train]
            boot_y_train_time = y_train_time.iloc[boot_idx_train]
            boot_risk_scores_train = risk_scores_train[boot_idx_train]
            boot_y_train_surv = Surv.from_arrays(boot_y_train_event.astype(bool), boot_y_train_time)

            boot_c_idx_train = concordance_index_censored(boot_y_train_event.astype(bool), boot_y_train_time, boot_risk_scores_train)[0]
            boot_c_indices_train.append(boot_c_idx_train)

            boot_auc_train, _ = cumulative_dynamic_auc(boot_y_train_surv, boot_y_train_surv, boot_risk_scores_train, times)
            boot_aucs_train.append(boot_auc_train)

            # Test 부트스트랩 (참조: 전체 Train 데이터 사용)
            boot_idx_test = np.random.choice(test_indices, size=len(test_indices), replace=True)
            boot_y_test_event = y_test_event.iloc[boot_idx_test]
            boot_y_test_time = y_test_time.iloc[boot_idx_test]
            boot_risk_scores_test = risk_scores_test[boot_idx_test]

            boot_auc_test, _ = cumulative_dynamic_auc(
                y_train_surv,
                Surv.from_arrays(boot_y_test_event.astype(bool), boot_y_test_time),
                boot_risk_scores_test,
                times
            )
            boot_aucs_test.append(boot_auc_test)

            boot_c_idx_test = concordance_index_censored(boot_y_test_event.astype(bool), boot_y_test_time, boot_risk_scores_test)[0]
            boot_c_indices_test.append(boot_c_idx_test)

        boot_c_indices_train = np.array(boot_c_indices_train)
        boot_c_indices_test = np.array(boot_c_indices_test)
        boot_aucs_train = np.array(boot_aucs_train)
        boot_aucs_test = np.array(boot_aucs_test)

        lower_c_index_train = np.percentile(boot_c_indices_train, 2.5)
        upper_c_index_train = np.percentile(boot_c_indices_train, 97.5)
        lower_c_index_test = np.percentile(boot_c_indices_test, 2.5)
        upper_c_index_test = np.percentile(boot_c_indices_test, 97.5)

        lower_auc_train = np.percentile(boot_aucs_train, 2.5, axis=0)
        upper_auc_train = np.percentile(boot_aucs_train, 97.5, axis=0)
        lower_auc_test = np.percentile(boot_aucs_test, 2.5, axis=0)
        upper_auc_test = np.percentile(boot_aucs_test, 97.5, axis=0)

        mean_auc_train_boot = np.mean(boot_aucs_train, axis=1)
        mean_auc_test_boot = np.mean(boot_aucs_test, axis=1)
        lower_mean_auc_train = np.percentile(mean_auc_train_boot, 2.5)
        upper_mean_auc_train = np.percentile(mean_auc_train_boot, 97.5)
        lower_mean_auc_test = np.percentile(mean_auc_test_boot, 2.5)
        upper_mean_auc_test = np.percentile(mean_auc_test_boot, 97.5)

        results[model_type] = {
            'train': {
                'c_index': c_index_train,
                'c_index_ci': (lower_c_index_train, upper_c_index_train),
                'auc_values': auc_train_values,
                'mean_auc': mean_auc_train,
                'auc_ci': (lower_auc_train, upper_auc_train),
                'mean_auc_ci': (lower_mean_auc_train, upper_mean_auc_train)
            },
            'test': {
                'c_index': c_index_test,
                'c_index_ci': (lower_c_index_test, upper_c_index_test),
                'auc_values': auc_test_values,
                'mean_auc': mean_auc_test,
                'auc_ci': (lower_auc_test, upper_auc_test),
                'mean_auc_ci': (lower_mean_auc_test, upper_mean_auc_test)
            },
            'times': times
        }

        # 콘솔 출력 (영문)
        model_name = "Non_Invasive Model" if model_type == 'non_invasive' else "Invasive Model"
        print(f"\n{model_name}:")
        print(f"  C-index (Test Data): {c_index_test:.3f} ({lower_c_index_test:.3f}-{upper_c_index_test:.3f})")
        print(f"  Train Data Mean AUC: {mean_auc_train:.3f} ({lower_mean_auc_train:.3f}-{upper_mean_auc_train:.3f})")
        print(f"  Test Data Mean AUC: {mean_auc_test:.3f} ({lower_mean_auc_test:.3f}-{upper_mean_auc_test:.3f})")

    # 그래프: Test Data에 대한 Time-dependent AUC 비교
    plt.figure(figsize=(12, 8))
    colors = {'non_invasive': '#1f77b4', 'invasive': '#ff7f0e'}

    for model_type in ['non_invasive', 'invasive']:
        res = results[model_type]['test']
        label = (f"{model_type.replace('_',' ').title()} (Test Mean AUC={res['mean_auc']:.3f} "
                 f"({res['mean_auc_ci'][0]:.3f}-{res['mean_auc_ci'][1]:.3f}))")
        plt.plot(results[model_type]['times'], res['auc_values'],
                 color=colors[model_type], linewidth=2, label=label)
        plt.fill_between(results[model_type]['times'],
                         res['auc_ci'][0], res['auc_ci'][1],
                         color=colors[model_type], alpha=0.2)

    plt.xlabel('Follow-up Time (Years)', fontsize=12)
    plt.ylabel('Time-dependent AUC', fontsize=12)
    plt.title(f"Time-Dependent AUC of the Cox Proportional Hazards Model ({analysis_label})", fontsize=14)
    plt.legend(loc='lower right', fontsize=10)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

    return results

# 4. Overall Analysis (All Subjects)
print("=====================================")
print("=== Overall Analysis ===")
print("=====================================")
results_overall = perform_analysis(data, non_invasive_orig, invasive_orig, analysis_label="All Subjects")

# 5. Gender-Specific Analysis (Male and Female)
# 성별: 1은 Male, 2는 Female
genders = {1: "Male", 2: "Female"}
results_gender = {}

for gender_value, gender_label in genders.items():
    print("\n=====================================")
    print(f"=== {gender_label} Group Analysis ===")
    print("=====================================")
    # 성별별 데이터 필터링
    data_gender = data[data['성별'] == gender_value].copy()
    # 성별별 분석에서는 '성별' 변수를 제거한 predictor 리스트를 사용합니다.
    results_gender[gender_label] = perform_analysis(data_gender, non_invasive, invasive, analysis_label=gender_label)

## RSF 전체 + 성별 분할

### 공복혈당 추가 모델

In [None]:
# 1. Predictor 변수 정의
# 전체 분석에서는 '성별' 변수를 포함합니다.
non_invasive_orig = ['나이', 'WHtR', '성별', '고혈압여부', '당뇨병여부', 'IPAQ_4group_mets_days']
invasive_orig = non_invasive_orig + ['TG', 'LDL', 'HDL', '공복혈당']

# 성별별 분석 시에는 '성별' 변수를 제거합니다.
non_invasive = [col for col in non_invasive_orig if col != '성별']
invasive = [col for col in invasive_orig if col != '성별']

# 2. 데이터 로드
data = Data_temp

# 3. 분석 수행 함수 정의
def perform_analysis(data_subset, features_non_inv, features_inv, analysis_label="Overall"):
    """
    data_subset: 분석 대상 데이터 (전체, 남자, 여자 등)
    features_non_inv: 비침습 변수 리스트
    features_inv: 침습 변수 리스트
    analysis_label: 출력 그래프 제목에 들어갈 label
    """
    results = {}

    for model_type in ['non_invasive', 'invasive']:
        # 모델별 predictor 설정
        features = features_non_inv if model_type == 'non_invasive' else features_inv
        X = data_subset[features]
        y_event = data_subset['CVD사망'].astype(bool)
        y_time = data_subset['추적기간_연']

        # Train/Test 분할 (stratify로 사건 비율 유지)
        X_train, X_test, y_train_event, y_test_event, y_train_time, y_test_time = train_test_split(
            X, data_subset['CVD사망'], data_subset['추적기간_연'],
            test_size=0.3, stratify=data_subset['CVD사망'], random_state=42
        )

        # 생존 데이터 생성 (sksurv 형식)
        y_train_surv = Surv.from_arrays(y_train_event.astype(bool), y_train_time)
        y_test_surv = Surv.from_arrays(y_test_event.astype(bool), y_test_time)

        # RSF 모델 학습 (sksurv)
        model = RandomSurvivalForest(**best_rsf_params, n_jobs=-1, random_state=42)
        model.fit(X_train, y_train_surv)

        # 위험 점수 예측
        risk_scores_train = model.predict(X_train)
        risk_scores_test = model.predict(X_test)

        # C-index 계산
        c_index_train = concordance_index_censored(y_train_event.astype(bool), y_train_time, risk_scores_train)[0]
        c_index_test = concordance_index_censored(y_test_event.astype(bool), y_test_time, risk_scores_test)[0]

        # 시간 포인트 설정: Train 데이터 중 사건 발생 시간의 분위수 (예: 10개)
        times = np.quantile(y_train_time[y_train_event.astype(bool)], np.linspace(0.1, 0.9, 10))

        # Time-dependent AUC 계산 (Train & Test)
        auc_train_values, mean_auc_train = cumulative_dynamic_auc(y_train_surv, y_train_surv, risk_scores_train, times)
        auc_test_values, mean_auc_test = cumulative_dynamic_auc(y_train_surv, y_test_surv, risk_scores_test, times)

        # 부트스트랩을 통한 95% 신뢰구간 산출 (n_bootstraps = 100)
        n_bootstraps = 100
        boot_c_indices_train = []
        boot_c_indices_test = []
        boot_aucs_train = []  # shape: (n_bootstraps, len(times))
        boot_aucs_test = []

        train_indices = np.arange(len(X_train))
        test_indices = np.arange(len(X_test))

        for i in range(n_bootstraps):
            # Train 부트스트랩
            boot_idx_train = np.random.choice(train_indices, size=len(train_indices), replace=True)
            boot_y_train_event = y_train_event.iloc[boot_idx_train]
            boot_y_train_time = y_train_time.iloc[boot_idx_train]
            boot_risk_scores_train = risk_scores_train[boot_idx_train]
            boot_y_train_surv = Surv.from_arrays(boot_y_train_event.astype(bool), boot_y_train_time)

            boot_c_idx_train = concordance_index_censored(boot_y_train_event.astype(bool), boot_y_train_time, boot_risk_scores_train)[0]
            boot_c_indices_train.append(boot_c_idx_train)

            boot_auc_train, _ = cumulative_dynamic_auc(boot_y_train_surv, boot_y_train_surv, boot_risk_scores_train, times)
            boot_aucs_train.append(boot_auc_train)

            # Test 부트스트랩 (참조: 전체 Train 데이터 사용)
            boot_idx_test = np.random.choice(test_indices, size=len(test_indices), replace=True)
            boot_y_test_event = y_test_event.iloc[boot_idx_test]
            boot_y_test_time = y_test_time.iloc[boot_idx_test]
            boot_risk_scores_test = risk_scores_test[boot_idx_test]

            boot_auc_test, _ = cumulative_dynamic_auc(
                y_train_surv,
                Surv.from_arrays(boot_y_test_event.astype(bool), boot_y_test_time),
                boot_risk_scores_test,
                times
            )
            boot_aucs_test.append(boot_auc_test)

            boot_c_idx_test = concordance_index_censored(boot_y_test_event.astype(bool), boot_y_test_time, boot_risk_scores_test)[0]
            boot_c_indices_test.append(boot_c_idx_test)

        boot_c_indices_train = np.array(boot_c_indices_train)
        boot_c_indices_test = np.array(boot_c_indices_test)
        boot_aucs_train = np.array(boot_aucs_train)
        boot_aucs_test = np.array(boot_aucs_test)

        lower_c_index_train = np.percentile(boot_c_indices_train, 2.5)
        upper_c_index_train = np.percentile(boot_c_indices_train, 97.5)
        lower_c_index_test = np.percentile(boot_c_indices_test, 2.5)
        upper_c_index_test = np.percentile(boot_c_indices_test, 97.5)

        lower_auc_train = np.percentile(boot_aucs_train, 2.5, axis=0)
        upper_auc_train = np.percentile(boot_aucs_train, 97.5, axis=0)
        lower_auc_test = np.percentile(boot_aucs_test, 2.5, axis=0)
        upper_auc_test = np.percentile(boot_aucs_test, 97.5, axis=0)

        mean_auc_train_boot = np.mean(boot_aucs_train, axis=1)
        mean_auc_test_boot = np.mean(boot_aucs_test, axis=1)
        lower_mean_auc_train = np.percentile(mean_auc_train_boot, 2.5)
        upper_mean_auc_train = np.percentile(mean_auc_train_boot, 97.5)
        lower_mean_auc_test = np.percentile(mean_auc_test_boot, 2.5)
        upper_mean_auc_test = np.percentile(mean_auc_test_boot, 97.5)

        results[model_type] = {
            'train': {
                'c_index': c_index_train,
                'c_index_ci': (lower_c_index_train, upper_c_index_train),
                'auc_values': auc_train_values,
                'mean_auc': mean_auc_train,
                'auc_ci': (lower_auc_train, upper_auc_train),
                'mean_auc_ci': (lower_mean_auc_train, upper_mean_auc_train)
            },
            'test': {
                'c_index': c_index_test,
                'c_index_ci': (lower_c_index_test, upper_c_index_test),
                'auc_values': auc_test_values,
                'mean_auc': mean_auc_test,
                'auc_ci': (lower_auc_test, upper_auc_test),
                'mean_auc_ci': (lower_mean_auc_test, upper_mean_auc_test)
            },
            'times': times
        }

        # 콘솔 출력 (영문)
        model_name = "Non_Invasive Model" if model_type == 'non_invasive' else "Invasive Model"
        print(f"\n{model_name}:")
        print(f"  C-index (Test Data): {c_index_test:.3f} ({lower_c_index_test:.3f}-{upper_c_index_test:.3f})")
        print(f"  Train Data Mean AUC: {mean_auc_train:.3f} ({lower_mean_auc_train:.3f}-{upper_mean_auc_train:.3f})")
        print(f"  Test Data Mean AUC: {mean_auc_test:.3f} ({lower_mean_auc_test:.3f}-{upper_mean_auc_test:.3f})")

    # 그래프: Test Data에 대한 Time-dependent AUC 비교
    plt.figure(figsize=(12, 8))
    colors = {'non_invasive': '#1f77b4', 'invasive': '#ff7f0e'}

    for model_type in ['non_invasive', 'invasive']:
        res = results[model_type]['test']
        label = (f"{model_type.replace('_',' ').title()} (Test Mean AUC={res['mean_auc']:.3f} "
                 f"({res['mean_auc_ci'][0]:.3f}-{res['mean_auc_ci'][1]:.3f}))")
        plt.plot(results[model_type]['times'], res['auc_values'],
                 color=colors[model_type], linewidth=2, label=label)
        plt.fill_between(results[model_type]['times'],
                         res['auc_ci'][0], res['auc_ci'][1],
                         color=colors[model_type], alpha=0.2)

    plt.xlabel('Follow-up Time (Years)', fontsize=12)
    plt.ylabel('Time-dependent AUC', fontsize=12)
    plt.title(f"Time-Dependent AUC of the Random Survival Forest Model ({analysis_label})", fontsize=14) # 이름 수정
    plt.legend(loc='lower right', fontsize=10)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

    return results

# 4. Overall Analysis (All Subjects)
print("=====================================")
print("=== Overall Analysis ===")
print("=====================================")
results_overall = perform_analysis(data, non_invasive_orig, invasive_orig, analysis_label="All Subjects")

# 5. Gender-Specific Analysis (Male and Female)
# 성별: 1은 Male, 2는 Female
genders = {1: "Male", 2: "Female"}
results_gender = {}

for gender_value, gender_label in genders.items():
    print("\n=====================================")
    print(f"=== {gender_label} Group Analysis ===")
    print("=====================================")
    # 성별별 데이터 필터링
    data_gender = data[data['성별'] == gender_value].copy()
    # 성별별 분석에서는 '성별' 변수를 제거한 predictor 리스트를 사용합니다.
    results_gender[gender_label] = perform_analysis(data_gender, non_invasive, invasive, analysis_label=gender_label)

In [None]:
# 논문에 현재 기록된 파라미터
best_rsf_params = {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 14,
                   'max_leaf_nodes': 70, 'max_features': 'log2'}

In [None]:
# 1. Predictor 변수 정의
# 전체 분석에서는 '성별' 변수를 포함합니다.
non_invasive_orig = ['나이', 'WHtR', '성별', '고혈압여부', '당뇨병여부', 'IPAQ_4group_mets_days']
invasive_orig = non_invasive_orig + ['TG', 'LDL', 'HDL']

# 성별별 분석 시에는 '성별' 변수를 제거합니다.
non_invasive = [col for col in non_invasive_orig if col != '성별']
invasive = [col for col in invasive_orig if col != '성별']

# 2. 데이터 로드
data = Data_temp

# 3. 분석 수행 함수 정의
def perform_analysis(data_subset, features_non_inv, features_inv, analysis_label="Overall"):
    """
    data_subset: 분석 대상 데이터 (전체, 남자, 여자 등)
    features_non_inv: 비침습 변수 리스트
    features_inv: 침습 변수 리스트
    analysis_label: 출력 그래프 제목에 들어갈 label
    """
    results = {}

    for model_type in ['non_invasive', 'invasive']:
        # 모델별 predictor 설정
        features = features_non_inv if model_type == 'non_invasive' else features_inv
        X = data_subset[features]
        y_event = data_subset['CVD사망'].astype(bool)
        y_time = data_subset['추적기간_연']

        # Train/Test 분할 (stratify로 사건 비율 유지)
        X_train, X_test, y_train_event, y_test_event, y_train_time, y_test_time = train_test_split(
            X, data_subset['CVD사망'], data_subset['추적기간_연'],
            test_size=0.3, stratify=data_subset['CVD사망'], random_state=42
        )

        # 생존 데이터 생성 (sksurv 형식)
        y_train_surv = Surv.from_arrays(y_train_event.astype(bool), y_train_time)
        y_test_surv = Surv.from_arrays(y_test_event.astype(bool), y_test_time)

        # RSF 모델 학습 (sksurv)
        model = RandomSurvivalForest(**best_rsf_params, n_jobs=-1, random_state=42)
        model.fit(X_train, y_train_surv)

        # 위험 점수 예측
        risk_scores_train = model.predict(X_train)
        risk_scores_test = model.predict(X_test)

        # C-index 계산
        c_index_train = concordance_index_censored(y_train_event.astype(bool), y_train_time, risk_scores_train)[0]
        c_index_test = concordance_index_censored(y_test_event.astype(bool), y_test_time, risk_scores_test)[0]

        # 시간 포인트 설정: Train 데이터 중 사건 발생 시간의 분위수 (예: 10개)
        times = np.quantile(y_train_time[y_train_event.astype(bool)], np.linspace(0.1, 0.9, 10))

        # Time-dependent AUC 계산 (Train & Test)
        auc_train_values, mean_auc_train = cumulative_dynamic_auc(y_train_surv, y_train_surv, risk_scores_train, times)
        auc_test_values, mean_auc_test = cumulative_dynamic_auc(y_train_surv, y_test_surv, risk_scores_test, times)

        # 부트스트랩을 통한 95% 신뢰구간 산출 (n_bootstraps = 100)
        n_bootstraps = 100
        boot_c_indices_train = []
        boot_c_indices_test = []
        boot_aucs_train = []  # shape: (n_bootstraps, len(times))
        boot_aucs_test = []

        train_indices = np.arange(len(X_train))
        test_indices = np.arange(len(X_test))

        for i in range(n_bootstraps):
            # Train 부트스트랩
            boot_idx_train = np.random.choice(train_indices, size=len(train_indices), replace=True)
            boot_y_train_event = y_train_event.iloc[boot_idx_train]
            boot_y_train_time = y_train_time.iloc[boot_idx_train]
            boot_risk_scores_train = risk_scores_train[boot_idx_train]
            boot_y_train_surv = Surv.from_arrays(boot_y_train_event.astype(bool), boot_y_train_time)

            boot_c_idx_train = concordance_index_censored(boot_y_train_event.astype(bool), boot_y_train_time, boot_risk_scores_train)[0]
            boot_c_indices_train.append(boot_c_idx_train)

            boot_auc_train, _ = cumulative_dynamic_auc(boot_y_train_surv, boot_y_train_surv, boot_risk_scores_train, times)
            boot_aucs_train.append(boot_auc_train)

            # Test 부트스트랩 (참조: 전체 Train 데이터 사용)
            boot_idx_test = np.random.choice(test_indices, size=len(test_indices), replace=True)
            boot_y_test_event = y_test_event.iloc[boot_idx_test]
            boot_y_test_time = y_test_time.iloc[boot_idx_test]
            boot_risk_scores_test = risk_scores_test[boot_idx_test]

            boot_auc_test, _ = cumulative_dynamic_auc(
                y_train_surv,
                Surv.from_arrays(boot_y_test_event.astype(bool), boot_y_test_time),
                boot_risk_scores_test,
                times
            )
            boot_aucs_test.append(boot_auc_test)

            boot_c_idx_test = concordance_index_censored(boot_y_test_event.astype(bool), boot_y_test_time, boot_risk_scores_test)[0]
            boot_c_indices_test.append(boot_c_idx_test)

        boot_c_indices_train = np.array(boot_c_indices_train)
        boot_c_indices_test = np.array(boot_c_indices_test)
        boot_aucs_train = np.array(boot_aucs_train)
        boot_aucs_test = np.array(boot_aucs_test)

        lower_c_index_train = np.percentile(boot_c_indices_train, 2.5)
        upper_c_index_train = np.percentile(boot_c_indices_train, 97.5)
        lower_c_index_test = np.percentile(boot_c_indices_test, 2.5)
        upper_c_index_test = np.percentile(boot_c_indices_test, 97.5)

        lower_auc_train = np.percentile(boot_aucs_train, 2.5, axis=0)
        upper_auc_train = np.percentile(boot_aucs_train, 97.5, axis=0)
        lower_auc_test = np.percentile(boot_aucs_test, 2.5, axis=0)
        upper_auc_test = np.percentile(boot_aucs_test, 97.5, axis=0)

        mean_auc_train_boot = np.mean(boot_aucs_train, axis=1)
        mean_auc_test_boot = np.mean(boot_aucs_test, axis=1)
        lower_mean_auc_train = np.percentile(mean_auc_train_boot, 2.5)
        upper_mean_auc_train = np.percentile(mean_auc_train_boot, 97.5)
        lower_mean_auc_test = np.percentile(mean_auc_test_boot, 2.5)
        upper_mean_auc_test = np.percentile(mean_auc_test_boot, 97.5)

        results[model_type] = {
            'train': {
                'c_index': c_index_train,
                'c_index_ci': (lower_c_index_train, upper_c_index_train),
                'auc_values': auc_train_values,
                'mean_auc': mean_auc_train,
                'auc_ci': (lower_auc_train, upper_auc_train),
                'mean_auc_ci': (lower_mean_auc_train, upper_mean_auc_train)
            },
            'test': {
                'c_index': c_index_test,
                'c_index_ci': (lower_c_index_test, upper_c_index_test),
                'auc_values': auc_test_values,
                'mean_auc': mean_auc_test,
                'auc_ci': (lower_auc_test, upper_auc_test),
                'mean_auc_ci': (lower_mean_auc_test, upper_mean_auc_test)
            },
            'times': times
        }

        # 콘솔 출력 (영문)
        model_name = "Non_Invasive Model" if model_type == 'non_invasive' else "Invasive Model"
        print(f"\n{model_name}:")
        print(f"  C-index (Test Data): {c_index_test:.3f} ({lower_c_index_test:.3f}-{upper_c_index_test:.3f})")
        print(f"  Train Data Mean AUC: {mean_auc_train:.3f} ({lower_mean_auc_train:.3f}-{upper_mean_auc_train:.3f})")
        print(f"  Test Data Mean AUC: {mean_auc_test:.3f} ({lower_mean_auc_test:.3f}-{upper_mean_auc_test:.3f})")

    # 그래프: Test Data에 대한 Time-dependent AUC 비교
    plt.figure(figsize=(12, 8))
    colors = {'non_invasive': '#1f77b4', 'invasive': '#ff7f0e'}

    for model_type in ['non_invasive', 'invasive']:
        res = results[model_type]['test']
        label = (f"{model_type.replace('_',' ').title()} (Test Mean AUC={res['mean_auc']:.3f} "
                 f"({res['mean_auc_ci'][0]:.3f}-{res['mean_auc_ci'][1]:.3f}))")
        plt.plot(results[model_type]['times'], res['auc_values'],
                 color=colors[model_type], linewidth=2, label=label)
        plt.fill_between(results[model_type]['times'],
                         res['auc_ci'][0], res['auc_ci'][1],
                         color=colors[model_type], alpha=0.2)

    plt.xlabel('Follow-up Time (Years)', fontsize=12)
    plt.ylabel('Time-dependent AUC', fontsize=12)
    plt.title(f"Time-Dependent AUC of the Random Survival Forest Model ({analysis_label})", fontsize=14) # 이름 수정
    plt.legend(loc='lower right', fontsize=10)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

    return results

# 4. Overall Analysis (All Subjects)
print("=====================================")
print("=== Overall Analysis ===")
print("=====================================")
results_overall = perform_analysis(data, non_invasive_orig, invasive_orig, analysis_label="All Subjects")

# 5. Gender-Specific Analysis (Male and Female)
# 성별: 1은 Male, 2는 Female
genders = {1: "Male", 2: "Female"}
results_gender = {}

for gender_value, gender_label in genders.items():
    print("\n=====================================")
    print(f"=== {gender_label} Group Analysis ===")
    print("=====================================")
    # 성별별 데이터 필터링
    data_gender = data[data['성별'] == gender_value].copy()
    # 성별별 분석에서는 '성별' 변수를 제거한 predictor 리스트를 사용합니다.
    results_gender[gender_label] = perform_analysis(data_gender, non_invasive, invasive, analysis_label=gender_label)

## GBS 전체 + 성별 분할

In [None]:
best_gbs_params = {'learning_rate': 0.06, 'n_estimators': 121, 'max_depth': 5, 'min_samples_split': 8,
                   'subsample': 0.568, 'max_features': 'sqrt', 'dropout_rate': 0.313}

In [None]:
# 1. Predictor 변수 정의
# 전체 분석에서는 '성별' 변수를 포함합니다.
non_invasive_orig = ['나이', 'WHtR', '성별', '고혈압여부', '당뇨병여부', 'IPAQ_4group_mets_days']
invasive_orig = non_invasive_orig + ['TG', 'LDL', 'HDL']

# 성별별 분석 시에는 '성별' 변수를 제거합니다.
non_invasive = [col for col in non_invasive_orig if col != '성별']
invasive = [col for col in invasive_orig if col != '성별']

# 2. 데이터 로드
data = Data_temp

# 3. 분석 수행 함수 정의
def perform_analysis(data_subset, features_non_inv, features_inv, analysis_label="Overall"):
    """
    data_subset: 분석 대상 데이터 (전체, 남자, 여자 등)
    features_non_inv: 비침습 변수 리스트
    features_inv: 침습 변수 리스트
    analysis_label: 출력 그래프 제목에 들어갈 label
    """
    results = {}

    for model_type in ['non_invasive', 'invasive']:
        # 모델별 predictor 설정
        features = features_non_inv if model_type == 'non_invasive' else features_inv
        X = data_subset[features]
        y_event = data_subset['CVD사망'].astype(bool)
        y_time = data_subset['추적기간_연']

        # Train/Test 분할 (stratify로 사건 비율 유지)
        X_train, X_test, y_train_event, y_test_event, y_train_time, y_test_time = train_test_split(
            X, data_subset['CVD사망'], data_subset['추적기간_연'],
            test_size=0.3, stratify=data_subset['CVD사망'], random_state=42
        )

        # 생존 데이터 생성 (sksurv 형식)
        y_train_surv = Surv.from_arrays(y_train_event.astype(bool), y_train_time)
        y_test_surv = Surv.from_arrays(y_test_event.astype(bool), y_test_time)

        # RSF 모델 학습 (sksurv)
        model = GradientBoostingSurvivalAnalysis(**best_gbs_params, loss='coxph', random_state=42)
        model.fit(X_train, y_train_surv)

        # 위험 점수 예측
        risk_scores_train = model.predict(X_train)
        risk_scores_test = model.predict(X_test)

        # C-index 계산
        c_index_train = concordance_index_censored(y_train_event.astype(bool), y_train_time, risk_scores_train)[0]
        c_index_test = concordance_index_censored(y_test_event.astype(bool), y_test_time, risk_scores_test)[0]

        # 시간 포인트 설정: Train 데이터 중 사건 발생 시간의 분위수 (예: 10개)
        times = np.quantile(y_train_time[y_train_event.astype(bool)], np.linspace(0.1, 0.9, 10))

        # Time-dependent AUC 계산 (Train & Test)
        auc_train_values, mean_auc_train = cumulative_dynamic_auc(y_train_surv, y_train_surv, risk_scores_train, times)
        auc_test_values, mean_auc_test = cumulative_dynamic_auc(y_train_surv, y_test_surv, risk_scores_test, times)

        # 부트스트랩을 통한 95% 신뢰구간 산출 (n_bootstraps = 100)
        n_bootstraps = 100
        boot_c_indices_train = []
        boot_c_indices_test = []
        boot_aucs_train = []  # shape: (n_bootstraps, len(times))
        boot_aucs_test = []

        train_indices = np.arange(len(X_train))
        test_indices = np.arange(len(X_test))

        for i in range(n_bootstraps):
            # Train 부트스트랩
            boot_idx_train = np.random.choice(train_indices, size=len(train_indices), replace=True)
            boot_y_train_event = y_train_event.iloc[boot_idx_train]
            boot_y_train_time = y_train_time.iloc[boot_idx_train]
            boot_risk_scores_train = risk_scores_train[boot_idx_train]
            boot_y_train_surv = Surv.from_arrays(boot_y_train_event.astype(bool), boot_y_train_time)

            boot_c_idx_train = concordance_index_censored(boot_y_train_event.astype(bool), boot_y_train_time, boot_risk_scores_train)[0]
            boot_c_indices_train.append(boot_c_idx_train)

            boot_auc_train, _ = cumulative_dynamic_auc(boot_y_train_surv, boot_y_train_surv, boot_risk_scores_train, times)
            boot_aucs_train.append(boot_auc_train)

            # Test 부트스트랩 (참조: 전체 Train 데이터 사용)
            boot_idx_test = np.random.choice(test_indices, size=len(test_indices), replace=True)
            boot_y_test_event = y_test_event.iloc[boot_idx_test]
            boot_y_test_time = y_test_time.iloc[boot_idx_test]
            boot_risk_scores_test = risk_scores_test[boot_idx_test]

            boot_auc_test, _ = cumulative_dynamic_auc(
                y_train_surv,
                Surv.from_arrays(boot_y_test_event.astype(bool), boot_y_test_time),
                boot_risk_scores_test,
                times
            )
            boot_aucs_test.append(boot_auc_test)

            boot_c_idx_test = concordance_index_censored(boot_y_test_event.astype(bool), boot_y_test_time, boot_risk_scores_test)[0]
            boot_c_indices_test.append(boot_c_idx_test)

        boot_c_indices_train = np.array(boot_c_indices_train)
        boot_c_indices_test = np.array(boot_c_indices_test)
        boot_aucs_train = np.array(boot_aucs_train)
        boot_aucs_test = np.array(boot_aucs_test)

        lower_c_index_train = np.percentile(boot_c_indices_train, 2.5)
        upper_c_index_train = np.percentile(boot_c_indices_train, 97.5)
        lower_c_index_test = np.percentile(boot_c_indices_test, 2.5)
        upper_c_index_test = np.percentile(boot_c_indices_test, 97.5)

        lower_auc_train = np.percentile(boot_aucs_train, 2.5, axis=0)
        upper_auc_train = np.percentile(boot_aucs_train, 97.5, axis=0)
        lower_auc_test = np.percentile(boot_aucs_test, 2.5, axis=0)
        upper_auc_test = np.percentile(boot_aucs_test, 97.5, axis=0)

        mean_auc_train_boot = np.mean(boot_aucs_train, axis=1)
        mean_auc_test_boot = np.mean(boot_aucs_test, axis=1)
        lower_mean_auc_train = np.percentile(mean_auc_train_boot, 2.5)
        upper_mean_auc_train = np.percentile(mean_auc_train_boot, 97.5)
        lower_mean_auc_test = np.percentile(mean_auc_test_boot, 2.5)
        upper_mean_auc_test = np.percentile(mean_auc_test_boot, 97.5)

        results[model_type] = {
            'train': {
                'c_index': c_index_train,
                'c_index_ci': (lower_c_index_train, upper_c_index_train),
                'auc_values': auc_train_values,
                'mean_auc': mean_auc_train,
                'auc_ci': (lower_auc_train, upper_auc_train),
                'mean_auc_ci': (lower_mean_auc_train, upper_mean_auc_train)
            },
            'test': {
                'c_index': c_index_test,
                'c_index_ci': (lower_c_index_test, upper_c_index_test),
                'auc_values': auc_test_values,
                'mean_auc': mean_auc_test,
                'auc_ci': (lower_auc_test, upper_auc_test),
                'mean_auc_ci': (lower_mean_auc_test, upper_mean_auc_test)
            },
            'times': times
        }

        # 콘솔 출력 (영문)
        model_name = "Non_Invasive Model" if model_type == 'non_invasive' else "Invasive Model"
        print(f"\n{model_name}:")
        print(f"  C-index (Test Data): {c_index_test:.3f} ({lower_c_index_test:.3f}-{upper_c_index_test:.3f})")
        print(f"  Train Data Mean AUC: {mean_auc_train:.3f} ({lower_mean_auc_train:.3f}-{upper_mean_auc_train:.3f})")
        print(f"  Test Data Mean AUC: {mean_auc_test:.3f} ({lower_mean_auc_test:.3f}-{upper_mean_auc_test:.3f})")

    # 그래프: Test Data에 대한 Time-dependent AUC 비교
    plt.figure(figsize=(12, 8))
    colors = {'non_invasive': '#1f77b4', 'invasive': '#ff7f0e'}

    for model_type in ['non_invasive', 'invasive']:
        res = results[model_type]['test']
        label = (f"{model_type.replace('_',' ').title()} (Test Mean AUC={res['mean_auc']:.3f} "
                 f"({res['mean_auc_ci'][0]:.3f}-{res['mean_auc_ci'][1]:.3f}))")
        plt.plot(results[model_type]['times'], res['auc_values'],
                 color=colors[model_type], linewidth=2, label=label)
        plt.fill_between(results[model_type]['times'],
                         res['auc_ci'][0], res['auc_ci'][1],
                         color=colors[model_type], alpha=0.2)

    plt.xlabel('Follow-up Time (Years)', fontsize=12)
    plt.ylabel('Time-dependent AUC', fontsize=12)
    plt.title(f"Time-Dependent AUC of the Gradient Boosting Survival Model ({analysis_label})", fontsize=14) # 이름 수정
    plt.legend(loc='lower right', fontsize=10)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

    return results

# 4. Overall Analysis (All Subjects)
print("=====================================")
print("=== Overall Analysis ===")
print("=====================================")
results_overall = perform_analysis(data, non_invasive_orig, invasive_orig, analysis_label="All Subjects")

# 5. Gender-Specific Analysis (Male and Female)
# 성별: 1은 Male, 2는 Female
genders = {1: "Male", 2: "Female"}
results_gender = {}

for gender_value, gender_label in genders.items():
    print("\n=====================================")
    print(f"=== {gender_label} Group Analysis ===")
    print("=====================================")
    # 성별별 데이터 필터링
    data_gender = data[data['성별'] == gender_value].copy()
    # 성별별 분석에서는 '성별' 변수를 제거한 predictor 리스트를 사용합니다.
    results_gender[gender_label] = perform_analysis(data_gender, non_invasive, invasive, analysis_label=gender_label)

# 예측 모델별 비교

## 5가지 모델 비교
- 현재 논문 Figure에 기록된 수치 재현
- COX PH, COX Net, Survival Tree, RSF, GBS

In [None]:
from sksurv.util import Surv
from sksurv.metrics import concordance_index_censored, cumulative_dynamic_auc
from sksurv.linear_model import CoxPHSurvivalAnalysis, CoxnetSurvivalAnalysis
from sksurv.tree import SurvivalTree
from sksurv.ensemble import RandomSurvivalForest

In [None]:
best_rsf_params = {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 14,
                   'max_leaf_nodes': 70, 'max_features': 'log2'}
best_gbs_params = {'learning_rate': 0.06, 'n_estimators': 121, 'max_depth': 5, 'min_samples_split': 8,
                   'subsample': 0.568, 'max_features': 'sqrt', 'dropout_rate': 0.313}
# 모델 설정
MODELS = [
    {
        "name": "Cox PH",
        "estimator": CoxPHSurvivalAnalysis(),
        "predict_method": "predict"
    },
    {
        "name": "Cox Net",
        "estimator": CoxnetSurvivalAnalysis(l1_ratio=0.9, alpha_min_ratio=0.01),
        "predict_method": "predict"
    },
    {
        "name": "Survival Tree",
        "estimator": SurvivalTree(max_depth=16, max_leaf_nodes=32, min_samples_split=10, min_samples_leaf=10,
                                  min_weight_fraction_leaf=0.01, low_memory=False, random_state=42),
        "predict_method": "predict"
    },
    {
        "name": "RSF",
        "estimator": RandomSurvivalForest(**best_rsf_params, n_jobs=-1, random_state=42),
        "predict_method": "predict"
    },
    {
        "name": "GBS",
        "estimator": GradientBoostingSurvivalAnalysis(**best_gbs_params, loss='coxph', random_state=42),
        "predict_method": "predict"
    }
]

# 분석 수행 함수
def perform_analysis(data_subset, features, analysis_label, variable_type):
    results = {}
    X = data_subset[features]
    y_event = data_subset['CVD사망'].astype(bool)
    y_time = data_subset['추적기간_연']

    # 데이터 분할
    X_train, X_test, y_train_event, y_test_event, y_train_time, y_test_time = train_test_split(
        X, y_event, y_time, test_size=0.3, stratify=y_event, random_state=42
    )

    y_train_surv = Surv.from_arrays(y_train_event, y_train_time)
    y_test_surv = Surv.from_arrays(y_test_event, y_test_time)

    # 시간 포인트 설정
    times = np.quantile(y_train_time[y_train_event], np.linspace(0.1, 0.9, 10))

    model_results = {}
    for model_info in MODELS:
        model_name = model_info["name"]
        estimator = model_info["estimator"]
        predict_method = model_info["predict_method"]
        score_sign = model_info.get("score_sign", 1)

        try:
            # 모델 학습 및 예측
            estimator.fit(X_train, y_train_surv)
            predictor = getattr(estimator, predict_method)
            train_scores = score_sign * predictor(X_train)
            test_scores = score_sign * predictor(X_test)

            # 성능 평가
            c_index_train = concordance_index_censored(y_train_event, y_train_time, train_scores)[0]
            c_index_test = concordance_index_censored(y_test_event, y_test_time, test_scores)[0]

            # Time-dependent AUC
            auc_train, mean_auc_train = cumulative_dynamic_auc(y_train_surv, y_train_surv, train_scores, times)
            auc_test, mean_auc_test = cumulative_dynamic_auc(y_train_surv, y_test_surv, test_scores, times)

            # 부트스트랩 신뢰구간 계산
            def bootstrap_ci(y_true, y_time, scores, times, n_boot=100):
                boot_mean_aucs = []
                indices = np.arange(len(scores))
                for _ in range(n_boot):
                    boot_idx = np.random.choice(indices, size=len(indices), replace=True)
                    _, mean_auc = cumulative_dynamic_auc(
                        y_train_surv,
                        Surv.from_arrays(y_true.iloc[boot_idx], y_time.iloc[boot_idx]),
                        scores[boot_idx],
                        times
                    )
                    boot_mean_aucs.append(mean_auc)
                return np.percentile(boot_mean_aucs, [2.5, 97.5])

            # 신뢰구간 계산
            train_ci = bootstrap_ci(y_train_event, y_train_time, train_scores, times)
            test_ci = bootstrap_ci(y_test_event, y_test_time, test_scores, times)

            # 결과 저장
            model_results[model_name] = {
                'c_index': {
                    'train': (c_index_train, train_ci),
                    'test': (c_index_test, test_ci)
                },
                'auc': {
                    'train': (mean_auc_train, train_ci),
                    'test': (mean_auc_test, test_ci)
                },
                'times': times,
                'auc_curve_test': auc_test
            }

            # 성능 지표 출력
            print(f"\n[{model_name}] {analysis_label} ({variable_type})")
            print(f"Train C-index: {c_index_train:.3f} (95% CI: {train_ci[0]:.3f}-{train_ci[1]:.3f})")
            print(f"Test C-index:  {c_index_test:.3f} (95% CI: {test_ci[0]:.3f}-{test_ci[1]:.3f})")
            print(f"Train Mean AUC: {mean_auc_train:.3f} (95% CI: {train_ci[0]:.3f}-{train_ci[1]:.3f})")
            print(f"Test Mean AUC:  {mean_auc_test:.3f} (95% CI: {test_ci[0]:.3f}-{test_ci[1]:.3f})")

        except Exception as e:
            print(f"{model_name} 모델 오류: {str(e)}")
            continue

    # 그래프 생성
    plot_results(model_results, analysis_label, variable_type)
    return model_results

def plot_results(results, analysis_label, variable_type):
    plt.figure(figsize=(14, 6))
    colors = plt.cm.get_cmap('tab10', len(results))

    for idx, (model_name, result) in enumerate(results.items()):
        color = colors(idx)
        mean_auc = result['auc']['test'][0]
        ci_lower, ci_upper = result['auc']['test'][1]

        label = (f"{model_name}\n"
                 f"Mean AUC: {mean_auc:.3f} (95% CI: {ci_lower:.3f}-{ci_upper:.3f})")

        plt.plot(
            result['times'],
            result['auc_curve_test'],
            color=color,
            linewidth=2,
            label=label
        )

    plt.xlabel('Follow-up Time (Years)')
    plt.ylabel('Time-dependent AUC')
    plt.title(f"{analysis_label} - {variable_type} Model Performance (Test Set)")
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

# 전체 분석 실행
def run_full_analysis(data, non_invasive_vars, invasive_vars):
    # 비침습 변수 분석
    print("\n\n=== Non-invasive Variables Analysis ===")
    non_invasive_results = {
        "Overall": perform_analysis(data, non_invasive_vars, "All Subjects", "Non-invasive"),
        "Male": perform_analysis(data[data['성별']==1], [v for v in non_invasive_vars if v != '성별'],
                               "Male", "Non-invasive"),
        "Female": perform_analysis(data[data['성별']==2], [v for v in non_invasive_vars if v != '성별'],
                                "Female", "Non-invasive")
    }

    # 침습 변수 분석
    print("\n\n=== Invasive Variables Analysis ===")
    invasive_results = {
        "Overall": perform_analysis(data, invasive_vars, "All Subjects", "Invasive"),
        "Male": perform_analysis(data[data['성별']==1], [v for v in invasive_vars if v != '성별'],
                               "Male", "Invasive"),
        "Female": perform_analysis(data[data['성별']==2], [v for v in invasive_vars if v != '성별'],
                                "Female", "Invasive")
    }

    return non_invasive_results, invasive_results

# 변수 정의
non_invasive_orig = ['나이', 'WHtR', '성별', '고혈압여부', '당뇨병여부', 'IPAQ_4group_mets_days']
invasive_orig = non_invasive_orig + ['TG', 'LDL','HDL']

# 분석 실행
non_invasive_results, invasive_results = run_full_analysis(Data_temp, non_invasive_orig, invasive_orig)

## 5가지 모델 비교 + 대상자별 하이퍼 파라미터
- 전체, 남자, 여자 대상자별로 최적화를 해도 유의한 성능 차이 없음
- 전체 대상자에서 최적화한 모델을 같이 사용 한 것으로 최종 결정
- 즉, 위의 ***5가지 모델 비교*** 의 수치를 Figure로 사용

In [None]:
best_rsf_params = {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 14, 'max_leaf_nodes': 70, 'max_features': 'log2'}
best_rsf_params_man = {'n_estimators': 186, 'max_depth': 14, 'min_samples_split': 14, 'min_samples_leaf': 13, 'max_leaf_nodes': 70, 'max_features': 'sqrt'}
best_rsf_params_woman = {'n_estimators': 200, 'max_depth': 8, 'min_samples_split': 17, 'min_samples_leaf': 7, 'max_leaf_nodes': 32, 'max_features': 'log2'}

best_gbs_params = {'learning_rate': 0.06, 'n_estimators': 121, 'max_depth': 5, 'min_samples_split': 8,'subsample': 0.568,'max_features': 'sqrt', 'dropout_rate': 0.313}
best_gbs_params_man = {'learning_rate': 0.298, 'n_estimators': 185, 'max_depth': 6, 'min_samples_split': 5, 'subsample': 0.796,'max_features': 'sqrt', 'dropout_rate': 0.067752}
best_gbs_params_woman = {'learning_rate': 0.0232, 'n_estimators': 104, 'max_depth': 7, 'min_samples_split': 6, 'subsample': 0.583,'max_features': 'log2', 'dropout_rate': 0.00326}

In [None]:
# 전체 대상자 모델 설정
MODELS_overall = [
    {
        "name": "Cox PH",
        "estimator": CoxPHSurvivalAnalysis(),
        "predict_method": "predict"
    },
    {
        "name": "Cox Net",
        "estimator": CoxnetSurvivalAnalysis(l1_ratio=0.9, alpha_min_ratio=0.01),
        "predict_method": "predict"
    },
    {
        "name": "Survival Tree",
        "estimator": SurvivalTree(max_depth=16, max_leaf_nodes=32, min_samples_split=10,
                                  min_samples_leaf=10, min_weight_fraction_leaf=0.01,
                                  low_memory=False, random_state=42),
        "predict_method": "predict"
    },
    {
        "name": "RSF",
        "estimator": RandomSurvivalForest(**best_rsf_params, n_jobs=-1, random_state=42),
        "predict_method": "predict"
    },
    {
        "name": "GBS",
        "estimator": GradientBoostingSurvivalAnalysis(**best_gbs_params, loss='coxph', random_state=42),
        "predict_method": "predict"
    }
]

# 남자 모델 설정
MODELS_male = [
    {
        "name": "Cox PH",
        "estimator": CoxPHSurvivalAnalysis(),
        "predict_method": "predict"
    },
    {
        "name": "Cox Net",
        "estimator": CoxnetSurvivalAnalysis(l1_ratio=0.9, alpha_min_ratio=0.01),
        "predict_method": "predict"
    },
    {
        "name": "Survival Tree",
        "estimator": SurvivalTree(max_depth=16, max_leaf_nodes=32, min_samples_split=10,
                                  min_samples_leaf=10, min_weight_fraction_leaf=0.01,
                                  low_memory=False, random_state=42),
        "predict_method": "predict"
    },
    {
        "name": "RSF",
        "estimator": RandomSurvivalForest(**best_rsf_params_man, n_jobs=-1, random_state=42),
        "predict_method": "predict"
    },
    {
        "name": "GBS",
        "estimator": GradientBoostingSurvivalAnalysis(**best_gbs_params_man, loss='coxph', random_state=42),
        "predict_method": "predict"
    }
]

# 여자 모델 설정
MODELS_female = [
    {
        "name": "Cox PH",
        "estimator": CoxPHSurvivalAnalysis(),
        "predict_method": "predict"
    },
    {
        "name": "Cox Net",
        "estimator": CoxnetSurvivalAnalysis(l1_ratio=0.9, alpha_min_ratio=0.01),
        "predict_method": "predict"
    },
    {
        "name": "Survival Tree",
        "estimator": SurvivalTree(max_depth=16, max_leaf_nodes=32, min_samples_split=10,
                                  min_samples_leaf=10, min_weight_fraction_leaf=0.01,
                                  low_memory=False, random_state=42),
        "predict_method": "predict"
    },
    {
        "name": "RSF",
        "estimator": RandomSurvivalForest(**best_rsf_params_woman, n_jobs=-1, random_state=42),
        "predict_method": "predict"
    },
    {
        "name": "GBS",
        "estimator": GradientBoostingSurvivalAnalysis(**best_gbs_params_woman, loss='coxph', random_state=42),
        "predict_method": "predict"
    }
]

# --- 분석 수행 함수 (모델 리스트를 인자로 받도록 수정) ---
def perform_analysis(data_subset, features, analysis_label, variable_type, models):
    results = {}
    X = data_subset[features]
    y_event = data_subset['CVD사망'].astype(bool)
    y_time = data_subset['추적기간_연']

    # 데이터 분할
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train_event, y_test_event, y_train_time, y_test_time = train_test_split(
        X, y_event, y_time, test_size=0.3, stratify=y_event, random_state=42
    )

    from sksurv.util import Surv
    y_train_surv = Surv.from_arrays(y_train_event, y_train_time)
    y_test_surv = Surv.from_arrays(y_test_event, y_test_time)

    # 시간 포인트 설정: 여기서는 10% ~ 90% 분위수
    times = np.quantile(y_train_time[y_train_event], np.linspace(0.1, 0.9, 10))

    model_results = {}
    from sksurv.metrics import concordance_index_censored, cumulative_dynamic_auc
    for model_info in models:
        model_name = model_info["name"]
        estimator = model_info["estimator"]
        predict_method = model_info["predict_method"]
        score_sign = model_info.get("score_sign", 1)

        try:
            # 모델 학습 및 예측
            estimator.fit(X_train, y_train_surv)
            predictor = getattr(estimator, predict_method)
            train_scores = score_sign * predictor(X_train)
            test_scores = score_sign * predictor(X_test)

            # 성능 평가
            c_index_train = concordance_index_censored(y_train_event, y_train_time, train_scores)[0]
            c_index_test = concordance_index_censored(y_test_event, y_test_time, test_scores)[0]

            # Time-dependent AUC 평가
            auc_train, mean_auc_train = cumulative_dynamic_auc(y_train_surv, y_train_surv, train_scores, times)
            auc_test, mean_auc_test = cumulative_dynamic_auc(y_train_surv, y_test_surv, test_scores, times)

            # 부트스트랩 신뢰구간 계산 함수
            def bootstrap_ci(y_true, y_time, scores, times, n_boot=100):
                boot_mean_aucs = []
                indices = np.arange(len(scores))
                for _ in range(n_boot):
                    boot_idx = np.random.choice(indices, size=len(indices), replace=True)
                    _, mean_auc = cumulative_dynamic_auc(
                        y_train_surv,
                        Surv.from_arrays(y_true.iloc[boot_idx], y_time.iloc[boot_idx]),
                        scores[boot_idx],
                        times
                    )
                    boot_mean_aucs.append(mean_auc)
                return np.percentile(boot_mean_aucs, [2.5, 97.5])

            # 신뢰구간 계산
            train_ci = bootstrap_ci(y_train_event, y_train_time, train_scores, times)
            test_ci = bootstrap_ci(y_test_event, y_test_time, test_scores, times)

            # 결과 저장
            model_results[model_name] = {
                'c_index': {
                    'train': (c_index_train, train_ci),
                    'test': (c_index_test, test_ci)
                },
                'auc': {
                    'train': (mean_auc_train, train_ci),
                    'test': (mean_auc_test, test_ci)
                },
                'times': times,
                'auc_curve_test': auc_test
            }

            # 성능 지표 출력
            print(f"\n[{model_name}] {analysis_label} ({variable_type})")
            print(f"Train C-index: {c_index_train:.3f} (95% CI: {train_ci[0]:.3f}-{train_ci[1]:.3f})")
            print(f"Test C-index:  {c_index_test:.3f} (95% CI: {test_ci[0]:.3f}-{test_ci[1]:.3f})")
            print(f"Train Mean AUC: {mean_auc_train:.3f} (95% CI: {train_ci[0]:.3f}-{train_ci[1]:.3f})")
            print(f"Test Mean AUC:  {mean_auc_test:.3f} (95% CI: {test_ci[0]:.3f}-{test_ci[1]:.3f})")

        except Exception as e:
            print(f"{model_name} 모델 오류: {str(e)}")
            continue

    # 그래프 생성
    plot_results(model_results, analysis_label, variable_type)
    return model_results

def plot_results(results, analysis_label, variable_type):
    import matplotlib.pyplot as plt
    plt.figure(figsize=(14, 6))
    colors = plt.cm.get_cmap('tab10', len(results))

    for idx, (model_name, result) in enumerate(results.items()):
        color = colors(idx)
        mean_auc = result['auc']['test'][0]
        ci_lower, ci_upper = result['auc']['test'][1]

        label = (f"{model_name}\n"
                 f"Mean AUC: {mean_auc:.3f} (95% CI: {ci_lower:.3f}-{ci_upper:.3f})")

        plt.plot(
            result['times'],
            result['auc_curve_test'],
            color=color,
            linewidth=2,
            label=label
        )

    plt.xlabel('Follow-up Time (Years)')
    plt.ylabel('Time-dependent AUC')
    plt.title(f"{analysis_label} - {variable_type} Model Performance (Test Set)")
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

# --- 전체 분석 실행 함수 ---
def run_full_analysis(data, non_invasive_vars, invasive_vars):
    # 비침습 변수 분석
    print("\n\n=== Non-invasive Variables Analysis ===")
    non_invasive_results = {
        "Overall": perform_analysis(data, non_invasive_vars, "All Subjects", "Non-invasive", MODELS_overall),
        "Male": perform_analysis(data[data['성별']==1], [v for v in non_invasive_vars if v != '성별'],
                                  "Male", "Non-invasive", MODELS_male),
        "Female": perform_analysis(data[data['성별']==2], [v for v in non_invasive_vars if v != '성별'],
                                    "Female", "Non-invasive", MODELS_female)
    }

    # 침습 변수 분석
    print("\n\n=== Invasive Variables Analysis ===")
    invasive_results = {
        "Overall": perform_analysis(data, invasive_vars, "All Subjects", "Invasive", MODELS_overall),
        "Male": perform_analysis(data[data['성별']==1], [v for v in invasive_vars if v != '성별'],
                                  "Male", "Invasive", MODELS_male),
        "Female": perform_analysis(data[data['성별']==2], [v for v in invasive_vars if v != '성별'],
                                    "Female", "Invasive", MODELS_female)
    }

    return non_invasive_results, invasive_results

# --- 변수 정의 ---
non_invasive_orig = ['나이', 'WHtR', '성별', '고혈압여부', '당뇨병여부',
                       'IPAQ_4group_mets_days', 'BMI', 'SBP', 'DBP', 'TotalMets', '중강도mets', '걷기mets']
invasive_orig = non_invasive_orig + ['TG', 'LDL', 'HDL', '공복혈당']

# --- 분석 실행 ---
non_invasive_results, invasive_results = run_full_analysis(Data_temp, non_invasive_orig, invasive_orig)

In [None]:
best_rsf_params = {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 14,
                   'max_leaf_nodes': 70, 'max_features': 'log2'}
best_gbs_params = {'learning_rate': 0.06, 'n_estimators': 121, 'max_depth': 5, 'min_samples_split': 8,
                   'subsample': 0.568, 'max_features': 'sqrt', 'dropout_rate': 0.313}

# 모델 설정
MODELS = [
    {
        "name": "Cox proportional hazards", #
        "estimator": CoxPHSurvivalAnalysis(),
        "predict_method": "predict"
    },
    {
        "name": "Random survival forest",
        "estimator": RandomSurvivalForest(**best_rsf_params, n_jobs=-1, random_state=42),
        "predict_method": "predict"
    },
    {
        "name": "Gradient boosting survival",
        "estimator": GradientBoostingSurvivalAnalysis(**best_gbs_params, loss='coxph', random_state=42),
        "predict_method": "predict"
    }
]

# 분석 수행 함수
def perform_analysis(data_subset, features, analysis_label, variable_type):
    results = {}
    X = data_subset[features]
    y_event = data_subset['CVD사망'].astype(bool)
    y_time = data_subset['추적기간_연']

    # 데이터 분할
    X_train, X_test, y_train_event, y_test_event, y_train_time, y_test_time = train_test_split(
        X, y_event, y_time, test_size=0.3, stratify=y_event, random_state=42
    )

    y_train_surv = Surv.from_arrays(y_train_event, y_train_time)
    y_test_surv = Surv.from_arrays(y_test_event, y_test_time)

    # 시간 포인트 설정
    times = np.quantile(y_train_time[y_train_event], np.linspace(0.1, 0.9, 10))

    model_results = {}
    for model_info in MODELS:
        model_name = model_info["name"]
        estimator = model_info["estimator"]
        predict_method = model_info["predict_method"]
        score_sign = model_info.get("score_sign", 1)

        try:
            # 모델 학습 및 예측
            estimator.fit(X_train, y_train_surv)
            predictor = getattr(estimator, predict_method)
            train_scores = score_sign * predictor(X_train)
            test_scores = score_sign * predictor(X_test)

            # 성능 평가
            c_index_train = concordance_index_censored(y_train_event, y_train_time, train_scores)[0]
            c_index_test = concordance_index_censored(y_test_event, y_test_time, test_scores)[0]

            # Time-dependent AUC
            auc_train, mean_auc_train = cumulative_dynamic_auc(y_train_surv, y_train_surv, train_scores, times)
            auc_test, mean_auc_test = cumulative_dynamic_auc(y_train_surv, y_test_surv, test_scores, times)

            # 부트스트랩 신뢰구간 계산
            def bootstrap_ci(y_true, y_time, scores, times, n_boot=100):
                boot_mean_aucs = []
                indices = np.arange(len(scores))
                for _ in range(n_boot):
                    boot_idx = np.random.choice(indices, size=len(indices), replace=True)
                    _, mean_auc = cumulative_dynamic_auc(
                        y_train_surv,
                        Surv.from_arrays(y_true.iloc[boot_idx], y_time.iloc[boot_idx]),
                        scores[boot_idx],
                        times
                    )
                    boot_mean_aucs.append(mean_auc)
                return np.percentile(boot_mean_aucs, [2.5, 97.5])

            # 신뢰구간 계산
            train_ci = bootstrap_ci(y_train_event, y_train_time, train_scores, times)
            test_ci = bootstrap_ci(y_test_event, y_test_time, test_scores, times)

            # 결과 저장
            model_results[model_name] = {
                'c_index': {
                    'train': (c_index_train, train_ci),
                    'test': (c_index_test, test_ci)
                },
                'auc': {
                    'train': (mean_auc_train, train_ci),
                    'test': (mean_auc_test, test_ci)
                },
                'times': times,
                'auc_curve_test': auc_test
            }

            # 성능 지표 출력
            print(f"\n[{model_name}] {analysis_label} ({variable_type})")
            print(f"Train C-index: {c_index_train:.3f} (95% CI: {train_ci[0]:.3f}-{train_ci[1]:.3f})")
            print(f"Test C-index:  {c_index_test:.3f} (95% CI: {test_ci[0]:.3f}-{test_ci[1]:.3f})")
            print(f"Train Mean AUC: {mean_auc_train:.3f} (95% CI: {train_ci[0]:.3f}-{train_ci[1]:.3f})")
            print(f"Test Mean AUC:  {mean_auc_test:.3f} (95% CI: {test_ci[0]:.3f}-{test_ci[1]:.3f})")

        except Exception as e:
            print(f"{model_name} 모델 오류: {str(e)}")
            continue

    # 그래프 생성
    plot_results(model_results, analysis_label, variable_type)
    return model_results

def plot_results(results, analysis_label, variable_type):
    plt.figure(figsize=(14, 6))
    colors = plt.cm.get_cmap('tab10', len(results))

    for idx, (model_name, result) in enumerate(results.items()):
        color = colors(idx)
        mean_auc = result['auc']['test'][0]
        ci_lower, ci_upper = result['auc']['test'][1]

        label = (f"{model_name}\n"
                 f"Mean AUC: {mean_auc:.3f} (95% CI: {ci_lower:.3f}-{ci_upper:.3f})")

        plt.plot(
            result['times'],
            result['auc_curve_test'],
            color=color,
            linewidth=2,
            label=label
        )

    plt.xlabel('Follow-up Time (Years)')
    plt.ylabel('Time-dependent AUC')
    plt.title(f"{analysis_label} - {variable_type} Model Performance (Test Set)")
    plt.legend(loc='lower right', fontsize=10)  # 범례를 그래프 안 오른쪽 아래에 배치
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

# 전체 분석 실행
def run_full_analysis(data, non_invasive_vars, invasive_vars):
    # 비침습 변수 분석
    print("\n\n=== Non-invasive Variables Analysis ===")
    non_invasive_results = {
        "Overall": perform_analysis(data, non_invasive_vars, "All Subjects", "Non-invasive"),
        "Male": perform_analysis(data[data['성별']==1], [v for v in non_invasive_vars if v != '성별'],
                               "Male", "Non-invasive"),
        "Female": perform_analysis(data[data['성별']==2], [v for v in non_invasive_vars if v != '성별'],
                                "Female", "Non-invasive")
    }

    # 침습 변수 분석
    print("\n\n=== Invasive Variables Analysis ===")
    invasive_results = {
        "Overall": perform_analysis(data, invasive_vars, "All Subjects", "Invasive"),
        "Male": perform_analysis(data[data['성별']==1], [v for v in invasive_vars if v != '성별'],
                               "Male", "Invasive"),
        "Female": perform_analysis(data[data['성별']==2], [v for v in invasive_vars if v != '성별'],
                                "Female", "Invasive")
    }

    return non_invasive_results, invasive_results

# 변수 정의
non_invasive_orig = ['나이', 'WHtR', '성별', '고혈압여부', '당뇨병여부', 'IPAQ_4group_mets_days']
invasive_orig = non_invasive_orig + ['TG', 'LDL','HDL']

# 분석 실행
non_invasive_results, invasive_results = run_full_analysis(Data_temp, non_invasive_orig, invasive_orig)

In [None]:
best_rsf_params = {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 14,
                   'max_leaf_nodes': 70, 'max_features': 'log2'}
best_gbs_params = {'learning_rate': 0.06, 'n_estimators': 121, 'max_depth': 5, 'min_samples_split': 8,
                   'subsample': 0.568, 'max_features': 'sqrt', 'dropout_rate': 0.313}

# 모델 설정
MODELS = [
    {
        "name": "Cox proportional hazards ",
        "estimator": CoxPHSurvivalAnalysis(),
        "predict_method": "predict"
    },
    {
        "name": "Random survival forest",
        "estimator": RandomSurvivalForest(**best_rsf_params, n_jobs=-1, random_state=42),
        "predict_method": "predict"
    },
    {
        "name": "Gradient boosting survival",
        "estimator": GradientBoostingSurvivalAnalysis(**best_gbs_params, loss='coxph', random_state=42),
        "predict_method": "predict"
    }
]

# 분석 수행 함수
def perform_analysis(data_subset, features, analysis_label, variable_type):
    results = {}
    X = data_subset[features]
    y_event = data_subset['CVD사망'].astype(bool)
    y_time = data_subset['추적기간_연']

    # 데이터 분할
    X_train, X_test, y_train_event, y_test_event, y_train_time, y_test_time = train_test_split(
        X, y_event, y_time, test_size=0.3, stratify=y_event, random_state=42
    )

    y_train_surv = Surv.from_arrays(y_train_event, y_train_time)
    y_test_surv = Surv.from_arrays(y_test_event, y_test_time)

    # 시간 포인트 설정
    times = np.quantile(y_train_time[y_train_event], np.linspace(0.1, 0.9, 10))

    model_results = {}
    for model_info in MODELS:
        model_name = model_info["name"]
        estimator = model_info["estimator"]
        predict_method = model_info["predict_method"]
        score_sign = model_info.get("score_sign", 1)

        try:
            # 모델 학습 및 예측
            estimator.fit(X_train, y_train_surv)
            predictor = getattr(estimator, predict_method)
            train_scores = score_sign * predictor(X_train)
            test_scores = score_sign * predictor(X_test)

            # 성능 평가
            c_index_train = concordance_index_censored(y_train_event, y_train_time, train_scores)[0]
            c_index_test = concordance_index_censored(y_test_event, y_test_time, test_scores)[0]

            # Time-dependent AUC
            auc_train, mean_auc_train = cumulative_dynamic_auc(y_train_surv, y_train_surv, train_scores, times)
            auc_test, mean_auc_test = cumulative_dynamic_auc(y_train_surv, y_test_surv, test_scores, times)

            # 부트스트랩 신뢰구간 계산
            def bootstrap_ci(y_true, y_time, scores, times, n_boot=100):
                boot_mean_aucs = []
                indices = np.arange(len(scores))
                for _ in range(n_boot):
                    boot_idx = np.random.choice(indices, size=len(indices), replace=True)
                    _, mean_auc = cumulative_dynamic_auc(
                        y_train_surv,
                        Surv.from_arrays(y_true.iloc[boot_idx], y_time.iloc[boot_idx]),
                        scores[boot_idx],
                        times
                    )
                    boot_mean_aucs.append(mean_auc)
                return np.percentile(boot_mean_aucs, [2.5, 97.5])

            # 신뢰구간 계산
            train_ci = bootstrap_ci(y_train_event, y_train_time, train_scores, times)
            test_ci = bootstrap_ci(y_test_event, y_test_time, test_scores, times)

            # 결과 저장
            model_results[model_name] = {
                'c_index': {
                    'train': (c_index_train, train_ci),
                    'test': (c_index_test, test_ci)
                },
                'auc': {
                    'train': (mean_auc_train, train_ci),
                    'test': (mean_auc_test, test_ci)
                },
                'times': times,
                'auc_curve_test': auc_test
            }

            # 성능 지표 출력
            print(f"\n[{model_name}] {analysis_label} ({variable_type})")
            print(f"Train C-index: {c_index_train:.3f} (95% CI: {train_ci[0]:.3f}-{train_ci[1]:.3f})")
            print(f"Test C-index:  {c_index_test:.3f} (95% CI: {test_ci[0]:.3f}-{test_ci[1]:.3f})")
            print(f"Train Mean AUC: {mean_auc_train:.3f} (95% CI: {train_ci[0]:.3f}-{train_ci[1]:.3f})")
            print(f"Test Mean AUC:  {mean_auc_test:.3f} (95% CI: {test_ci[0]:.3f}-{test_ci[1]:.3f})")

        except Exception as e:
            print(f"{model_name} 모델 오류: {str(e)}")
            continue

    # 그래프 생성
    plot_results(model_results, analysis_label, variable_type)
    return model_results

def plot_results(results, analysis_label, variable_type):
    plt.figure(figsize=(14, 6))
    colors = plt.cm.get_cmap('tab10', len(results))

    for idx, (model_name, result) in enumerate(results.items()):
        color = colors(idx)
        mean_auc = result['auc']['test'][0]
        ci_lower, ci_upper = result['auc']['test'][1]

        label = (f"{model_name}\n"
                 f"Mean AUC: {mean_auc:.3f} (95% CI: {ci_lower:.3f}-{ci_upper:.3f})")

        plt.plot(
            result['times'],
            result['auc_curve_test'],
            color=color,
            linewidth=2,
            label=label
        )

    plt.xlabel('Follow-up Time (Years)')
    plt.ylabel('Time-dependent AUC')
    plt.title(f"{analysis_label} - {variable_type} Model Performance (Test Set)")
    plt.legend(loc='lower right', fontsize=10)  # 범례를 그래프 안 오른쪽 아래에 배치
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

    plt.xlabel('Follow-up Time (Years)')
    plt.ylabel('Time-dependent AUC')
    plt.title(f"{analysis_label} - {variable_type} Model Performance (Test Set)")
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

# 전체 분석 실행
def run_full_analysis(data, non_invasive_vars, invasive_vars):
    # 비침습 변수 분석
    print("\n\n=== Non-invasive Variables Analysis ===")
    non_invasive_results = {
        "Overall": perform_analysis(data, non_invasive_vars, "All Subjects", "Non-invasive"),
        "Male": perform_analysis(data[data['성별']==1], [v for v in non_invasive_vars if v != '성별'],
                               "Male", "Non-invasive"),
        "Female": perform_analysis(data[data['성별']==2], [v for v in non_invasive_vars if v != '성별'],
                                "Female", "Non-invasive")
    }

    # 침습 변수 분석
    print("\n\n=== Invasive Variables Analysis ===")
    invasive_results = {
        "Overall": perform_analysis(data, invasive_vars, "All Subjects", "Invasive"),
        "Male": perform_analysis(data[data['성별']==1], [v for v in invasive_vars if v != '성별'],
                               "Male", "Invasive"),
        "Female": perform_analysis(data[data['성별']==2], [v for v in invasive_vars if v != '성별'],
                                "Female", "Invasive")
    }

    return non_invasive_results, invasive_results

# 변수 정의
non_invasive_orig = ['나이', 'WHtR', '성별', '고혈압여부', '당뇨병여부', 'IPAQ_4group_mets_days']
invasive_orig = non_invasive_orig + ['TG', 'LDL','HDL']

# 분석 실행
non_invasive_results, invasive_results = run_full_analysis(Data_temp, non_invasive_orig, invasive_orig)

# Brier Score

## Brier Score 백본 코드
- Calibration plot도 추가해야됨

In [None]:
brier_scores, integrated_brier = brier_score(
    y_train_surv,  # (n_train,) shaped Surv
    y_test_surv,   # (n_test,) shaped Surv
    pred_surv_test_mat,  # (n_test, n_times)
    times        # (n_times,)
)

In [None]:
brier_scores

In [None]:
integrated_brier

In [None]:
print(brier_scores.shape , integrated_brier.shape)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import concordance_index_censored, cumulative_dynamic_auc, brier_score
from sksurv.util import Surv

# 1. 데이터 준비 및 변수 설정
data = Data_temp  # 사용자의 데이터프레임
non_invasive = ['나이', 'WHtR', '성별', '고혈압여부', '당뇨병여부', 'IPAQ_4group_mets_days']
invasive = ['나이', 'WHtR', '성별', '고혈압여부', '당뇨병여부', 'IPAQ_4group_mets_days', 'TG', 'LDL', 'HDL']

results = {}  # 모델별 결과 저장

for var_type in ['non_invasive', 'invasive']:
    print(f"\n=== {var_type.title()} Model Analysis ===")
    results[var_type] = {}

    # 변수 선택
    features = non_invasive if var_type == 'non_invasive' else invasive
    X = data[features]
    y_event = data['CVD사망'].astype(bool)
    y_time = data['추적기간_연']

    # Train/Test 분할 (stratify 사용)
    X_train, X_test, y_train_event, y_test_event, y_train_time, y_test_time = train_test_split(
         X, y_event, y_time,
         test_size=0.3, stratify=y_event, random_state=42
    )

    # 생존 데이터 생성
    y_train_surv = Surv.from_arrays(y_train_event, y_train_time)
    y_test_surv = Surv.from_arrays(y_test_event, y_test_time)

    # 2. 모델 학습
    model = CoxPHSurvivalAnalysis()
    model.fit(X_train, y_train_surv)

    # 위험 점수 예측
    risk_scores_train = model.predict(X_train)
    risk_scores_test = model.predict(X_test)

    # 3. 기본 지표 (C-index, Time-dependent AUC)
    c_index_train = concordance_index_censored(y_train_event, y_train_time, risk_scores_train)[0]
    c_index_test = concordance_index_censored(y_test_event, y_test_time, risk_scores_test)[0]

    # 시간 포인트 설정 (Train 데이터의 사건 발생 시간 분위수, 예: 10개)
    times = np.quantile(y_train_time[y_train_event], np.linspace(0.1, 0.9, 10))
    times = times.ravel()  # 반드시 1D 배열로

    # Time-dependent AUC 계산
    auc_train_values, mean_auc_train = cumulative_dynamic_auc(
        y_train_surv, y_train_surv, risk_scores_train, times
    )
    auc_test_values, mean_auc_test = cumulative_dynamic_auc(
        y_train_surv, y_test_surv, risk_scores_test, times
    )

    # 4. 시간별 Brier score 계산
    # Cox 모델의 생존함수 예측 (리스트 반환)
    pred_surv_train = model.predict_survival_function(X_train)
    pred_surv_test = model.predict_survival_function(X_test)

    # 주어진 times에서 각 대상자의 생존확률 평가 → (n_samples, len(times))
    pred_surv_train_mat = np.row_stack([fn(times) for fn in pred_surv_train])
    pred_surv_test_mat = np.row_stack([fn(times) for fn in pred_surv_test])

    # brier_score() 호출 → (times, time-dependent brier scores)
    times_brier_train, brier_train_values = brier_score(
        y_train_surv, y_train_surv, pred_surv_train_mat, times
    )
    times_brier_test, brier_test_values = brier_score(
        y_train_surv, y_test_surv, pred_surv_test_mat, times
    )

    # 5. 부트스트랩을 통한 95% 신뢰구간 계산 (시간별 Brier score)
    n_bootstraps = 100
    boot_brier_train = np.zeros((n_bootstraps, len(times)))
    boot_brier_test  = np.zeros((n_bootstraps, len(times)))

    train_indices = np.arange(len(X_train))
    test_indices = np.arange(len(X_test))

    for i in range(n_bootstraps):
        # --- Train 부트스트랩 ---
        boot_idx_train = np.random.choice(train_indices, size=len(train_indices), replace=True)
        boot_y_train_event = y_train_event.iloc[boot_idx_train]
        boot_y_train_time = y_train_time.iloc[boot_idx_train]
        boot_risk_scores_train = risk_scores_train[boot_idx_train]
        boot_y_train_surv = Surv.from_arrays(boot_y_train_event, boot_y_train_time)

        boot_pred_surv_train = [pred_surv_train[j] for j in boot_idx_train]
        boot_pred_surv_train_mat = np.row_stack([fn(times) for fn in boot_pred_surv_train])

        # brier_score() 반환값: (times, time-dependent brier scores)
        _, brier_tr = brier_score(
            boot_y_train_surv, boot_y_train_surv, boot_pred_surv_train_mat, times
        )
        boot_brier_train[i, :] = brier_tr

        # --- Test 부트스트랩 ---
        boot_idx_test = np.random.choice(test_indices, size=len(test_indices), replace=True)
        boot_y_test_event = y_test_event.iloc[boot_idx_test]
        boot_y_test_time = y_test_time.iloc[boot_idx_test]
        boot_risk_scores_test = risk_scores_test[boot_idx_test]
        boot_y_test_surv = Surv.from_arrays(boot_y_test_event, boot_y_test_time)

        boot_pred_surv_test = [pred_surv_test[j] for j in boot_idx_test]
        boot_pred_surv_test_mat = np.row_stack([fn(times) for fn in boot_pred_surv_test])

        _, brier_ts = brier_score(
            y_train_surv, boot_y_test_surv, boot_pred_surv_test_mat, times
        )
        boot_brier_test[i, :] = brier_ts

    # 시간별 Brier score 신뢰구간 계산
    lower_brier_train = np.percentile(boot_brier_train, 2.5, axis=0)
    upper_brier_train = np.percentile(boot_brier_train, 97.5, axis=0)
    lower_brier_test = np.percentile(boot_brier_test, 2.5, axis=0)
    upper_brier_test = np.percentile(boot_brier_test, 97.5, axis=0)

    # 6. 결과 저장 (각 시간별 Brier score와 신뢰구간)
    results[var_type]['train'] = {
        'c_index': c_index_train,
        'auc_values': auc_train_values,
        'mean_auc': mean_auc_train,
        'brier_values': brier_train_values,  # 시간별 Brier score (길이=len(times))
        'brier_ci': (lower_brier_train, upper_brier_train)
    }
    results[var_type]['test'] = {
        'c_index': c_index_test,
        'auc_values': auc_test_values,
        'mean_auc': mean_auc_test,
        'brier_values': brier_test_values,  # 시간별 Brier score (길이=len(times))
        'brier_ci': (lower_brier_test, upper_brier_test)
    }

    model_name = "Non_Invasive Model" if var_type == 'non_invasive' else "Invasive Model"
    print(f"\n{model_name}:")
    print(f"  C-index (Test Data): {c_index_test:.3f}")
    print(f"  Test Data Mean AUC: {mean_auc_test:.3f}")
    # 통합 Brier score는 별도로 계산하지 않고 시간별 값을 그래프로 표현합니다.
    print(f"  Time-dependent Brier scores (Test): {brier_test_values}")

# 7. 그래프 그리기 (시간별 Brier score 및 신뢰구간)
plt.figure(figsize=(10, 6))
colors = {'non_invasive': '#1f77b4', 'invasive': '#ff7f0e'}

for var_type in ['non_invasive', 'invasive']:
    res = results[var_type]['test']
    label = (f"{var_type.title()} (Mean Brier: {np.mean(res['brier_values']):.3f})")
    plt.plot(times, res['brier_values'], color=colors[var_type], lw=2, label=label)
    plt.fill_between(times, res['brier_ci'][0], res['brier_ci'][1],
                     color=colors[var_type], alpha=0.2)

plt.xlabel('Follow-up Time (Years)', fontsize=12)
plt.ylabel('Time-dependent Brier Score', fontsize=12)
plt.title('Time-Dependent Brier Score of the Cox PH Model', fontsize=14)
plt.legend(loc='upper left', fontsize=10)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# 최종 수정 Ver

## 모두 음영 표시

In [None]:
from joblib import Parallel, delayed

def bootstrap_time_dep_metrics(y_train_surv, y_true, y_time, scores, surv_probs, times, n_boot=50):
    """
    부트스트랩을 통해 Test set에서:
      - 시간별 AUC 곡선 + 평균 AUC
      - 시간별 Brier score 곡선 + IBS
      - C-index
    의 신뢰구간을 계산
    """
    n_times = len(times)
    indices = np.arange(len(y_true))

    # 결과 저장용
    boot_auc_curves  = np.zeros((n_boot, n_times))
    boot_mean_aucs   = np.zeros(n_boot)
    boot_brier_curves= np.zeros((n_boot, n_times))
    boot_ibs         = np.zeros(n_boot)
    boot_cindex      = np.zeros(n_boot)

    def single_bootstrap(_):
        boot_idx = np.random.choice(indices, size=len(indices), replace=True)
        y_boot_event = y_true[boot_idx]
        y_boot_time  = y_time[boot_idx]
        surv_boot    = Surv.from_arrays(y_boot_event, y_boot_time)
        scores_boot  = scores[boot_idx]
        surv_probs_boot = surv_probs[boot_idx]

        # Time-dependent AUC
        auc_curve, mean_auc = cumulative_dynamic_auc(
            y_train_surv, surv_boot, scores_boot, times
        )

        # Time-dependent Brier score
        _, brier_curve = brier_score(
            y_train_surv, surv_boot, surv_probs_boot, times
        )

        # IBS
        ibs_val = integrated_brier_score(
            y_train_surv, surv_boot, surv_probs_boot, times
        )

        # C-index
        c_idx = concordance_index_censored(y_boot_event, y_boot_time, scores_boot)[0]

        return auc_curve, mean_auc, brier_curve, ibs_val, c_idx

    boot_results = Parallel(n_jobs=-1)(
        delayed(single_bootstrap)(i) for i in range(n_boot)
    )

    for i, (auc_curve_i, mean_auc_i, brier_curve_i, ibs_i, c_idx_i) in enumerate(boot_results):
        boot_auc_curves[i, :]   = auc_curve_i
        boot_mean_aucs[i]       = mean_auc_i
        boot_brier_curves[i, :] = brier_curve_i
        boot_ibs[i]             = ibs_i
        boot_cindex[i]          = c_idx_i

    # 시간별 AUC 곡선 신뢰구간
    lower_auc_curve = np.percentile(boot_auc_curves, 2.5, axis=0)
    upper_auc_curve = np.percentile(boot_auc_curves, 97.5, axis=0)
    # 평균 AUC 신뢰구간
    auc_ci = np.percentile(boot_mean_aucs, [2.5, 97.5])

    # 시간별 Brier curve 신뢰구간
    lower_brier_curve = np.percentile(boot_brier_curves, 2.5, axis=0)
    upper_brier_curve = np.percentile(boot_brier_curves, 97.5, axis=0)
    # IBS 신뢰구간
    ibs_ci = np.percentile(boot_ibs, [2.5, 97.5])

    # C-index 신뢰구간
    cindex_ci = np.percentile(boot_cindex, [2.5, 97.5])

    return {
        'auc_curve_ci': (lower_auc_curve, upper_auc_curve),
        'auc_mean_ci': auc_ci,
        'brier_curve_ci': (lower_brier_curve, upper_brier_curve),
        'ibs_ci': ibs_ci,
        'cindex_ci': cindex_ci
    }

In [None]:
def plot_results(results, analysis_label, variable_type):
    """
    3개의 그래프 생성:
      1) Time-dependent AUC (선 그래프 + fill)
      2) IBS (막대그래프)
      3) Time-dependent Brier Score (선 그래프 + fill)
    """
    # 1) Time-dependent AUC
    plt.figure(figsize=(10, 6))
    colors = plt.cm.get_cmap('tab10', len(results))
    for idx, (model_name, res) in enumerate(results.items()):
        color = colors(idx)
        times = res['times']

        # AUC 곡선 + 신뢰구간
        auc_curve = res['auc']['curve']
        auc_curve_ci = res['auc'].get('curve_ci', None)
        mean_auc_val = res['auc']['test'][0]
        mean_auc_ci  = res['auc']['test'][1]

        label = (f"{model_name} (Mean AUC: {mean_auc_val:.3f} "
                 f"({mean_auc_ci[0]:.3f}-{mean_auc_ci[1]:.3f}))")

        plt.plot(times, auc_curve, color=color, linewidth=2, label=label)
        if auc_curve_ci is not None:
            lower, upper = auc_curve_ci
            plt.fill_between(times, lower, upper, color=color, alpha=0.2)

    plt.xlabel('Follow-up Time (Years)')
    plt.ylabel('Time-dependent AUC')
    plt.title(f"{analysis_label} - {variable_type} Model (Time-dependent AUC)")
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

    # 2) IBS (막대그래프)
    plt.figure(figsize=(10, 6))
    for idx, (model_name, res) in enumerate(results.items()):
        color = colors(idx)
        ibs_val = res['ibs']['test'][0]
        ibs_ci  = res['ibs']['test'][1]  # (lower, upper)
        label = (f"{model_name} IBS: {ibs_val:.3f} "
                 f"({ibs_ci[0]:.3f}-{ibs_ci[1]:.3f})")
        plt.bar(idx, ibs_val, color=color, alpha=0.7, label=label)
        plt.errorbar(idx, ibs_val,
                     yerr=[[ibs_val - ibs_ci[0]], [ibs_ci[1] - ibs_val]],
                     fmt='o', color='black')
    plt.xticks(range(len(results)), list(results.keys()), rotation=45)
    plt.ylabel('Integrated Brier Score (IBS)')
    plt.title(f"{analysis_label} - {variable_type} IBS (Test Set)")
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

    # 3) Time-dependent Brier Score
    plt.figure(figsize=(10, 6))
    for idx, (model_name, res) in enumerate(results.items()):
        color = colors(idx)
        times = res['times']

        # Brier score 곡선 + 신뢰구간
        brier_curve = res['brier']['curve']
        brier_curve_ci = res['brier'].get('curve_ci', None)
        ibs_val = res['ibs']['test'][0]
        ibs_ci  = res['ibs']['test'][1]

        label = (f"{model_name} (IBS: {ibs_val:.3f} "
                 f"({ibs_ci[0]:.3f}-{ibs_ci[1]:.3f}))")

        plt.plot(times, brier_curve, color=color, linewidth=2, label=label)
        if brier_curve_ci is not None:
            lower, upper = brier_curve_ci
            plt.fill_between(times, lower, upper, color=color, alpha=0.2)

    plt.xlabel('Follow-up Time (Years)')
    plt.ylabel('Time-dependent Brier Score')
    plt.title(f"{analysis_label} - {variable_type} Model (Time-dependent Brier Score)")
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

In [None]:
def perform_analysis(data_subset, features, analysis_label, variable_type):
    """
    각 모델별로:
      - 모델 학습
      - Time-dependent AUC 곡선, Mean AUC
      - Time-dependent Brier Score 곡선, IBS
      - 부트스트랩 통해 시간별 곡선 신뢰구간, 통합 지표 신뢰구간 계산
      - 결과 저장 및 그래프 호출
    """
    results = {}
    X = data_subset[features].values
    y_event = data_subset['CVD사망'].values.astype(bool)
    y_time = data_subset['추적기간_연'].values

    # 데이터 분할
    X_train, X_test, y_train_event, y_test_event, y_train_time, y_test_time = train_test_split(
        X, y_event, y_time, test_size=0.3, stratify=y_event, random_state=42
    )
    y_train_surv = Surv.from_arrays(y_train_event, y_train_time)
    y_test_surv  = Surv.from_arrays(y_test_event, y_test_time)

    # 시간 포인트 설정
    event_times = y_train_time[y_train_event]
    times = np.quantile(event_times, np.linspace(0.1, 0.9, 10)) if len(event_times) > 0 else np.array([])

    model_results = {}
    for model_info in MODELS:
        model_name = model_info["name"]
        estimator  = model_info["estimator"]
        predict_method = model_info["predict_method"]

        try:
            # fit (Cox Net에서 fit_baseline_model 여부는 버전에 따라)
            estimator.fit(X_train, y_train_surv)

            # 위험 점수 예측
            train_scores = getattr(estimator, predict_method)(X_train)
            test_scores  = getattr(estimator, predict_method)(X_test)

            # 생존확률 예측
            pred_surv_train = estimator.predict_survival_function(X_train)
            pred_surv_test  = estimator.predict_survival_function(X_test)
            surv_probs_train = np.row_stack([fn(times) for fn in pred_surv_train])
            surv_probs_test  = np.row_stack([fn(times) for fn in pred_surv_test])

            # Time-dependent AUC 곡선
            auc_curve_test, mean_auc_test = cumulative_dynamic_auc(y_train_surv, y_test_surv, test_scores, times)
            # Time-dependent Brier Score 곡선
            _, brier_curve_test = brier_score(y_train_surv, y_test_surv, surv_probs_test, times)
            # IBS
            ibs_test = integrated_brier_score(y_train_surv, y_test_surv, surv_probs_test, times)
            # C-index
            c_index_test = concordance_index_censored(y_test_event, y_test_time, test_scores)[0]

            # 부트스트랩 -> 시간별 곡선 신뢰구간 + 통합 지표 신뢰구간
            boot_test = bootstrap_time_dep_metrics(
                y_train_surv, y_test_event, y_test_time, test_scores, surv_probs_test, times, n_boot=50
            )

            # 결과 저장
            model_results[model_name] = {
                'c_index': c_index_test,
                'auc': {
                    'curve': auc_curve_test,
                    'test': (mean_auc_test, boot_test['auc_mean_ci']),
                    'curve_ci': boot_test['auc_curve_ci']
                },
                'brier': {
                    'curve': brier_curve_test,
                    'curve_ci': boot_test['brier_curve_ci']
                },
                'ibs': {
                    'test': (ibs_test, boot_test['ibs_ci'])
                },
                'times': times
            }

            # 결과 출력
            print(f"\n[{model_name}] {analysis_label} ({variable_type})")
            print(f"Test C-index: {c_index_test:.3f} (95% CI: {boot_test['cindex_ci'][0]:.3f}-{boot_test['cindex_ci'][1]:.3f})")
            print(f"Time-dependent Mean AUC (Test): {mean_auc_test:.3f} (95% CI: {boot_test['auc_mean_ci'][0]:.3f}-{boot_test['auc_mean_ci'][1]:.3f})")
            print(f"IBS (Test): {ibs_test:.3f} (95% CI: {boot_test['ibs_ci'][0]:.3f}-{boot_test['ibs_ci'][1]:.3f})")

        except Exception as e:
            print(f"{model_name} Error: {str(e)}")
            continue

    # 그래프 생성
    plot_results(model_results, analysis_label, variable_type)
    return model_results

In [None]:
def run_full_analysis(data, non_invasive_vars, invasive_vars):
    # 비침습 변수 분석
    print("\n\n=== Non-invasive Variables Analysis ===")
    non_invasive_results = {
        "Overall": perform_analysis(data, non_invasive_vars, "All Subjects", "Non-invasive"),
        "Male": perform_analysis(data[data['성별']==1], [v for v in non_invasive_vars if v != '성별'],
                               "Male", "Non-invasive"),
        "Female": perform_analysis(data[data['성별']==2], [v for v in non_invasive_vars if v != '성별'],
                                "Female", "Non-invasive")
    }

    # 침습 변수 분석
    print("\n\n=== Invasive Variables Analysis ===")
    invasive_results = {
        "Overall": perform_analysis(data, invasive_vars, "All Subjects", "Invasive"),
        "Male": perform_analysis(data[data['성별']==1], [v for v in invasive_vars if v != '성별'],
                               "Male", "Invasive"),
        "Female": perform_analysis(data[data['성별']==2], [v for v in invasive_vars if v != '성별'],
                                "Female", "Invasive")
    }

    return non_invasive_results, invasive_results

In [None]:
# 변수 정의
non_invasive_orig = ['나이', 'WHtR', '성별', '고혈압여부', '당뇨병여부', 'IPAQ_4group_mets_days']
invasive_orig = non_invasive_orig + ['TG', 'LDL', 'HDL']

# 분석 실행
non_invasive_results, invasive_results = run_full_analysis(Data_temp, non_invasive_orig, invasive_orig)

## 95%CI 음영 없는 그림

In [None]:
def plot_results_no_shade(results, analysis_label, variable_type):
    """
    1) Time-dependent AUC 그래프 (음영 없이)
    2) IBS 막대그래프 (기존)
    3) Time-dependent Brier score 그래프 (음영 없이)
    """
    import matplotlib.pyplot as plt
    import numpy as np

    # 1) Time-dependent AUC (No shading)
    plt.figure(figsize=(10, 6))
    colors = plt.cm.get_cmap('tab10', len(results))
    for idx, (model_name, res) in enumerate(results.items()):
        color = colors(idx)
        times = res['times']
        auc_curve = res['auc']['curve']
        mean_auc_val = res['auc']['test'][0]
        mean_auc_ci  = res['auc']['test'][1]

        label = (f"{model_name} (Mean AUC: {mean_auc_val:.3f} "
                 f"({mean_auc_ci[0]:.3f}-{mean_auc_ci[1]:.3f}))")
        plt.plot(times, auc_curve, color=color, linewidth=2, label=label)
        # 음영 제거 → fill_between을 호출하지 않음

    plt.xlabel('Follow-up Time (Years)')
    plt.ylabel('Time-dependent AUC')
    plt.title(f"{analysis_label} - {variable_type} (No-Shade AUC)")
    plt.legend(loc='lower right')  # 그래프 내부 하단 오른쪽
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

    # 2) IBS 막대그래프
    plt.figure(figsize=(10, 6))
    for idx, (model_name, res) in enumerate(results.items()):
        color = colors(idx)
        ibs_val = res['ibs']['test'][0]
        ibs_ci  = res['ibs']['test'][1]  # (lower, upper)
        label = (f"{model_name} IBS: {ibs_val:.3f} "
                 f"({ibs_ci[0]:.3f}-{ibs_ci[1]:.3f})")
        plt.bar(idx, ibs_val, color=color, alpha=0.7, label=label)
        plt.errorbar(idx, ibs_val,
                     yerr=[[ibs_val - ibs_ci[0]], [ibs_ci[1] - ibs_val]],
                     fmt='o', color='black')
    plt.xticks(range(len(results)), list(results.keys()), rotation=45)
    plt.ylabel('Integrated Brier Score (IBS)')
    plt.title(f"{analysis_label} - {variable_type} IBS (No-Shade)")
    plt.legend(loc='lower right')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

    # 3) Time-dependent Brier Score (No shading)
    plt.figure(figsize=(10, 6))
    for idx, (model_name, res) in enumerate(results.items()):
        color = colors(idx)
        times = res['times']
        brier_curve = res['brier']['curve']
        ibs_val = res['ibs']['test'][0]
        ibs_ci  = res['ibs']['test'][1]
        label = (f"{model_name} (IBS: {ibs_val:.3f} "
                 f"({ibs_ci[0]:.3f}-{ibs_ci[1]:.3f}))")
        plt.plot(times, brier_curve, color=color, linewidth=2, label=label)
        # 음영 제거 → fill_between 없음

    plt.xlabel('Follow-up Time (Years)')
    plt.ylabel('Time-dependent Brier Score')
    plt.title(f"{analysis_label} - {variable_type} Brier (No-Shade)")
    plt.legend(loc='lower right')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

## Best model alph(투명도) 조정

In [None]:
def plot_results_with_alpha(results, analysis_label, variable_type):
    """
    Selective line style 적용:
      - AUC: 최고 모델(Mean AUC 최대)은 진한 실선, 나머지는 연한 점선.
      - Brier: IBS가 가장 낮은 모델(최고 성능)은 진한 실선, 나머지는 연한 점선.
      - 범례는 그래프 내부 오른쪽 하단에 배치.
    """
    import matplotlib.pyplot as plt
    import numpy as np

    # 우선, 각 지표별 최적 모델 결정
    # AUC: 최고 모델은 Mean AUC가 가장 큰 모델
    best_auc_model = max(results.items(), key=lambda x: x[1]['auc']['test'][0])[0]
    # Brier: IBS가 낮은 모델이 좋은 모델이므로, IBS 최소인 모델을 선택
    best_ibs_model = min(results.items(), key=lambda x: x[1]['ibs']['test'][0])[0]

    colors = plt.cm.get_cmap('tab10', len(results))

    # 1) Time-dependent AUC 그래프
    plt.figure(figsize=(10, 6))
    for idx, (model_name, res) in enumerate(results.items()):
        color = colors(idx)
        times = res['times']
        auc_curve = res['auc']['curve']
        mean_auc_val = res['auc']['test'][0]
        mean_auc_ci  = res['auc']['test'][1]
        # 최고 모델: solid line, 다른 모델: dotted line
        linestyle = 'solid' if model_name == best_auc_model else 'dotted'
        # 최고 모델: 낮은 투명도(진하게), 나머지: 높은 투명도(연하게)
        alpha_val = 1.0 if model_name == best_auc_model else 0.3
        label = (f"{model_name} (Mean AUC: {mean_auc_val:.3f} "
                 f"({mean_auc_ci[0]:.3f}-{mean_auc_ci[1]:.3f}))")
        plt.plot(times, auc_curve, color=color, linewidth=2, linestyle=linestyle, label=label)
        # 신뢰구간 음영: 최고 모델은 alpha=0.2, 나머지는 alpha=0.1
        if 'auc_curve_ci' in res['auc']:
            lower, upper = res['auc']['curve_ci']
            plt.fill_between(times, lower, upper, color=color, alpha=(0.2 if model_name==best_auc_model else 0.1))
    plt.xlabel('Follow-up Time (Years)')
    plt.ylabel('Time-dependent AUC')
    plt.title(f"{analysis_label} - {variable_type} (AUC)")
    plt.legend(loc='lower right')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

    # 2) IBS 막대그래프 (범례와 에러바는 그대로 유지)
    plt.figure(figsize=(10, 6))
    for idx, (model_name, res) in enumerate(results.items()):
        color = colors(idx)
        ibs_val, ibs_ci = res['ibs']['test']
        label = f"{model_name} (IBS: {ibs_val:.3f} ({ibs_ci[0]:.3f}-{ibs_ci[1]:.3f}))"
        plt.bar(idx, ibs_val, color=color, alpha=0.7, label=label)
        plt.errorbar(idx, ibs_val, yerr=[[ibs_val - ibs_ci[0]], [ibs_ci[1] - ibs_val]], fmt='o', color='black')
    plt.xticks(range(len(results)), list(results.keys()), rotation=45)
    plt.ylabel('Integrated Brier Score (IBS)')
    plt.title(f"{analysis_label} - {variable_type} IBS")
    plt.legend(loc='lower right')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

    # 3) Time-dependent Brier Score 그래프
    plt.figure(figsize=(10, 6))
    for idx, (model_name, res) in enumerate(results.items()):
        color = colors(idx)
        times = res['times']
        brier_curve = res['brier']['curve']
        ibs_val, ibs_ci = res['ibs']['test']
        # Brier Score의 경우 IBS가 낮은 모델이 좋은 모델이므로, 최저 IBS 모델은 solid, 나머지는 dotted
        linestyle = 'solid' if model_name == best_ibs_model else 'dotted'
        alpha_val = 1.0 if model_name == best_ibs_model else 0.3
        label = (f"{model_name} (IBS: {ibs_val:.3f} "
                 f"({ibs_ci[0]:.3f}-{ibs_ci[1]:.3f}))")
        plt.plot(times, brier_curve, color=color, linewidth=2, linestyle=linestyle, label=label)
        if 'brier_curve_ci' in res['brier']:
            lower, upper = res['brier']['curve_ci']
            plt.fill_between(times, lower, upper, color=color, alpha=(0.2 if model_name==best_ibs_model else 0.1))
    plt.xlabel('Follow-up Time (Years)')
    plt.ylabel('Time-dependent Brier Score')
    plt.title(f"{analysis_label} - {variable_type} (Brier Score)")
    plt.legend(loc='lower right')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

## 위에서 정의한 함수를 실행하는 코드

In [None]:
def perform_analysis(data_subset, features, analysis_label, variable_type):
    """
    각 모델별로:
      - 모델 학습
      - Time-dependent AUC 곡선, Mean AUC
      - Time-dependent Brier Score 곡선, IBS
      - 부트스트랩 통해 시간별 곡선 신뢰구간, 통합 지표 신뢰구간 계산
      - 결과 저장 및 그래프 호출
    """
    results = {}
    X = data_subset[features].values
    y_event = data_subset['CVD사망'].values.astype(bool)
    y_time = data_subset['추적기간_연'].values

    # 데이터 분할
    X_train, X_test, y_train_event, y_test_event, y_train_time, y_test_time = train_test_split(
        X, y_event, y_time, test_size=0.3, stratify=y_event, random_state=42
    )
    y_train_surv = Surv.from_arrays(y_train_event, y_train_time)
    y_test_surv  = Surv.from_arrays(y_test_event, y_test_time)

    # 시간 포인트 설정
    event_times = y_train_time[y_train_event]
    times = np.quantile(event_times, np.linspace(0.1, 0.9, 10)) if len(event_times) > 0 else np.array([])

    model_results = {}
    for model_info in MODELS:
        model_name = model_info["name"]
        estimator  = model_info["estimator"]
        predict_method = model_info["predict_method"]

        try:
            # fit (Cox Net에서 fit_baseline_model 여부는 버전에 따라)
            estimator.fit(X_train, y_train_surv)

            # 위험 점수 예측
            train_scores = getattr(estimator, predict_method)(X_train)
            test_scores  = getattr(estimator, predict_method)(X_test)

            # 생존확률 예측
            pred_surv_train = estimator.predict_survival_function(X_train)
            pred_surv_test  = estimator.predict_survival_function(X_test)
            surv_probs_train = np.row_stack([fn(times) for fn in pred_surv_train])
            surv_probs_test  = np.row_stack([fn(times) for fn in pred_surv_test])

            # Time-dependent AUC 곡선
            auc_curve_test, mean_auc_test = cumulative_dynamic_auc(y_train_surv, y_test_surv, test_scores, times)
            # Time-dependent Brier Score 곡선
            _, brier_curve_test = brier_score(y_train_surv, y_test_surv, surv_probs_test, times)
            # IBS
            ibs_test = integrated_brier_score(y_train_surv, y_test_surv, surv_probs_test, times)
            # C-index
            c_index_test = concordance_index_censored(y_test_event, y_test_time, test_scores)[0]

            # 부트스트랩 -> 시간별 곡선 신뢰구간 + 통합 지표 신뢰구간
            boot_test = bootstrap_time_dep_metrics(
                y_train_surv, y_test_event, y_test_time, test_scores, surv_probs_test, times, n_boot=50
            )

            # 결과 저장
            model_results[model_name] = {
                'c_index': c_index_test,
                'auc': {
                    'curve': auc_curve_test,
                    'test': (mean_auc_test, boot_test['auc_mean_ci']),
                    'curve_ci': boot_test['auc_curve_ci']
                },
                'brier': {
                    'curve': brier_curve_test,
                    'curve_ci': boot_test['brier_curve_ci']
                },
                'ibs': {
                    'test': (ibs_test, boot_test['ibs_ci'])
                },
                'times': times
            }

            # 결과 출력
            print(f"\n[{model_name}] {analysis_label} ({variable_type})")
            print(f"Test C-index: {c_index_test:.3f} (95% CI: {boot_test['cindex_ci'][0]:.3f}-{boot_test['cindex_ci'][1]:.3f})")
            print(f"Time-dependent Mean AUC (Test): {mean_auc_test:.3f} (95% CI: {boot_test['auc_mean_ci'][0]:.3f}-{boot_test['auc_mean_ci'][1]:.3f})")
            print(f"IBS (Test): {ibs_test:.3f} (95% CI: {boot_test['ibs_ci'][0]:.3f}-{boot_test['ibs_ci'][1]:.3f})")

        except Exception as e:
            print(f"{model_name} Error: {str(e)}")
            continue

    # 그래프 생성
    plot_results(model_results, analysis_label, variable_type)
    plot_results_no_shade(model_results, analysis_label, variable_type)
    plot_results_with_alpha(model_results, analysis_label, variable_type)
    return model_results

In [None]:
# 전체 분석 실행 (run_full_analysis 함수 내부에서 perform_analysis 호출)
non_invasive_results, invasive_results = run_full_analysis(Data_temp, non_invasive_orig, invasive_orig)

# Feature selection

In [None]:
import lightgbm as lgb

# 데이터 준비
X = sampled_total[['성별', '나이', 'BMI', 'WHtR', 'SBP', 'DBP', '공복혈당', 'LDL', 'HDL', 'TG',
                   '당뇨병여부', '고혈압여부', '고강도mets', 'TotalMets', '중강도mets', '걷기mets',
                   'IPAQ_4group_mets_days']]
y = sampled_total['CVD사망']

# 파라미터 그리드와 반복 횟수 설정
param_grid = [
    {'n_estimators': 100, 'learning_rate': 0.01, 'max_depth': 3},
    {'n_estimators': 200, 'learning_rate': 0.05, 'max_depth': 4},
    {'n_estimators': 150, 'learning_rate': 0.01, 'max_depth': 5},
]
n_repeats = 5  # 각 파라미터 조합에 대해 반복

# 결과를 저장할 데이터프레임 생성 (피처명을 인덱스로 사용)
feature_importance_df = pd.DataFrame(index=X.columns)

# 각 파라미터 조합 및 반복에 대해 모델 학습 및 피처 중요도 기록
for params in param_grid:
    for repeat in range(n_repeats):
        # 매 반복마다 다른 데이터 분할 (랜덤 시드 변경)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42 + repeat)
        model = lgb.LGBMClassifier(**params, random_state=42 + repeat)
        model.fit(X_train, y_train)

        # 피처 중요도 기록 (여기서는 기본 중요도 사용)
        col_name = f"n_est={params['n_estimators']}_rep={repeat}"
        feature_importance_df[col_name] = model.feature_importances_

# 각 피처별 평균 중요도 계산
feature_importance_df['mean_importance'] = feature_importance_df.mean(axis=1)
stable_features = feature_importance_df['mean_importance'].sort_values(ascending=False)
print("안정적인 피처 순위:")
print(stable_features)

# 중요도 시각화
plt.figure(figsize=(10, 6))
stable_features.plot(kind='bar', color='skyblue')
plt.title('Stability Feature Importances')
plt.xlabel('Features')
plt.ylabel('Average Importance')
plt.xticks(rotation=90)
plt.show()

In [None]:
non_invasive = ['성별', '나이', '신장', 'BMI', 'WHtR', 'SBP', 'DBP',
                '당뇨병여부', '고혈압여부', 'TotalMets', '중강도mets', '걷기mets',
                'IPAQ_4group_mets_days']
invasive = non_invasive + ['공복혈당' ,'LDL' ,'HDL' ,'TG']

# Model Optimization

In [None]:
def prepare_data_from_split(train_df, test_df, features):
    X_train = train_df[features]
    y_train_event = train_df['CVD사망'].astype(bool)
    y_train_time = train_df['추적기간_연']

    X_test = test_df[features]
    y_test_event = test_df['CVD사망'].astype(bool)
    y_test_time = test_df['추적기간_연']

    y_train_surv = Surv.from_arrays(y_train_event, y_train_time)
    y_test_surv = Surv.from_arrays(y_test_event, y_test_time)

    # 시간 포인트 설정 (예: y_train_time의 10%,20%, …,100% 분위수)
    # 참고: 여기서는 np.linspace(0.1, 1, 10)를 사용하여 10개의 분위수 시점
    times = np.quantile(y_train_time[y_train_event], np.linspace(0.1, 1, 10))
    return X_train, X_test, y_train_surv, y_test_surv, times

# RSF 하이퍼파라미터 튜닝
def optimize_rsf(trial, X_train, X_test, y_train_surv, y_test_surv, times):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 200),
        'max_depth': trial.suggest_int('max_depth', 8, 15),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 15),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 20, 100),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    }

    model = RandomSurvivalForest(**params, n_jobs=-1, random_state=42)
    model.fit(X_train, y_train_surv)

    scores = model.predict(X_test)
    _, mean_auc = cumulative_dynamic_auc(y_train_surv, y_test_surv, scores, times)
    return mean_auc

# GBS 하이퍼파라미터 튜닝
def optimize_gbs(trial, X_train, X_test, y_train_surv, y_test_surv, times):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'dropout_rate': trial.suggest_float('dropout_rate', 0.0, 0.5)
    }

    model = GradientBoostingSurvivalAnalysis(loss='coxph', **params, random_state=42)
    model.fit(X_train, y_train_surv)

    scores = model.predict(X_test)
    _, mean_auc = cumulative_dynamic_auc(y_train_surv, y_test_surv, scores, times)
    return mean_auc

def run_optimization_from_split(model_type, train_df, test_df, features, n_trials=100):
    X_train, X_test, y_train_surv, y_test_surv, times = prepare_data_from_split(train_df, test_df, features)

    study = optuna.create_study(direction='maximize')
    objective = lambda trial: optimize_rsf(trial, X_train, X_test, y_train_surv, y_test_surv, times) \
        if model_type == 'RSF' else \
        optimize_gbs(trial, X_train, X_test, y_train_surv, y_test_surv, times)

    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

    print(f"\nBest {model_type} parameters:")
    print(study.best_params)
    print(f"Best Mean AUC: {study.best_value:.4f}")

    # study.best_params와 study 객체 자체를 함께 반환
    return study.best_params, study

In [None]:
# non_invasive_orig = ['나이', 'WHtR', '성별', '고혈압여부', '당뇨병여부', 'IPAQ_4group_mets_days'] # 기존에 사용된 변수
# invasive_orig = non_invasive_orig + ['TG', 'LDL', 'HDL','공복혈당']
non_invasive_orig = ['나이', 'WHtR', '성별', '고혈압여부', '당뇨병여부', 'IPAQ_4group_mets_days'] + ['BMI','SBP','DBP','TotalMets','중강도mets','걷기mets']
invasive_orig = non_invasive_orig + ['TG', 'LDL', 'HDL'] + ['공복혈당']

sampled_train_man = sampled_train[sampled_train['성별']==1]
sampled_train_woman = sampled_train[sampled_train['성별']==2]
sampled_test_man = sampled_test[sampled_test['성별']==1]
sampled_test_woman = sampled_test[sampled_test['성별']==2]

In [None]:
# 전체 대상자 최적화
best_rsf_params, rsf_study = run_optimization_from_split('RSF', sampled_train, sampled_test, non_invasive_orig, n_trials=30) # RSF 모델
best_gbs_params, gbs_study = run_optimization_from_split('GBS', sampled_train, sampled_test, invasive_orig, n_trials=30) # GBS 모델

In [None]:
# 남자 대상자 최적화
best_rsf_params_man, rsf_study_man = run_optimization_from_split('RSF', sampled_train_man, sampled_test_man, non_invasive_orig, n_trials=30)
best_gbs_params_man, gbs_study_man = run_optimization_from_split('GBS', sampled_train_man, sampled_test_man, non_invasive_orig, n_trials=30)

In [None]:
# 여자 대상자 최적화
best_rsf_params_woman, rsf_study_woman = run_optimization_from_split('RSF', sampled_train_woman, sampled_test_woman, non_invasive_orig, n_trials=30)
best_gbs_params_woman, gbs_study_woman = run_optimization_from_split('GBS', sampled_train_woman, sampled_test_woman, non_invasive_orig, n_trials=30)

## 최적화 결과 시각화

In [None]:
import optuna.visualization as vis

In [None]:
# 최적화 과정 시각화 (objective 값 변화)
vis.plot_optimization_history(rsf_study)

In [None]:
# 하이퍼파라미터 중요도 시각화
vis.plot_param_importances(rsf_study)

In [None]:
# 병렬 좌표 시각화
vis.plot_parallel_coordinate(rsf_study)

In [None]:
# objective 값의 분포 시각화
vis.plot_slice(rsf_study)

In [None]:
# 최적화 과정에서 objective 값의 변화 시각화
vis.plot_optimization_history(gbs_study)

In [None]:
# 하이퍼파라미터 중요도 시각화
vis.plot_param_importances(gbs_study)

In [None]:
# 병렬 좌표 시각화 (하이퍼파라미터 간 관계)
vis.plot_parallel_coordinate(gbs_study)

In [None]:
# objective 값의 분포 시각화
vis.plot_slice(gbs_study)

# 예전에 작성한 코드 (참고용)

In [None]:
# 변수 목록
비침습 = ['나이' , 'WHtR' , '성별' , '고혈압여부','당뇨병여부', 'IPAQ_4group_mets_days', 'CVD사망' ,'추적기간_연']
침습 = ['나이' , 'WHtR' , '성별' , '고혈압여부','당뇨병여부', 'IPAQ_4group_mets_days', 'TG' ,'LDL', 'CVD사망' ,'추적기간_연'] # HDL
범주형_변수 = ['성별','고혈압여부' ,'당뇨병여부' ,'IPAQ_4group_mets_days']

# 범주형 변수를 category 타입으로 변경
Data_temp = sampled_total.copy()
Data_temp[범주형_변수] = Data_temp[범주형_변수].astype('category')

# Non-invasive and invasive variable data
data_subset_non_invasive = Data_temp[비침습]
data_subset_invasive = Data_temp[침습]

# Create Surv objects for survival analysis
y_non_invasive = Surv.from_dataframe('CVD사망', '추적기간_연', data_subset_non_invasive)
y_invasive = Surv.from_dataframe('CVD사망', '추적기간_연', data_subset_invasive)

# 공변량 추출 및 더미화
X_non_invasive = pd.get_dummies(data_subset_non_invasive.drop(['CVD사망', '추적기간_연'], axis=1), drop_first=True)
X_invasive = pd.get_dummies(data_subset_invasive.drop(['CVD사망', '추적기간_연'], axis=1), drop_first=True)

# 데이터 분할 (층화 샘플링 사용)
X_train_non_invasive, X_test_non_invasive, y_train_non_invasive, y_test_non_invasive = train_test_split(
    X_non_invasive, y_non_invasive, test_size=0.3, stratify=y_non_invasive['CVD사망'], random_state=42)
X_train_invasive, X_test_invasive, y_train_invasive, y_test_invasive = train_test_split(
    X_invasive, y_invasive, test_size=0.3, stratify=y_invasive['CVD사망'], random_state=42)

In [None]:
# 스케일링
scaler_non_invasive = StandardScaler()
scaler_invasive = StandardScaler()

X_train_non_invasive_scaled = scaler_non_invasive.fit_transform(X_train_non_invasive)
X_test_non_invasive_scaled = scaler_non_invasive.transform(X_test_non_invasive)
X_train_invasive_scaled = scaler_invasive.fit_transform(X_train_invasive)
X_test_invasive_scaled = scaler_invasive.transform(X_test_invasive)

# 비침습 모델 구축 및 평가
cox_model_non_invasive = CoxPHSurvivalAnalysis()
cox_model_non_invasive.fit(X_train_non_invasive_scaled, y_train_non_invasive)

# 침습 모델 구축 및 평가
cox_model_invasive = CoxPHSurvivalAnalysis()
cox_model_invasive.fit(X_train_invasive_scaled, y_train_invasive)

# 비침습 모델 C-index
y_pred_train_non_invasive = cox_model_non_invasive.predict(X_train_non_invasive_scaled)
y_pred_test_non_invasive = cox_model_non_invasive.predict(X_test_non_invasive_scaled)

c_index_train_non_invasive = concordance_index_censored(y_train_non_invasive['CVD사망'], y_train_non_invasive['추적기간_연'], y_pred_train_non_invasive)
c_index_test_non_invasive = concordance_index_censored(y_test_non_invasive['CVD사망'], y_test_non_invasive['추적기간_연'], y_pred_test_non_invasive)

print(f"비침습 모델 Train - Concordance Index: {c_index_train_non_invasive[0]:.4f}")
print(f"비침습 모델 Test - Concordance Index: {c_index_test_non_invasive[0]:.4f}")

# 침습 모델 C-index
y_pred_train_invasive = cox_model_invasive.predict(X_train_invasive_scaled)
y_pred_test_invasive = cox_model_invasive.predict(X_test_invasive_scaled)

c_index_train_invasive = concordance_index_censored(y_train_invasive['CVD사망'], y_train_invasive['추적기간_연'], y_pred_train_invasive)
c_index_test_invasive = concordance_index_censored(y_test_invasive['CVD사망'], y_test_invasive['추적기간_연'], y_pred_test_invasive)

print(f"침습 모델 Train - Concordance Index: {c_index_train_invasive[0]:.4f}")
print(f"침습 모델 Test - Concordance Index: {c_index_test_invasive[0]:.4f}")

- original scale

In [None]:
# 스케일링 적용하지 않음, 원본 변수

# 비침습 모델 구축 및 평가
cox_model_non_invasive = CoxPHSurvivalAnalysis()
cox_model_non_invasive.fit(X_train_non_invasive, y_train_non_invasive)

# 침습 모델 구축 및 평가
cox_model_invasive = CoxPHSurvivalAnalysis()
cox_model_invasive.fit(X_train_invasive, y_train_invasive)

# 비침습 모델 C-index
y_pred_train_non_invasive = cox_model_non_invasive.predict(X_train_non_invasive)
y_pred_test_non_invasive = cox_model_non_invasive.predict(X_test_non_invasive)

c_index_train_non_invasive = concordance_index_censored(y_train_non_invasive['CVD사망'], y_train_non_invasive['추적기간_연'], y_pred_train_non_invasive)
c_index_test_non_invasive = concordance_index_censored(y_test_non_invasive['CVD사망'], y_test_non_invasive['추적기간_연'], y_pred_test_non_invasive)

print(f"비침습 모델 Train - Concordance Index: {c_index_train_non_invasive[0]:.4f}")
print(f"비침습 모델 Test - Concordance Index: {c_index_test_non_invasive[0]:.4f}")

# 침습 모델 C-index
y_pred_train_invasive = cox_model_invasive.predict(X_train_invasive)
y_pred_test_invasive = cox_model_invasive.predict(X_test_invasive)

c_index_train_invasive = concordance_index_censored(y_train_invasive['CVD사망'], y_train_invasive['추적기간_연'], y_pred_train_invasive)
c_index_test_invasive = concordance_index_censored(y_test_invasive['CVD사망'], y_test_invasive['추적기간_연'], y_pred_test_invasive)

print(f"침습 모델 Train - Concordance Index: {c_index_train_invasive[0]:.4f}")
print(f"침습 모델 Test - Concordance Index: {c_index_test_invasive[0]:.4f}")

## Cox PH
- Cox's Proportional Hazards model

### CPH 전체
- C index and Time dependent AUC

In [None]:
print(formula4,'\n',formula6)

In [None]:
# 변수 목록
비침습 = ['나이' , 'WHtR' , '성별' , '고혈압여부','당뇨병여부', 'IPAQ_4group_mets_days', 'CVD사망' ,'추적기간_연']
침습 = ['나이' , 'WHtR' , '성별' , '고혈압여부','당뇨병여부', 'IPAQ_4group_mets_days', 'TG' ,'LDL', 'CVD사망' ,'추적기간_연'] # HDL
범주형_변수 = ['성별','고혈압여부' ,'당뇨병여부' ,'IPAQ_4group_mets_days']

# 범주형 변수를 category 타입으로 변경
Data_temp = sampled_total.copy()
Data_temp[범주형_변수] = Data_temp[범주형_변수].astype('category')

# Non-invasive and invasive variable data
data_subset_non_invasive = Data_temp[비침습]
data_subset_invasive = Data_temp[침습]

# Create Surv objects for survival analysis
y_non_invasive = Surv.from_dataframe('CVD사망', '추적기간_연', data_subset_non_invasive)
y_invasive = Surv.from_dataframe('CVD사망', '추적기간_연', data_subset_invasive)

# 공변량 추출 및 더미화
X_non_invasive = pd.get_dummies(data_subset_non_invasive.drop(['CVD사망', '추적기간_연'], axis=1), drop_first=True)
X_invasive = pd.get_dummies(data_subset_invasive.drop(['CVD사망', '추적기간_연'], axis=1), drop_first=True)

# 데이터 분할 (층화 샘플링 사용)
X_train_non_invasive, X_test_non_invasive, y_train_non_invasive, y_test_non_invasive = train_test_split(
    X_non_invasive, y_non_invasive, test_size=0.3, stratify=y_non_invasive['CVD사망'], random_state=42)
X_train_invasive, X_test_invasive, y_train_invasive, y_test_invasive = train_test_split(
    X_invasive, y_invasive, test_size=0.3, stratify=y_invasive['CVD사망'], random_state=42)

In [None]:
def build_and_evaluate_model_cph(X_train, y_train, X_test, y_test):
    model = CoxPHSurvivalAnalysis()
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    c_index_train = concordance_index_censored(y_train['CVD사망'], y_train['추적기간_연'], y_pred_train)
    c_index_test = concordance_index_censored(y_test['CVD사망'], y_test['추적기간_연'], y_pred_test)

    return model, y_pred_train, y_pred_test, c_index_train, c_index_test

def plot_results(times_non_invasive, auc_non_invasive, conf_int_non_invasive,
                 times_invasive, auc_invasive, conf_int_invasive,
                 mean_auc_non_invasive, mean_auc_invasive,
                 ci_non_invasive, ci_invasive):
    plt.figure(figsize=(10, 6))

    # 비침습 모델
    plt.plot(times_non_invasive, auc_non_invasive, color='blue',
             label=f'비침습 모델 (Mean AUC = {mean_auc_non_invasive:.3f} (95% CI: {ci_non_invasive[0]:.3f} - {ci_non_invasive[1]:.3f}))')
    plt.fill_between(times_non_invasive, conf_int_non_invasive[0], conf_int_non_invasive[1], color='blue', alpha=0.2)

    # 침습 모델
    plt.plot(times_invasive, auc_invasive, color='orange',
             label=f'침습 모델 (Mean AUC = {mean_auc_invasive:.3f} (95% CI: {ci_invasive[0]:.3f} - {ci_invasive[1]:.3f}))')
    plt.fill_between(times_invasive, conf_int_invasive[0], conf_int_invasive[1], color='orange', alpha=0.2)

    plt.xlabel('Time (years)')
    plt.ylabel('AUC')
    plt.title('Time-Dependent AUC for Cox Models with 95% Confidence Interval (Test Set)')
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
def calculate_auc_and_confidence_interval(y_train, y_test, y_pred_test, times, n_bootstraps=100):
    # 누적 동적 AUC 계산
    auc, _ = cumulative_dynamic_auc(y_train, y_test, y_pred_test, times)

    # Bootstrap을 사용한 AUC 추정
    bootstrapped_auc = []
    rng = np.random.RandomState(42)

    for _ in range(n_bootstraps):
        indices = rng.randint(0, len(y_test), len(y_test))

        y_test_bootstrap = y_test[indices]
        y_pred_test_bootstrap = y_pred_test[indices]

        # 추적 기간의 범위 설정 (훈련 데이터 범위로 조정)
        times_bootstrap = np.arange(
            max(y_train['추적기간_연'].min(), y_test_bootstrap['추적기간_연'].min()),
            min(y_train['추적기간_연'].max(), y_test_bootstrap['추적기간_연'].max()),
            1/12
        )

        # 부트스트랩된 AUC 계산
        auc_bootstrap, _ = cumulative_dynamic_auc(
            y_train, y_test_bootstrap, y_pred_test_bootstrap, times_bootstrap)

        # AUC를 해당 시간대에 맞게 보간
        auc_interpolated = np.interp(times, times_bootstrap, auc_bootstrap)
        bootstrapped_auc.append(auc_interpolated)

    # 부트스트랩된 AUC의 신뢰구간 계산 (2.5% ~ 97.5% 범위)
    bootstrapped_auc = np.array(bootstrapped_auc)
    conf_int = np.percentile(bootstrapped_auc, [2.5, 97.5], axis=0)

    # AUC의 평균값 계산
    mean_auc = np.mean(auc)

    return auc, mean_auc, conf_int

In [None]:
# CPH 모델 구축 및 평가 (비침습 데이터)
cox_model_non_invasive_cph, y_pred_train_non_invasive_cph, y_pred_test_non_invasive_cph, c_index_train_non_invasive_cph, c_index_test_non_invasive_cph = build_and_evaluate_model_cph(
    X_train_non_invasive, y_train_non_invasive, X_test_non_invasive, y_test_non_invasive)

# CPH 모델 구축 및 평가 (침습 데이터)
cox_model_invasive_cph, y_pred_train_invasive_cph, y_pred_test_invasive_cph, c_index_train_invasive_cph, c_index_test_invasive_cph = build_and_evaluate_model_cph(
    X_train_invasive, y_train_invasive, X_test_invasive, y_test_invasive)

print(f"비침습 모델 Train - Concordance Index: {c_index_train_non_invasive_cph[0]:.4f}")
print(f"비침습 모델 Test - Concordance Index: {c_index_test_non_invasive_cph[0]:.4f}")
print(f"침습 모델 Train - Concordance Index: {c_index_train_invasive_cph[0]:.4f}")
print(f"침습 모델 Test - Concordance Index: {c_index_test_invasive_cph[0]:.4f}")

# 최대 추적 기간 및 times 설정 (시간 범위 조정)
y_train_events = y_train_non_invasive[y_train_non_invasive['CVD사망']]
train_min_non_invasive, train_max_non_invasive = y_train_events['추적기간_연'].min(), y_train_events['추적기간_연'].max()

y_test_events = y_test_non_invasive[y_test_non_invasive['CVD사망']]
test_min_non_invasive, test_max_non_invasive = y_test_events['추적기간_연'].min(), y_test_events['추적기간_연'].max()

# 범위가 다르면 경고 메시지를 출력하고 수정
if not (train_min_non_invasive <= test_min_non_invasive < test_max_non_invasive <= train_max_non_invasive):
    print("경고: 테스트 데이터의 시간 범위가 훈련 데이터의 시간 범위와 일치하지 않습니다. 시간 범위를 조정합니다.")
    test_min_non_invasive = max(train_min_non_invasive, test_min_non_invasive)
    test_max_non_invasive = min(train_max_non_invasive, test_max_non_invasive)

times_non_invasive = np.arange(test_min_non_invasive, test_max_non_invasive, 1/12)

# 침습 데이터의 시간 범위도 같은 방식으로 처리
y_train_events = y_train_invasive[y_train_invasive['CVD사망']]
train_min_invasive, train_max_invasive = y_train_events['추적기간_연'].min(), y_train_events['추적기간_연'].max()

y_test_events = y_test_invasive[y_test_invasive['CVD사망']]
test_min_invasive, test_max_invasive = y_test_events['추적기간_연'].min(), y_test_events['추적기간_연'].max()

# 범위가 다르면 경고 메시지를 출력하고 수정
if not (train_min_invasive <= test_min_invasive < test_max_invasive <= train_max_invasive):
    print("경고: 침습 테스트 데이터의 시간 범위가 훈련 데이터의 시간 범위와 일치하지 않습니다. 시간 범위를 조정합니다.")
    test_min_invasive = max(train_min_invasive, test_min_invasive)
    test_max_invasive = min(train_max_invasive, test_max_invasive)

times_invasive = np.arange(test_min_invasive, test_max_invasive, 1/12)

# AUC 및 신뢰구간 계산
cox_auc_non_invasive_cph, cox_mean_auc_non_invasive_cph, cox_conf_int_non_invasive_cph = calculate_auc_and_confidence_interval(
    y_train_non_invasive, y_test_non_invasive, y_pred_test_non_invasive_cph, times_non_invasive)

cox_auc_invasive_cph, cox_mean_auc_invasive_cph, cox_conf_int_invasive_cph = calculate_auc_and_confidence_interval(
    y_train_invasive, y_test_invasive, y_pred_test_invasive_cph, times_invasive)

# 신뢰구간의 평균값 계산
ci_non_invasive_cph = (np.mean(cox_conf_int_non_invasive_cph[0]), np.mean(cox_conf_int_non_invasive_cph[1]))
ci_invasive_cph = (np.mean(cox_conf_int_invasive_cph[0]), np.mean(cox_conf_int_invasive_cph[1]))

# 결과 플로팅
plot_results(
    times_non_invasive, cox_auc_non_invasive_cph, cox_conf_int_non_invasive_cph,
    times_invasive, cox_auc_invasive_cph, cox_conf_int_invasive_cph,
    cox_mean_auc_non_invasive_cph, cox_mean_auc_invasive_cph,
    ci_non_invasive_cph, ci_invasive_cph
)

In [None]:
def calculate_auc_and_confidence_interval(y_train, y_test, y_pred_test, times, n_bootstraps=100):
    auc, _ = cumulative_dynamic_auc(y_train, y_test, y_pred_test, times)

    bootstrapped_auc = []
    rng = np.random.RandomState(42)

    for _ in range(n_bootstraps):
        indices = rng.randint(0, len(y_test), len(y_test))
        y_test_bootstrap = y_test[indices]
        y_pred_test_bootstrap = y_pred_test[indices]

        times_bootstrap = np.arange(
            np.min(y_test_bootstrap['추적기간_연']),
            np.max(y_test_bootstrap['추적기간_연']),
            1/12)

        auc_bootstrap, _ = cumulative_dynamic_auc(
            y_train, y_test_bootstrap, y_pred_test_bootstrap, times_bootstrap)

        auc_interpolated = np.interp(times, times_bootstrap, auc_bootstrap)
        bootstrapped_auc.append(auc_interpolated)

    bootstrapped_auc = np.array(bootstrapped_auc)
    conf_int = np.percentile(bootstrapped_auc, [2.5, 97.5], axis=0)
    mean_auc = np.mean(auc)

    return auc, mean_auc, conf_int

def plot_results(times_non_invasive, auc_non_invasive, conf_int_non_invasive,
                 times_invasive, auc_invasive, conf_int_invasive,
                 mean_auc_non_invasive, mean_auc_invasive,
                 ci_non_invasive, ci_invasive):
    plt.figure(figsize=(10, 6))

    # 비침습 모델
    plt.plot(times_non_invasive, auc_non_invasive, color='blue',
             label=f'비침습 모델 (Mean AUC = {mean_auc_non_invasive:.3f} (95% CI: {ci_non_invasive[0]:.3f} - {ci_non_invasive[1]:.3f}))')
    plt.fill_between(times_non_invasive, conf_int_non_invasive[0], conf_int_non_invasive[1], color='blue', alpha=0.2)

    # 침습 모델
    plt.plot(times_invasive, auc_invasive, color='orange',
             label=f'침습 모델 (Mean AUC = {mean_auc_invasive:.3f} (95% CI: {ci_invasive[0]:.3f} - {ci_invasive[1]:.3f}))')
    plt.fill_between(times_invasive, conf_int_invasive[0], conf_int_invasive[1], color='orange', alpha=0.2)

    plt.xlabel('Time (years)')
    plt.ylabel('AUC')
    plt.title('Time-Dependent AUC for Cox Models with 95% Confidence Interval (Test Set)')
    plt.legend()
    plt.grid(True)
    plt.show()

def build_and_evaluate_model_cph(X_train, y_train, X_test, y_test):
    model = CoxPHSurvivalAnalysis()
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    c_index_train = concordance_index_censored(y_train['CVD사망'], y_train['추적기간_연'], y_pred_train)
    c_index_test = concordance_index_censored(y_test['CVD사망'], y_test['추적기간_연'], y_pred_test)

    return model, y_pred_train, y_pred_test, c_index_train, c_index_test

# CPH 모델 구축 및 평가 (비침습 데이터)
cox_model_non_invasive_cph, y_pred_train_non_invasive_cph, y_pred_test_non_invasive_cph, c_index_train_non_invasive_cph, c_index_test_non_invasive_cph = build_and_evaluate_model_cph(
    X_train_non_invasive, y_train_non_invasive, X_test_non_invasive, y_test_non_invasive)

# CPH 모델 구축 및 평가 (침습 데이터)
cox_model_invasive_cph, y_pred_train_invasive_cph, y_pred_test_invasive_cph, c_index_train_invasive_cph, c_index_test_invasive_cph = build_and_evaluate_model_cph(
    X_train_invasive, y_train_invasive, X_test_invasive, y_test_invasive)

print(f"비침습 모델 Train - Concordance Index: {c_index_train_non_invasive_cph[0]:.4f}")
print(f"비침습 모델 Test - Concordance Index: {c_index_test_non_invasive_cph[0]:.4f}")
print(f"침습 모델 Train - Concordance Index: {c_index_train_invasive_cph[0]:.4f}")
print(f"침습 모델 Test - Concordance Index: {c_index_test_invasive_cph[0]:.4f}")

# 최대 추적 기간 및 times 설정
max_follow_up_non_invasive = y_test_non_invasive['추적기간_연'].max()
min_follow_up_non_invasive = y_test_non_invasive['추적기간_연'].min()
times_non_invasive = np.arange(min_follow_up_non_invasive, max_follow_up_non_invasive, 1/12)

max_follow_up_invasive = y_test_invasive['추적기간_연'].max()
min_follow_up_invasive = y_test_invasive['추적기간_연'].min()
times_invasive = np.arange(min_follow_up_invasive, max_follow_up_invasive, 1/12)

# AUC 및 신뢰구간 계산
cox_auc_non_invasive_cph, cox_mean_auc_non_invasive_cph, cox_conf_int_non_invasive_cph = calculate_auc_and_confidence_interval(
    y_train_non_invasive, y_test_non_invasive, y_pred_test_non_invasive_cph, times_non_invasive)

cox_auc_invasive_cph, cox_mean_auc_invasive_cph, cox_conf_int_invasive_cph = calculate_auc_and_confidence_interval(
    y_train_invasive, y_test_invasive, y_pred_test_invasive_cph, times_invasive)

# 신뢰구간의 평균값 계산
ci_non_invasive_cph = (np.mean(cox_conf_int_non_invasive_cph[0]), np.mean(cox_conf_int_non_invasive_cph[1]))
ci_invasive_cph = (np.mean(cox_conf_int_invasive_cph[0]), np.mean(cox_conf_int_invasive_cph[1]))

# 결과 플로팅
plot_results(
    times_non_invasive, cox_auc_non_invasive_cph, cox_conf_int_non_invasive_cph,
    times_invasive, cox_auc_invasive_cph, cox_conf_int_invasive_cph,
    cox_mean_auc_non_invasive_cph, cox_mean_auc_invasive_cph,
    ci_non_invasive_cph, ci_invasive_cph
)

### CPH 성별 분할
- C index and Time dependent AUC

In [None]:
# Cox PH model
# 변수 목록
비침습 = ['나이' , 'WHtR' , '성별' , '고혈압여부','당뇨병여부', 'IPAQ_4group_mets_days', 'CVD사망' ,'추적기간_연']
침습 = ['나이' , 'WHtR' , '성별' , '고혈압여부','당뇨병여부', 'IPAQ_4group_mets_days', 'TG' ,'LDL', 'CVD사망' ,'추적기간_연'] # HDL
범주형_변수 = ['성별','고혈압여부' ,'당뇨병여부' ,'IPAQ_4group_mets_days']

# Convert categorical variables to category type
Data_temp = sampled_total.copy()
Data_temp[범주형_변수] = Data_temp[범주형_변수].astype('category')

# Non-invasive variable data
data_subset_non_invasive = Data_temp[비침습]
# Invasive variable data
data_subset_invasive = Data_temp[침습]

# Extract male data (성별 == 1)
male_data_non_invasive = data_subset_non_invasive[data_subset_non_invasive['성별'] == 1].copy()
male_data_invasive = data_subset_invasive[data_subset_invasive['성별'] == 1].copy()

# Extract female data (성별 == 2)
female_data_non_invasive = data_subset_non_invasive[data_subset_non_invasive['성별'] == 2].copy()
female_data_invasive = data_subset_invasive[data_subset_invasive['성별'] == 2].copy()

# Remove gender column
male_data_non_invasive.drop(columns=['성별'], inplace=True)
female_data_non_invasive.drop(columns=['성별'], inplace=True)
male_data_invasive.drop(columns=['성별'], inplace=True)
female_data_invasive.drop(columns=['성별'], inplace=True)

# Create Surv objects
y_male_non_invasive = Surv.from_dataframe('CVD사망', '추적기간_연', male_data_non_invasive)
y_female_non_invasive = Surv.from_dataframe('CVD사망', '추적기간_연', female_data_non_invasive)
y_male_invasive = Surv.from_dataframe('CVD사망', '추적기간_연', male_data_invasive)
y_female_invasive = Surv.from_dataframe('CVD사망', '추적기간_연', female_data_invasive)

# Extract covariates and dummy encode
X_male_non_invasive = pd.get_dummies(male_data_non_invasive.drop(['CVD사망', '추적기간_연'], axis=1), drop_first=True)
X_female_non_invasive = pd.get_dummies(female_data_non_invasive.drop(['CVD사망', '추적기간_연'], axis=1), drop_first=True)
X_male_invasive = pd.get_dummies(male_data_invasive.drop(['CVD사망', '추적기간_연'], axis=1), drop_first=True)
X_female_invasive = pd.get_dummies(female_data_invasive.drop(['CVD사망', '추적기간_연'], axis=1), drop_first=True)

# Stratified data split
X_train_male_non_invasive, X_test_male_non_invasive, y_train_male_non_invasive, y_test_male_non_invasive = train_test_split(
    X_male_non_invasive, y_male_non_invasive, test_size=0.3, stratify=y_male_non_invasive['CVD사망'], random_state=42)
X_train_female_non_invasive, X_test_female_non_invasive, y_train_female_non_invasive, y_test_female_non_invasive = train_test_split(
    X_female_non_invasive, y_female_non_invasive, test_size=0.3, stratify=y_female_non_invasive['CVD사망'], random_state=42)
X_train_male_invasive, X_test_male_invasive, y_train_male_invasive, y_test_male_invasive = train_test_split(
    X_male_invasive, y_male_invasive, test_size=0.3, stratify=y_male_invasive['CVD사망'], random_state=42)
X_train_female_invasive, X_test_female_invasive, y_train_female_invasive, y_test_female_invasive = train_test_split(
    X_female_invasive, y_female_invasive, test_size=0.3, stratify=y_female_invasive['CVD사망'], random_state=42)

In [None]:
def build_and_evaluate_model_cph(X_train, y_train, X_test, y_test):
    model = CoxPHSurvivalAnalysis()
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    c_index_train = concordance_index_censored(y_train['CVD사망'], y_train['추적기간_연'], y_pred_train)
    c_index_test = concordance_index_censored(y_test['CVD사망'], y_test['추적기간_연'], y_pred_test)

    return model, y_pred_train, y_pred_test, c_index_train, c_index_test

def calculate_auc_and_confidence_interval(y_train, y_test, y_pred_test, times, n_bootstraps=100):
    auc, _ = cumulative_dynamic_auc(y_train, y_test, y_pred_test, times)

    bootstrapped_auc = []
    rng = np.random.RandomState(42)

    for _ in range(n_bootstraps):
        indices = rng.randint(0, len(y_test), len(y_test))
        y_test_bootstrap = y_test[indices]
        y_pred_test_bootstrap = y_pred_test[indices]

        times_bootstrap = np.arange(
            np.min(y_test_bootstrap['추적기간_연']),
            np.max(y_test_bootstrap['추적기간_연']),
            1/12)

        auc_bootstrap, _ = cumulative_dynamic_auc(
            y_train, y_test_bootstrap, y_pred_test_bootstrap, times_bootstrap)

        auc_interpolated = np.interp(times, times_bootstrap, auc_bootstrap)
        bootstrapped_auc.append(auc_interpolated)

    bootstrapped_auc = np.array(bootstrapped_auc)
    conf_int = np.percentile(bootstrapped_auc, [2.5, 97.5], axis=0)
    mean_auc = np.mean(auc)

    return auc, mean_auc, conf_int

def plot_results(times_non_invasive, auc_non_invasive, conf_int_non_invasive,
                 times_invasive, auc_invasive, conf_int_invasive,
                 mean_auc_non_invasive, mean_auc_invasive,
                 ci_non_invasive, ci_invasive, gender):
    plt.figure(figsize=(12, 8))

    # 비침습 모델
    plt.plot(times_non_invasive, auc_non_invasive, color='blue',
             label=f'Non-Invasive (Mean AUC = {mean_auc_non_invasive:.3f} (95% CI: {ci_non_invasive[0]:.3f} - {ci_non_invasive[1]:.3f}))')
    plt.fill_between(times_non_invasive, conf_int_non_invasive[0], conf_int_non_invasive[1], color='blue', alpha=0.2)

    # 침습 모델
    plt.plot(times_invasive, auc_invasive, color='orange',
             label=f'Invasive (Mean AUC = {mean_auc_invasive:.3f} (95% CI: {ci_invasive[0]:.3f} - {ci_invasive[1]:.3f}))')
    plt.fill_between(times_invasive, conf_int_invasive[0], conf_int_invasive[1], color='orange', alpha=0.2)

    plt.xlabel('Time (years)')
    plt.ylabel('AUC')
    plt.title(f'{gender.capitalize()} Models AUC Comparison')
    plt.legend()
    plt.grid(True)
    plt.show()

def evaluate_and_plot_all_models():
    results = {}
    for gender in ['male', 'female']:
        for invasiveness in ['non_invasive', 'invasive']:
            # Create appropriate variable names
            X_train = globals()[f'X_train_{gender}_{invasiveness}']
            y_train = globals()[f'y_train_{gender}_{invasiveness}']
            X_test = globals()[f'X_test_{gender}_{invasiveness}']
            y_test = globals()[f'y_test_{gender}_{invasiveness}']

            # Build and evaluate model
            model, y_pred_train, y_pred_test, c_index_train, c_index_test = build_and_evaluate_model_cph(
                X_train, y_train, X_test, y_test)

            # Calculate times
            max_follow_up = y_test['추적기간_연'].max()
            min_follow_up = y_test['추적기간_연'].min()
            times = np.arange(min_follow_up, max_follow_up, 1/12)

            # Calculate AUC and confidence intervals
            auc, mean_auc, conf_int = calculate_auc_and_confidence_interval(
                y_train, y_test, y_pred_test, times)

            # Save results
            results[f'{gender}_{invasiveness}'] = {
                'model': model,
                'y_pred_test': y_pred_test,
                'c_index_train': c_index_train,
                'c_index_test': c_index_test,
                'auc': auc,
                'mean_auc': mean_auc,
                'conf_int': conf_int,
                'times': times
            }

    # Plot results for males
    auc_non_invasive = results['male_non_invasive']['auc']
    conf_int_non_invasive = results['male_non_invasive']['conf_int']
    mean_auc_non_invasive = results['male_non_invasive']['mean_auc']
    times_non_invasive = results['male_non_invasive']['times']

    auc_invasive = results['male_invasive']['auc']
    conf_int_invasive = results['male_invasive']['conf_int']
    mean_auc_invasive = results['male_invasive']['mean_auc']
    times_invasive = results['male_invasive']['times']

    ci_non_invasive = (np.mean(conf_int_non_invasive[0]), np.mean(conf_int_non_invasive[1]))
    ci_invasive = (np.mean(conf_int_invasive[0]), np.mean(conf_int_invasive[1]))

    plot_results(
        times_non_invasive, auc_non_invasive, conf_int_non_invasive,
        times_invasive, auc_invasive, conf_int_invasive,
        mean_auc_non_invasive, mean_auc_invasive,
        ci_non_invasive, ci_invasive,
        gender='male'
    )

    # Plot results for females
    auc_non_invasive = results['female_non_invasive']['auc']
    conf_int_non_invasive = results['female_non_invasive']['conf_int']
    mean_auc_non_invasive = results['female_non_invasive']['mean_auc']
    times_non_invasive = results['female_non_invasive']['times']

    auc_invasive = results['female_invasive']['auc']
    conf_int_invasive = results['female_invasive']['conf_int']
    mean_auc_invasive = results['female_invasive']['mean_auc']
    times_invasive = results['female_invasive']['times']

    ci_non_invasive = (np.mean(conf_int_non_invasive[0]), np.mean(conf_int_non_invasive[1]))
    ci_invasive = (np.mean(conf_int_invasive[0]), np.mean(conf_int_invasive[1]))

    plot_results(
        times_non_invasive, auc_non_invasive, conf_int_non_invasive,
        times_invasive, auc_invasive, conf_int_invasive,
        mean_auc_non_invasive, mean_auc_invasive,
        ci_non_invasive, ci_invasive,
        gender='female'
    )

# Execute evaluation and plotting
evaluate_and_plot_all_models()

In [None]:
# Model building and evaluation function
def build_and_evaluate_model_cph(X_train, y_train, X_test, y_test, label):
    # Build model
    cox_model = CoxPHSurvivalAnalysis()
    cox_model.fit(X_train, y_train)

    # Calculate C-index for CPH model
    y_pred_train = cox_model.predict(X_train)
    y_pred_test = cox_model.predict(X_test)

    c_index_train = concordance_index_censored(y_train['CVD사망'], y_train['추적기간_연'], y_pred_train)
    c_index_test = concordance_index_censored(y_test['CVD사망'], y_test['추적기간_연'], y_pred_test)

    print(f"{label} 모델 Train - Concordance Index: {c_index_train[0]:.4f}")
    print(f"{label} 모델 Test - Concordance Index: {c_index_test[0]:.4f}")

    # Set times for AUC calculation within test set follow-up period
    max_follow_up = y_test['추적기간_연'].max()
    min_follow_up = y_test['추적기간_연'].min()

    # Times now set to be within the test data follow-up range
    times = np.arange(min_follow_up, max_follow_up, 1/12)

    # Calculate AUC for the model on the test set
    auc, mean_auc = cumulative_dynamic_auc(y_train, y_test, y_pred_test, times)
    print(f"{label} 모델 Mean AUC: {mean_auc:.4f}")

    # Calculate 95% confidence intervals
    n_bootstraps = 100
    rng_seed = 42  # reproducible results
    bootstrapped_scores = []

    rng = np.random.RandomState(rng_seed)
    for i in range(n_bootstraps):
        indices = rng.randint(0, len(X_test), len(X_test))
        y_test_resampled = y_test[indices]
        y_pred_resampled = y_pred_test[indices]
        try:
            score, _ = cumulative_dynamic_auc(y_train, y_test_resampled, y_pred_resampled, times)
            bootstrapped_scores.append(score)
        except ValueError:
            continue  # Skip any bootstraps that have incompatible times

    sorted_scores = np.array(bootstrapped_scores)
    mean_scores = np.mean(sorted_scores, axis=0)
    lower = np.percentile(sorted_scores, 2.5, axis=0)
    upper = np.percentile(sorted_scores, 97.5, axis=0)

    return times, auc, mean_auc, lower, upper, c_index_train, c_index_test

# Evaluate models and store C-index values
results_cph = {
    "Male Non-Invasive": build_and_evaluate_model_cph(X_train_male_non_invasive, y_train_male_non_invasive, X_test_male_non_invasive, y_test_male_non_invasive, "Male Non-Invasive"),
    "Male Invasive": build_and_evaluate_model_cph(X_train_male_invasive, y_train_male_invasive, X_test_male_invasive, y_test_male_invasive, "Male Invasive"),
    "Female Non-Invasive": build_and_evaluate_model_cph(X_train_female_non_invasive, y_train_female_non_invasive, X_test_female_non_invasive, y_test_female_non_invasive, "Female Non-Invasive"),
    "Female Invasive": build_and_evaluate_model_cph(X_train_female_invasive, y_train_female_invasive, X_test_female_invasive, y_test_female_invasive, "Female Invasive")
}

In [None]:
results_dict = results_cph

# Extract C-index values for plotting
c_index_values_train = [results_dict[key][5][0] for key in results_dict]  # Train C-index
c_index_values_test = [results_dict[key][6][0] for key in results_dict]  # Test C-index

# Plotting C-index as bar plot
plt.figure(figsize=(8, 6))
x_labels = ["Male Non-Invasive", "Male Invasive", "Female Non-Invasive", "Female Invasive"]

x = np.arange(len(x_labels))
bar_width = 0.35

bars1 = plt.bar(x - bar_width/2, c_index_values_train, bar_width, label='Train', color='skyblue')
bars2 = plt.bar(x + bar_width/2, c_index_values_test, bar_width, label='Test', color='orange')

plt.xlabel('Model Type')
plt.ylabel('C-index')
plt.title('C-index Comparison for Cox’s proportional hazard model by Feature Type')
plt.xticks(x, x_labels)
plt.ylim(0.5, 1)  # Adjust y-axis limit for better visibility
plt.legend(loc='upper right')
plt.grid(axis='y')

# Add C-index values on top of bars
for bar in bars1:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.01, round(yval, 3), ha='center', va='bottom')

for bar in bars2:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.01, round(yval, 3), ha='center', va='bottom')

plt.show()

# Plotting male models with CI in the same figure
plt.figure(figsize=(12, 6))

# Male Non-Invasive
plt.plot(results_dict["Male Non-Invasive"][0], results_dict["Male Non-Invasive"][1],
         color='blue', label=f'Male Non-Invasive (Mean AUC = {results_dict["Male Non-Invasive"][2]:.3f} (95% CI: {results_dict["Male Non-Invasive"][3][0]:.3f} - {results_dict["Male Non-Invasive"][4][0]:.3f}))')
plt.fill_between(results_dict["Male Non-Invasive"][0], results_dict["Male Non-Invasive"][3], results_dict["Male Non-Invasive"][4], color='blue', alpha=0.2)

# Male Invasive
plt.plot(results_dict["Male Invasive"][0], results_dict["Male Invasive"][1],
         color='green', label=f'Male Invasive (Mean AUC = {results_dict["Male Invasive"][2]:.3f} (95% CI: {results_dict["Male Invasive"][3][0]:.3f} - {results_dict["Male Invasive"][4][0]:.3f}))')
plt.fill_between(results_dict["Male Invasive"][0], results_dict["Male Invasive"][3], results_dict["Male Invasive"][4], color='green', alpha=0.2)

plt.xlabel('Time (years)')
plt.ylabel('AUC')
plt.title('Time-Dependent AUC for Cox’s proportional hazard model - Males')
plt.legend(loc='lower right')
plt.grid()
plt.show()

# Plotting female models with CI in the same figure
plt.figure(figsize=(12, 6))

# Female Non-Invasive
plt.plot(results_dict["Female Non-Invasive"][0], results_dict["Female Non-Invasive"][1],
         color='purple', label=f'Female Non-Invasive (Mean AUC = {results_dict["Female Non-Invasive"][2]:.3f} (95% CI: {results_dict["Female Non-Invasive"][3][0]:.3f} - {results_dict["Female Non-Invasive"][4][0]:.3f}))')
plt.fill_between(results_dict["Female Non-Invasive"][0], results_dict["Female Non-Invasive"][3], results_dict["Female Non-Invasive"][4], color='purple', alpha=0.2)

# Female Invasive
plt.plot(results_dict["Female Invasive"][0], results_dict["Female Invasive"][1],
         color='red', label=f'Female Invasive (Mean AUC = {results_dict["Female Invasive"][2]:.3f} (95% CI: {results_dict["Female Invasive"][3][0]:.3f} - {results_dict["Female Invasive"][4][0]:.3f}))')
plt.fill_between(results_dict["Female Invasive"][0], results_dict["Female Invasive"][3], results_dict["Female Invasive"][4], color='red', alpha=0.2)

plt.xlabel('Time (years)')
plt.ylabel('AUC')
plt.title('Time-Dependent AUC for Cox’s proportional hazard model - Females')
plt.legend(loc='lower right')
plt.grid()
plt.show()

## Cox Net
- Cox’s proportional hazard model with elastic net penalty.

### CNS 전체

In [None]:
from sksurv.linear_model import CoxnetSurvivalAnalysis
def build_and_evaluate_model_coxnet(X_train, y_train, X_test, y_test):
    # CoxNet 모델 생성
    model = CoxnetSurvivalAnalysis(l1_ratio=0.9, alpha_min_ratio=0.01)
    model.fit(X_train, y_train)

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Concordance Index 계산
    c_index_train = concordance_index_censored(y_train['CVD사망'], y_train['추적기간_연'], y_pred_train)
    c_index_test = concordance_index_censored(y_test['CVD사망'], y_test['추적기간_연'], y_pred_test)

    return model, y_pred_train, y_pred_test, c_index_train, c_index_test

def plot_results(times_non_invasive, auc_non_invasive, conf_int_non_invasive,
                 times_invasive, auc_invasive, conf_int_invasive,
                 mean_auc_non_invasive, mean_auc_invasive,
                 ci_non_invasive, ci_invasive):
    plt.figure(figsize=(10, 6))

    # 비침습 모델 - Cox Net으로 이름 변경
    plt.plot(times_non_invasive, auc_non_invasive, color='blue',
             label=f'Cox net (비침습) 모델 (Mean AUC = {mean_auc_non_invasive:.3f} (95% CI: {ci_non_invasive[0]:.3f} - {ci_non_invasive[1]:.3f}))')
    plt.fill_between(times_non_invasive, conf_int_non_invasive[0], conf_int_non_invasive[1], color='blue', alpha=0.2)

    # 침습 모델 - Cox Net으로 이름 변경
    plt.plot(times_invasive, auc_invasive, color='orange',
             label=f'Cox net (침습) 모델 (Mean AUC = {mean_auc_invasive:.3f} (95% CI: {ci_invasive[0]:.3f} - {ci_invasive[1]:.3f}))')
    plt.fill_between(times_invasive, conf_int_invasive[0], conf_int_invasive[1], color='orange', alpha=0.2)

    plt.xlabel('Time (years)')
    plt.ylabel('AUC')
    plt.title('Time-Dependent AUC for Cox Net Models with 95% Confidence Interval (Test Set)')
    plt.legend()
    plt.grid(True)
    plt.show()

# Coxnet 모델 구축 및 평가 (비침습 데이터)
coxnet_model_non_invasive, y_pred_train_non_invasive_coxnet, y_pred_test_non_invasive_coxnet, c_index_train_non_invasive_coxnet, c_index_test_non_invasive_coxnet = build_and_evaluate_model_coxnet(
    X_train_non_invasive, y_train_non_invasive, X_test_non_invasive, y_test_non_invasive)

# Coxnet 모델 구축 및 평가 (침습 데이터)
coxnet_model_invasive, y_pred_train_invasive_coxnet, y_pred_test_invasive_coxnet, c_index_train_invasive_coxnet, c_index_test_invasive_coxnet = build_and_evaluate_model_coxnet(
    X_train_invasive, y_train_invasive, X_test_invasive, y_test_invasive)

print(f"CoxNet 비침습 모델 Train - Concordance Index: {c_index_train_non_invasive_coxnet[0]:.4f}")
print(f"CoxNet 비침습 모델 Test - Concordance Index: {c_index_test_non_invasive_coxnet[0]:.4f}")
print(f"CoxNet 침습 모델 Train - Concordance Index: {c_index_train_invasive_coxnet[0]:.4f}")
print(f"CoxNet 침습 모델 Test - Concordance Index: {c_index_test_invasive_coxnet[0]:.4f}")

# 최대 추적 기간 및 times 설정 (시간 범위 조정)
y_train_events = y_train_non_invasive[y_train_non_invasive['CVD사망']]
train_min_non_invasive, train_max_non_invasive = y_train_events['추적기간_연'].min(), y_train_events['추적기간_연'].max()

y_test_events = y_test_non_invasive[y_test_non_invasive['CVD사망']]
test_min_non_invasive, test_max_non_invasive = y_test_events['추적기간_연'].min(), y_test_events['추적기간_연'].max()

# 범위가 다르면 경고 메시지를 출력하고 수정
if not (train_min_non_invasive <= test_min_non_invasive < test_max_non_invasive <= train_max_non_invasive):
    print("경고: 테스트 데이터의 시간 범위가 훈련 데이터의 시간 범위와 일치하지 않습니다. 시간 범위를 조정합니다.")
    test_min_non_invasive = max(train_min_non_invasive, test_min_non_invasive)
    test_max_non_invasive = min(train_max_non_invasive, test_max_non_invasive)

times_non_invasive = np.arange(test_min_non_invasive, test_max_non_invasive, 1/12)

# 침습 데이터의 시간 범위도 같은 방식으로 처리
y_train_events = y_train_invasive[y_train_invasive['CVD사망']]
train_min_invasive, train_max_invasive = y_train_events['추적기간_연'].min(), y_train_events['추적기간_연'].max()

y_test_events = y_test_invasive[y_test_invasive['CVD사망']]
test_min_invasive, test_max_invasive = y_test_events['추적기간_연'].min(), y_test_events['추적기간_연'].max()

# 범위가 다르면 경고 메시지를 출력하고 수정
if not (train_min_invasive <= test_min_invasive < test_max_invasive <= train_max_invasive):
    print("경고: 침습 테스트 데이터의 시간 범위가 훈련 데이터의 시간 범위와 일치하지 않습니다. 시간 범위를 조정합니다.")
    test_min_invasive = max(train_min_invasive, test_min_invasive)
    test_max_invasive = min(train_max_invasive, test_max_invasive)

times_invasive = np.arange(test_min_invasive, test_max_invasive, 1/12)

# AUC 및 신뢰구간 계산
coxnet_auc_non_invasive, coxnet_mean_auc_non_invasive, coxnet_conf_int_non_invasive = calculate_auc_and_confidence_interval(
    y_train_non_invasive, y_test_non_invasive, y_pred_test_non_invasive_coxnet, times_non_invasive)

coxnet_auc_invasive, coxnet_mean_auc_invasive, coxnet_conf_int_invasive = calculate_auc_and_confidence_interval(
    y_train_invasive, y_test_invasive, y_pred_test_invasive_coxnet, times_invasive)

# 신뢰구간의 평균값 계산
ci_non_invasive_coxnet = (np.mean(coxnet_conf_int_non_invasive[0]), np.mean(coxnet_conf_int_non_invasive[1]))
ci_invasive_coxnet = (np.mean(coxnet_conf_int_invasive[0]), np.mean(coxnet_conf_int_invasive[1]))

# 결과 플로팅
plot_results(
    times_non_invasive, coxnet_auc_non_invasive, coxnet_conf_int_non_invasive,
    times_invasive, coxnet_auc_invasive, coxnet_conf_int_invasive,
    coxnet_mean_auc_non_invasive, coxnet_mean_auc_invasive,
    ci_non_invasive_coxnet, ci_invasive_coxnet
)

In [None]:
from sksurv.linear_model import CoxnetSurvivalAnalysis
def build_and_evaluate_model_coxnet(X_train, y_train, X_test, y_test):
    # CoxNet 모델 생성
    model = CoxnetSurvivalAnalysis(l1_ratio=0.9, alpha_min_ratio=0.01)
    model.fit(X_train, y_train)

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Concordance Index 계산
    c_index_train = concordance_index_censored(y_train['CVD사망'], y_train['추적기간_연'], y_pred_train)
    c_index_test = concordance_index_censored(y_test['CVD사망'], y_test['추적기간_연'], y_pred_test)

    return model, y_pred_train, y_pred_test, c_index_train, c_index_test

def plot_results(times_non_invasive, auc_non_invasive, conf_int_non_invasive,
                 times_invasive, auc_invasive, conf_int_invasive,
                 mean_auc_non_invasive, mean_auc_invasive,
                 ci_non_invasive, ci_invasive):
    plt.figure(figsize=(10, 6))

    # 비침습 모델 - Cox Net으로 이름 변경
    plt.plot(times_non_invasive, auc_non_invasive, color='blue',
             label=f'Cox net (비침습) 모델 (Mean AUC = {mean_auc_non_invasive:.3f} (95% CI: {ci_non_invasive[0]:.3f} - {ci_non_invasive[1]:.3f}))')
    plt.fill_between(times_non_invasive, conf_int_non_invasive[0], conf_int_non_invasive[1], color='blue', alpha=0.2)

    # 침습 모델 - Cox Net으로 이름 변경
    plt.plot(times_invasive, auc_invasive, color='orange',
             label=f'Cox net (침습) 모델 (Mean AUC = {mean_auc_invasive:.3f} (95% CI: {ci_invasive[0]:.3f} - {ci_invasive[1]:.3f}))')
    plt.fill_between(times_invasive, conf_int_invasive[0], conf_int_invasive[1], color='orange', alpha=0.2)

    plt.xlabel('Time (years)')
    plt.ylabel('AUC')
    plt.title('Time-Dependent AUC for Cox Net Models with 95% Confidence Interval (Test Set)')
    plt.legend()
    plt.grid(True)
    plt.show()

# Coxnet 모델 구축 및 평가 (비침습 데이터)
coxnet_model_non_invasive, y_pred_train_non_invasive_coxnet, y_pred_test_non_invasive_coxnet, c_index_train_non_invasive_coxnet, c_index_test_non_invasive_coxnet = build_and_evaluate_model_coxnet(
    X_train_non_invasive, y_train_non_invasive, X_test_non_invasive, y_test_non_invasive)

# Coxnet 모델 구축 및 평가 (침습 데이터)
coxnet_model_invasive, y_pred_train_invasive_coxnet, y_pred_test_invasive_coxnet, c_index_train_invasive_coxnet, c_index_test_invasive_coxnet = build_and_evaluate_model_coxnet(
    X_train_invasive, y_train_invasive, X_test_invasive, y_test_invasive)

print(f"CoxNet 비침습 모델 Train - Concordance Index: {c_index_train_non_invasive_coxnet[0]:.4f}")
print(f"CoxNet 비침습 모델 Test - Concordance Index: {c_index_test_non_invasive_coxnet[0]:.4f}")
print(f"CoxNet 침습 모델 Train - Concordance Index: {c_index_train_invasive_coxnet[0]:.4f}")
print(f"CoxNet 침습 모델 Test - Concordance Index: {c_index_test_invasive_coxnet[0]:.4f}")

# 최대 추적 기간 및 times 설정
max_follow_up_non_invasive = y_test_non_invasive['추적기간_연'].max()
min_follow_up_non_invasive = y_test_non_invasive['추적기간_연'].min()
times_non_invasive = np.arange(min_follow_up_non_invasive, max_follow_up_non_invasive, 1/12)

max_follow_up_invasive = y_test_invasive['추적기간_연'].max()
min_follow_up_invasive = y_test_invasive['추적기간_연'].min()
times_invasive = np.arange(min_follow_up_invasive, max_follow_up_invasive, 1/12)

# AUC 및 신뢰구간 계산
coxnet_auc_non_invasive, coxnet_mean_auc_non_invasive, coxnet_conf_int_non_invasive = calculate_auc_and_confidence_interval(
    y_train_non_invasive, y_test_non_invasive, y_pred_test_non_invasive_coxnet, times_non_invasive)

coxnet_auc_invasive, coxnet_mean_auc_invasive, coxnet_conf_int_invasive = calculate_auc_and_confidence_interval(
    y_train_invasive, y_test_invasive, y_pred_test_invasive_coxnet, times_invasive)

# 신뢰구간의 평균값 계산
ci_non_invasive_coxnet = (np.mean(coxnet_conf_int_non_invasive[0]), np.mean(coxnet_conf_int_non_invasive[1]))
ci_invasive_coxnet = (np.mean(coxnet_conf_int_invasive[0]), np.mean(coxnet_conf_int_invasive[1]))

# 결과 플로팅
plot_results(
    times_non_invasive, coxnet_auc_non_invasive, coxnet_conf_int_non_invasive,
    times_invasive, coxnet_auc_invasive, coxnet_conf_int_invasive,
    coxnet_mean_auc_non_invasive, coxnet_mean_auc_invasive,
    ci_non_invasive_coxnet, ci_invasive_coxnet
)

In [None]:
# 모델 학습 및 평가 함수 (모든 모델에 대해 사용 가능)
def build_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    c_index_train = concordance_index_censored(y_train['CVD사망'], y_train['추적기간_연'], y_pred_train)
    c_index_test = concordance_index_censored(y_test['CVD사망'], y_test['추적기간_연'], y_pred_test)

    return {
        "model": model,
        "y_pred_train": y_pred_train,
        "y_pred_test": y_pred_test,
        "c_index_train": c_index_train[0],
        "c_index_test": c_index_test[0]
    }

# AUC 및 신뢰 구간 계산 함수
def calculate_auc_and_confidence_interval(y_train, y_test, y_pred_test, times, n_bootstraps=100):
    auc, _ = cumulative_dynamic_auc(y_train, y_test, y_pred_test, times)

    bootstrapped_auc = []
    rng = np.random.RandomState(42)

    for _ in range(n_bootstraps):
        indices = rng.randint(0, len(y_test), len(y_test))
        y_test_bootstrap = y_test[indices]
        y_pred_test_bootstrap = y_pred_test[indices]

        times_bootstrap = np.arange(np.min(y_test_bootstrap['추적기간_연']), np.max(y_test_bootstrap['추적기간_연']), 1/12)
        auc_bootstrap, _ = cumulative_dynamic_auc(y_train, y_test_bootstrap, y_pred_test_bootstrap, times_bootstrap)

        auc_interpolated = np.interp(times, times_bootstrap, auc_bootstrap)
        bootstrapped_auc.append(auc_interpolated)

    bootstrapped_auc = np.array(bootstrapped_auc)
    conf_int = np.percentile(bootstrapped_auc, [2.5, 97.5], axis=0)
    mean_auc = np.mean(auc)

    return auc, mean_auc, conf_int

# 결과 그래프 출력 함수
def plot_results(times_non_invasive, auc_non_invasive, conf_int_non_invasive,
                 times_invasive, auc_invasive, conf_int_invasive,
                 mean_auc_non_invasive, mean_auc_invasive,
                 ci_non_invasive, ci_invasive):
    plt.figure(figsize=(10, 6))

    plt.plot(times_non_invasive, auc_non_invasive, color='blue',
             label=f'비침습 모델 (Mean AUC = {mean_auc_non_invasive:.3f} (95% CI: {ci_non_invasive[0]:.3f} - {ci_non_invasive[1]:.3f}))')
    plt.fill_between(times_non_invasive, conf_int_non_invasive[0], conf_int_non_invasive[1], color='blue', alpha=0.2)

    plt.plot(times_invasive, auc_invasive, color='orange',
             label=f'침습 모델 (Mean AUC = {mean_auc_invasive:.3f} (95% CI: {ci_invasive[0]:.3f} - {ci_invasive[1]:.3f}))')
    plt.fill_between(times_invasive, conf_int_invasive[0], conf_int_invasive[1], color='orange', alpha=0.2)

    plt.xlabel('Time (years)')
    plt.ylabel('AUC')
    plt.title('Time-Dependent AUC for Models with 95% Confidence Interval (Test Set)')
    plt.legend()
    plt.grid(True)
    plt.show()

# 딕셔너리에 모델 결과 저장
def evaluate_and_store_results(model_name, X_train, y_train, X_test, y_test, times):
    # 모델 학습 및 평가
    model = CoxnetSurvivalAnalysis(l1_ratio=0.9, alpha_min_ratio=0.01)  # 모델 교체 가능
    results = build_and_evaluate_model(model, X_train, y_train, X_test, y_test)

    # AUC 및 신뢰 구간 계산
    auc, mean_auc, conf_int = calculate_auc_and_confidence_interval(y_train, y_test, results["y_pred_test"], times)
    ci_mean = (np.mean(conf_int[0]), np.mean(conf_int[1]))

    # 결과 저장
    model_results = {
        "model_name": model_name,
        "c_index_train": results["c_index_train"],
        "c_index_test": results["c_index_test"],
        "mean_auc": mean_auc,
        "ci_mean": ci_mean,
        "auc": auc,
        "conf_int": conf_int,
        "times": times
    }

    return model_results

In [None]:
# 생존 함수 사용을 위한 모델 빌드 및 평가 함수
def build_and_evaluate_model_cns(X_train, y_train, X_test, y_test, label):
    # 모델 빌드
    cox_net = CoxnetSurvivalAnalysis(l1_ratio=0.9, alpha_min_ratio=0.01)
    cox_net.fit(X_train, y_train)

    # C-index 계산
    y_pred_train = cox_net.predict(X_train)
    y_pred_test = cox_net.predict(X_test)

    c_index_train = concordance_index_censored(y_train['CVD사망'], y_train['추적기간_연'], y_pred_train)
    c_index_test = concordance_index_censored(y_test['CVD사망'], y_test['추적기간_연'], y_pred_test)

    print(f"{label} 모델 Train - Concordance Index: {c_index_train[0]:.4f}")
    print(f"{label} 모델 Test - Concordance Index: {c_index_test[0]:.4f}")

    # 생존 확률을 평가할 시간 설정
    max_follow_up = y_test['추적기간_연'].max()
    min_follow_up = y_test['추적기간_연'].min()
    times = np.arange(min_follow_up, max_follow_up, 1/12)

    # 테스트 세트에 대한 모델의 AUC 계산
    auc, mean_auc = cumulative_dynamic_auc(y_train, y_test, y_pred_test, times)
    print(f"{label} 모델 Mean AUC: {mean_auc:.4f}")

    # 95% 신뢰구간 계산
    n_bootstraps = 100
    rng_seed = 42  # 재현 가능성
    bootstrapped_scores = []

    rng = np.random.RandomState(rng_seed)
    for i in range(n_bootstraps):
        indices = rng.randint(0, len(X_test), len(X_test))
        y_test_resampled = y_test[indices]
        y_pred_resampled = y_pred_test[indices]
        try:
            score, _ = cumulative_dynamic_auc(y_train, y_test_resampled, y_pred_resampled, times)
            bootstrapped_scores.append(score)
        except ValueError:
            continue  # Skip any bootstraps that have incompatible times

    sorted_scores = np.array(bootstrapped_scores)
    mean_scores = np.mean(sorted_scores, axis=0)
    lower = np.percentile(sorted_scores, 2.5, axis=0)
    upper = np.percentile(sorted_scores, 97.5, axis=0)

    return times, auc, mean_auc, lower, upper, c_index_train, c_index_test

# 모델 평가 및 C-index 값 저장
results_cns_all = {
    "Non-Invasive": build_and_evaluate_model_cns(X_train_non_invasive, y_train_non_invasive, X_test_non_invasive, y_test_non_invasive, "Non-Invasive"),
    "Invasive": build_and_evaluate_model_cns(X_train_invasive, y_train_invasive, X_test_invasive, y_test_invasive, "Invasive")
}

In [None]:
results_dict = results_cns_all

# Extract C-index values for plotting
c_index_values_train = [results_dict[key][5][0] for key in results_dict]  # Train C-index
c_index_values_test = [results_dict[key][6][0] for key in results_dict]  # Test C-index

# Plotting C-index as bar plot
plt.figure(figsize=(8, 6))
x_labels = ["Non-Invasive", "Invasive"]

x = np.arange(len(x_labels))
bar_width = 0.35

bars1 = plt.bar(x - bar_width/2, c_index_values_train, bar_width, label='Train', color='skyblue')
bars2 = plt.bar(x + bar_width/2, c_index_values_test, bar_width, label='Test', color='orange')

plt.xlabel('Model Type')
plt.ylabel('C-index')
plt.title('C-index Comparison for Cox’s proportional hazard model with elastic net by Feature Type')
plt.xticks(x, x_labels)
plt.ylim(0.5, 1)  # Adjust y-axis limit for better visibility
plt.legend()
plt.grid(axis='y')

# Display C-index values on top of the bars
for bar in bars1:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.01, round(yval, 3), ha='center', va='bottom')

for bar in bars2:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.01, round(yval, 3), ha='center', va='bottom')

plt.show()

# Plotting models with CI in the same figure
plt.figure(figsize=(12, 6))

# Non-Invasive
plt.plot(results_dict["Non-Invasive"][0], results_dict["Non-Invasive"][1],
         color='blue', label=f'Non-Invasive (Mean AUC = {results_dict["Non-Invasive"][2]:.3f} (95% CI: {results_dict["Non-Invasive"][3][0]:.3f} - {results_dict["Non-Invasive"][4][0]:.3f}))')
plt.fill_between(results_dict["Non-Invasive"][0], results_dict["Non-Invasive"][3], results_dict["Non-Invasive"][4], color='blue', alpha=0.2)

# Invasive
plt.plot(results_dict["Invasive"][0], results_dict["Invasive"][1],
         color='green', label=f'Invasive (Mean AUC = {results_dict["Invasive"][2]:.3f} (95% CI: {results_dict["Invasive"][3][0]:.3f} - {results_dict["Invasive"][4][0]:.3f}))')
plt.fill_between(results_dict["Invasive"][0], results_dict["Invasive"][3], results_dict["Invasive"][4], color='green', alpha=0.2)

plt.xlabel('Time (years)')
plt.ylabel('AUC')
plt.title('Time-Dependent AUC for Cox’s proportional hazard model with elastic net - All Subjects')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

### CNS 성별 분할

In [None]:
# Variable lists
비침습 = ['나이' , 'WHtR' , '성별' , '고혈압여부','당뇨병여부', 'IPAQ_4group_mets_days', 'CVD사망' ,'추적기간_연']
침습 = ['나이' , 'WHtR' , '성별' , '고혈압여부','당뇨병여부', 'IPAQ_4group_mets_days', 'TG' ,'LDL', 'CVD사망' ,'추적기간_연'] # HDL
범주형_변수 = ['성별','고혈압여부' ,'당뇨병여부' ,'IPAQ_4group_mets_days']

# Convert categorical variables to category type
Data_temp = sampled_total.copy()
Data_temp[범주형_변수] = Data_temp[범주형_변수].astype('category')

# Non-invasive variable data
data_subset_non_invasive = Data_temp[비침습]
# Invasive variable data
data_subset_invasive = Data_temp[침습]

# Extract male data (성별 == 1)
male_data_non_invasive = data_subset_non_invasive[data_subset_non_invasive['성별'] == 1].copy()
male_data_invasive = data_subset_invasive[data_subset_invasive['성별'] == 1].copy()

# Extract female data (성별 == 2)
female_data_non_invasive = data_subset_non_invasive[data_subset_non_invasive['성별'] == 2].copy()
female_data_invasive = data_subset_invasive[data_subset_invasive['성별'] == 2].copy()

# Remove gender column
male_data_non_invasive.drop(columns=['성별'], inplace=True)
female_data_non_invasive.drop(columns=['성별'], inplace=True)
male_data_invasive.drop(columns=['성별'], inplace=True)
female_data_invasive.drop(columns=['성별'], inplace=True)

# Create Surv objects
y_male_non_invasive = Surv.from_dataframe('CVD사망', '추적기간_연', male_data_non_invasive)
y_female_non_invasive = Surv.from_dataframe('CVD사망', '추적기간_연', female_data_non_invasive)
y_male_invasive = Surv.from_dataframe('CVD사망', '추적기간_연', male_data_invasive)
y_female_invasive = Surv.from_dataframe('CVD사망', '추적기간_연', female_data_invasive)

# Extract covariates and dummy encode
X_male_non_invasive = pd.get_dummies(male_data_non_invasive.drop(['CVD사망', '추적기간_연'], axis=1), drop_first=True)
X_female_non_invasive = pd.get_dummies(female_data_non_invasive.drop(['CVD사망', '추적기간_연'], axis=1), drop_first=True)
X_male_invasive = pd.get_dummies(male_data_invasive.drop(['CVD사망', '추적기간_연'], axis=1), drop_first=True)
X_female_invasive = pd.get_dummies(female_data_invasive.drop(['CVD사망', '추적기간_연'], axis=1), drop_first=True)

# Stratified data split
X_train_male_non_invasive, X_test_male_non_invasive, y_train_male_non_invasive, y_test_male_non_invasive = train_test_split(
    X_male_non_invasive, y_male_non_invasive, test_size=0.3, stratify=y_male_non_invasive['CVD사망'], random_state=42)
X_train_female_non_invasive, X_test_female_non_invasive, y_train_female_non_invasive, y_test_female_non_invasive = train_test_split(
    X_female_non_invasive, y_female_non_invasive, test_size=0.3, stratify=y_female_non_invasive['CVD사망'], random_state=42)
X_train_male_invasive, X_test_male_invasive, y_train_male_invasive, y_test_male_invasive = train_test_split(
    X_male_invasive, y_male_invasive, test_size=0.3, stratify=y_male_invasive['CVD사망'], random_state=42)
X_train_female_invasive, X_test_female_invasive, y_train_female_invasive, y_test_female_invasive = train_test_split(
    X_female_invasive, y_female_invasive, test_size=0.3, stratify=y_female_invasive['CVD사망'], random_state=42)

In [None]:
# Model building and evaluation function
def build_and_evaluate_model_cns(X_train, y_train, X_test, y_test, label):
    # Build model
    cox_net = CoxnetSurvivalAnalysis(l1_ratio=0.9, alpha_min_ratio=0.01)
    cox_net.fit(X_train, y_train)

    # Calculate C-index for CPH model
    y_pred_train = cox_net.predict(X_train)
    y_pred_test = cox_net.predict(X_test)

    c_index_train = concordance_index_censored(y_train['CVD사망'], y_train['추적기간_연'], y_pred_train)
    c_index_test = concordance_index_censored(y_test['CVD사망'], y_test['추적기간_연'], y_pred_test)

    print(f"{label} 모델 Train - Concordance Index: {c_index_train[0]:.4f}")
    print(f"{label} 모델 Test - Concordance Index: {c_index_test[0]:.4f}")

    # Set times for AUC calculation within test set follow-up period
    max_follow_up = y_test['추적기간_연'].max()
    min_follow_up = y_test['추적기간_연'].min()

    # Times now set to be within the test data follow-up range
    times = np.arange(min_follow_up, max_follow_up, 1/12)

    # Calculate AUC for the model on the test set
    auc, mean_auc = cumulative_dynamic_auc(y_train, y_test, y_pred_test, times)
    print(f"{label} 모델 Mean AUC: {mean_auc:.4f}")

    # Calculate 95% confidence intervals
    n_bootstraps = 100
    rng_seed = 42  # reproducible results
    bootstrapped_scores = []

    rng = np.random.RandomState(rng_seed)
    for i in range(n_bootstraps):
        indices = rng.randint(0, len(X_test), len(X_test))
        y_test_resampled = y_test[indices]
        y_pred_resampled = y_pred_test[indices]
        try:
            score, _ = cumulative_dynamic_auc(y_train, y_test_resampled, y_pred_resampled, times)
            bootstrapped_scores.append(score)
        except ValueError:
            continue  # Skip any bootstraps that have incompatible times

    sorted_scores = np.array(bootstrapped_scores)
    mean_scores = np.mean(sorted_scores, axis=0)
    lower = np.percentile(sorted_scores, 2.5, axis=0)
    upper = np.percentile(sorted_scores, 97.5, axis=0)

    return times, auc, mean_auc, lower, upper, c_index_train, c_index_test

# Evaluate models and store C-index values
results_cns = {
    "Male Non-Invasive": build_and_evaluate_model_cns(X_train_male_non_invasive, y_train_male_non_invasive, X_test_male_non_invasive, y_test_male_non_invasive, "Male Non-Invasive"),
    "Male Invasive": build_and_evaluate_model_cns(X_train_male_invasive, y_train_male_invasive, X_test_male_invasive, y_test_male_invasive, "Male Invasive"),
    "Female Non-Invasive": build_and_evaluate_model_cns(X_train_female_non_invasive, y_train_female_non_invasive, X_test_female_non_invasive, y_test_female_non_invasive, "Female Non-Invasive"),
    "Female Invasive": build_and_evaluate_model_cns(X_train_female_invasive, y_train_female_invasive, X_test_female_invasive, y_test_female_invasive, "Female Invasive")
}

In [None]:
# Extract C-index values for plotting
results_dict = results_cns

c_index_values_train = [results_dict[key][5][0] for key in results_dict]  # Train C-index
c_index_values_test = [results_dict[key][6][0] for key in results_dict]  # Test C-index

# Plotting C-index as bar plot
plt.figure(figsize=(8, 6))
x_labels = ["Male Non-Invasive", "Male Invasive", "Female Non-Invasive", "Female Invasive"]

x = np.arange(len(x_labels))
bar_width = 0.35

bars1 = plt.bar(x - bar_width/2, c_index_values_train, bar_width, label='Train', color='skyblue')
bars2 = plt.bar(x + bar_width/2, c_index_values_test, bar_width, label='Test', color='orange')

plt.xlabel('Model Type')
plt.ylabel('C-index')
plt.title('C-index Comparison for Cox’s proportional hazard model with elastic net by Feature Type')
plt.xticks(x, x_labels)
plt.ylim(0.5, 1)  # Adjust y-axis limit for better visibility
plt.legend(loc='upper right')
plt.grid(axis='y')

# Add C-index values on top of bars
for bar in bars1:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.01, round(yval, 3), ha='center', va='bottom')

for bar in bars2:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.01, round(yval, 3), ha='center', va='bottom')

plt.show()

# Plotting male models with CI in the same figure
plt.figure(figsize=(12, 6))

# Male Non-Invasive
male_noninvasive_data = results_dict["Male Non-Invasive"]
plt.plot(male_noninvasive_data[0], male_noninvasive_data[1], color='blue', label=f'Male Non-Invasive (Mean AUC = {male_noninvasive_data[2]:.3f} (95% CI: {male_noninvasive_data[3][0]:.3f} - {male_noninvasive_data[4][0]:.3f}))')
plt.fill_between(male_noninvasive_data[0], male_noninvasive_data[3], male_noninvasive_data[4], color='blue', alpha=0.2)

# Male Invasive
male_invasive_data = results_dict["Male Invasive"]
plt.plot(male_invasive_data[0], male_invasive_data[1], color='green', label=f'Male Invasive (Mean AUC = {male_invasive_data[2]:.3f} (95% CI: {male_invasive_data[3][0]:.3f} - {male_invasive_data[4][0]:.3f}))')
plt.fill_between(male_invasive_data[0], male_invasive_data[3], male_invasive_data[4], color='green', alpha=0.2)

plt.xlabel('Time (years)')
plt.ylabel('AUC')
plt.title('Time-Dependent AUC for Cox’s proportional hazard model with elastic net  - Males')
plt.legend(loc='lower right')
plt.grid()
plt.show()

# Plotting female models with CI in the same figure (반복)
plt.figure(figsize=(12, 6))

# Female Non-Invasive
female_noninvasive_data = results_dict["Female Non-Invasive"]
plt.plot(female_noninvasive_data[0], female_noninvasive_data[1], color='purple', label=f'Female Non-Invasive (Mean AUC = {female_noninvasive_data[2]:.3f} (95% CI: {female_noninvasive_data[3][0]:.3f} - {female_noninvasive_data[4][0]:.3f}))')
plt.fill_between(female_noninvasive_data[0], female_noninvasive_data[3], female_noninvasive_data[4], color='purple', alpha=0.2)

# Female Invasive
female_invasive_data = results_dict["Female Invasive"]
plt.plot(female_invasive_data[0], female_invasive_data[1], color='red', label=f'Female Invasive (Mean AUC = {female_invasive_data[2]:.3f} (95% CI: {female_invasive_data[3][0]:.3f} - {female_invasive_data[4][0]:.3f}))')
plt.fill_between(female_invasive_data[0], female_invasive_data[3], female_invasive_data[4], color='red', alpha=0.2)

plt.xlabel('Time (years)')
plt.ylabel('AUC')
plt.title('Time-Dependent AUC for Cox’s proportional hazard model with elastic net - Females')
plt.legend(loc='lower right')
plt.grid()
plt.show()

## FksSVM
- Fast Kernel Survival Support Vector Machine
- 스케일링 여부에 따라 수행 속도차이가 매우 큼 - > 스케일링 적용

In [None]:
from sksurv.svm import FastKernelSurvivalSVM

In [None]:
# 변수 목록
비침습 = ['나이' , 'WHtR' , '성별' , '고혈압여부','당뇨병여부', 'IPAQ_4group_mets_days', 'CVD사망' ,'추적기간_연']
침습 = ['나이' , 'WHtR' , '성별' , '고혈압여부','당뇨병여부', 'IPAQ_4group_mets_days', 'TG' ,'LDL', 'CVD사망' ,'추적기간_연'] # HDL
범주형_변수 = ['성별','고혈압여부' ,'당뇨병여부' ,'IPAQ_4group_mets_days']

# 범주형 변수를 category 타입으로 변경
Data_temp = sampled_total.copy()
Data_temp[범주형_변수] = Data_temp[범주형_변수].astype('category')

# Non-invasive and invasive variable data
data_subset_non_invasive = Data_temp[비침습]
data_subset_invasive = Data_temp[침습]

# Create Surv objects for survival analysis
y_non_invasive = Surv.from_dataframe('CVD사망', '추적기간_연', data_subset_non_invasive)
y_invasive = Surv.from_dataframe('CVD사망', '추적기간_연', data_subset_invasive)

# 공변량 추출 및 더미화
X_non_invasive = pd.get_dummies(data_subset_non_invasive.drop(['CVD사망', '추적기간_연'], axis=1), drop_first=True)
X_invasive = pd.get_dummies(data_subset_invasive.drop(['CVD사망', '추적기간_연'], axis=1), drop_first=True)

# 데이터 분할 (층화 샘플링 사용)
X_train_non_invasive, X_test_non_invasive, y_train_non_invasive, y_test_non_invasive = train_test_split(
    X_non_invasive, y_non_invasive, test_size=0.3, stratify=y_non_invasive['CVD사망'], random_state=42)
X_train_invasive, X_test_invasive, y_train_invasive, y_test_invasive = train_test_split(
    X_invasive, y_invasive, test_size=0.3, stratify=y_invasive['CVD사망'], random_state=42)

# 스케일링
scaler_non_invasive = StandardScaler()
scaler_invasive = StandardScaler()

X_train_non_invasive_scaled = scaler_non_invasive.fit_transform(X_train_non_invasive)
X_test_non_invasive_scaled = scaler_non_invasive.transform(X_test_non_invasive)
X_train_invasive_scaled = scaler_invasive.fit_transform(X_train_invasive)
X_test_invasive_scaled = scaler_invasive.transform(X_test_invasive)

### FksSVM 전체

In [None]:
from sksurv.kernels import clinical_kernel
kernel_matrix = clinical_kernel(X_train_non_invasive)
kssvm = FastKernelSurvivalSVM(optimizer="rbtree", kernel="precomputed", random_state=0) # clinical_kernel c-index 0.818
kssvm.fit(kernel_matrix, y_train_non_invasive)
model = kssvm
y_pred_train = model.predict(kernel_matrix)
c_index_train = concordance_index_censored(y_train_non_invasive['CVD사망'], y_train_non_invasive['추적기간_연'], y_pred_train)
c_index_train

In [None]:
def calculate_auc_and_confidence_interval(y_train, y_test, y_pred_test, times, n_bootstraps=100):
    auc, _ = cumulative_dynamic_auc(y_train, y_test, y_pred_test, times)

    bootstrapped_auc = []
    rng = np.random.RandomState(42)

    for _ in range(n_bootstraps):
        indices = rng.randint(0, len(y_test), len(y_test))
        y_test_bootstrap = y_test[indices]
        y_pred_test_bootstrap = y_pred_test[indices]

        times_bootstrap = np.arange(
            np.min(y_test_bootstrap['추적기간_연']),
            np.max(y_test_bootstrap['추적기간_연']),
            1/12)

        auc_bootstrap, _ = cumulative_dynamic_auc(
            y_train, y_test_bootstrap, y_pred_test_bootstrap, times_bootstrap)

        auc_interpolated = np.interp(times, times_bootstrap, auc_bootstrap)
        bootstrapped_auc.append(auc_interpolated)

    bootstrapped_auc = np.array(bootstrapped_auc)
    conf_int = np.percentile(bootstrapped_auc, [2.5, 97.5], axis=0)
    mean_auc = np.mean(auc)

    return auc, mean_auc, conf_int

def plot_results(times_non_invasive, auc_non_invasive, conf_int_non_invasive,
                 times_invasive, auc_invasive, conf_int_invasive,
                 mean_auc_non_invasive, mean_auc_invasive,
                 ci_non_invasive, ci_invasive):
    plt.figure(figsize=(10, 6))

    # 비침습 모델
    plt.plot(times_non_invasive, auc_non_invasive, color='blue',
             label=f'비침습 모델 (Mean AUC = {mean_auc_non_invasive:.3f} (95% CI: {ci_non_invasive[0]:.3f} - {ci_non_invasive[1]:.3f}))')
    plt.fill_between(times_non_invasive, conf_int_non_invasive[0], conf_int_non_invasive[1], color='blue', alpha=0.2)

    # 침습 모델
    plt.plot(times_invasive, auc_invasive, color='orange',
             label=f'침습 모델 (Mean AUC = {mean_auc_invasive:.3f} (95% CI: {ci_invasive[0]:.3f} - {ci_invasive[1]:.3f}))')
    plt.fill_between(times_invasive, conf_int_invasive[0], conf_int_invasive[1], color='orange', alpha=0.2)

    plt.xlabel('Time (years)')
    plt.ylabel('AUC')
    plt.title('Time-Dependent AUC for Cox Models with 95% Confidence Interval (Test Set)')
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
from sksurv.svm import FastKernelSurvivalSVM

def build_and_evaluate_model_fkssvm(X_train, y_train, X_test, y_test):
    model = FastKernelSurvivalSVM(alpha=1024, gamma=0.01, optimizer='rbtree', kernel='rbf', max_iter=20,
                                  random_state=42) # alpha=1, gamma=None
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    c_index_train = concordance_index_censored(y_train['CVD사망'], y_train['추적기간_연'], y_pred_train)
    c_index_test = concordance_index_censored(y_test['CVD사망'], y_test['추적기간_연'], y_pred_test)

    return model, y_pred_train, y_pred_test, c_index_train, c_index_test

# FKSSVM 모델 구축 및 평가 (비침습 데이터)
fkssvm_model_non_invasive, y_pred_train_non_invasive_fkssvm, y_pred_test_non_invasive_fkssvm, c_index_train_non_invasive_fkssvm, c_index_test_non_invasive_fkssvm = build_and_evaluate_model_fkssvm(
    X_train_non_invasive_scaled , y_train_non_invasive, X_test_non_invasive_scaled , y_test_non_invasive)

# FKSSVM 모델 구축 및 평가 (침습 데이터)
fkssvm_model_invasive, y_pred_train_invasive_fkssvm, y_pred_test_invasive_fkssvm, c_index_train_invasive_fkssvm, c_index_test_invasive_fkssvm = build_and_evaluate_model_fkssvm(
    X_train_invasive_scaled , y_train_invasive, X_test_invasive_scaled , y_test_invasive)

print(f"비침습 모델 Train - Concordance Index: {c_index_train_non_invasive_fkssvm[0]:.4f}")
print(f"비침습 모델 Test - Concordance Index: {c_index_test_non_invasive_fkssvm[0]:.4f}")
print(f"침습 모델 Train - Concordance Index: {c_index_train_invasive_fkssvm[0]:.4f}")
print(f"침습 모델 Test - Concordance Index: {c_index_test_invasive_fkssvm[0]:.4f}")

# 최대 추적 기간 및 times 설정 (시간 범위 조정)
y_train_events = y_train_non_invasive[y_train_non_invasive['CVD사망']]
train_min_non_invasive, train_max_non_invasive = y_train_events['추적기간_연'].min(), y_train_events['추적기간_연'].max()

y_test_events = y_test_non_invasive[y_test_non_invasive['CVD사망']]
test_min_non_invasive, test_max_non_invasive = y_test_events['추적기간_연'].min(), y_test_events['추적기간_연'].max()

# 범위가 다르면 경고 메시지를 출력하고 수정
if not (train_min_non_invasive <= test_min_non_invasive < test_max_non_invasive <= train_max_non_invasive):
    print("경고: 테스트 데이터의 시간 범위가 훈련 데이터의 시간 범위와 일치하지 않습니다. 시간 범위를 조정합니다.")
    test_min_non_invasive = max(train_min_non_invasive, test_min_non_invasive)
    test_max_non_invasive = min(train_max_non_invasive, test_max_non_invasive)

times_non_invasive = np.arange(test_min_non_invasive, test_max_non_invasive, 1/12)

# 침습 데이터의 시간 범위도 같은 방식으로 처리
y_train_events = y_train_invasive[y_train_invasive['CVD사망']]
train_min_invasive, train_max_invasive = y_train_events['추적기간_연'].min(), y_train_events['추적기간_연'].max()

y_test_events = y_test_invasive[y_test_invasive['CVD사망']]
test_min_invasive, test_max_invasive = y_test_events['추적기간_연'].min(), y_test_events['추적기간_연'].max()

# 범위가 다르면 경고 메시지를 출력하고 수정
if not (train_min_invasive <= test_min_invasive < test_max_invasive <= train_max_invasive):
    print("경고: 침습 테스트 데이터의 시간 범위가 훈련 데이터의 시간 범위와 일치하지 않습니다. 시간 범위를 조정합니다.")
    test_min_invasive = max(train_min_invasive, test_min_invasive)
    test_max_invasive = min(train_max_invasive, test_max_invasive)

times_invasive = np.arange(test_min_invasive, test_max_invasive, 1/12)

# AUC 및 신뢰구간 계산
fkssvm_auc_non_invasive, fkssvm_mean_auc_non_invasive, fkssvm_conf_int_non_invasive = calculate_auc_and_confidence_interval(
    y_train_non_invasive, y_test_non_invasive, y_pred_test_non_invasive_fkssvm, times_non_invasive)

fkssvm_auc_invasive, fkssvm_mean_auc_invasive, fkssvm_conf_int_invasive = calculate_auc_and_confidence_interval(
    y_train_invasive, y_test_invasive, y_pred_test_invasive_fkssvm, times_invasive)

# 신뢰구간의 평균값 계산
ci_non_invasive_fkssvm = (np.mean(fkssvm_conf_int_non_invasive[0]), np.mean(fkssvm_conf_int_non_invasive[1]))
ci_invasive_fkssvm = (np.mean(fkssvm_conf_int_invasive[0]), np.mean(fkssvm_conf_int_invasive[1]))

# 결과 플로팅
plot_results(
    times_non_invasive, fkssvm_auc_non_invasive, fkssvm_conf_int_non_invasive,
    times_invasive, fkssvm_auc_invasive, fkssvm_conf_int_invasive,
    fkssvm_mean_auc_non_invasive, fkssvm_mean_auc_invasive,
    ci_non_invasive_fkssvm, ci_invasive_fkssvm
)

## Survival Tree

In [None]:
from sksurv.tree import SurvivalTree

### ST 전체

In [None]:
# ST 모델 구축 및 평가
def build_and_evaluate_model_st(X_train, y_train, X_test, y_test, label):
    # RSF 모델 생성
    model = SurvivalTree(max_depth=16, max_leaf_nodes=32, min_samples_split=10, min_samples_leaf=10, min_weight_fraction_leaf=0.01,
                         low_memory=False, random_state=42)
    model.fit(X_train, y_train)

    # 예측 값
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Concordance Index 계산
    c_index_train = concordance_index_censored(y_train['CVD사망'], y_train['추적기간_연'], y_pred_train)
    c_index_test = concordance_index_censored(y_test['CVD사망'], y_test['추적기간_연'], y_pred_test)

    return model, y_pred_train, y_pred_test, c_index_train, c_index_test

# AUC 및 신뢰구간 계산
def calculate_auc_and_confidence_interval(y_train, y_test, y_pred_test, times, n_bootstraps=100):
    auc, _ = cumulative_dynamic_auc(y_train, y_test, y_pred_test, times)

    bootstrapped_auc = []
    rng = np.random.RandomState(42)

    for _ in range(n_bootstraps):
        indices = rng.randint(0, len(y_test), len(y_test))
        y_test_bootstrap = y_test[indices]
        y_pred_test_bootstrap = y_pred_test[indices]

        times_bootstrap = np.arange(
            np.min(y_test_bootstrap['추적기간_연']),
            np.max(y_test_bootstrap['추적기간_연']),
            1/12)

        auc_bootstrap, _ = cumulative_dynamic_auc(
            y_train, y_test_bootstrap, y_pred_test_bootstrap, times_bootstrap)

        auc_interpolated = np.interp(times, times_bootstrap, auc_bootstrap)
        bootstrapped_auc.append(auc_interpolated)

    bootstrapped_auc = np.array(bootstrapped_auc)
    conf_int = np.percentile(bootstrapped_auc, [2.5, 97.5], axis=0)
    mean_auc = np.mean(auc)

    return auc, mean_auc, conf_int

# 결과 플로팅 함수
def plot_results(times_non_invasive, auc_non_invasive, conf_int_non_invasive,
                 times_invasive, auc_invasive, conf_int_invasive,
                 mean_auc_non_invasive, mean_auc_invasive,
                 ci_non_invasive, ci_invasive):
    plt.figure(figsize=(10, 6))

    # 비침습 모델
    plt.plot(times_non_invasive, auc_non_invasive, color='blue',
             label=f'ST 비침습 모델 (Mean AUC = {mean_auc_non_invasive:.3f} (95% CI: {ci_non_invasive[0]:.3f} - {ci_non_invasive[1]:.3f}))')
    plt.fill_between(times_non_invasive, conf_int_non_invasive[0], conf_int_non_invasive[1], color='blue', alpha=0.2)

    # 침습 모델
    plt.plot(times_invasive, auc_invasive, color='orange',
             label=f'ST 침습 모델 (Mean AUC = {mean_auc_invasive:.3f} (95% CI: {ci_invasive[0]:.3f} - {ci_invasive[1]:.3f}))')
    plt.fill_between(times_invasive, conf_int_invasive[0], conf_int_invasive[1], color='orange', alpha=0.2)

    plt.xlabel('Time (years)')
    plt.ylabel('AUC')
    plt.title('Time-Dependent AUC for Survival Tree Models with 95% Confidence Interval (Test Set)')
    plt.legend()
    plt.grid(True)
    plt.show()

# ST 모델 구축 및 평가 (비침습 데이터)
st_model_non_invasive, y_pred_train_non_invasive_st, y_pred_test_non_invasive_st, c_index_train_non_invasive_st, c_index_test_non_invasive_st = build_and_evaluate_model_st(
    X_train_non_invasive, y_train_non_invasive, X_test_non_invasive, y_test_non_invasive, label="Non-Invasive")

# ST 모델 구축 및 평가 (침습 데이터)
st_model_invasive, y_pred_train_invasive_st, y_pred_test_invasive_st, c_index_train_invasive_st, c_index_test_invasive_st = build_and_evaluate_model_st(
    X_train_invasive, y_train_invasive, X_test_invasive, y_test_invasive, label="Invasive")

print(f"ST 비침습 모델 Train - Concordance Index: {c_index_train_non_invasive_st[0]:.4f}")
print(f"ST 비침습 모델 Test - Concordance Index: {c_index_test_non_invasive_st[0]:.4f}")
print(f"ST 침습 모델 Train - Concordance Index: {c_index_train_invasive_st[0]:.4f}")
print(f"ST 침습 모델 Test - Concordance Index: {c_index_test_invasive_st[0]:.4f}")

# AUC 및 신뢰구간 계산
st_auc_non_invasive, st_mean_auc_non_invasive, st_conf_int_non_invasive = calculate_auc_and_confidence_interval(
    y_train_non_invasive, y_test_non_invasive, y_pred_test_non_invasive_st, times_non_invasive)

st_auc_invasive, st_mean_auc_invasive, st_conf_int_invasive = calculate_auc_and_confidence_interval(
    y_train_invasive, y_test_invasive, y_pred_test_invasive_st, times_invasive)

# 신뢰구간의 평균값 계산
ci_non_invasive_st = (np.mean(st_conf_int_non_invasive[0]), np.mean(st_conf_int_non_invasive[1]))
ci_invasive_st = (np.mean(st_conf_int_invasive[0]), np.mean(st_conf_int_invasive[1]))

# 결과 플로팅
plot_results(
    times_non_invasive, st_auc_non_invasive, st_conf_int_non_invasive,
    times_invasive, st_auc_invasive, st_conf_int_invasive,
    st_mean_auc_non_invasive, st_mean_auc_invasive,
    ci_non_invasive_st, ci_invasive_st
)

## RSF
- Random Survival Forest

### RSF 전체
- Cox 모델과 다른 예측 변수가 들어감
- C index and Time dependent AUC

In [None]:
from sksurv.ensemble import RandomSurvivalForest

In [None]:
# Variable lists
비침습 = ['나이', 'WHtR', '성별', '고혈압여부', '당뇨병여부', 'IPAQ_4group_mets_days', 'CVD사망', '추적기간_연']
침습 = ['나이', 'WHtR', '성별', '고혈압여부', '당뇨병여부', 'IPAQ_4group_mets_days', 'TG', 'LDL', 'HDL', 'CVD사망', '추적기간_연']
범주형_변수 = ['성별', '고혈압여부', '당뇨병여부', 'IPAQ_4group_mets_days']

# Convert categorical variables to category type
Data_temp = sampled_total.copy()
Data_temp[범주형_변수] = Data_temp[범주형_변수].astype('category')

# Non-invasive and invasive variable data
data_subset_non_invasive = Data_temp[비침습]
data_subset_invasive = Data_temp[침습]

# Create Surv objects for survival analysis
y_non_invasive = Surv.from_dataframe('CVD사망', '추적기간_연', data_subset_non_invasive)
y_invasive = Surv.from_dataframe('CVD사망', '추적기간_연', data_subset_invasive)

# Extract covariates without dummy encoding
X_non_invasive = data_subset_non_invasive.drop(['CVD사망', '추적기간_연'], axis=1)
X_invasive = data_subset_invasive.drop(['CVD사망', '추적기간_연'], axis=1)

# Stratified data split
X_train_non_invasive, X_test_non_invasive, y_train_non_invasive, y_test_non_invasive = train_test_split(
    X_non_invasive, y_non_invasive, test_size=0.3, stratify=y_non_invasive['CVD사망'], random_state=42)
X_train_invasive, X_test_invasive, y_train_invasive, y_test_invasive = train_test_split(
    X_invasive, y_invasive, test_size=0.3, stratify=y_invasive['CVD사망'], random_state=42)

In [None]:
# RSF 모델 구축 및 평가
def build_and_evaluate_model_rsf(X_train, y_train, X_test, y_test, label):
    # RSF 모델 생성
    model = RandomSurvivalForest(n_estimators=200, max_depth=16, max_leaf_nodes=32,
                                 min_samples_split=8, min_samples_leaf=8, low_memory=False,
                                 n_jobs=-1, random_state=42)
    model.fit(X_train, y_train)

    # 예측 값
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Concordance Index 계산
    c_index_train = concordance_index_censored(y_train['CVD사망'], y_train['추적기간_연'], y_pred_train)
    c_index_test = concordance_index_censored(y_test['CVD사망'], y_test['추적기간_연'], y_pred_test)

    return model, y_pred_train, y_pred_test, c_index_train, c_index_test

# AUC 및 신뢰구간 계산
def calculate_auc_and_confidence_interval(y_train, y_test, y_pred_test, times, n_bootstraps=100):
    auc, _ = cumulative_dynamic_auc(y_train, y_test, y_pred_test, times)

    bootstrapped_auc = []
    rng = np.random.RandomState(42)

    for _ in range(n_bootstraps):
        indices = rng.randint(0, len(y_test), len(y_test))
        y_test_bootstrap = y_test[indices]
        y_pred_test_bootstrap = y_pred_test[indices]

        times_bootstrap = np.arange(
            np.min(y_test_bootstrap['추적기간_연']),
            np.max(y_test_bootstrap['추적기간_연']),
            1/12)

        auc_bootstrap, _ = cumulative_dynamic_auc(
            y_train, y_test_bootstrap, y_pred_test_bootstrap, times_bootstrap)

        auc_interpolated = np.interp(times, times_bootstrap, auc_bootstrap)
        bootstrapped_auc.append(auc_interpolated)

    bootstrapped_auc = np.array(bootstrapped_auc)
    conf_int = np.percentile(bootstrapped_auc, [2.5, 97.5], axis=0)
    mean_auc = np.mean(auc)

    return auc, mean_auc, conf_int

# 결과 플로팅 함수
def plot_results(times_non_invasive, auc_non_invasive, conf_int_non_invasive,
                 times_invasive, auc_invasive, conf_int_invasive,
                 mean_auc_non_invasive, mean_auc_invasive,
                 ci_non_invasive, ci_invasive):
    plt.figure(figsize=(10, 6))

    # 비침습 모델
    plt.plot(times_non_invasive, auc_non_invasive, color='blue',
             label=f'RSF 비침습 모델 (Mean AUC = {mean_auc_non_invasive:.3f} (95% CI: {ci_non_invasive[0]:.3f} - {ci_non_invasive[1]:.3f}))')
    plt.fill_between(times_non_invasive, conf_int_non_invasive[0], conf_int_non_invasive[1], color='blue', alpha=0.2)

    # 침습 모델
    plt.plot(times_invasive, auc_invasive, color='orange',
             label=f'RSF 침습 모델 (Mean AUC = {mean_auc_invasive:.3f} (95% CI: {ci_invasive[0]:.3f} - {ci_invasive[1]:.3f}))')
    plt.fill_between(times_invasive, conf_int_invasive[0], conf_int_invasive[1], color='orange', alpha=0.2)

    plt.xlabel('Time (years)')
    plt.ylabel('AUC')
    plt.title('Time-Dependent AUC for RSF Models with 95% Confidence Interval (Test Set)')
    plt.legend()
    plt.grid(True)
    plt.show()

# RSF 모델 구축 및 평가 (비침습 데이터)
rsf_model_non_invasive, y_pred_train_non_invasive_rsf, y_pred_test_non_invasive_rsf, c_index_train_non_invasive_rsf, c_index_test_non_invasive_rsf = build_and_evaluate_model_rsf(
    X_train_non_invasive, y_train_non_invasive, X_test_non_invasive, y_test_non_invasive, label="Non-Invasive")

# RSF 모델 구축 및 평가 (침습 데이터)
rsf_model_invasive, y_pred_train_invasive_rsf, y_pred_test_invasive_rsf, c_index_train_invasive_rsf, c_index_test_invasive_rsf = build_and_evaluate_model_rsf(
    X_train_invasive, y_train_invasive, X_test_invasive, y_test_invasive, label="Invasive")

print(f"RSF 비침습 모델 Train - Concordance Index: {c_index_train_non_invasive_rsf[0]:.4f}")
print(f"RSF 비침습 모델 Test - Concordance Index: {c_index_test_non_invasive_rsf[0]:.4f}")
print(f"RSF 침습 모델 Train - Concordance Index: {c_index_train_invasive_rsf[0]:.4f}")
print(f"RSF 침습 모델 Test - Concordance Index: {c_index_test_invasive_rsf[0]:.4f}")

# AUC 및 신뢰구간 계산
rsf_auc_non_invasive, rsf_mean_auc_non_invasive, rsf_conf_int_non_invasive = calculate_auc_and_confidence_interval(
    y_train_non_invasive, y_test_non_invasive, y_pred_test_non_invasive_rsf, times_non_invasive)

rsf_auc_invasive, rsf_mean_auc_invasive, rsf_conf_int_invasive = calculate_auc_and_confidence_interval(
    y_train_invasive, y_test_invasive, y_pred_test_invasive_rsf, times_invasive)

# 신뢰구간의 평균값 계산
ci_non_invasive_rsf = (np.mean(rsf_conf_int_non_invasive[0]), np.mean(rsf_conf_int_non_invasive[1]))
ci_invasive_rsf = (np.mean(rsf_conf_int_invasive[0]), np.mean(rsf_conf_int_invasive[1]))

# 결과 플로팅
plot_results(
    times_non_invasive, rsf_auc_non_invasive, rsf_conf_int_non_invasive,
    times_invasive, rsf_auc_invasive, rsf_conf_int_invasive,
    rsf_mean_auc_non_invasive, rsf_mean_auc_invasive,
    ci_non_invasive_rsf, ci_invasive_rsf
)

In [None]:
# RSF 모델 구축 및 평가
def build_and_evaluate_model_rsf(X_train, y_train, X_test, y_test, label):
    # RSF 모델 생성
    model = RandomSurvivalForest(n_estimators=200, max_depth=16, max_leaf_nodes=32,
                                 min_samples_split=8, min_samples_leaf=8, low_memory=False,
                                 n_jobs=-1, random_state=42)
    model.fit(X_train, y_train)

    # 예측 값
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Concordance Index 계산
    c_index_train = concordance_index_censored(y_train['CVD사망'], y_train['추적기간_연'], y_pred_train)
    c_index_test = concordance_index_censored(y_test['CVD사망'], y_test['추적기간_연'], y_pred_test)

    return model, y_pred_train, y_pred_test, c_index_train, c_index_test

# AUC 및 신뢰구간 계산
def calculate_auc_and_confidence_interval(y_train, y_test, y_pred_test, times, n_bootstraps=100):
    auc, _ = cumulative_dynamic_auc(y_train, y_test, y_pred_test, times)

    bootstrapped_auc = []
    rng = np.random.RandomState(42)

    for _ in range(n_bootstraps):
        indices = rng.randint(0, len(y_test), len(y_test))
        y_test_bootstrap = y_test[indices]
        y_pred_test_bootstrap = y_pred_test[indices]

        times_bootstrap = np.arange(
            np.min(y_test_bootstrap['추적기간_연']),
            np.max(y_test_bootstrap['추적기간_연']),
            1/12)

        auc_bootstrap, _ = cumulative_dynamic_auc(
            y_train, y_test_bootstrap, y_pred_test_bootstrap, times_bootstrap)

        auc_interpolated = np.interp(times, times_bootstrap, auc_bootstrap)
        bootstrapped_auc.append(auc_interpolated)

    bootstrapped_auc = np.array(bootstrapped_auc)
    conf_int = np.percentile(bootstrapped_auc, [2.5, 97.5], axis=0)
    mean_auc = np.mean(auc)

    return auc, mean_auc, conf_int

# 결과 플로팅 함수
def plot_results(times_non_invasive, auc_non_invasive, conf_int_non_invasive,
                 times_invasive, auc_invasive, conf_int_invasive,
                 mean_auc_non_invasive, mean_auc_invasive,
                 ci_non_invasive, ci_invasive):
    plt.figure(figsize=(10, 6))

    # 비침습 모델
    plt.plot(times_non_invasive, auc_non_invasive, color='blue',
             label=f'RSF 비침습 모델 (Mean AUC = {mean_auc_non_invasive:.3f} (95% CI: {ci_non_invasive[0]:.3f} - {ci_non_invasive[1]:.3f}))')
    plt.fill_between(times_non_invasive, conf_int_non_invasive[0], conf_int_non_invasive[1], color='blue', alpha=0.2)

    # 침습 모델
    plt.plot(times_invasive, auc_invasive, color='orange',
             label=f'RSF 침습 모델 (Mean AUC = {mean_auc_invasive:.3f} (95% CI: {ci_invasive[0]:.3f} - {ci_invasive[1]:.3f}))')
    plt.fill_between(times_invasive, conf_int_invasive[0], conf_int_invasive[1], color='orange', alpha=0.2)

    plt.xlabel('Time (years)')
    plt.ylabel('AUC')
    plt.title('Time-Dependent AUC for RSF Models with 95% Confidence Interval (Test Set)')
    plt.legend()
    plt.grid(True)
    plt.show()

# RSF 모델 구축 및 평가 (비침습 데이터)
rsf_model_non_invasive, y_pred_train_non_invasive_rsf, y_pred_test_non_invasive_rsf, c_index_train_non_invasive_rsf, c_index_test_non_invasive_rsf = build_and_evaluate_model_rsf(
    X_train_non_invasive, y_train_non_invasive, X_test_non_invasive, y_test_non_invasive, label="Non-Invasive")

# RSF 모델 구축 및 평가 (침습 데이터)
rsf_model_invasive, y_pred_train_invasive_rsf, y_pred_test_invasive_rsf, c_index_train_invasive_rsf, c_index_test_invasive_rsf = build_and_evaluate_model_rsf(
    X_train_invasive, y_train_invasive, X_test_invasive, y_test_invasive, label="Invasive")

print(f"RSF 비침습 모델 Train - Concordance Index: {c_index_train_non_invasive_rsf[0]:.4f}")
print(f"RSF 비침습 모델 Test - Concordance Index: {c_index_test_non_invasive_rsf[0]:.4f}")
print(f"RSF 침습 모델 Train - Concordance Index: {c_index_train_invasive_rsf[0]:.4f}")
print(f"RSF 침습 모델 Test - Concordance Index: {c_index_test_invasive_rsf[0]:.4f}")

# AUC 및 신뢰구간 계산
rsf_auc_non_invasive, rsf_mean_auc_non_invasive, rsf_conf_int_non_invasive = calculate_auc_and_confidence_interval(
    y_train_non_invasive, y_test_non_invasive, y_pred_test_non_invasive_rsf, times_non_invasive)

rsf_auc_invasive, rsf_mean_auc_invasive, rsf_conf_int_invasive = calculate_auc_and_confidence_interval(
    y_train_invasive, y_test_invasive, y_pred_test_invasive_rsf, times_invasive)

# 신뢰구간의 평균값 계산
ci_non_invasive_rsf = (np.mean(rsf_conf_int_non_invasive[0]), np.mean(rsf_conf_int_non_invasive[1]))
ci_invasive_rsf = (np.mean(rsf_conf_int_invasive[0]), np.mean(rsf_conf_int_invasive[1]))

# 결과 플로팅
plot_results(
    times_non_invasive, rsf_auc_non_invasive, rsf_conf_int_non_invasive,
    times_invasive, rsf_auc_invasive, rsf_conf_int_invasive,
    rsf_mean_auc_non_invasive, rsf_mean_auc_invasive,
    ci_non_invasive_rsf, ci_invasive_rsf
)

### RSF 성별 분할
- C index and Time dependent AUC

In [None]:
# Variable lists
비침습 = ['나이', 'WHtR', '성별', '고혈압여부', '당뇨병여부', 'IPAQ_4group_mets_days', 'CVD사망', '추적기간_연']
침습 = ['나이', 'WHtR', '성별', '고혈압여부', '당뇨병여부', 'IPAQ_4group_mets_days', 'TG', 'LDL', 'HDL', 'CVD사망', '추적기간_연']
범주형_변수 = ['성별', '고혈압여부', '당뇨병여부', 'IPAQ_4group_mets_days']

# Convert categorical variables to category type
Data_temp = sampled_total.copy()
Data_temp[범주형_변수] = Data_temp[범주형_변수].astype('category')

# Non-invasive variable data
data_subset_non_invasive = Data_temp[비침습]
# Invasive variable data
data_subset_invasive = Data_temp[침습]

# Extract male data (성별 == 1)
male_data_non_invasive = data_subset_non_invasive[data_subset_non_invasive['성별'] == 1].copy()
male_data_invasive = data_subset_invasive[data_subset_invasive['성별'] == 1].copy()

# Extract female data (성별 == 2)
female_data_non_invasive = data_subset_non_invasive[data_subset_non_invasive['성별'] == 2].copy()
female_data_invasive = data_subset_invasive[data_subset_invasive['성별'] == 2].copy()

# Remove gender column
male_data_non_invasive.drop(columns=['성별'], inplace=True)
female_data_non_invasive.drop(columns=['성별'], inplace=True)
male_data_invasive.drop(columns=['성별'], inplace=True)
female_data_invasive.drop(columns=['성별'], inplace=True)

# Create Surv objects
y_male_non_invasive = Surv.from_dataframe('CVD사망', '추적기간_연', male_data_non_invasive)
y_female_non_invasive = Surv.from_dataframe('CVD사망', '추적기간_연', female_data_non_invasive)
y_male_invasive = Surv.from_dataframe('CVD사망', '추적기간_연', male_data_invasive)
y_female_invasive = Surv.from_dataframe('CVD사망', '추적기간_연', female_data_invasive)

# Extract covariates without dummy encoding
X_male_non_invasive = male_data_non_invasive.drop(['CVD사망', '추적기간_연'], axis=1)
X_female_non_invasive = female_data_non_invasive.drop(['CVD사망', '추적기간_연'], axis=1)
X_male_invasive = male_data_invasive.drop(['CVD사망', '추적기간_연'], axis=1)
X_female_invasive = female_data_invasive.drop(['CVD사망', '추적기간_연'], axis=1)

# Stratified data split
X_train_male_non_invasive, X_test_male_non_invasive, y_train_male_non_invasive, y_test_male_non_invasive = train_test_split(
    X_male_non_invasive, y_male_non_invasive, test_size=0.3, stratify=y_male_non_invasive['CVD사망'], random_state=42)
X_train_female_non_invasive, X_test_female_non_invasive, y_train_female_non_invasive, y_test_female_non_invasive = train_test_split(
    X_female_non_invasive, y_female_non_invasive, test_size=0.3, stratify=y_female_non_invasive['CVD사망'], random_state=42)
X_train_male_invasive, X_test_male_invasive, y_train_male_invasive, y_test_male_invasive = train_test_split(
    X_male_invasive, y_male_invasive, test_size=0.3, stratify=y_male_invasive['CVD사망'], random_state=42)
X_train_female_invasive, X_test_female_invasive, y_train_female_invasive, y_test_female_invasive = train_test_split(
    X_female_invasive, y_female_invasive, test_size=0.3, stratify=y_female_invasive['CVD사망'], random_state=42)

In [None]:
# Model building and evaluation function
def build_and_evaluate_model_rsf(X_train, y_train, X_test, y_test, label):
    # Build model
    rsf_model = RandomSurvivalForest(n_estimators=100, max_depth=16, max_leaf_nodes=32,
                                 min_samples_split=8, min_samples_leaf=8, low_memory=False,
                                 n_jobs=-1, random_state=42)

    rsf_model.fit(X_train, y_train)
    # Calculate C-index for RSF model
    y_pred_train = rsf_model.predict(X_train)
    y_pred_test = rsf_model.predict(X_test)

    c_index_train = concordance_index_censored(y_train['CVD사망'], y_train['추적기간_연'], y_pred_train)
    c_index_test = concordance_index_censored(y_test['CVD사망'], y_test['추적기간_연'], y_pred_test)

    print(f"{label} 모델 Train - Concordance Index: {c_index_train[0]:.4f}")
    print(f"{label} 모델 Test - Concordance Index: {c_index_test[0]:.4f}")

    # Set times for AUC calculation within test set follow-up period
    max_follow_up = y_test['추적기간_연'].max()
    min_follow_up = y_test['추적기간_연'].min()

    # Times now set to be within the test data follow-up range
    times = np.arange(min_follow_up, max_follow_up, 1/12)

    # Calculate AUC for the model on the test set
    auc, mean_auc = cumulative_dynamic_auc(y_train, y_test, y_pred_test, times)
    print(f"{label} 모델 Mean AUC: {mean_auc:.4f}")

    # Calculate 95% confidence intervals
    n_bootstraps = 100
    rng_seed = 42  # reproducible results
    bootstrapped_scores = []

    rng = np.random.RandomState(rng_seed)
    for i in range(n_bootstraps):
        indices = rng.randint(0, len(X_test), len(X_test))
        y_test_resampled = y_test[indices]
        y_pred_resampled = y_pred_test[indices]
        try:
            score, _ = cumulative_dynamic_auc(y_train, y_test_resampled, y_pred_resampled, times)
            bootstrapped_scores.append(score)
        except ValueError:
            continue  # Skip any bootstraps that have incompatible times

    sorted_scores = np.array(bootstrapped_scores)
    mean_scores = np.mean(sorted_scores, axis=0)
    lower = np.percentile(sorted_scores, 2.5, axis=0)
    upper = np.percentile(sorted_scores, 97.5, axis=0)

    return times, auc, mean_auc, lower, upper, c_index_train, c_index_test

# Evaluate models and store C-index values
results_rsf = {
    "Male Non-Invasive": build_and_evaluate_model_rsf(X_train_male_non_invasive, y_train_male_non_invasive, X_test_male_non_invasive, y_test_male_non_invasive, "Male Non-Invasive"),
    "Male Invasive": build_and_evaluate_model_rsf(X_train_male_invasive, y_train_male_invasive, X_test_male_invasive, y_test_male_invasive, "Male Invasive"),
    "Female Non-Invasive": build_and_evaluate_model_rsf(X_train_female_non_invasive, y_train_female_non_invasive, X_test_female_non_invasive, y_test_female_non_invasive, "Female Non-Invasive"),
    "Female Invasive": build_and_evaluate_model_rsf(X_train_female_invasive, y_train_female_invasive, X_test_female_invasive, y_test_female_invasive, "Female Invasive")
}

In [None]:
results_dict = results_rsf

# Extract C-index values for plotting
c_index_values_train = [results_dict[key][5][0] for key in results_dict]  # Train C-index
c_index_values_test = [results_dict[key][6][0] for key in results_dict]  # Test C-index

# Plotting C-index as bar plot
plt.figure(figsize=(8, 6))
x_labels = ["Male Non-Invasive", "Male Invasive", "Female Non-Invasive", "Female Invasive"]

x = np.arange(len(x_labels))
bar_width = 0.35

bars1 = plt.bar(x - bar_width/2, c_index_values_train, bar_width, label='Train', color='skyblue')
bars2 = plt.bar(x + bar_width/2, c_index_values_test, bar_width, label='Test', color='orange')

plt.xlabel('Model Type')
plt.ylabel('C-index')
plt.title('C-index Comparison for Random Survival Forest Models by Feature Type')
plt.xticks(x, x_labels)
plt.ylim(0.5, 1)  # Adjust y-axis limit for better visibility
plt.legend()
plt.grid(axis='y')

# Display C-index values on top of the bars
for bar in bars1:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.01, round(yval, 3), ha='center', va='bottom')

for bar in bars2:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.01, round(yval, 3), ha='center', va='bottom')

plt.show()

# Plotting male models with CI in the same figure
plt.figure(figsize=(12, 6))

# Male Non-Invasive
plt.plot(results_dict["Male Non-Invasive"][0], results_dict["Male Non-Invasive"][1],
         color='blue', label=f'Male Non-Invasive (Mean AUC = {results_dict["Male Non-Invasive"][2]:.3f} (95% CI: {results_dict["Male Non-Invasive"][3][0]:.3f} - {results_dict["Male Non-Invasive"][4][0]:.3f}))')
plt.fill_between(results_dict["Male Non-Invasive"][0], results_dict["Male Non-Invasive"][3], results_dict["Male Non-Invasive"][4], color='blue', alpha=0.2)

# Male Invasive
plt.plot(results_dict["Male Invasive"][0], results_dict["Male Invasive"][1],
         color='green', label=f'Male Invasive (Mean AUC = {results_dict["Male Invasive"][2]:.3f} (95% CI: {results_dict["Male Invasive"][3][0]:.3f} - {results_dict["Male Invasive"][4][0]:.3f}))')
plt.fill_between(results_dict["Male Invasive"][0], results_dict["Male Invasive"][3], results_dict["Male Invasive"][4], color='green', alpha=0.2)

plt.xlabel('Time (years)')
plt.ylabel('AUC')
plt.title('Time-Dependent AUC for Random Survival Forest Models - Males')
plt.legend(loc='lower right')
plt.grid()
plt.show()

# Plotting female models with CI in the same figure
plt.figure(figsize=(12, 6))

# Female Non-Invasive
plt.plot(results_dict["Female Non-Invasive"][0], results_dict["Female Non-Invasive"][1],
         color='purple', label=f'Female Non-Invasive (Mean AUC = {results_dict["Female Non-Invasive"][2]:.3f} (95% CI: {results_dict["Female Non-Invasive"][3][0]:.3f} - {results_dict["Female Non-Invasive"][4][0]:.3f}))')
plt.fill_between(results_dict["Female Non-Invasive"][0], results_dict["Female Non-Invasive"][3], results_dict["Female Non-Invasive"][4], color='purple', alpha=0.2)

# Female Invasive
plt.plot(results_dict["Female Invasive"][0], results_dict["Female Invasive"][1],
         color='red', label=f'Female Invasive (Mean AUC = {results_dict["Female Invasive"][2]:.3f} (95% CI: {results_dict["Female Invasive"][3][0]:.3f} - {results_dict["Female Invasive"][4][0]:.3f}))')
plt.fill_between(results_dict["Female Invasive"][0], results_dict["Female Invasive"][3], results_dict["Female Invasive"][4], color='red', alpha=0.2)

plt.xlabel('Time (years)')
plt.ylabel('AUC')
plt.title('Time-Dependent AUC for Random Survival Forest Models - Females')
plt.legend(loc='lower right')
plt.grid()
plt.show()

## GBS
- Gradient Boosting Survival Analysis

In [None]:
from sksurv.ensemble import GradientBoostingSurvivalAnalysis

### GBS 전체

In [None]:
train_deaths = Data_train[Data_train['CVD사망'] == 1] # 사망한 대상자 추출
test_deaths = Data_test[Data_test['CVD사망'] == 1]

train_survivors = Data_train[Data_train['CVD사망'] == 0] # 사망하지 않은 대상자 추출
test_survivors = Data_test[Data_test['CVD사망'] == 0]

# 사망하지 않은 대상자를 샘플링
train_survivors_sampled = train_survivors.sample(frac=0.003, random_state=42) # Train 4%
test_survivors_sampled = test_survivors.sample(frac=0.01, random_state=42) # Test 15%

sampled_train = pd.concat([train_deaths, train_survivors_sampled]) # 샘플링된 데이터를 다시 합침
sampled_test = pd.concat([test_deaths, test_survivors_sampled])
sampled_total = pd.concat([sampled_train,sampled_test])

# 결과 확인
print("Sampled Train Data:")
print(sampled_train['CVD사망'].value_counts())
print("\nSampled Test Data:")
print(sampled_test['CVD사망'].value_counts())
print('\nSampled Total Data:')
print(sampled_total['CVD사망'].value_counts())

In [None]:
# Componentwise Gradient Boosting Survival Analysis(CGBS) 모델 구축 및 평가
def build_and_evaluate_model_gbs(X_train, y_train, X_test, y_test):
    model = GradientBoostingSurvivalAnalysis(loss='coxph', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse',
                                             min_samples_split=10, min_samples_leaf=10, min_weight_fraction_leaf=0.01, max_depth=3, min_impurity_decrease=0.0,
                                             max_features=None, max_leaf_nodes=16, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, dropout_rate=0.0,
                                             verbose=0, ccp_alpha=0.0, random_state=42)
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    c_index_train = concordance_index_censored(y_train['CVD사망'], y_train['추적기간_연'], y_pred_train)
    c_index_test = concordance_index_censored(y_test['CVD사망'], y_test['추적기간_연'], y_pred_test)

    return model, y_pred_train, y_pred_test, c_index_train, c_index_test

# 결과 플로팅 함수
def plot_results(times_non_invasive, auc_non_invasive, conf_int_non_invasive,
                 times_invasive, auc_invasive, conf_int_invasive,
                 mean_auc_non_invasive, mean_auc_invasive,
                 ci_non_invasive, ci_invasive):
    plt.figure(figsize=(10, 6))

    # 비침습 모델
    plt.plot(times_non_invasive, auc_non_invasive, color='blue',
             label=f'CGBS 비침습 모델 (Mean AUC = {mean_auc_non_invasive:.3f} (95% CI: {ci_non_invasive[0]:.3f} - {ci_non_invasive[1]:.3f}))')
    plt.fill_between(times_non_invasive, conf_int_non_invasive[0], conf_int_non_invasive[1], color='blue', alpha=0.2)

    # 침습 모델
    plt.plot(times_invasive, auc_invasive, color='orange',
             label=f'CGBS 침습 모델 (Mean AUC = {mean_auc_invasive:.3f} (95% CI: {ci_invasive[0]:.3f} - {ci_invasive[1]:.3f}))')
    plt.fill_between(times_invasive, conf_int_invasive[0], conf_int_invasive[1], color='orange', alpha=0.2)

    plt.xlabel('Time (years)')
    plt.ylabel('AUC')
    plt.title('Time-Dependent AUC for Componentwise Gradient Boosting Models with 95% Confidence Interval (Test Set)')
    plt.legend()
    plt.grid(True)
    plt.show()

# GBS 모델 구축 및 평가 (비침습 데이터)
gbs_model_non_invasive, y_pred_train_non_invasive_gbs, y_pred_test_non_invasive_gbs, c_index_train_non_invasive_gbs, c_index_test_non_invasive_gbs = build_and_evaluate_model_gbs(
    X_train_non_invasive, y_train_non_invasive, X_test_non_invasive, y_test_non_invasive)

# GBS 모델 구축 및 평가 (침습 데이터)
gbs_model_invasive, y_pred_train_invasive_gbs, y_pred_test_invasive_gbs, c_index_train_invasive_gbs, c_index_test_invasive_gbs = build_and_evaluate_model_gbs(
    X_train_invasive, y_train_invasive, X_test_invasive, y_test_invasive)

# Concordance Index 출력
print(f"비침습 모델 Train - Concordance Index: {c_index_train_non_invasive_gbs[0]:.4f}")
print(f"비침습 모델 Test - Concordance Index: {c_index_test_non_invasive_gbs[0]:.4f}")
print(f"침습 모델 Train - Concordance Index: {c_index_train_invasive_gbs[0]:.4f}")
print(f"침습 모델 Test - Concordance Index: {c_index_test_invasive_gbs[0]:.4f}")

# 최대 추적 기간 및 times 설정 (시간 범위 조정)
y_train_events = y_train_non_invasive[y_train_non_invasive['CVD사망']]
train_min_non_invasive, train_max_non_invasive = y_train_events['추적기간_연'].min(), y_train_events['추적기간_연'].max()

y_test_events = y_test_non_invasive[y_test_non_invasive['CVD사망']]
test_min_non_invasive, test_max_non_invasive = y_test_events['추적기간_연'].min(), y_test_events['추적기간_연'].max()

# 범위가 다르면 경고 메시지를 출력하고 수정
if not (train_min_non_invasive <= test_min_non_invasive < test_max_non_invasive <= train_max_non_invasive):
    print("경고: 테스트 데이터의 시간 범위가 훈련 데이터의 시간 범위와 일치하지 않습니다. 시간 범위를 조정합니다.")
    test_min_non_invasive = max(train_min_non_invasive, test_min_non_invasive)
    test_max_non_invasive = min(train_max_non_invasive, test_max_non_invasive)

times_non_invasive = np.arange(test_min_non_invasive, test_max_non_invasive, 1/12)

# 침습 데이터의 시간 범위도 같은 방식으로 처리
y_train_events = y_train_invasive[y_train_invasive['CVD사망']]
train_min_invasive, train_max_invasive = y_train_events['추적기간_연'].min(), y_train_events['추적기간_연'].max()

y_test_events = y_test_invasive[y_test_invasive['CVD사망']]
test_min_invasive, test_max_invasive = y_test_events['추적기간_연'].min(), y_test_events['추적기간_연'].max()

# 범위가 다르면 경고 메시지를 출력하고 수정
if not (train_min_invasive <= test_min_invasive < test_max_invasive <= train_max_invasive):
    print("경고: 침습 테스트 데이터의 시간 범위가 훈련 데이터의 시간 범위와 일치하지 않습니다. 시간 범위를 조정합니다.")
    test_min_invasive = max(train_min_invasive, test_min_invasive)
    test_max_invasive = min(train_max_invasive, test_max_invasive)

times_invasive = np.arange(test_min_invasive, test_max_invasive, 1/12)

# AUC 및 신뢰구간 계산
gbs_auc_non_invasive, gbs_mean_auc_non_invasive, gbs_conf_int_non_invasive = calculate_auc_and_confidence_interval(
    y_train_non_invasive, y_test_non_invasive, y_pred_test_non_invasive_gbs, times_non_invasive)

gbs_auc_invasive, gbs_mean_auc_invasive, gbs_conf_int_invasive = calculate_auc_and_confidence_interval(
    y_train_invasive, y_test_invasive, y_pred_test_invasive_gbs, times_invasive)

# 신뢰구간의 평균값 계산
ci_non_invasive_gbs = (np.mean(gbs_conf_int_non_invasive[0]), np.mean(gbs_conf_int_non_invasive[1]))
ci_invasive_gbs = (np.mean(gbs_conf_int_invasive[0]), np.mean(gbs_conf_int_invasive[1]))

# 결과 플로팅
plot_results(
    times_non_invasive, gbs_auc_non_invasive, gbs_conf_int_non_invasive,
    times_invasive, gbs_auc_invasive, gbs_conf_int_invasive,
    gbs_mean_auc_non_invasive, gbs_mean_auc_invasive,
    ci_non_invasive_gbs, ci_invasive_gbs
)

# 모델별 성능 비교

## 전체 - 구분없이 그림

In [None]:
# 비침습 결과 플로팅 함수
def plot_combined_results_non_invasive(times_non_invasive, cox_auc_non_invasive_cph, cox_conf_int_non_invasive_cph,
                                       coxnet_auc_non_invasive, coxnet_conf_int_non_invasive,
                                       rsf_auc_non_invasive, rsf_conf_int_non_invasive,
                                       fkssvm_auc_non_invasive, fkssvm_conf_int_non_invasive,
                                       st_auc_non_invasive, st_conf_int_non_invasive,
                                       gbs_auc_non_invasive, gbs_conf_int_non_invasive,
                                       cox_mean_auc_non_invasive_cph, coxnet_mean_auc_non_invasive,
                                       rsf_mean_auc_non_invasive, fkssvm_mean_auc_non_invasive,
                                       st_mean_auc_non_invasive, gbs_mean_auc_non_invasive,
                                       ci_non_invasive_cph, ci_non_invasive_coxnet, ci_non_invasive_rsf,
                                       ci_non_invasive_fkssvm, ci_non_invasive_st, ci_non_invasive_gbs):
    plt.figure(figsize=(12, 8))
    plt.title("Non Invasive - Time dependent AUC with 95% Confidence Interval")

    # Cox PH 모델
    plt.plot(times_non_invasive, cox_auc_non_invasive_cph, label=f"Cox PH (Mean AUC={cox_mean_auc_non_invasive_cph:.3f} (95% CI: {ci_non_invasive_cph[0]:.3f} - {ci_non_invasive_cph[1]:.3f}))", color='blue')
    plt.fill_between(times_non_invasive, cox_conf_int_non_invasive_cph[0], cox_conf_int_non_invasive_cph[1], color='blue', alpha=0.2)

    # Cox Net 모델
    plt.plot(times_non_invasive, coxnet_auc_non_invasive, label=f"Cox Net (Mean AUC={coxnet_mean_auc_non_invasive:.3f} (95% CI: {ci_non_invasive_coxnet[0]:.3f} - {ci_non_invasive_coxnet[1]:.3f}))", color='green')
    plt.fill_between(times_non_invasive, coxnet_conf_int_non_invasive[0], coxnet_conf_int_non_invasive[1], color='green', alpha=0.2)

    # RSF 모델
    plt.plot(times_non_invasive, rsf_auc_non_invasive, label=f"RSF (Mean AUC={rsf_mean_auc_non_invasive:.3f} (95% CI: {ci_non_invasive_rsf[0]:.3f} - {ci_non_invasive_rsf[1]:.3f}))", color='red')
    plt.fill_between(times_non_invasive, rsf_conf_int_non_invasive[0], rsf_conf_int_non_invasive[1], color='red', alpha=0.2)

    # FKSSVM 모델
    plt.plot(times_non_invasive, fkssvm_auc_non_invasive, label=f"FKSSVM (Mean AUC={fkssvm_mean_auc_non_invasive:.3f} (95% CI: {ci_non_invasive_fkssvm[0]:.3f} - {ci_non_invasive_fkssvm[1]:.3f}))", color='purple')
    plt.fill_between(times_non_invasive, fkssvm_conf_int_non_invasive[0], fkssvm_conf_int_non_invasive[1], color='purple', alpha=0.2)

    # Survival Tree 모델
    plt.plot(times_non_invasive, st_auc_non_invasive, label=f"Survival Tree (Mean AUC={st_mean_auc_non_invasive:.3f} (95% CI: {ci_non_invasive_st[0]:.3f} - {ci_non_invasive_st[1]:.3f}))", color='cyan')
    plt.fill_between(times_non_invasive, st_conf_int_non_invasive[0], st_conf_int_non_invasive[1], color='cyan', alpha=0.2)

    # Gradient Boosting Survival 모델
    plt.plot(times_non_invasive, gbs_auc_non_invasive, label=f"GBS (Mean AUC={gbs_mean_auc_non_invasive:.3f} (95% CI: {ci_non_invasive_gbs[0]:.3f} - {ci_non_invasive_gbs[1]:.3f}))", color='orange')
    plt.fill_between(times_non_invasive, gbs_conf_int_non_invasive[0], gbs_conf_int_non_invasive[1], color='orange', alpha=0.2)

    plt.xlabel("Time (Years)")
    plt.ylabel("Time dependent AUC")
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# 침습 결과 플로팅 함수
def plot_combined_results_invasive(times_invasive, cox_auc_invasive_cph, cox_conf_int_invasive_cph,
                                   coxnet_auc_invasive, coxnet_conf_int_invasive,
                                   rsf_auc_invasive, rsf_conf_int_invasive,
                                   fkssvm_auc_invasive, fkssvm_conf_int_invasive,
                                   st_auc_invasive, st_conf_int_invasive,
                                   gbs_auc_invasive, gbs_conf_int_invasive,
                                   cox_mean_auc_invasive_cph, coxnet_mean_auc_invasive,
                                   rsf_mean_auc_invasive, fkssvm_mean_auc_invasive,
                                   st_mean_auc_invasive, gbs_mean_auc_invasive,
                                   ci_invasive_cph, ci_invasive_coxnet, ci_invasive_rsf,
                                   ci_invasive_fkssvm, ci_invasive_st, ci_invasive_gbs):
    plt.figure(figsize=(12, 8))
    plt.title("Invasive - Time dependent AUC with 95% Confidence Interval")

    # Cox PH 모델
    plt.plot(times_invasive, cox_auc_invasive_cph, label=f"Cox PH (Mean AUC={cox_mean_auc_invasive_cph:.3f} (95% CI: {ci_invasive_cph[0]:.3f} - {ci_invasive_cph[1]:.3f}))", color='blue')
    plt.fill_between(times_invasive, cox_conf_int_invasive_cph[0], cox_conf_int_invasive_cph[1], color='blue', alpha=0.2)

    # Cox Net 모델
    plt.plot(times_invasive, coxnet_auc_invasive, label=f"Cox Net (Mean AUC={coxnet_mean_auc_invasive:.3f} (95% CI: {ci_invasive_coxnet[0]:.3f} - {ci_invasive_coxnet[1]:.3f}))", color='green')
    plt.fill_between(times_invasive, coxnet_conf_int_invasive[0], coxnet_conf_int_invasive[1], color='green', alpha=0.2)

    # RSF 모델
    plt.plot(times_invasive, rsf_auc_invasive, label=f"RSF (Mean AUC={rsf_mean_auc_invasive:.3f} (95% CI: {ci_invasive_rsf[0]:.3f} - {ci_invasive_rsf[1]:.3f}))", color='red')
    plt.fill_between(times_invasive, rsf_conf_int_invasive[0], rsf_conf_int_invasive[1], color='red', alpha=0.2)

    # FKSSVM 모델
    plt.plot(times_invasive, fkssvm_auc_invasive, label=f"FKSSVM (Mean AUC={fkssvm_mean_auc_invasive:.3f} (95% CI: {ci_invasive_fkssvm[0]:.3f} - {ci_invasive_fkssvm[1]:.3f}))", color='purple')
    plt.fill_between(times_invasive, fkssvm_conf_int_invasive[0], fkssvm_conf_int_invasive[1], color='purple', alpha=0.2)

    # Survival Tree 모델
    plt.plot(times_invasive, st_auc_invasive, label=f"Survival Tree (Mean AUC={st_mean_auc_invasive:.3f} (95% CI: {ci_invasive_st[0]:.3f} - {ci_invasive_st[1]:.3f}))", color='cyan')
    plt.fill_between(times_invasive, st_conf_int_invasive[0], st_conf_int_invasive[1], color='cyan', alpha=0.2)

    # Gradient Boosting Survival 모델
    plt.plot(times_invasive, gbs_auc_invasive, label=f"GBS (Mean AUC={gbs_mean_auc_invasive:.3f} (95% CI: {ci_invasive_gbs[0]:.3f} - {ci_invasive_gbs[1]:.3f}))", color='orange')
    plt.fill_between(times_invasive, gbs_conf_int_invasive[0], gbs_conf_int_invasive[1], color='orange', alpha=0.2)

    plt.xlabel("Time (Years)")
    plt.ylabel("Time dependent AUC")
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# 비침습 결과 플로팅
plot_combined_results_non_invasive(
    times_non_invasive,
    cox_auc_non_invasive_cph, cox_conf_int_non_invasive_cph,
    coxnet_auc_non_invasive, coxnet_conf_int_non_invasive,
    rsf_auc_non_invasive, rsf_conf_int_non_invasive,
    fkssvm_auc_non_invasive, fkssvm_conf_int_non_invasive,
    st_auc_non_invasive, st_conf_int_non_invasive,
    gbs_auc_non_invasive, gbs_conf_int_non_invasive,
    cox_mean_auc_non_invasive_cph, coxnet_mean_auc_non_invasive,
    rsf_mean_auc_non_invasive, fkssvm_mean_auc_non_invasive,
    st_mean_auc_non_invasive, gbs_mean_auc_non_invasive,
    ci_non_invasive_cph, ci_non_invasive_coxnet, ci_non_invasive_rsf,
    ci_non_invasive_fkssvm, ci_non_invasive_st, ci_non_invasive_gbs
)

# 침습 결과 플로팅
plot_combined_results_invasive(
    times_invasive,
    cox_auc_invasive_cph, cox_conf_int_invasive_cph,
    coxnet_auc_invasive, coxnet_conf_int_invasive,
    rsf_auc_invasive, rsf_conf_int_invasive,
    fkssvm_auc_invasive, fkssvm_conf_int_invasive,
    st_auc_invasive, st_conf_int_invasive,
    gbs_auc_invasive, gbs_conf_int_invasive,
    cox_mean_auc_invasive_cph, coxnet_mean_auc_invasive,
    rsf_mean_auc_invasive, fkssvm_mean_auc_invasive,
    st_mean_auc_invasive, gbs_mean_auc_invasive,
    ci_invasive_cph, ci_invasive_coxnet, ci_invasive_rsf,
    ci_invasive_fkssvm, ci_invasive_st, ci_invasive_gbs
)

## 전체 - 투명도 , 스타일 조정

In [None]:
# 비침습 결과 플로팅 함수
def plot_combined_results_non_invasive(times_non_invasive, cox_auc_non_invasive_cph, cox_conf_int_non_invasive_cph,
                                       coxnet_auc_non_invasive, coxnet_conf_int_non_invasive,
                                       rsf_auc_non_invasive, rsf_conf_int_non_invasive,
                                       fkssvm_auc_non_invasive, fkssvm_conf_int_non_invasive,
                                       st_auc_non_invasive, st_conf_int_non_invasive,
                                       gbs_auc_non_invasive, gbs_conf_int_non_invasive,
                                       cox_mean_auc_non_invasive_cph, coxnet_mean_auc_non_invasive,
                                       rsf_mean_auc_non_invasive, fkssvm_mean_auc_non_invasive,
                                       st_mean_auc_non_invasive, gbs_mean_auc_non_invasive,
                                       ci_non_invasive_cph, ci_non_invasive_coxnet, ci_non_invasive_rsf,
                                       ci_non_invasive_fkssvm, ci_non_invasive_st, ci_non_invasive_gbs):

    plt.figure(figsize=(12, 8))
    plt.title("Non Invasive - Time dependent AUC with 95% Confidence Interval")

    # 각 모델의 AUC 값을 리스트로 저장하여 평균 AUC가 가장 높은 모델을 찾음
    mean_aucs = [cox_mean_auc_non_invasive_cph, coxnet_mean_auc_non_invasive, rsf_mean_auc_non_invasive,
                 fkssvm_mean_auc_non_invasive, st_mean_auc_non_invasive, gbs_mean_auc_non_invasive]
    max_auc_index = mean_aucs.index(max(mean_aucs))  # 평균 AUC가 가장 높은 모델의 인덱스 찾기

    # 모델 리스트와 관련 데이터
    model_labels = ['Cox PH', 'Cox Net', 'RSF', 'FKSSVM', 'Survival Tree', 'GBS']
    aucs = [cox_auc_non_invasive_cph, coxnet_auc_non_invasive, rsf_auc_non_invasive,
            fkssvm_auc_non_invasive, st_auc_non_invasive, gbs_auc_non_invasive]
    conf_ints = [cox_conf_int_non_invasive_cph, coxnet_conf_int_non_invasive, rsf_conf_int_non_invasive,
                 fkssvm_conf_int_non_invasive, st_conf_int_non_invasive, gbs_conf_int_non_invasive]
    mean_aucs = [cox_mean_auc_non_invasive_cph, coxnet_mean_auc_non_invasive, rsf_mean_auc_non_invasive,
                 fkssvm_mean_auc_non_invasive, st_mean_auc_non_invasive, gbs_mean_auc_non_invasive]
    cis = [ci_non_invasive_cph, ci_non_invasive_coxnet, ci_non_invasive_rsf,
           ci_non_invasive_fkssvm, ci_non_invasive_st, ci_non_invasive_gbs]
    colors = ['blue', 'green', 'red', 'purple', 'cyan', 'orange']

    # 각 모델을 플롯
    for i in range(6):
        if i == max_auc_index:
            # Mean AUC가 가장 높은 모델은 실선으로 그림
            plt.plot(times_non_invasive, aucs[i], label=f"{model_labels[i]} (Mean AUC={mean_aucs[i]:.3f} (95% CI: {cis[i][0]:.3f} - {cis[i][1]:.3f}))",
                     color=colors[i], alpha=1.0, linestyle='-')
        else:
            # 나머지 모델은 투명도를 낮춤
            plt.plot(times_non_invasive, aucs[i], label=f"{model_labels[i]} (Mean AUC={mean_aucs[i]:.3f} (95% CI: {cis[i][0]:.3f} - {cis[i][1]:.3f}))",
                     color=colors[i], alpha=0.4, linestyle='--')
        plt.fill_between(times_non_invasive, conf_ints[i][0], conf_ints[i][1], color=colors[i], alpha=0.2 if i == max_auc_index else 0.1)

    plt.xlabel("Time (Years)")
    plt.ylabel("Time dependent AUC")
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# 침습 결과 플로팅 함수
def plot_combined_results_invasive(times_invasive, cox_auc_invasive_cph, cox_conf_int_invasive_cph,
                                   coxnet_auc_invasive, coxnet_conf_int_invasive,
                                   rsf_auc_invasive, rsf_conf_int_invasive,
                                   fkssvm_auc_invasive, fkssvm_conf_int_invasive,
                                   st_auc_invasive, st_conf_int_invasive,
                                   gbs_auc_invasive, gbs_conf_int_invasive,
                                   cox_mean_auc_invasive_cph, coxnet_mean_auc_invasive,
                                   rsf_mean_auc_invasive, fkssvm_mean_auc_invasive,
                                   st_mean_auc_invasive, gbs_mean_auc_invasive,
                                   ci_invasive_cph, ci_invasive_coxnet, ci_invasive_rsf,
                                   ci_invasive_fkssvm, ci_invasive_st, ci_invasive_gbs):

    plt.figure(figsize=(12, 8))
    plt.title("Invasive - Time dependent AUC with 95% Confidence Interval")

    # 각 모델의 AUC 값을 리스트로 저장하여 평균 AUC가 가장 높은 모델을 찾음
    mean_aucs = [cox_mean_auc_invasive_cph, coxnet_mean_auc_invasive, rsf_mean_auc_invasive,
                 fkssvm_mean_auc_invasive, st_mean_auc_invasive, gbs_mean_auc_invasive]
    max_auc_index = mean_aucs.index(max(mean_aucs))  # 평균 AUC가 가장 높은 모델의 인덱스 찾기

    # 모델 리스트와 관련 데이터
    model_labels = ['Cox PH', 'Cox Net', 'RSF', 'FKSSVM', 'Survival Tree', 'GBS']
    aucs = [cox_auc_invasive_cph, coxnet_auc_invasive, rsf_auc_invasive,
            fkssvm_auc_invasive, st_auc_invasive, gbs_auc_invasive]
    conf_ints = [cox_conf_int_invasive_cph, coxnet_conf_int_invasive, rsf_conf_int_invasive,
                 fkssvm_conf_int_invasive, st_conf_int_invasive, gbs_conf_int_invasive]
    mean_aucs = [cox_mean_auc_invasive_cph, coxnet_mean_auc_invasive, rsf_mean_auc_invasive,
                 fkssvm_mean_auc_invasive, st_mean_auc_invasive, gbs_mean_auc_invasive]
    cis = [ci_invasive_cph, ci_invasive_coxnet, ci_invasive_rsf,
           ci_invasive_fkssvm, ci_invasive_st, ci_invasive_gbs]
    colors = ['blue', 'green', 'red', 'purple', 'cyan', 'orange']

    # 각 모델을 플롯
    for i in range(6):
        if i == max_auc_index:
            # Mean AUC가 가장 높은 모델은 실선으로 그림
            plt.plot(times_invasive, aucs[i], label=f"{model_labels[i]} (Mean AUC={mean_aucs[i]:.3f} (95% CI: {cis[i][0]:.3f} - {cis[i][1]:.3f}))",
                     color=colors[i], alpha=1.0, linestyle='-')
        else:
            # 나머지 모델은 투명도를 낮춤
            plt.plot(times_invasive, aucs[i], label=f"{model_labels[i]} (Mean AUC={mean_aucs[i]:.3f} (95% CI: {cis[i][0]:.3f} - {cis[i][1]:.3f}))",
                     color=colors[i], alpha=0.4, linestyle='--')
        plt.fill_between(times_invasive, conf_ints[i][0], conf_ints[i][1], color=colors[i], alpha=0.2 if i == max_auc_index else 0.1)

    plt.xlabel("Time (Years)")
    plt.ylabel("Time dependent AUC")
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# 비침습 결과 플로팅
plot_combined_results_non_invasive(
    times_non_invasive,
    cox_auc_non_invasive_cph, cox_conf_int_non_invasive_cph,
    coxnet_auc_non_invasive, coxnet_conf_int_non_invasive,
    rsf_auc_non_invasive, rsf_conf_int_non_invasive,
    fkssvm_auc_non_invasive, fkssvm_conf_int_non_invasive,
    st_auc_non_invasive, st_conf_int_non_invasive,
    gbs_auc_non_invasive, gbs_conf_int_non_invasive,
    cox_mean_auc_non_invasive_cph, coxnet_mean_auc_non_invasive,
    rsf_mean_auc_non_invasive, fkssvm_mean_auc_non_invasive,
    st_mean_auc_non_invasive, gbs_mean_auc_non_invasive,
    ci_non_invasive_cph, ci_non_invasive_coxnet, ci_non_invasive_rsf,
    ci_non_invasive_fkssvm, ci_non_invasive_st, ci_non_invasive_gbs
)

# 침습 결과 플로팅
plot_combined_results_invasive(
    times_invasive,
    cox_auc_invasive_cph, cox_conf_int_invasive_cph,
    coxnet_auc_invasive, coxnet_conf_int_invasive,
    rsf_auc_invasive, rsf_conf_int_invasive,
    fkssvm_auc_invasive, fkssvm_conf_int_invasive,
    st_auc_invasive, st_conf_int_invasive,
    gbs_auc_invasive, gbs_conf_int_invasive,
    cox_mean_auc_invasive_cph, coxnet_mean_auc_invasive,
    rsf_mean_auc_invasive, fkssvm_mean_auc_invasive,
    st_mean_auc_invasive, gbs_mean_auc_invasive,
    ci_invasive_cph, ci_invasive_coxnet, ci_invasive_rsf,
    ci_invasive_fkssvm, ci_invasive_st, ci_invasive_gbs
)

# 이전 코드

In [None]:
# Auto ML
#from autogluon.tabular import TabularPredictor
# 결측치 처리
from missingpy import MissForest #sklearn 버전 변경
from lifelines import CoxPHFitter
from sksurv.preprocessing import OneHotEncoder

# 데이터 전처리
#from dataprep.eda import *
import numpy as np
import pandas as pd
import statsmodels.api as sm
import pylab as pl
from sklearn.preprocessing import StandardScaler

# 분류기 , 평가모델
from sklearn.svm import LinearSVC , SVC
from sklearn.linear_model import LogisticRegression
from statsmodels.formula.api import logit
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
#from lightgbm import LGBMClassifier
#from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix, classification_report , log_loss
from sklearn.metrics import accuracy_score,recall_score , precision_score , f1_score , roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# 통계분석을 하기 위한 도구 불러오기.
import scipy as sp
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols

# 성능평가를 위한 도구 불러오기
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.svm import LinearSVR , SVR
from sklearn.linear_model import  LinearRegression
from sklearn.ensemble import RandomForestRegressor , ExtraTreesRegressor
#import xgboost , lightgbm

# 하이퍼 하라미터 튜닝
#import optuna.integration.lightgbm as lgb
from sklearn.model_selection import StratifiedKFold , KFold , cross_val_score , cross_validate
from sklearn.model_selection import RandomizedSearchCV , GridSearchCV
from scipy.stats import randint , uniform

# Feature Selection 모듈 -> sklearn.impute 모듈 없다고 나옴.

# 시각화
#import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
matplotlib.rcParams['axes.unicode_minus']=False
plt.style.use('seaborn-white')

import collections

# 한글 폰트 설정
plt.rc('font', family='Malgun Gothic')
import io
#import pydot
from IPython.core.display import Image
from sklearn.tree import export_graphviz

#import optuna
#from optuna import Trial, visualization
#from optuna.samplers import TPESampler

from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score,
)
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.nonparametric import kaplan_meier_estimator
import pyreadstat
import sksurv

In [None]:
Data_train.columns

In [None]:
# 사용할 변수만 담기
feature = ['성별', '나이', 'BMI', 'WHtR', '허벅지둘레', 'SBP', 'DBP',
           '공복혈당', 'LDL', 'HDL', 'TG', '대사증후군여부', '당뇨병여부', '고혈압여부', '고강도mets',
           'TotalMets', '중강도mets', '걷기mets', '나이_5그룹', '나이_4그룹', '나이_3그룹', '나이_2그룹',
           '추적기간_연', 'CVD사망','PA_3group','PA_2group']
Data_total = Data_total[feature]
Data_train = Data_train[feature]
Data_test = Data_test[feature]
Data_rsf = Data_rsf[feature]

In [None]:
pd.options.display.float_format = '{:.2f}'.format # 일반 표기법
#pd.options.display.float_format  = '{:.2e}'.format # 과학적 표기법
Data_train.describe()

# 범주형 변수 지정

In [None]:
# 범주형 타입 지정
Data_train['나이'] = Data_train.나이.astype('int')
Data_train['성별'] = Data_train.성별.astype('category')
Data_train['대사증후군여부'] = Data_train.대사증후군여부.astype('category')
Data_train['당뇨병여부'] = Data_train.당뇨병여부.astype('category')
Data_train['고혈압여부'] = Data_train.고혈압여부.astype('category')
Data_train['PA_3group'] = Data_train.PA_3group.astype('category')
Data_train['PA_2group'] = Data_train.PA_2group.astype('category')
Data_train['나이_5그룹'] = Data_train.나이_5그룹.astype('category')
Data_train['나이_4그룹'] = Data_train.나이_4그룹.astype('category')
Data_train['나이_3그룹'] = Data_train.나이_3그룹.astype('category')
Data_train['나이_2그룹'] = Data_train.나이_2그룹.astype('category')

Data_test['나이'] = Data_test.나이.astype('int')
Data_test['성별'] = Data_test.성별.astype('category')
Data_test['대사증후군여부'] = Data_test.대사증후군여부.astype('category')
Data_test['당뇨병여부'] = Data_test.당뇨병여부.astype('category')
Data_test['고혈압여부'] = Data_test.고혈압여부.astype('category')
Data_test['PA_3group'] = Data_test.PA_3group.astype('category')
Data_test['PA_2group'] = Data_test.PA_2group.astype('category')
Data_test['나이_5그룹'] = Data_test.나이_5그룹.astype('category')
Data_test['나이_4그룹'] = Data_test.나이_4그룹.astype('category')
Data_test['나이_3그룹'] = Data_test.나이_3그룹.astype('category')
Data_test['나이_2그룹'] = Data_test.나이_2그룹.astype('category')

Data_rsf['나이'] = Data_rsf.나이.astype('int')
Data_rsf['성별'] = Data_rsf.성별.astype('category')
Data_rsf['대사증후군여부'] = Data_rsf.대사증후군여부.astype('category')
Data_rsf['당뇨병여부'] = Data_rsf.당뇨병여부.astype('category')
Data_rsf['고혈압여부'] = Data_rsf.고혈압여부.astype('category')
Data_rsf['PA_3group'] = Data_rsf.PA_3group.astype('category')
Data_rsf['PA_2group'] = Data_rsf.PA_2group.astype('category')
Data_rsf['나이_5그룹'] = Data_rsf.나이_5그룹.astype('category')
Data_rsf['나이_4그룹'] = Data_rsf.나이_4그룹.astype('category')
Data_rsf['나이_3그룹'] = Data_rsf.나이_3그룹.astype('category')
Data_rsf['나이_2그룹'] = Data_rsf.나이_2그룹.astype('category')

In [None]:
Data_train_encoding = OneHotEncoder().fit_transform(Data_train)
Data_test_encoding = OneHotEncoder().fit_transform(Data_test)
Data_rsf_encoding = OneHotEncoder().fit_transform(Data_rsf)

In [None]:
Data_test_encoding

# 결측치 확인

In [None]:
# 결측치
missing_data = Data_total.isnull().sum().reset_index()
missing_data.columns=['예측변수 이름','결측치 개수']
missing_data['결측치 비율'] = missing_data['결측치 개수']/Data_total.shape[0]
missing_data = missing_data.sort_values(by='결측치 비율', ascending=False)
missing_data.loc[missing_data['결측치 비율'] != 0]

# 결측치 대체

In [None]:
feature = Data_total[['성별', '나이', '신장', '체중', 'BMI', '허리둘레', 'WHtR', '허벅지둘레',
                     'SBP', 'DBP','공복혈당', 'LDL', 'HDL', 'TG',
                     '대사증후군여부', '당뇨병여부', '고혈압여부',
                     '고강도신체활동', '중강도신체활동', '걷기신체활동', '추적기간_연','CVD사망', ]]

In [None]:
imputer = MissForest()
feautre_impute = imputer.fit_transform(feature)

# 변수 Log , Sqrt 변환
- 2땀 - 1땀 변화량은 음수도 있으므로 로그와 sqrt 변환이 불가능함.

In [None]:
Data_train['TotalMets_log']=np.log1p(Data_train['TotalMets'])
Data_test['TotalMets_log']=np.log1p(Data_test['TotalMets'])

In [None]:
Data_train.hist(figsize=(15,15));

In [None]:
Data_train[['TotalMets_log','TotalMets']].hist(figsize=(10,5));

## 이상치 제거 후 상관관계 다시 분석

In [None]:
Q3 = Data_train[['SW_10min', 'SW_15min', 'SW_20min','SW_25min','SW_Post',
                 'BG_Pre']].quantile(0.75)
Q1 = Data_train[['SW_10min', 'SW_15min', 'SW_20min','SW_25min','SW_Post',
                'BG_Pre']].quantile(0.25)
IQR = Q3-Q1
print(IQR)

upper = Q3+1.5*IQR
lower = Q1-1.5*IQR
print(upper , lower)

## 다중공선성 확인

In [None]:
columns=['성별', '나이', '신장', '체중', 'BMI', '허리둘레', 'WHtR', '허벅지둘레', 'SBP', 'DBP',
       '공복혈당', 'LDL', 'HDL', 'TG', '대사증후군여부', '당뇨병여부', '고혈압여부', '고강도mets',
       'TotalMets', '중강도mets', '걷기mets', 'IPAQ_4group_mets_days', '추적기간_연',
       'CVD사망']
Data_corr = Data_total[columns]
corr = Data_corr.corr()
import seaborn as sns
plt.figure(figsize=(15,15))
sns.heatmap (corr, annot=True);

# 데이터 성별 분할

In [None]:
man_idx = Data_train[Data_train['성별']==1].index
woman_idx = Data_train[Data_train['성별']==2].index
Data_train_man = Data_train.drop(woman_idx)
Data_train_woman = Data_train.drop(man_idx)

man_idx_t = Data_test[Data_test['성별']==1].index
woman_idx_t = Data_test[Data_test['성별']==2].index
Data_test_man = Data_test.drop(woman_idx_t)
Data_test_woman = Data_test.drop(man_idx_t)

# Train / Validation / Test Data 분할

In [None]:
Data_train_woman['CVD사망'].value_counts()

In [None]:
Data_train_man['CVD사망'].value_counts()

In [None]:
Data_total['CVD사망'].value_counts()

In [None]:
X1 = Data_total.dropna(axis=0)
y1 = X1.CVD사망
Input1 = X1.drop(['CVD사망'] , axis=1)
X_train , X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.3, stratify=y1, random_state=42)
print('전체 데이터: {} , 레이블 :{}'.format(Input1.shape[0], y1.shape[0]))
print('Train: {}  ㅣ 레이블 :{}'.format(X_train.shape[0], y_train.shape[0]))
print('Test: {}  ㅣ 레이블 :{}'.format(X_test.shape[0], y_test.shape[0]))
print()
y_total_count = collections.Counter(y1)
y_train_count = collections.Counter(y_train)
y_test_count = collections.Counter(y_test)

print('전체 데이터 중 생존 :', y_total_count[0],' l ', '사망자 :', y_total_count[1], ' ㅣ ', '사망률 :', y_total_count[1]/(y_total_count[0]+y_total_count[1])*100,'%')
print('Train 데이터 중 생존 :',y_train_count[0], ' l ', '사망자 :', y_train_count[1], ' ㅣ ', '사망률 :', y_train_count[1]/(y_train_count[0]+y_train_count[1])*100,'%')
print('Test 데이터 중 생존 :',y_test_count[0],' l ', '사망자 :', y_test_count[1], ' ㅣ ','사망률 :', y_test_count[1]/(y_test_count[0]+y_test_count[1])*100,'%')

In [None]:
import pyreadstat
pyreadstat.write_sav(X_test, 'Test data.sav')
pyreadstat.write_sav(X_train, 'Model building data.sav')

In [None]:
X2 = X_train.dropna(axis=0)
y2 = X2.CVD사망
Input2 = X2.drop(['CVD사망'] , axis=1)
X_train , X_val, y_train, y_val = train_test_split(X2, y2, test_size=0.125, stratify=y2, random_state=42)
print('전체 데이터: {} , 레이블 :{}'.format(Input2.shape[0], y2.shape[0]))
print('Train: {}  ㅣ 레이블 :{}'.format(X_train.shape[0], y_train.shape[0]))
print('Validation: {}  ㅣ 레이블 :{}'.format(X_val.shape[0], y_val.shape[0]))
print()
y_train_count = collections.Counter(y_train)
y_val_count = collections.Counter(y_val)
y_total_count = collections.Counter(y2)

print('전체 데이터 중 생존 :', y_total_count[0],' l ', '사망자 :', y_total_count[1], ' ㅣ ', '사망률 :', y_total_count[1]/(y_total_count[0]+y_total_count[1])*100,'%')
print('Train 데이터 중 생존 :',y_train_count[0], ' l ', '사망자 :', y_train_count[1], ' ㅣ ', '사망률 :', y_train_count[1]/(y_train_count[0]+y_train_count[1])*100,'%')
print('Validation 데이터 중 생존 :',y_val_count[0],' l ', '사망자 :', y_val_count[1], ' ㅣ ','사망률 :', y_val_count[1]/(y_val_count[0]+y_val_count[1])*100,'%')

In [None]:
pyreadstat.write_sav(X_train, 'Train data.sav')
pyreadstat.write_sav(X_val, 'Validation data.sav')

In [None]:
Total_data = Data_total
Train_data = X_train
Val_data = X_val
Test_data = X_test
print(Total_data.shape)
print(Train_data.shape)
print(Val_data.shape)
print(Test_data.shape)

In [None]:
y_train = Train_data.CVD사망
y_val = Val_data.CVD사망
y_test = Test_data.CVD사망

Input_train = Train_data.drop(['CVD사망'] , axis=1)
Input_val = Val_data.drop(['CVD사망'] , axis=1)
Input_test = Test_data.drop(['CVD사망'] , axis=1)

Input_train.head()

# Kaplan meier estimator - 생존률 추정

In [None]:
plt.rc('font', size=20)        # 기본 폰트 크기
plt.rc('axes', labelsize=15)   # x,y축 label 폰트 크기
plt.rc('xtick', labelsize=13)  # x축 눈금 폰트 크기
plt.rc('ytick', labelsize=13)  # y축 눈금 폰트 크기
plt.rc('legend', fontsize=20)  # 범례 폰트 크기
plt.rc('figure', titlesize=30) # figure title 폰트 크기

In [None]:
import sksurv
y2 = sksurv.util.Surv.from_arrays(Data_train['CVD사망'] , Data_train['추적기간_연'])
y2_test = sksurv.util.Surv.from_arrays(Data_test['CVD사망'] , Data_test['추적기간_연'])

In [None]:
from lifelines import KaplanMeierFitter
T = Data_train['추적기간_연']
E = Data_train['CVD사망']
kmf = KaplanMeierFitter().fit(T, event_observed=E)
plt.figure(figsize=(11,10));
kmf.plot_survival_function()
plt.ylabel('est. probability of survival $\hat{S}(t)$')
plt.xlabel('time t (year)')
plt.title('전체대상자의 Kaplan-Meier 생존곡선')
plt.xticks([1,2,3,4,5,6,7,8,9,10])
from lifelines.plotting import add_at_risk_counts
add_at_risk_counts(kmf)

In [None]:
T = Data_train['추적기간_연']
E = Data_train['CVD사망']

plt.figure(figsize=(11,10));
성별 = (Data_train['성별'] ==1)
ax = plt.subplot(111)
kmf1 = KaplanMeierFitter()
kmf1.fit(T[~성별] , event_observed=E[~성별], label='여성')
kmf1.plot_survival_function(ax=ax)
kmf2 = KaplanMeierFitter()
kmf2.fit(T[성별] , event_observed=E[성별], label='남성')
kmf2.plot_survival_function(ax=ax)
plt.title('성별에 따른 Kaplan-Meier 생존곡선')
plt.ylabel('est. probability of survival $\hat{S}(t)$')
plt.xlabel('time t (year)')
plt.xticks([1,2,3,4,5,6,7,8,9,10])
from lifelines.plotting import add_at_risk_counts
add_at_risk_counts(kmf1, kmf2, ax=ax)
plt.tight_layout()

In [None]:
T = Data_train['추적기간_연']
E = Data_train['CVD사망']

plt.figure(figsize=(11,10));
대사증후군 = (Data_train['대사증후군여부'] ==1)
ax = plt.subplot(111)
kmf1 = KaplanMeierFitter()
kmf1.fit(T[~대사증후군] , event_observed=E[~대사증후군], label='대사증후군 없음')
kmf1.plot_survival_function(ax=ax)
kmf2 = KaplanMeierFitter()
kmf2.fit(T[대사증후군] , event_observed=E[대사증후군], label='대사증후군')
kmf2.plot_survival_function(ax=ax)
plt.title('대사증후군 유무에 따른 Kaplan-Meier 생존곡선')
plt.ylabel('est. probability of survival $\hat{S}(t)$')
plt.xlabel('time t (year)')
plt.xticks([1,2,3,4,5,6,7,8,9,10])
from lifelines.plotting import add_at_risk_counts
add_at_risk_counts(kmf1, kmf2, ax=ax)
plt.tight_layout()

In [None]:
plt.figure(figsize=(11,10));
당뇨병 = (Data_train['당뇨병여부'] ==1)
ax = plt.subplot(111)
kmf1 = KaplanMeierFitter()
kmf1.fit(T[~당뇨병] , event_observed=E[~당뇨병], label='당뇨병 없음')
kmf1.plot_survival_function(ax=ax)
kmf2 = KaplanMeierFitter()
kmf2.fit(T[당뇨병] , event_observed=E[당뇨병], label='당뇨병')
kmf2.plot_survival_function(ax=ax)
plt.title('당뇨병 유무에 따른 Kaplan-Meier 생존곡선')
plt.ylabel('est. probability of survival $\hat{S}(t)$')
plt.xlabel('time t (year)')
plt.xticks([1,2,3,4,5,6,7,8,9,10])
from lifelines.plotting import add_at_risk_counts
add_at_risk_counts(kmf1, kmf2, ax=ax)
plt.tight_layout()

In [None]:
plt.figure(figsize=(11,10));
고혈압 = (Data_train['고혈압여부'] ==1)
ax = plt.subplot(111)
kmf1 = KaplanMeierFitter()
kmf1.fit(T[~고혈압] , event_observed=E[~고혈압], label='고혈압 없음')
kmf1.plot_survival_function(ax=ax)
kmf2 = KaplanMeierFitter()
kmf2.fit(T[고혈압] , event_observed=E[고혈압], label='고혈압 있음')
kmf2.plot_survival_function(ax=ax)
plt.title('고혈압 유무에 따른 Kaplan-Meier 생존곡선')
plt.ylabel('est. probability of survival $\hat{S}(t)$')
plt.xlabel('time t (year)')
plt.xticks([1,2,3,4,5,6,7,8,9,10])
from lifelines.plotting import add_at_risk_counts
add_at_risk_counts(kmf1, kmf2, ax=ax)
plt.tight_layout()

In [None]:
plt.figure(figsize=(11,11));
PA1= (Data_train['PA_3group'] ==1)
ax = plt.subplot(111)
kmf1 = KaplanMeierFitter()
kmf1.fit(T[PA1] , event_observed=E[PA1], label='비활동군(Inactivity)')
kmf1.plot_survival_function(ax=ax)
PA2= (Data_train['PA_3group'] ==2)
kmf2 = KaplanMeierFitter()
kmf2.fit(T[PA2] , event_observed=E[PA2], label='최소활동군(Minimally active)')
kmf2.plot_survival_function(ax=ax)
PA3= (Data_train['PA_3group'] ==3)
kmf3 = KaplanMeierFitter()
kmf3.fit(T[PA3] , event_observed=E[PA3], label='활동군(Active)')
kmf3.plot_survival_function(ax=ax)
plt.title('신체활동수준에 따른 Kaplan-Meier 생존곡선')
plt.ylabel('est. probability of survival $\hat{S}(t)$')
plt.xlabel('time t (year)')
plt.xticks([1,2,3,4,5,6,7,8,9,10])
from lifelines.plotting import add_at_risk_counts
add_at_risk_counts(kmf1, kmf2, kmf3, ax=ax)
plt.tight_layout()

In [None]:
plt.figure(figsize=(11,11));
PA1= (Data_train['PA_3group'] ==1)
ax = plt.subplot(111)
kmf1 = KaplanMeierFitter()
kmf1.fit(T[PA1] , event_observed=E[PA1], label='비활동군(Inactivity)')
kmf1.plot_survival_function(ax=ax , ci_show=False)
PA2= (Data_train['PA_3group'] ==2)
kmf2 = KaplanMeierFitter()
kmf2.fit(T[PA2] , event_observed=E[PA2], label='최소활동군(Minimally active)')
kmf2.plot_survival_function(ax=ax , ci_show=False)
PA3= (Data_train['PA_3group'] ==3)
kmf3 = KaplanMeierFitter()
kmf3.fit(T[PA3] , event_observed=E[PA3], label='활동군(Active)')
kmf3.plot_survival_function(ax=ax , ci_show=False)
plt.title('신체활동수준에 따른 Kaplan-Meier 생존곡선')
plt.ylabel('est. probability of survival $\hat{S}(t)$')
plt.xlabel('time t (year)')
plt.xticks([1,2,3,4,5,6,7,8,9,10])
from lifelines.plotting import add_at_risk_counts
add_at_risk_counts(kmf1, kmf2, kmf3, ax=ax)
plt.tight_layout()

생존률에 영향을 미치는 변인은 다양함
개인의 나이, 질병 고위험군 여부, 비만도 등. 이러한 변인을 공변량으로 사용하지 못함.
-> Cox PH 모델을 사용하여 교랸변인을 통제하여 사망예측 요인으로서 신체활동 변수의 중요도를 파악하고자 함.

In [None]:
plt.figure(figsize=(11,11));
PA1= (Data_train['PA_2group'] ==0)
ax = plt.subplot(111)
kmf1 = KaplanMeierFitter()
kmf1.fit(T[PA1] , event_observed=E[PA1], label='비활동군(Inactivity)')
kmf1.plot_survival_function(ax=ax, ci_show=False)
PA2= (Data_train['PA_2group'] ==1)
kmf2 = KaplanMeierFitter()
kmf2.fit(T[PA2] , event_observed=E[PA2], label='최소활동군+활동군(Minimally active + Active)')
kmf2.plot_survival_function(ax=ax, ci_show=False)
plt.title('신체활동수준에 따른 Kaplan-Meier 생존곡선')
plt.ylabel('est. probability of survival $\hat{S}(t)$')
plt.xlabel('time t (year)')
plt.xticks([1,2,3,4,5,6,7,8,9,10])
from lifelines.plotting import add_at_risk_counts
add_at_risk_counts(kmf1, kmf2, ax=ax)
plt.tight_layout()

# Log Rank test

In [None]:
pd.options.display.float_format = '{:.3f}'.format # 일반 표기법
from lifelines.statistics import logrank_test
대사1 = Data_train[Data_train['대사증후군여부']==0]
대사2 = Data_train[Data_train['대사증후군여부']==1]
당뇨1 = Data_train[Data_train['당뇨병여부']==0]
당뇨2 = Data_train[Data_train['당뇨병여부']==1]
혈압1 = Data_train[Data_train['고혈압여부']==0]
혈압2 = Data_train[Data_train['고혈압여부']==1]
신체1 = Data_train[Data_train['PA_2group']==0]
신체2 = Data_train[Data_train['PA_2group']==1]
print('대사증후군',logrank_test(대사1['추적기간_연'] , 대사2['추적기간_연'] , 대사1['CVD사망'],대사2['CVD사망']).p_value)
print('당뇨병',logrank_test(당뇨1['추적기간_연'] , 당뇨2['추적기간_연'] , 당뇨1['CVD사망'],당뇨2['CVD사망']).p_value)
print('혈압',logrank_test(혈압1['추적기간_연'] , 혈압2['추적기간_연'] , 혈압1['CVD사망'],혈압2['CVD사망']).p_value)
print('신체활동',logrank_test(신체1['추적기간_연'] , 신체2['추적기간_연'] , 신체1['CVD사망'],신체2['CVD사망']).p_value)

In [None]:
Data_train.columns

In [None]:
from lifelines import NelsonAalenFitter
naf = NelsonAalenFitter().fit(T, event_observed=E)
naf.plot_cumulative_hazard(ci_alpha=0.05, ci_force_lines=True)
plt.title('전체대상자의 생존곡선');

# Sksurvive Cox 회귀

## 시간종속 변수 생성

In [None]:
Data_rsf['나이_time']=Data_rsf['추적기간_연']*Data_rsf['나이']
Data_rsf['나이_log_time']=np.log(Data_rsf['추적기간_연'])*Data_rsf['나이']
Data_test['나이_time']=Data_test['추적기간_연']*Data_test['나이']
Data_test['나이_log_time']=np.log(Data_test['추적기간_연'])*Data_test['나이']
Data_train['나이_time']=Data_train['추적기간_연']*Data_train['나이']
Data_train['나이_log_time']=np.log(Data_train['추적기간_연'])*Data_train['나이']

In [None]:
from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score,
)
from sksurv.linear_model import CoxPHSurvivalAnalysis

## Sksurv에 맞는 형태로 변환
- sksurv.util.Surv.from_arrays(Data_train['CVD사망'] , Data_train['추적기간_연'])

In [None]:
# 남녀 성별분할 변환
import sksurv
y2_man = sksurv.util.Surv.from_arrays(Data_train_man['CVD사망'] , Data_train_man['추적기간_연'])
y2_woman = sksurv.util.Surv.from_arrays(Data_train_woman['CVD사망'] , Data_train_woman['추적기간_연'])

y2_man_test = sksurv.util.Surv.from_arrays(Data_test_man['CVD사망'] , Data_test_man['추적기간_연'])
y2_woman_test = sksurv.util.Surv.from_arrays(Data_test_woman['CVD사망'] , Data_test_woman['추적기간_연'])

In [None]:
import sksurv
y2 = sksurv.util.Surv.from_arrays(Data_train['CVD사망'] , Data_train['추적기간_연'])
y2_test = sksurv.util.Surv.from_arrays(Data_test['CVD사망'] , Data_test['추적기간_연'])
y2_rsf = sksurv.util.Surv.from_arrays(Data_rsf['CVD사망'] , Data_rsf['추적기간_연'])

In [None]:
Data_train_man.columns

In [None]:
feature1 = ['성별','나이_4그룹']
feature2 = ['성별','나이_4그룹','WHtR']
feature3 = ['성별','나이_4그룹','WHtR','대사증후군여부','당뇨병여부','고혈압여부']
feature4 = ['성별','나이_4그룹','WHtR','신체활동량moderate','신체활동량high']
feature5 = ['성별','나이_4그룹','WHtR','신체활동량moderate','신체활동량high','대사증후군여부','당뇨병여부','고혈압여부']

In [None]:
feature1 = ['성별','나이_time']
feature2 = ['성별','나이_time','WHtR']
feature3 = ['성별','나이_time','WHtR','대사증후군여부','당뇨병여부','고혈압여부']
feature4 = ['성별','나이_time','WHtR','신체활동량moderate','신체활동량high']
feature5 = ['성별','나이_time','WHtR','신체활동량moderate','신체활동량high','대사증후군여부','당뇨병여부','고혈압여부']

In [None]:
estimator1 = CoxPHSurvivalAnalysis().fit(Data_train[feature1], y2)
estimator2 = CoxPHSurvivalAnalysis().fit(Data_train[feature2], y2)
estimator3 = CoxPHSurvivalAnalysis().fit(Data_train[feature3], y2)
estimator4 = CoxPHSurvivalAnalysis().fit(Data_train[feature4], y2)
estimator5 = CoxPHSurvivalAnalysis().fit(Data_train[feature5], y2)

In [None]:
estimator_cox1 = CoxPHSurvivalAnalysis().fit(Data_rsf[feature1], y2_rsf)
estimator_cox2 = CoxPHSurvivalAnalysis().fit(Data_rsf[feature2], y2_rsf)
estimator_cox3 = CoxPHSurvivalAnalysis().fit(Data_rsf[feature3], y2_rsf)
estimator_cox4 = CoxPHSurvivalAnalysis().fit(Data_rsf[feature4], y2_rsf)
estimator_cox5 = CoxPHSurvivalAnalysis().fit(Data_rsf[feature5], y2_rsf)

risk_score1_cox = estimator_cox1.predict(Data_test[feature1])
risk_score2_cox = estimator_cox2.predict(Data_test[feature2])
risk_score3_cox = estimator_cox3.predict(Data_test[feature3])
risk_score4_cox = estimator_cox4.predict(Data_test[feature4])
risk_score5_cox = estimator_cox5.predict(Data_test[feature5])

In [None]:
from sksurv.metrics import concordance_index_censored
prediction1_train = estimator1.predict(Data_train[feature1])
prediction1_test = estimator1.predict(Data_test[feature1])
prediction2_train = estimator2.predict(Data_train[feature2])
prediction2_test = estimator2.predict(Data_test[feature2])
prediction3_train = estimator3.predict(Data_train[feature3])
prediction3_test = estimator3.predict(Data_test[feature3])
prediction4_train = estimator4.predict(Data_train[feature4])
prediction4_test = estimator4.predict(Data_test[feature4])
prediction5_train = estimator5.predict(Data_train[feature5])
prediction5_test = estimator5.predict(Data_test[feature5])

result1_train = concordance_index_censored(y2["event"], y2["time"], prediction1_train)
result1_test = concordance_index_censored(y2_test["event"], y2_test["time"], prediction1_test)
result2_train = concordance_index_censored(y2["event"], y2["time"], prediction2_train)
result2_test = concordance_index_censored(y2_test["event"], y2_test["time"], prediction2_test)
result3_train = concordance_index_censored(y2["event"], y2["time"], prediction3_train)
result3_test = concordance_index_censored(y2_test["event"], y2_test["time"], prediction3_test)
result4_train = concordance_index_censored(y2["event"], y2["time"], prediction4_train)
result4_test = concordance_index_censored(y2_test["event"], y2_test["time"], prediction4_test)
result5_train = concordance_index_censored(y2["event"], y2["time"], prediction5_train)
result5_test = concordance_index_censored(y2_test["event"], y2_test["time"], prediction5_test)

print(result1_train[0],result1_test[0])
print(result2_train[0],result2_test[0])
print(result3_train[0],result3_test[0])
print(result4_train[0],result4_test[0])
print(result5_train[0],result5_test[0])
print(result6_train[0],result6_test[0])
print(result7_train[0],result7_test[0])

## 시간종속 AUC plot

In [None]:
times = np.arange(0.25 , 10, 0.25)
auc1_cox, mean_auc1_cox = cumulative_dynamic_auc(y2, y2_test, risk_score1_cox, times)
auc2_cox, mean_auc2_cox = cumulative_dynamic_auc(y2, y2_test, risk_score2_cox, times)
auc3_cox, mean_auc3_cox = cumulative_dynamic_auc(y2, y2_test, risk_score3_cox, times)
auc4_cox, mean_auc4_cox = cumulative_dynamic_auc(y2, y2_test, risk_score4_cox, times)
auc5_cox, mean_auc5_cox = cumulative_dynamic_auc(y2, y2_test, risk_score5_cox, times)

In [None]:
plt.rc('legend', fontsize=15)  # 범례 폰트 크기
plt.rc('axes', labelsize=15)   # x,y축 label 폰트 크기
plt.rc('xtick', labelsize=13)  # x축 눈금 폰트 크기
plt.rc('ytick', labelsize=13)  # y축 눈금 폰트 크기

plt.figure(figsize=(12,10));
plt.xticks([1,2,3,4,5,6,7,8,9,10])
plt.plot(times, auc1_cox, marker="o",label="Cox1 (mean AUC = {:.3f})".format(mean_auc1_cox))
plt.plot(times, auc2_cox, marker="o",label="Cox2 (mean AUC = {:.3f})".format(mean_auc2_cox))
plt.plot(times, auc3_cox, marker="o",label="Cox3 (mean AUC = {:.3f})".format(mean_auc3_cox))
plt.plot(times, auc4_cox, marker="o",label="Cox4 (mean AUC = {:.3f})".format(mean_auc4_cox))
plt.plot(times, auc5_cox, marker="o",label="Cox5 (mean AUC = {:.3f})".format(mean_auc5_cox))

#plt.axhline(mean_auc1, linestyle="--")
#plt.axhline(mean_auc2, linestyle="--")
#plt.axhline(mean_auc3, linestyle="--")
#plt.axhline(mean_auc4, linestyle="--")
#plt.axhline(mean_auc5, linestyle="--")
#plt.axhline(mean_auc6, linestyle="--")
#plt.axhline(mean_auc7, linestyle="--")

plt.xlabel("Years from enrollment")
plt.ylabel("Time-dependent AUC")
plt.legend(loc="lower center") # loc="lower center"
plt.grid(True)

In [None]:
times = np.arange(0.4 , 9.9, 1)
auc, mean_auc = cumulative_dynamic_auc(y2, y2_test, risk_score, times)
plt.plot(times, auc, marker="o",label="CoxPH (mean AUC = {:.3f})".format(mean_auc))
plt.axhline(mean_auc, linestyle="--")
plt.xlabel("Years from enrollment")
plt.ylabel("time-dependent AUC")
plt.legend(loc="lower center")
plt.grid(True)

![image.png](attachment:image.png)

In [None]:
ci_ipcw = concordance_index_ipcw(y2, y2_test, estimator1.predict(Data_test[feature1]))
ci_ipcw

In [None]:
sksurv.metrics.cumulative_dynamic_auc(survival_train , survival_test , 추정 , 시간 , tie_tol = 1e-08 )

In [None]:
pred_surv = estimator.predict_survival_function(x_new)

# Survival Forest

데이터를 분할하여 학습, 평가는 그대로
- 학습데이터의 사망자 모두 추출
- 학습데이터의 생존자 중 일부 추출하여 concat

# Survival Forest 용 데이터 만들기

In [None]:
생존_train.shape

In [None]:
생존_train = Data_train[Data_train['CVD사망']==0]
Train_rsf = 생존_train.sample(frac=0.01)
Train_rsf

In [None]:
사망_train = Data_train[Data_train['CVD사망']==1]
사망_test = Data_test[Data_test['CVD사망']==2]
pd.concat([사망_train, 사망_test])

In [None]:
사망_train = Data_train[Data_train['CVD사망']==1]
사망_test = Data_test[Data_test['CVD사망']==2]
Data_rsf = pd.concat([사망_train, Train_rsf])
import pyreadstat
pyreadstat.write_sav(Data_rsf, 'Data_rsf.sav')

In [None]:
import sksurv
y2_rsf = sksurv.util.Surv.from_arrays(Data_rsf['CVD사망'] , Data_rsf['추적기간_연'])

In [None]:
feature1 = ['성별','나이']
feature2 = ['성별','나이','WHtR']
feature3 = ['성별','나이','WHtR','대사증후군여부','당뇨병여부','고혈압여부']
feature4 = ['성별','나이','WHtR','신체활동량active']
feature5 = ['성별','나이','WHtR','신체활동량moderate','신체활동량high']
feature6 = ['성별','나이','WHtR','신체활동량moderate','신체활동량high','대사증후군여부','당뇨병여부','고혈압여부']
feature7 = ['성별','나이','WHtR','신체활동량moderate','신체활동량high']

estimator1 = CoxPHSurvivalAnalysis().fit(Data_train[feature1], y2)
estimator2 = CoxPHSurvivalAnalysis().fit(Data_train[feature2], y2)
estimator3 = CoxPHSurvivalAnalysis().fit(Data_train[feature3], y2)
estimator4 = CoxPHSurvivalAnalysis().fit(Data_train[feature4], y2)
estimator5 = CoxPHSurvivalAnalysis().fit(Data_train[feature5], y2)
estimator6 = CoxPHSurvivalAnalysis().fit(Data_train[feature6], y2)
estimator7 = CoxPHSurvivalAnalysis().fit(Data_train[feature7], y2)

In [None]:
feature_rsf = ['성별', '나이', 'WHtR',
           '고강도mets','중강도mets', '걷기mets',
           '추적기간_연', 'CVD사망','PA_3group']
#Data_train = Data_build[feature]
Data_test = Data_validation[feature_rsf]
y2_test = sksurv.util.Surv.from_arrays(Data_test['CVD사망'] , Data_test['추적기간_연'])

In [None]:
from sksurv.ensemble import RandomSurvivalForest
rsf = RandomSurvivalForest(n_estimators=1000,
                           min_samples_split=10,
                           min_samples_leaf=15,
                           max_depth=40,
                           n_jobs=-1,
                           random_state=42)
rsf.fit(Data_build[feature_rsf], y2_rsf)

In [None]:
rsf.score(Data_rsf[feature_], y2_rsf)

In [None]:
rsf.score(Data_test[feature_rsf], y2_test)

# Lifelines Cox

In [None]:
from lifelines import CoxPHFitter

## 비례위험가정 확인

In [None]:
cat = pd.cut(Data_rsf['나이'],4)
cat

In [None]:
# time
Data_rsf['나이_time']=Data_rsf['추적기간_연']*Data_rsf['나이']
Data_test['나이_time']=Data_test['추적기간_연']*Data_test['나이']
Data_train['나이_time']=Data_train['추적기간_연']*Data_train['나이']

Data_rsf['WHtR_time']=Data_rsf['추적기간_연']*Data_rsf['WHtR']
Data_test['WHtR_time']=Data_test['추적기간_연']*Data_test['WHtR']
Data_train['WHtR_time']=Data_train['추적기간_연']*Data_train['WHtR']

#ln(time)
Data_rsf['나이_ln_time']=np.log(Data_rsf['추적기간_연'])*Data_rsf['나이']
Data_test['나이_ln_time']=np.log(Data_test['추적기간_연'])*Data_test['나이']
Data_train['나이_ln_time']=np.log(Data_train['추적기간_연'])*Data_train['나이']

#log10(time)
Data_rsf['나이_log_time']=np.log10(Data_rsf['추적기간_연'])*Data_rsf['나이']
Data_test['나이_log_time']=np.log10(Data_test['추적기간_연'])*Data_test['나이']
Data_train['나이_log_time']=np.log10(Data_train['추적기간_연'])*Data_train['나이']

In [None]:
from lifelines.statistics import proportional_hazard_test
cox = CoxPHFitter()
cox.fit(Data_rsf, '추적기간_연', 'CVD사망',
        formula='나이_4그룹+성별+WHtR+대사증후군여부+당뇨병여부+고혈압여부',
       show_progress=True)
cox.check_assumptions(Data_rsf, p_value_threshold=0.05, show_plots=True)

In [None]:
cox.print_summary()

## COX 회귀식

In [None]:
formula1 = '성별+나이'
formula2 = '성별+나이+WHtR'
formula3 = '성별+나이+WHtR+대사증후군여부+당뇨병여부+고혈압여부'
formula4 = '성별+나이+WHtR+신체활동량active'
formula5 = '성별+나이+WHtR+신체활동량moderate+신체활동량high'
formula6 = '성별+나이+WHtR+신체활동량moderate+신체활동량high+대사증후군여부+당뇨병여부+고혈압여부'

In [None]:
formula7 = '성별+나이+신체활동량moderate+신체활동량high'
cph7 = CoxPHFitter()
cph7 = cph7.fit(Data_train,'추적기간_연','CVD사망',formula=formula7, show_progress=True)

In [None]:
cph7.print_summary()

In [None]:
cph1 = CoxPHFitter()
cph2 = CoxPHFitter()
cph3 = CoxPHFitter()
cph4 = CoxPHFitter()
cph5 = CoxPHFitter()
cph6 = CoxPHFitter()
cph1 = cph1.fit(Data_train,'추적기간_연','CVD사망',formula=formula1, show_progress=True)
cph2 = cph2.fit(Data_train,'추적기간_연','CVD사망',formula=formula2, show_progress=True)
cph3 = cph3.fit(Data_train,'추적기간_연','CVD사망',formula=formula3, show_progress=True)
cph4 = cph4.fit(Data_train,'추적기간_연','CVD사망',formula=formula4, show_progress=True)
cph5 = cph5.fit(Data_train,'추적기간_연','CVD사망',formula=formula5, show_progress=True)
cph6 = cph6.fit(Data_train,'추적기간_연','CVD사망',formula=formula6, show_progress=True)

In [None]:
cph1.print_summary()

In [None]:
cph2.print_summary()

In [None]:
cph3.print_summary()

In [None]:
cph4.print_summary()

In [None]:
cph5.print_summary()

In [None]:
cph6.print_summary()

In [None]:
cph.plot()

In [None]:
cph.baseline_hazard_

In [None]:
cph.predict_survival_function(Data_test.iloc[5:10,:]).plot()

In [None]:
cph.predict_survival_function(Data_death.iloc[40:60,:]).plot(figsize=(10,10))

In [None]:
cph.predict_survival_function(Data_death.iloc[:20,:])

In [None]:
Data_death = Data_test[Data_test['CVD사망']==1]
Data_death

In [None]:
cph.plot_partial_effects_on_outcome(covariates='IPAQ_4group_mets_days', values=[0, 1, 2, 3],
                                     figsize=(15,15))

In [None]:
cph.check_assumptions(Data, show_plots=True)

In [None]:
import statsmodels.formula.api as smf
formula1 = 'CVD사망 ~ 성별+나이+WHtR+신체활동량moderate+신체활동량high'
formula2 = 'CVD사망 ~ 성별+나이+WHtR+신체활동량moderate+신체활동량high+대사증후군여부+당뇨병여부+고혈압여부'
result1 = smf.mnlogit(formula = formula2 , data = X1).fit()
odds_ratio1 = np.exp(result1.params).round(3)
print('Odds :\n' , odds_ratio1)
result1.summary()