In [None]:
import pandas as pd
import shutil
import os
import numpy as np
import matplotlib.pyplot as plt
import onekey_algo.custom.components as okcomp
from onekey_algo import get_param_in_cwd

plt.rcParams['figure.dpi'] = 300
model_names = ['Clinical', 'DLRadiomics', 'Pathomics', 'Nomogram']
# 获取配置
task = get_param_in_cwd('task_column') or 'label'
bst_model = get_param_in_cwd('sel_model') or 'LR'
labelf = 'clinic_sel.csv'
group_info = get_param_in_cwd('dataset_column') or 'group'

# 读取label文件。
labels = [task]
label_data_ = pd.read_csv(labelf)
# label_data_['ID'] = label_data_['ID'].map(lambda x: f"{x}v.nii.gz")
# label_data_ = label_data_[['ID', group_info, task]]
label_data_ = label_data_.dropna(axis=0)

ids = label_data_['ID']
print(label_data_.columns)
label_data = label_data_[['ID'] + labels]

label_data

# 验证集-汇总

In [None]:
import pandas as pd
from onekey_algo.custom.components import metrics
from onekey_algo.custom.components.metrics import analysis_pred_binary
from onekey_algo.custom.components.comp1 import plot_DCA
from onekey_algo.custom.components.comp1 import draw_calibration
from onekey_algo.custom.components.delong import delong_roc_test
from onekey_algo.custom.components.comp1 import draw_matrix

hosmer = []
metric = []
c = pd.read_csv('data/clinical.csv')
for Hname, Htype in zip(['Adenocarcinoma', 'Squamous'], [1], ):
    if 'None' in Hname:
        cids = set(c[c['Histological_type'] != Htype]['ID'])
    else:
        cids = set(c[c['Histological_type'] == Htype]['ID'])
    for subset in ['train', 'test']:
        Clinic_results = pd.merge(pd.read_csv(f'./results/Clinic_RandomForest_{subset}.csv', header=0), label_data, on='ID', how='inner')
        DLR_results = pd.merge(pd.read_csv(f'./results/Rad_3DL_ShuffleNet_{subset}.csv', header=0), label_data, on='ID', how='inner')
        Path_results = pd.merge(pd.read_csv(f'./Pathology/results/Path_XGBoost_{subset}.csv', header=0, converters={'ID': lambda x: f"{x}.nii.gz"}), 
                                label_data, on='ID', how='inner')
        ALL_results = pd.merge(pd.merge(Clinic_results, DLR_results, on='ID', how='inner'), 
                                        Path_results, on='ID', how='inner')
        ALL_results.columns = ['ID', '-0', model_names[0], task, 
                               '-00', model_names[1], '-l',
                               '-000', model_names[2], '-ll',]

        # ALL_results = normalize_df(ALL_results, method='minmax', not_norm=['ID', 'label'])
        Clinic = pd.read_csv('clinic_sel.csv')[['ID', 'Sex', 'Smoking_History']]
        cnames = list(Clinic.columns[1:])
        ALL_results = pd.merge(ALL_results, Clinic, on='ID', how='inner')
        ALL_results = ALL_results.dropna(axis=1)
        test_ids = set(ALL_results['ID'])
        ALL_results = ALL_results[ALL_results['ID'].isin(cids)]
        print(ALL_results.shape)

        pred_column = [f'{task}-0', f'{task}-1']
        Nomo_results = pd.read_csv(f'./results/Nomo_{subset}.csv', header=0)
        Nomo_results.columns = ['ID', 'label-9', model_names[-1]]
        ALL_results = pd.merge(ALL_results, Nomo_results, on='ID', how='inner')
        gt = [np.array(ALL_results[task]) for _ in model_names]
        pred_train = [np.array(ALL_results[d]) for d in model_names]
        okcomp.comp1.draw_roc(gt, pred_train, labels=model_names, title=f'Cohort {subset} ROC')
        plt.savefig(f'img/{Hname}_{subset}_auc.svg')
        plt.show()

        for mname, y, score in zip(model_names, gt, pred_train):
            # 计算验证集指标
            acc, auc, ci, tpr, tnr, ppv, npv, precision, recall, f1, thres = analysis_pred_binary(y, score)
            ci = f"{ci[0]:.4f} - {ci[1]:.4f}"
            metric.append((mname, acc, auc, ci, tpr, tnr, ppv, npv, precision, recall, f1, thres, f"{Hname}-{subset}"))
        m_ = pd.DataFrame(metric, index=None, columns=['Signature', 'Accuracy', 'AUC', '95% CI',
                                                       'Sensitivity', 'Specificity', 'PPV', 'NPV', 'Precision', 'Recall', 'F1',
                                                       'Threshold', 'Cohort'])
        display(m_)

        delong = []
        delong_columns = []
        this_delong = []
        plt.figure(figsize=(8, 6))
        cm = np.zeros((len(model_names), len(model_names)))
        for i, mni in enumerate(model_names):
            for j, mnj in enumerate(model_names):
                if i <= j:
                    cm[i][j] = np.nan
                else:
                    cm[i][j] = delong_roc_test(ALL_results[task], ALL_results[mni], ALL_results[mnj])[0][0]
        cm = pd.DataFrame(cm[1:, :-1], index=model_names[1:], columns=model_names[:-1])
        draw_matrix(cm, annot=True, cmap='jet_r', cbar=True)
        plt.title(f'Cohort {subset} Delong')
        plt.savefig(f'img/{Hname}_delong_each_cohort_{subset}.svg', bbox_inches = 'tight')
        plt.show()

        plot_DCA([ALL_results[model_name] for model_name in model_names], 
                 ALL_results[task], title=f'Cohort {subset} DCA', labels=model_names, y_min=-0.15, remap=False)
        plt.savefig(f'img/{Hname}_{subset}_dca.svg')
        plt.show()

        draw_calibration(pred_scores=pred_train, n_bins=5, remap=True,# smooth=True,
                         y_test=gt, model_names=model_names)
        plt.savefig(f'img/{Hname}_{subset}_cali.svg')
        plt.show()

In [None]:
from onekey_algo.custom.components import stats

hosmer.append([stats.hosmer_lemeshow_test(y_true, y_pred, bins=5, remap=False) 
              for fn, y_true, y_pred in zip(model_names, gt, pred_train)])
pd.concat([pd.DataFrame(hosmer, columns=model_names), pd.DataFrame(['Train', 'Test'], columns=['Cohort'])], axis=1)

In [None]:
import matplotlib.pyplot as plt
from onekey_algo.custom.components.stats import clinic_stats

import seaborn as sns

def get_subgroup(x):
    if '19' in x:
        return 'EGFR-19'
    elif '21' in x:
        return 'EGFR-21'
    elif '有' in x:
        return 'EGFR-Others'
    else:
        return 'Non-EGFR'

subset = 'test'
label_ = pd.read_csv('data/label.csv', converters={'ID': lambda x: f"{x}.nii.gz"}).drop_duplicates('ID')
label_['group'] = label_['label'].map(get_subgroup)

statss = []
for subset in ['train', 'test']:
    Clinic_results = pd.merge(pd.read_csv(f'./results/Clinic_RandomForest_{subset}.csv', header=0), label_data, on='ID', how='inner')
    DLR_results = pd.merge(pd.read_csv(f'./results/Rad_3DL_ShuffleNet_{subset}.csv', header=0), label_data, on='ID', how='inner')
    Path_results = pd.merge(pd.read_csv(f'./Pathology/results/Path_XGBoost_{subset}.csv', header=0, 
                                        converters={'ID': lambda x: f"{x}.nii.gz"}), 
                            label_data, on='ID', how='inner')
    ALL_results = pd.merge(pd.merge(Clinic_results, DLR_results, on='ID', how='inner'), 
                                    Path_results, on='ID', how='inner')
    ALL_results.columns = ['ID', '-0', model_names[0], task, 
                           '-00', model_names[1], '-l',
                           '-000', model_names[2], '-ll',]

    # ALL_results = normalize_df(ALL_results, method='minmax', not_norm=['ID', 'label'])
    Clinic = pd.read_csv('clinic_sel.csv')[['ID', 'Sex', 'Smoking_History']]
    cnames = list(Clinic.columns[1:])
    ALL_results = pd.merge(ALL_results, Clinic, on='ID', how='inner')
    ALL_results = ALL_results.dropna(axis=1)
    pred_column = [f'{task}-0', f'{task}-1']
    Nomo_results = pd.read_csv(f'./results/Nomo_{subset}.csv', header=0)
    Nomo_results.columns = ['ID', 'label-9', model_names[-1]]
    ALL_results = pd.merge(ALL_results, Nomo_results, on='ID', how='inner')
    ALL_results = pd.merge(ALL_results, label_[['ID', 'group']], on='ID', how='inner')
    print(ALL_results.shape)
    
    order = ['Non-EGFR', 'EGFR-19', 'EGFR-21', 'EGFR-Others']
    for model_name in model_names:
        stats = clinic_stats(ALL_results, 
                     stats_columns= model_name,
                     label_column='group', 
                     group_column=None, 
                     continuous_columns= [model_name], 
                     pretty=True, verbose=False)
        stats['group'] = subset
        statss.append(stats)
        g = sns.catplot(x="group", y=model_name, data=ALL_results, kind="violin",
                        order=order if subset == 'train' else order[:-1])
        g.fig.set_size_inches(15,10)
        sns.stripplot(x="group", y=model_name, data=ALL_results, ax=g.ax, color='black',
                      order=order if subset == 'train' else order[:-1])
        plt.title(f'Cohort {subset}')
        pvalue = stats.iloc[0]['pvalue']
        if isinstance(pvalue, float):
            pvalue = f"{pvalue:.3f}"
        plt.legend([f"p_value {pvalue}"], loc='lower right')
        plt.savefig(f'img/{model_name}_results_stats_{subset}.svg', bbox_inches = 'tight')
        plt.show()
pd.concat(statss, axis=0)