In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score, confusion_matrix
import seaborn as sns
from random import sample, seed 
from scipy import stats

import matplotlib.pyplot as plt
from gseapy.plot import barplot, dotplot
from mycolorpy import colorlist as mcp
from matplotlib.lines import Line2D




In [2]:
omics_prot_path = '/data/sharedData/UK_BIOBANK_DATA/Download_Data/ProteomicsData/SecondPhase/Olink_proteomics_data_2ndPhase_transposed_decoded2UNIportID.txt'

omics_prot = pd.read_csv(omics_prot_path, sep = '\t', index_col = 0)

In [3]:
data_statistics = ({'Disease':[],
                    'Data':[],
                    'Tissue':[],
                    'Count':[],
                    'Age_median':[],
                    'Age_IQR_low': [],
                    'Age_IQR_high': [],
                    'Female_proportion': [],
                    'N proteins': [],
                    'N proteins shared with UKBB': []
                   })
data_statistics = pd.DataFrame(data_statistics)

data_type = 'tissue'

for disease in ['Breast PDC000120', 'Colon PDC000116', 'HeadNeck PDC000221', 'Kidney PDC000127', 'Liver PDC000198', 'Lung PDC000153',
'Ovary PDC000110', 'Pancreas PDC000270', 'UCEC PDC000125']:
#for disease in ['Lung PDC000153']:
    for response in ['Primary Tumor', 'Solid Tissue Normal']:

        omics = pd.read_csv('../data/CPTAC for copycat/' + disease + '/normalize_clinic.txt', sep = '\t')
        
        omics_subset = omics[omics['Sample.Type'] == response]

        omics_subset['Age.at.Diagnosis'].dropna()//365
        
        N_orig = omics_subset.shape[1]
        
        N_shared = omics_subset[np.intersect1d(omics_subset.columns, omics_prot.columns)].shape[1]

        
        
        if disease == 'Colon PDC000116':            
            new_row = {'Disease': disease,
                        'Data': data_type,
                        'Tissue': response,
                        'Count': len(omics_subset),
                        'Age_median': 'NA',
                        'Age_IQR_low': 'NA',
                        'Age_IQR_high': 'NA',
                        'Female_proportion': sum(omics_subset['Gender'] == 'Female')/len(omics_subset['Gender'].dropna()),
                        'N proteins': N_orig,
                        'N proteins shared with UKBB': N_shared
                        }

        else:
            new_row = {'Disease': disease,
                        'Data': data_type,
                        'Tissue': response,
                        'Count': len(omics_subset),
                        'Age_median': np.median(omics_subset['Age.at.Diagnosis'].dropna()//365),
                        'Age_IQR_low': np.quantile(omics_subset['Age.at.Diagnosis'].dropna()//365, 0.25),
                        'Age_IQR_high': np.quantile(omics_subset['Age.at.Diagnosis'].dropna()//365, 0.75),
                        'Female_proportion': sum(omics_subset['Gender'] == 'Female')/len(omics_subset['Gender'].dropna()),
                        'N proteins': N_orig,
                        'N proteins shared with UKBB': N_shared
                      }

        data_statistics = data_statistics.append(new_row, ignore_index=True)



  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
data_type = 'tissue'
for response in ['Primary Tumor', 'Solid Tissue Normal']:
    omics_all = list()
    for disease in ['Breast PDC000120', 'Colon PDC000116', 'HeadNeck PDC000221', 'Kidney PDC000127', 'Liver PDC000198', 'Lung PDC000153',
    'Ovary PDC000110', 'Pancreas PDC000270', 'UCEC PDC000125']:
    #for disease in ['Lung PDC000153']:

        omics = pd.read_csv('../data/CPTAC for copycat/' + disease + '/normalize_clinic.txt', sep = '\t')
        
        omics_subset = omics[omics['Sample.Type'] == response]
        
        omics_all.append(omics_subset)

    omics = pd.concat(omics_all)
    
    N_orig = omics.shape[1]
        
    N_shared = omics[np.intersect1d(omics.columns, omics_prot.columns)].shape[1]


    new_row = {'Disease': 'all_patients',
                'Data': data_type,
                'Tissue': response,
                'Count': len(omics),
                'Age_median': np.median(omics['Age.at.Diagnosis'].dropna()//365),
                'Age_IQR_low': np.quantile(omics['Age.at.Diagnosis'].dropna()//365, 0.25),
                'Age_IQR_high': np.quantile(omics['Age.at.Diagnosis'].dropna()//365,  0.75),
                'Female_proportion': sum(omics['Gender'] == 'Female')/len(omics['Gender'].dropna()),
                'N proteins': N_orig,
                'N proteins shared with UKBB': N_shared 
              }

    data_statistics = data_statistics.append(new_row, ignore_index=True)

In [5]:
data_statistics

Unnamed: 0,Disease,Data,Tissue,Count,Age_median,Age_IQR_low,Age_IQR_high,Female_proportion,N proteins,N proteins shared with UKBB
0,Breast PDC000120,tissue,Primary Tumor,118.0,59.0,47.0,69.0,0.889831,9833.0,1911.0
1,Breast PDC000120,tissue,Solid Tissue Normal,18.0,62.5,55.75,70.75,1.0,9833.0,1911.0
2,Colon PDC000116,tissue,Primary Tumor,97.0,,,,0.57732,6686.0,1473.0
3,Colon PDC000116,tissue,Solid Tissue Normal,100.0,,,,0.57,6686.0,1473.0
4,HeadNeck PDC000221,tissue,Primary Tumor,110.0,62.0,55.25,67.0,0.127273,9633.0,1930.0
5,HeadNeck PDC000221,tissue,Solid Tissue Normal,68.0,62.0,56.0,65.0,0.102941,9633.0,1930.0
6,Kidney PDC000127,tissue,Primary Tumor,110.0,60.5,52.0,69.0,0.272727,9241.0,1856.0
7,Kidney PDC000127,tissue,Solid Tissue Normal,84.0,60.5,52.0,69.0,0.22619,9241.0,1856.0
8,Liver PDC000198,tissue,Primary Tumor,165.0,54.0,46.0,61.0,0.193939,9423.0,1811.0
9,Liver PDC000198,tissue,Solid Tissue Normal,165.0,54.0,46.0,61.0,0.193939,9423.0,1811.0


In [6]:
data_statistics.to_csv('Summary_statistics_CPTAC.csv', index = False)

In [18]:
omics_prot_path = '/data/sharedData/UK_BIOBANK_DATA/Download_Data/ProteomicsData/SecondPhase/Olink_proteomics_data_2ndPhase_transposed_decoded2UNIportID.txt'

omics_prot = pd.read_csv(omics_prot_path, sep = '\t', index_col = 0)

In [19]:
# Set up the data formats for the output
boxplot_data_full = ({'prob':[],
                'reference':[],
                'disease':[],
                'data_type':[],
                'N': []
                })
boxplot_data_full = pd.DataFrame(boxplot_data_full)

Shiny_data = ({'prob':[],
               'reference':[],
               'Disease':[],
               'Data': [],
               'N':[],
               'AUC_train':[],
               'AUC_test':[],
               'Features':[],
               'Coef':[]
              })
Shiny_data = pd.DataFrame(Shiny_data)

for disease in ['Breast PDC000120', 'Colon PDC000116', 'HeadNeck PDC000221', 'Kidney PDC000127', 'Liver PDC000198',
                'Lung PDC000153', 'Ovary PDC000110', 'Pancreas PDC000270', 'UCEC PDC000125']:
#for disease in ['Lung PDC000153']:
    data_type = 'tissue'
   
    omics = pd.read_csv('../data/CPTAC for copycat/' + disease + '/normalize_clinic.txt', sep = '\t')
    response = omics.iloc[:,-18]
    omics = omics.iloc[:,:-19]
    print(omics.shape)
    omics = omics[np.intersect1d(omics.columns, omics_prot.columns)]
    print(omics.shape)
        
    #devide between healthy and sick
    X_train, X_test, y_train, y_test = train_test_split(omics,
                                                        response,
                                                        test_size=0.3,
                                                        random_state=42)

    #KNN imputer(default parameters): train on train data, apply on both train and test
    imp = KNNImputer(n_neighbors=5, weights="uniform")
    imp.fit(X_train)
    X_train = pd.DataFrame(imp.transform(X_train), index = X_train.index, columns = X_train.columns)
    X_test = pd.DataFrame(imp.transform(X_test), index = X_test.index, columns = X_test.columns)



    # ExtraTreesClassifier(default parameters): Choose top 5 proteins/metabolites
    clf = ExtraTreesClassifier(n_estimators=10000,
                                random_state=42,
                                verbose = 1).fit(X_train, y_train)

    best_AUC = 0
    for N in range(1,16):
    #for N in range(4,6):
        # Take only top N proteins/metabolites
        model = SelectFromModel(clf, prefit=True, max_features = N)
        X_train_subset = X_train[X_train.columns[model.get_support()]]
        X_test_subset = X_test[X_test.columns[model.get_support()]]


        #train ridge regression model

        clf_ridge = LogisticRegressionCV(cv=10, Cs = 10, random_state=42, max_iter = 10000,
                                    penalty='l2').fit(X_train_subset, y_train)



        #compute feature importance 
        coefficients = clf_ridge.coef_[0]
        feature_importance = pd.DataFrame({'Feature': X_train_subset.columns,
                                            'Importance': np.abs(coefficients),
                                            'Coef': coefficients})
        feature_importance = feature_importance.sort_values('Importance', ascending = True)


        #compute metrics
        auc_train = roc_auc_score(y_train, clf_ridge.predict_proba(X_train_subset)[:, 1])
        auc = roc_auc_score(y_test, clf_ridge.predict_proba(X_test_subset)[:, 1])
        conf_matrix = confusion_matrix(y_test, clf_ridge.predict(X_test_subset), labels = np.unique(y_test))


        #print summary
        print('Disease: ' + disease)
        print(conf_matrix)
        print('AUC train: ' + str(auc_train))
        print('AUC: ' + str(auc))
        print('\n')
        
         # Prediction for test dataset
        X_test_summary = pd.DataFrame(clf_ridge.predict_proba(X_test_subset)[:, 1], columns = ['prob'])
        X_test_summary['reference'] = y_test.values


        # Data for boxplot
                            
        boxplot_data = X_test_summary[['prob', 'reference']]
        boxplot_data['disease'] = disease
        boxplot_data['data_type'] = data_type
        boxplot_data['N'] = N
                
        if N == 1:
            boxplot_data_full = boxplot_data_full.append(boxplot_data, ignore_index=True)


        # Data for ShinyApp
        new_row = {'prob': list(np.round(boxplot_data['prob'].astype(float), 3)),
                    'reference': list(boxplot_data['reference']),
                    'Disease': disease,
                    'Data': data_type,
                    'N': N,
                    'AUC_train': auc_train,
                    'AUC_test': auc,
                    'Features': list(feature_importance['Feature'].astype(str)),
                    'Coef': list(np.round(feature_importance['Coef'], 3))
            }

        Shiny_data = Shiny_data.append(new_row, ignore_index=True)


(136, 9814)
(136, 1911)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 10000 out of 10000 | elapsed:    6.4s finished


Disease: Breast PDC000120
[[35  0]
 [ 0  6]]
AUC train: 0.9919678714859438
AUC: 1.0


Disease: Breast PDC000120
[[35  0]
 [ 0  6]]
AUC train: 0.9969879518072289
AUC: 1.0


Disease: Breast PDC000120
[[35  0]
 [ 0  6]]
AUC train: 1.0
AUC: 1.0


Disease: Breast PDC000120
[[35  0]
 [ 0  6]]
AUC train: 1.0
AUC: 1.0


Disease: Breast PDC000120
[[35  0]
 [ 0  6]]
AUC train: 1.0
AUC: 1.0


Disease: Breast PDC000120
[[35  0]
 [ 0  6]]
AUC train: 1.0
AUC: 1.0


Disease: Breast PDC000120
[[35  0]
 [ 0  6]]
AUC train: 1.0
AUC: 1.0


Disease: Breast PDC000120
[[35  0]
 [ 0  6]]
AUC train: 1.0
AUC: 1.0


Disease: Breast PDC000120
[[35  0]
 [ 0  6]]
AUC train: 1.0
AUC: 1.0


Disease: Breast PDC000120
[[35  0]
 [ 0  6]]
AUC train: 1.0
AUC: 1.0


Disease: Breast PDC000120
[[35  0]
 [ 0  6]]
AUC train: 1.0
AUC: 1.0


Disease: Breast PDC000120
[[35  0]
 [ 0  6]]
AUC train: 1.0
AUC: 1.0


Disease: Breast PDC000120
[[35  0]
 [ 0  6]]
AUC train: 1.0
AUC: 1.0


Disease: Breast PDC000120
[[35  0]
 [ 0  6]]
AU

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 10000 out of 10000 | elapsed:    6.6s finished


Disease: Colon PDC000116
[[29  1]
 [ 0 30]]
AUC train: 1.0
AUC: 1.0


Disease: Colon PDC000116
[[29  1]
 [ 0 30]]
AUC train: 1.0
AUC: 1.0


Disease: Colon PDC000116
[[30  0]
 [ 0 30]]
AUC train: 1.0
AUC: 1.0


Disease: Colon PDC000116
[[30  0]
 [ 0 30]]
AUC train: 1.0
AUC: 1.0


Disease: Colon PDC000116
[[30  0]
 [ 0 30]]
AUC train: 1.0
AUC: 1.0


Disease: Colon PDC000116
[[29  1]
 [ 0 30]]
AUC train: 1.0
AUC: 1.0


Disease: Colon PDC000116
[[29  1]
 [ 0 30]]
AUC train: 1.0
AUC: 1.0


Disease: Colon PDC000116
[[29  1]
 [ 0 30]]
AUC train: 1.0
AUC: 1.0


Disease: Colon PDC000116
[[29  1]
 [ 0 30]]
AUC train: 1.0
AUC: 1.0


Disease: Colon PDC000116
[[29  1]
 [ 0 30]]
AUC train: 1.0
AUC: 1.0


Disease: Colon PDC000116
[[29  1]
 [ 0 30]]
AUC train: 1.0
AUC: 1.0


Disease: Colon PDC000116
[[29  1]
 [ 0 30]]
AUC train: 1.0
AUC: 0.9988888888888889


Disease: Colon PDC000116
[[29  1]
 [ 0 30]]
AUC train: 1.0
AUC: 1.0


Disease: Colon PDC000116
[[29  1]
 [ 0 30]]
AUC train: 1.0
AUC: 1.0


Disea

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 10000 out of 10000 | elapsed:    7.0s finished


Disease: HeadNeck PDC000221
[[29  3]
 [ 3 19]]
AUC train: 0.9832775919732442
AUC: 0.9119318181818182


Disease: HeadNeck PDC000221
[[30  2]
 [ 3 19]]
AUC train: 0.984392419175028
AUC: 0.9119318181818181


Disease: HeadNeck PDC000221
[[30  2]
 [ 3 19]]
AUC train: 0.9849498327759197
AUC: 0.9062499999999999


Disease: HeadNeck PDC000221
[[31  1]
 [ 2 20]]
AUC train: 0.9910813823857303
AUC: 0.9119318181818181


Disease: HeadNeck PDC000221
[[31  1]
 [ 2 20]]
AUC train: 0.9930323299888517
AUC: 0.9147727272727272


Disease: HeadNeck PDC000221
[[31  1]
 [ 3 19]]
AUC train: 1.0
AUC: 0.9673295454545454


Disease: HeadNeck PDC000221
[[31  1]
 [ 3 19]]
AUC train: 1.0
AUC: 0.9630681818181819


Disease: HeadNeck PDC000221
[[31  1]
 [ 3 19]]
AUC train: 0.9986064659977704
AUC: 0.9403409090909091


Disease: HeadNeck PDC000221
[[31  1]
 [ 3 19]]
AUC train: 0.9986064659977704
AUC: 0.9417613636363636


Disease: HeadNeck PDC000221
[[31  1]
 [ 3 19]]
AUC train: 0.9986064659977704
AUC: 0.9417613636363636


D

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 10000 out of 10000 | elapsed:    6.4s finished


Disease: Kidney PDC000127
[[32  0]
 [ 2 25]]
AUC train: 0.9986504723346828
AUC: 1.0


Disease: Kidney PDC000127
[[32  0]
 [ 1 26]]
AUC train: 0.9995501574448943
AUC: 0.9988425925925926


Disease: Kidney PDC000127
[[32  0]
 [ 2 25]]
AUC train: 0.9995501574448943
AUC: 0.9988425925925926


Disease: Kidney PDC000127
[[32  0]
 [ 2 25]]
AUC train: 0.9995501574448943
AUC: 0.9988425925925926


Disease: Kidney PDC000127
[[32  0]
 [ 1 26]]
AUC train: 0.9997750787224472
AUC: 1.0


Disease: Kidney PDC000127
[[32  0]
 [ 1 26]]
AUC train: 0.9995501574448943
AUC: 0.9988425925925926


Disease: Kidney PDC000127
[[32  0]
 [ 0 27]]
AUC train: 1.0
AUC: 1.0


Disease: Kidney PDC000127
[[32  0]
 [ 0 27]]
AUC train: 1.0
AUC: 1.0


Disease: Kidney PDC000127
[[32  0]
 [ 1 26]]
AUC train: 1.0
AUC: 1.0


Disease: Kidney PDC000127
[[32  0]
 [ 0 27]]
AUC train: 1.0
AUC: 1.0


Disease: Kidney PDC000127
[[32  0]
 [ 0 27]]
AUC train: 1.0
AUC: 1.0


Disease: Kidney PDC000127
[[32  0]
 [ 0 27]]
AUC train: 1.0
AUC: 1.0


  exec(code_obj, self.user_global_ns, self.user_ns)


(330, 9404)
(330, 1811)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 10000 out of 10000 | elapsed:    8.9s finished


Disease: Liver PDC000198
[[54  1]
 [ 0 44]]
AUC train: 0.9854620586025544
AUC: 0.9958677685950413


Disease: Liver PDC000198
[[54  1]
 [ 0 44]]
AUC train: 0.9909842223891812
AUC: 1.0


Disease: Liver PDC000198
[[55  0]
 [ 0 44]]
AUC train: 0.9904583020285499
AUC: 1.0


Disease: Liver PDC000198
[[54  1]
 [ 0 44]]
AUC train: 0.9903080390683696
AUC: 1.0


Disease: Liver PDC000198
[[54  1]
 [ 0 44]]
AUC train: 0.9873027798647634
AUC: 1.0


Disease: Liver PDC000198
[[54  1]
 [ 0 44]]
AUC train: 0.9856498873027799
AUC: 1.0


Disease: Liver PDC000198
[[54  1]
 [ 0 44]]
AUC train: 0.9850488354620587
AUC: 1.0


Disease: Liver PDC000198
[[54  1]
 [ 0 44]]
AUC train: 0.9845980465815176
AUC: 1.0


Disease: Liver PDC000198
[[54  1]
 [ 0 44]]
AUC train: 0.9846731780616078
AUC: 0.9991735537190083


Disease: Liver PDC000198
[[54  1]
 [ 0 44]]
AUC train: 0.984748309541698
AUC: 0.9991735537190083


Disease: Liver PDC000198
[[54  1]
 [ 0 44]]
AUC train: 0.9842223891810667
AUC: 0.9991735537190083


Diseas

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 10000 out of 10000 | elapsed:    7.0s finished


Disease: Lung PDC000153
[[29  5]
 [ 0 30]]
AUC train: 0.9977777777777778
AUC: 0.996078431372549


Disease: Lung PDC000153
[[32  2]
 [ 0 30]]
AUC train: 0.9979629629629629
AUC: 0.9980392156862745


Disease: Lung PDC000153
[[33  1]
 [ 0 30]]
AUC train: 0.9924074074074075
AUC: 0.996078431372549


Disease: Lung PDC000153
[[33  1]
 [ 0 30]]
AUC train: 0.9912962962962963
AUC: 0.9970588235294118


Disease: Lung PDC000153
[[33  1]
 [ 0 30]]
AUC train: 0.9911111111111112
AUC: 0.996078431372549


Disease: Lung PDC000153
[[33  1]
 [ 0 30]]
AUC train: 0.9911111111111112
AUC: 0.9950980392156863


Disease: Lung PDC000153
[[33  1]
 [ 0 30]]
AUC train: 0.9916666666666667
AUC: 0.9941176470588236


Disease: Lung PDC000153
[[33  1]
 [ 0 30]]
AUC train: 0.9912962962962963
AUC: 0.9941176470588236


Disease: Lung PDC000153
[[33  1]
 [ 0 30]]
AUC train: 0.992037037037037
AUC: 0.9862745098039216


Disease: Lung PDC000153
[[33  1]
 [ 0 30]]
AUC train: 0.9916666666666667
AUC: 0.9901960784313726


Disease: Lung 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 10000 out of 10000 | elapsed:    6.5s finished


Disease: Ovary PDC000110
[[24  4]
 [ 0  5]]
AUC train: 0.9970760233918129
AUC: 0.9500000000000001


Disease: Ovary PDC000110
[[26  2]
 [ 0  5]]
AUC train: 1.0
AUC: 0.9428571428571428


Disease: Ovary PDC000110
[[28  0]
 [ 0  5]]
AUC train: 1.0
AUC: 1.0


Disease: Ovary PDC000110
[[28  0]
 [ 0  5]]
AUC train: 1.0
AUC: 1.0


Disease: Ovary PDC000110
[[28  0]
 [ 0  5]]
AUC train: 1.0
AUC: 1.0


Disease: Ovary PDC000110
[[28  0]
 [ 0  5]]
AUC train: 1.0
AUC: 1.0


Disease: Ovary PDC000110
[[28  0]
 [ 0  5]]
AUC train: 0.9980506822612085
AUC: 1.0


Disease: Ovary PDC000110
[[28  0]
 [ 0  5]]
AUC train: 1.0
AUC: 1.0


Disease: Ovary PDC000110
[[28  0]
 [ 0  5]]
AUC train: 1.0
AUC: 1.0


Disease: Ovary PDC000110
[[28  0]
 [ 0  5]]
AUC train: 1.0
AUC: 1.0


Disease: Ovary PDC000110
[[28  0]
 [ 0  5]]
AUC train: 0.9970760233918128
AUC: 1.0


Disease: Ovary PDC000110
[[28  0]
 [ 0  5]]
AUC train: 1.0
AUC: 1.0


Disease: Ovary PDC000110
[[28  0]
 [ 0  5]]
AUC train: 1.0
AUC: 1.0


Disease: Ovary 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 10000 out of 10000 | elapsed:    8.1s finished


Disease: Pancreas PDC000270
[[41  2]
 [ 4 20]]
AUC train: 0.9502310700319943
AUC: 0.9525193798449613


Disease: Pancreas PDC000270
[[43  0]
 [ 4 20]]
AUC train: 0.9580519018841095
AUC: 0.9651162790697674


Disease: Pancreas PDC000270
[[42  1]
 [ 3 21]]
AUC train: 0.9587628865979381
AUC: 0.9699612403100775


Disease: Pancreas PDC000270
[[43  0]
 [ 4 20]]
AUC train: 0.9637397795947387
AUC: 0.9786821705426357


Disease: Pancreas PDC000270
[[43  0]
 [ 4 20]]
AUC train: 0.9642730181301102
AUC: 0.9777131782945736


Disease: Pancreas PDC000270
[[41  2]
 [ 3 21]]
AUC train: 0.9690721649484536
AUC: 0.9786821705426357


Disease: Pancreas PDC000270
[[42  1]
 [ 3 21]]
AUC train: 0.9712051190899396
AUC: 0.9825581395348837


Disease: Pancreas PDC000270
[[39  4]
 [ 1 23]]
AUC train: 0.9809811589050835
AUC: 0.99515503875969


Disease: Pancreas PDC000270
[[39  4]
 [ 1 23]]
AUC train: 0.9818698897973692
AUC: 0.993217054263566


Disease: Pancreas PDC000270
[[39  4]
 [ 1 23]]
AUC train: 0.9861357980803412

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 10000 out of 10000 | elapsed:    6.2s finished


Disease: UCEC PDC000125
[[30  1]
 [ 1  9]]
AUC train: 0.9791258969341161
AUC: 0.9903225806451613


Disease: UCEC PDC000125
[[29  2]
 [ 0 10]]
AUC train: 0.9915198956294847
AUC: 0.9741935483870968


Disease: UCEC PDC000125
[[29  2]
 [ 0 10]]
AUC train: 0.990215264187867
AUC: 0.9903225806451613


Disease: UCEC PDC000125
[[29  2]
 [ 0 10]]
AUC train: 0.9856490541422049
AUC: 0.9903225806451613


Disease: UCEC PDC000125
[[29  2]
 [ 0 10]]
AUC train: 0.9882583170254404
AUC: 0.9935483870967742


Disease: UCEC PDC000125
[[29  2]
 [ 0 10]]
AUC train: 0.9882583170254404
AUC: 1.0


Disease: UCEC PDC000125
[[29  2]
 [ 0 10]]
AUC train: 0.9928245270711025
AUC: 0.9903225806451613


Disease: UCEC PDC000125
[[29  2]
 [ 0 10]]
AUC train: 0.9915198956294846
AUC: 0.9967741935483871


Disease: UCEC PDC000125
[[29  2]
 [ 0 10]]
AUC train: 0.9928245270711025
AUC: 0.9935483870967742


Disease: UCEC PDC000125
[[29  2]
 [ 0 10]]
AUC train: 0.990215264187867
AUC: 0.9935483870967742


Disease: UCEC PDC000125
[[2

In [22]:
Shiny_data = Shiny_data[['Disease', 'Data', 'N', 'AUC_train', 'AUC_test', 'Features', 'Coef', 'prob', 'reference']]
Shiny_data

Unnamed: 0,Disease,Data,N,AUC_train,AUC_test,Features,Coef,prob,reference
0,Breast PDC000120,tissue,1.0,0.991968,1.000000,[CDCP1],[-3.214],"[0.006, 0.003, 0.008, 0.021, 0.831, 0.009, 0.0...","[Primary Tumor, Primary Tumor, Primary Tumor, ..."
1,Breast PDC000120,tissue,2.0,0.996988,1.000000,"[IGHMBP2, CDCP1]","[-2.244, -2.392]","[0.002, 0.003, 0.012, 0.019, 0.842, 0.007, 0.0...","[Primary Tumor, Primary Tumor, Primary Tumor, ..."
2,Breast PDC000120,tissue,3.0,1.000000,1.000000,"[PRDX6, IGHMBP2, CDCP1]","[1.733, -1.988, -2.118]","[0.002, 0.006, 0.013, 0.016, 0.937, 0.008, 0.0...","[Primary Tumor, Primary Tumor, Primary Tumor, ..."
3,Breast PDC000120,tissue,4.0,1.000000,1.000000,"[PRDX6, IGHMBP2, MYOM1, CDCP1]","[0.364, -0.588, 1.039, -1.04]","[0.006, 0.009, 0.008, 0.013, 0.919, 0.012, 0.1...","[Primary Tumor, Primary Tumor, Primary Tumor, ..."
4,Breast PDC000120,tissue,5.0,1.000000,1.000000,"[ATXN2L, PRDX6, IGHMBP2, CDCP1, MYOM1]","[-0.32, 0.356, -0.568, -1.009, 1.021]","[0.005, 0.01, 0.007, 0.013, 0.917, 0.012, 0.16...","[Primary Tumor, Primary Tumor, Primary Tumor, ..."
...,...,...,...,...,...,...,...,...,...
130,UCEC PDC000125,tissue,11.0,0.990868,0.993548,"[PXN, TBC1D17, LMNB2, AKT3, AHNAK, STXBP1, CGN...","[0.112, 0.136, 0.162, 0.185, 0.187, 0.221, -0....","[0.024, 0.021, 0.732, 0.022, 0.031, 0.027, 0.0...","[Primary Tumor, Primary Tumor, Solid Tissue No..."
131,UCEC PDC000125,tissue,12.0,0.992825,0.990323,"[PXN, TBC1D17, LMNB2, AKT3, AHNAK, DPT, STXBP1...","[0.106, 0.131, 0.156, 0.177, 0.179, 0.206, 0.2...","[0.023, 0.019, 0.735, 0.022, 0.028, 0.026, 0.0...","[Primary Tumor, Primary Tumor, Solid Tissue No..."
132,UCEC PDC000125,tissue,13.0,1.000000,0.977419,"[DMD, AHNAK, LMNB2, PXN, STXBP1, PRUNE2, SERPI...","[-0.891, -1.326, -1.601, 2.123, -2.27, 2.291, ...","[0.0, 0.0, 0.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7...","[Primary Tumor, Primary Tumor, Solid Tissue No..."
133,UCEC PDC000125,tissue,14.0,1.000000,0.974194,"[DMD, AHNAK, DCN, LMNB2, PXN, PRUNE2, STXBP1, ...","[-0.56, -1.288, -1.586, -1.725, 1.96, 1.998, -...","[0.0, 0.0, 0.999, 0.0, 0.0, 0.0, 0.0, 0.001, 0...","[Primary Tumor, Primary Tumor, Solid Tissue No..."


In [23]:
boxplot_data_full

Unnamed: 0,prob,reference,disease,data_type,N
0,0.006356,Primary Tumor,Breast PDC000120,tissue,1.0
1,0.003083,Primary Tumor,Breast PDC000120,tissue,1.0
2,0.007710,Primary Tumor,Breast PDC000120,tissue,1.0
3,0.020856,Primary Tumor,Breast PDC000120,tissue,1.0
4,0.831065,Solid Tissue Normal,Breast PDC000120,tissue,1.0
...,...,...,...,...,...
513,0.032658,Primary Tumor,UCEC PDC000125,tissue,1.0
514,0.746922,Solid Tissue Normal,UCEC PDC000125,tissue,1.0
515,0.694100,Solid Tissue Normal,UCEC PDC000125,tissue,1.0
516,0.080216,Primary Tumor,UCEC PDC000125,tissue,1.0


In [24]:
Shiny_data.to_csv('ShinyData_cancer_diagnosis_CPTAC.csv') #Supplementary table 2 (proteomics + metabolomics)
boxplot_data_full.to_csv('boxplot_data_cancer_diagnosis_CPTAC.csv')