In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score, confusion_matrix
import seaborn as sns
from random import sample, seed 

from scipy import stats

import matplotlib.pyplot as plt


# Read the data

In [None]:
omics_gen_path = '/data/sharedData/Dina_genetics/PRS_directory/Martin_pan_cancer_PRS/Martin_Cancer_PRSs/Matrix_scaled_CombinedPRS_EUROS_Martin_pancamcer_PRSs.csv'
omics = pd.read_csv(omics_gen_path, sep = ';', index_col = 0)
omics

In [None]:
#Read omics data
omics_gen_path = '/data/sharedData/Dina_genetics/PRS_directory/Martin_pan_cancer_PRS/Martin_Cancer_PRSs/Matrix_scaled_CombinedPRS_EUROS_Martin_pancamcer_PRSs.csv'
PCA_dataset_path = '/data/sharedData/Euros_genetic_principal_components.csv'

omics = pd.read_csv(omics_gen_path, sep = ';', index_col = 0)
PCA_dataset = pd.read_csv(PCA_dataset_path, sep = ';', index_col = 0)



boxplot_data_full = ({'prob':[],
                'reference':[],
                'disease':[],
                'data_type':[],
                'Model': [],
                'control_data': [],
                'N': []
                })

boxplot_data_full = pd.DataFrame(boxplot_data_full)

Shiny_data = ({'prob':[],
                'reference':[],
               'Disease':[],
               'Data': [],
               'AUC_train':[],
               'AUC_test':[],
               'Model':[],
               'control_data': [],
               'N': []
              })
Shiny_data = pd.DataFrame(Shiny_data)


for disease in ['AllCancers', 'Bladder', 'Breast', 'Colorectal', 'Kidney', 'Leukaemia', \
                'Lung', 'Lymphoma', 'Malignant_melanoma', 'Ovarian', 'Prostate', \
                'Thyroid', 'Uterine']:
#for disease in ['Thyroid']:

    data_type = 'all'
        
    if disease == 'AllCancers':
        colname_disease = 'combined'
    elif disease == 'Breast':
        colname_disease = 'breast_new'
    elif disease == 'Prostate':
        colname_disease = 'prostate_new'
    elif disease == 'Colorectal':
        colname_disease = 'CRC_new'
    elif disease == 'Malignant_melanoma':
        colname_disease = 'Melanoma_new'
    elif disease == 'Leukaemia':
        colname_disease = 'Leukemia'
    elif disease == 'Lymphoma':
        colname_disease = 'NH_cancer'
    else:
        colname_disease = disease
        
    PRS = [col for col in omics.columns if colname_disease in col]
    omics_specificPRS = omics[PRS]
        
        
    #for model_type in ['incident', 'prevalent']:        
    for model_type in ['prevalent']:        

        for control_data in ['HC', 'NonCancer']:
        #for control_data in ['NonCancer']:


            if model_type == 'incident':
                sick_path = '../data/Patients_final/' + disease + '/ukb_' + \
                            disease + '_' + data_type + '_incident.csv'    
                clinical_sick = pd.read_csv(sick_path, index_col = 0)
                HC_path = '../data/Patients_final/' + disease + '/ukb_' + \
                            disease + '_' + data_type + '_' + control_data + '_PairedToIncident.csv'
                clinical_healthy = pd.read_csv(HC_path, index_col = 0)
            else:
                sick_path = '../data/Patients_final/' + disease + '/ukb_' + \
                            disease + '_' + data_type + '_prevalent.csv'    
                clinical_sick = pd.read_csv(sick_path, index_col = 0)
                HC_path = '../data/Patients_final/' + disease + '/ukb_' + \
                            disease + '_' + data_type + '_' + control_data + '_PairedToPrevalent.csv'
                clinical_healthy = pd.read_csv(HC_path, index_col = 0)


       


            # clinical -> clinical data from both healthy and sick
            # omics_clinical -> subset of genomics including only patients from clinical data
            clinical = pd.concat([clinical_sick,clinical_healthy])
            clinical = clinical[['age_at_recruitment_f21022_0_0', 'sex_f31_0_0', 'group']]
            
            
            omics_clinical = clinical.join(PCA_dataset).join(omics_specificPRS)
            omics_clinical['PC3'] = pd.to_numeric(omics_clinical['PC3'])

                    #devide between healthy and sick
            X_train, X_test, y_train, y_test = train_test_split(omics_clinical.drop(columns=['group']),
                                                                omics_clinical['group'],
                                                                test_size=0.3,
                                                                random_state=42)

            X_train = pd.get_dummies(X_train)
            X_test = pd.get_dummies(X_test)


            clf_ridge = LogisticRegressionCV(cv=10, Cs = 10, random_state=42, max_iter = 10000,
                                        penalty='l2').fit(X_train, y_train)



            #compute feature importance 
            coefficients = clf_ridge.coef_[0]
            feature_importance = pd.DataFrame({'Feature': X_train.columns,
                                                'Importance': np.abs(coefficients),
                                                'Coef': coefficients})
            feature_importance = feature_importance.sort_values('Importance', ascending = True)


            #compute metrics
            auc_train = roc_auc_score(y_train, clf_ridge.predict_proba(X_train)[:, 1])
            auc = roc_auc_score(y_test, clf_ridge.predict_proba(X_test)[:, 1])
            conf_matrix = confusion_matrix(y_test, clf_ridge.predict(X_test), labels = np.unique(y_test))


            #print summary
            print('Disease: ' + disease)
            print(conf_matrix)
            print('AUC train: ' + str(auc_train))
            print('AUC: ' + str(auc))
            print('\n')




            # Prediction for sick/healthy
            X_test_summary = pd.DataFrame(clf_ridge.predict_proba(X_test)[:, 1], columns = ['prob'])
            X_test_summary['reference'] = y_test.values

            boxplot_data = X_test_summary[['prob', 'reference']]
            boxplot_data['disease'] = disease
            boxplot_data['data_type'] = data_type
            boxplot_data['Model'] = model_type
            boxplot_data['control_data'] = control_data
            boxplot_data['N'] = 1

            boxplot_data_full = boxplot_data_full.append(boxplot_data, ignore_index=True)

            new_row = {'prob': list(np.round(boxplot_data['prob'].astype(float), 3)),
                        'reference': list(boxplot_data['reference']),
                        'Disease': disease,
                        'Data': data_type,
                        'AUC_train': auc_train,
                        'AUC_test': auc,
                        'Model': model_type,
                        'control_data': control_data,
                        'N': 1                       
                }

            Shiny_data = Shiny_data.append(new_row, ignore_index=True)



In [None]:
Shiny_data = Shiny_data[['Disease', 'Data', 'Model', 'control_data', 'AUC_train', 'AUC_test', 'prob', 'reference']]
Shiny_data

In [None]:
Shiny_data.to_csv('ShinyData_cancer_diagnosis_genomics.csv') # Supplementary data 2 (genomics)

In [None]:
boxplot_data_full.to_csv('graph_cancer_diagnosis_genomics.csv')