In [1]:
import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
import methylcheck

## Evaluation of the workflows  
The performance of the python and federated implmentation of the workflow provided by the x group at Exeter university are compared to the original r-version of this workflow.  
Additionally the performance of the federated workflow is tested in several cases of sample size and class label imbalance.

In [47]:
GSE66351_pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Reduced_Pheno_Info.csv", index_col=0)
GSE105109_pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE105109\\GSE105109_Reduced_Pheno_Info.csv", index_col=0)
GSE134379_pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE134379\\Reduced_Pheno_Info.csv", index_col=0)
# make sure all the phenotype files used the same codes for the diagnosis
GSE105109_pheno.loc[GSE105109_pheno["Diagnosis"] == " Control", "Diagnosis"] = " CTRL"
GSE105109_pheno.loc[GSE105109_pheno["Diagnosis"] == " Alzheimer's disease", "Diagnosis"] = " AD"

GSE134379_pheno.loc[GSE134379_pheno["Diagnosis"] == " ND", "Diagnosis"] = " CTRL"
GSE134379_pheno.loc[GSE134379_pheno["Diagnosis"] == " AD", "Diagnosis"] = " AD"

# recode the sex column so in all datasets "gender: F" is females and "gender: M" is males
# only GSE66351 needs to be recoded that way

GSE66351_pheno.loc[GSE66351_pheno["Sex"] == " F", "Sex"] = " F"
GSE66351_pheno.loc[GSE66351_pheno["Sex"] == " M", "Sex"] = " M"

In [3]:
# read in the results from the different implementations
original_r_GSE66351_E = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\EWAS\\Results_small_dataset.csv", index_col=0)
#original_r_GSE66351_B = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Normalised_Betas.csv", index_col=0)


original_r_GSE105109_E = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE105109\\EWAS\\Results_small_dataset.csv", index_col=0)
#original_r_GSE105109_B = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE105109\\GSE105109_Normalised_Betas.csv", index_col=0)


original_r_GSE134379_E = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE134379\\EWAS\\Results_small_dataset.csv", index_col=0)
#original_r_GSE134379_B = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE134379\\Normalised_Betas.csv", index_col=0)


original_r_combined = None

In [2]:
python_central_GSE66351_E = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\QC_Python\\GSE66351results_diagnosis_regression_python.csv", index_col=0)
python_central_GSE66351_B = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\QC_Python\\GSE66351_normalised_betas_python.csv", index_col=0)

python_central_GSE105109 = None
python_central_GSE134379 = None
python_combined = None

In [None]:
federated = None


In [None]:
federated_imbalance = None

In [28]:
def calculate_EWAS_metrics_r(EWAS_results):
    #control_probe_means = Normalised_betas.loc[:, phenotype.index[phenotype["Diagnosis"] == " CTRL"]].mean(axis=1)
    #AD_probe_means = Normalised_betas.loc[:, phenotype.index[phenotype["Diagnosis"] != " CTRL"]].mean(axis=1)
    probe_difference = pd.Series.abs(EWAS_results.loc[:,"Diagnosis_Beta"])
    DM_probes = probe_difference[probe_difference >= 0.05]
    not_DM_probes = probe_difference[probe_difference < 0.05]

    # selecting the significant probes from the EWAS results
    EWAS_sig = EWAS_results.loc[EWAS_results["corr_pval"] <= 0.05, :]
    EWAS_notsig = EWAS_results.loc[EWAS_results["corr_pval"] > 0.05, :]

    # true positives - EWAS significant and differentially methylated
    true_positive = len(set(DM_probes.index.values).intersection(set(EWAS_sig.index.values)))
    # true negatives - EWAS not significant and not differentially methylated
    true_negative = len(set(not_DM_probes.index.values).intersection(set(EWAS_notsig.index.values)))
    # false positives - EWAS significant and not differentially methylated
    false_positve = len(set(not_DM_probes.index.values).intersection(set(EWAS_sig.index.values)))
    # false negatives = EWAS not significant and differentially methylated
    false_negative = len(set(DM_probes.index.values).intersection(set(EWAS_notsig.index.values)))

    if (true_positive+true_negative+false_positve+false_negative) > 0:
        Acc = (true_positive + true_negative)/(true_positive+true_negative+false_positve+false_negative)
    else:
        Acc = 0
    if (true_positive+false_positve) > 0:
        Pre = true_positive/(true_positive+false_positve)
    else:
        Pre = 0
    if (true_positive+false_negative) > 0:
        Rec = true_positive/(true_positive + false_negative)
    else:
        Rec = 0
    if Pre and Rec:
        F1 = 2*(Rec*Pre)/(Rec + Pre)
    else:
        F1 = 0

    return {"TP":true_positive, "TN":true_negative, "FP":false_positve, "FN":false_negative, "Accuracy":Acc, "Precision":Pre, "Recall":Rec, "F1":F1}

    

In [26]:
def calculate_EWAS_metrics_p(EWAS_results):
    #control_probe_means = Normalised_betas.loc[:, phenotype.index[phenotype["Diagnosis"] == " CTRL"]].mean(axis=1)
    #AD_probe_means = Normalised_betas.loc[:, phenotype.index[phenotype["Diagnosis"] != " CTRL"]].mean(axis=1)
    probe_difference = pd.Series.abs(EWAS_results.loc[:,"Methylation Change"])
    DM_probes = probe_difference[probe_difference >= 0.05]
    not_DM_probes = probe_difference[probe_difference < 0.05]

    # selecting the significant probes from the EWAS results
    EWAS_sig = EWAS_results.loc[EWAS_results["Corrected P-value"] <= 0.05, :]
    EWAS_notsig = EWAS_results.loc[EWAS_results["Corrected P-value"] > 0.05, :]

    # true positives - EWAS significant and differentially methylated
    true_positive = len(set(DM_probes.index.values).intersection(set(EWAS_sig.index.values)))
    # true negatives - EWAS not significant and not differentially methylated
    true_negative = len(set(not_DM_probes.index.values).intersection(set(EWAS_notsig.index.values)))
    # false positives - EWAS significant and not differentially methylated
    false_positve = len(set(not_DM_probes.index.values).intersection(set(EWAS_sig.index.values)))
    # false negatives = EWAS not significant and differentially methylated
    false_negative = len(set(DM_probes.index.values).intersection(set(EWAS_notsig.index.values)))

    Acc = (true_positive + true_negative)/(true_positive+true_negative+false_positve+false_negative)
    if (true_positive+false_positve) > 0:
        Pre = true_positive/(true_positive+false_positve)
    else:
        Pre = 0
    if (true_positive+false_negative) > 0:
        Rec = true_positive/(true_positive + false_negative)
    else:
        Rec = 0
    if Pre and Rec:
        F1 = 2*(Rec*Pre)/(Rec + Pre)
    else:
        F1 = 0

    return {"TP":true_positive, "TN":true_negative, "FP":false_positve, "FN":false_negative, "Accuracy":Acc, "Precision":Pre, "Recall":Rec, "F1":F1}


In [29]:
GSE66351_metrics_r = calculate_EWAS_metrics_r(original_r_GSE66351_E)

GSE105109_metrics_r = calculate_EWAS_metrics_r(original_r_GSE105109_E)
GSE134379_metrics_r = calculate_EWAS_metrics_r(original_r_GSE134379_E)

In [30]:
GSE66351_metrics_p = calculate_EWAS_metrics_p(python_central_GSE66351_E)

In [31]:
# combine the metrics results into one dataframe
#GSE66351_metrics_r = pd.DataFrame.from_dict(GSE66351_metrics_r)
#GSE66351_metrics_p = pd.DataFrame.from_dict(GSE66351_metrics_p)
performance_results = pd.DataFrame.from_dict([GSE66351_metrics_r, GSE66351_metrics_p])
performance_results.index = ["GSE66351 R original", "GSE66351 Pyhton central"]

In [32]:
performance_results


Unnamed: 0,TP,TN,FP,FN,Accuracy,Precision,Recall,F1
GSE66351 R original,10,450517,24,286,0.999312,0.294118,0.033784,0.060606
GSE66351 Pyhton central,40,450541,0,256,0.999432,1.0,0.135135,0.238095
