In [1]:
import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
import methylcheck

## Evaluation of the workflows  
The performance of the python and federated implmentation of the workflow provided by the x group at Exeter university are compared to the original r-version of this workflow.  
Additionally the performance of the federated workflow is tested in several cases of sample size and class label imbalance.

In [47]:
GSE66351_pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Reduced_Pheno_Info.csv", index_col=0)
GSE105109_pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE105109\\GSE105109_Reduced_Pheno_Info.csv", index_col=0)
GSE134379_pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE134379\\Reduced_Pheno_Info.csv", index_col=0)
# make sure all the phenotype files used the same codes for the diagnosis
GSE105109_pheno.loc[GSE105109_pheno["Diagnosis"] == " Control", "Diagnosis"] = " CTRL"
GSE105109_pheno.loc[GSE105109_pheno["Diagnosis"] == " Alzheimer's disease", "Diagnosis"] = " AD"

GSE134379_pheno.loc[GSE134379_pheno["Diagnosis"] == " ND", "Diagnosis"] = " CTRL"
GSE134379_pheno.loc[GSE134379_pheno["Diagnosis"] == " AD", "Diagnosis"] = " AD"

# recode the sex column so in all datasets "gender: F" is females and "gender: M" is males
# only GSE66351 needs to be recoded that way

GSE66351_pheno.loc[GSE66351_pheno["Sex"] == " F", "Sex"] = " F"
GSE66351_pheno.loc[GSE66351_pheno["Sex"] == " M", "Sex"] = " M"

In [2]:
# read in the results from the different implementations
original_r_GSE66351_E = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\EWAS\\Results_small_dataset.csv", index_col=0)
#original_r_GSE66351_B = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Normalised_Betas.csv", index_col=0)


original_r_GSE105109_E = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE105109\\EWAS\\Results_small_dataset.csv", index_col=0)
#original_r_GSE105109_B = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE105109\\GSE105109_Normalised_Betas.csv", index_col=0)


original_r_GSE134379_E = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE134379\\EWAS\\Results_small_dataset.csv", index_col=0)
#original_r_GSE134379_B = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE134379\\Normalised_Betas.csv", index_col=0)


original_r_combined = None

In [3]:
python_central_GSE66351_E = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\QC_Python\\GSE66351results_diagnosis_regression_python.csv", index_col=0)
python_central_GSE66351_B = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\QC_Python\\GSE66351_normalised_betas_python.csv", index_col=0)

python_central_GSE105109 = None
python_central_GSE134379 = None
python_combined = None

In [4]:
federated = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE66351_Fed\\Results_EWAS1.csv", index_col=0, header=[0,1])


In [5]:
federated_mild_imbalance = pd.read_csv("E:\Msc Systems Biology\MSB5000_Master_Thesis\Practical work\Data\GSE66351_Fed\mild_Results_EWAS1.csv", index_col=0, header = [0,1])
federated_strong_imbalance = pd.read_csv("E:\Msc Systems Biology\MSB5000_Master_Thesis\Practical work\Data\GSE66351_Fed\stron_Results_EWAS1.csv", index_col=0, header = [0,1])

In [6]:
def calculate_GroudTruth(R_EWAS_results, diff_meth_thresh = 0.05):
    #control_probe_means = Normalised_betas.loc[:, phenotype.index[phenotype["Diagnosis"] == " CTRL"]].mean(axis=1)
    #AD_probe_means = Normalised_betas.loc[:, phenotype.index[phenotype["Diagnosis"] != " CTRL"]].mean(axis=1)
    probe_difference = pd.Series.abs(R_EWAS_results.loc[:,"Diagnosis_Beta"])
    DM_probes = probe_difference[probe_difference >= diff_meth_thresh]
    not_DM_probes = probe_difference[probe_difference < diff_meth_thresh]

    # selecting the significant probes from the EWAS results
    EWAS_sig = R_EWAS_results.loc[R_EWAS_results["corr_pval"] <= 0.05, :]
    EWAS_notsig = R_EWAS_results.loc[R_EWAS_results["corr_pval"] > 0.05, :]

    # true positives - EWAS significant and differentially methylated
    true_positive = set(DM_probes.index.values).intersection(set(EWAS_sig.index.values))
    # true negatives - EWAS not significant and not differentially methylated
    true_negative = set(not_DM_probes.index.values).intersection(set(EWAS_notsig.index.values))
    # false positives - EWAS significant and not differentially methylated
    false_positve = set(not_DM_probes.index.values).intersection(set(EWAS_sig.index.values))
    # false negatives = EWAS not significant and differentially methylated
    false_negative = set(DM_probes.index.values).intersection(set(EWAS_notsig.index.values))

    return {"TP":true_positive, "TN":true_negative, "FP":false_positve, "FN":false_negative}

    

In [25]:
def calculate_EWAS_metrics_p(EWAS_results, ground_truth, diff_meth_thresh = 0.05,  multiIndex = False):
    #control_probe_means = Normalised_betas.loc[:, phenotype.index[phenotype["Diagnosis"] == " CTRL"]].mean(axis=1)
    #AD_probe_means = Normalised_betas.loc[:, phenotype.index[phenotype["Diagnosis"] != " CTRL"]].mean(axis=1)
    if multiIndex:
            probe_difference = pd.Series.abs(EWAS_results.loc[:, ("Methylation change", "Diagnosis")])
    else:
        probe_difference = pd.Series.abs(EWAS_results.loc[:,"Methylation Change"])
    DM_probes = probe_difference[probe_difference >= diff_meth_thresh]
    not_DM_probes = probe_difference[probe_difference < diff_meth_thresh]

    # selecting the significant probes from the EWAS results
    if multiIndex:
            EWAS_sig = EWAS_results.loc[EWAS_results[("Corrected P-value", "Diagnosis")] <= 0.05, :]
    else:
        EWAS_sig = EWAS_results.loc[EWAS_results["Corrected P-value"] <= 0.05, :]
    
    if multiIndex:
            EWAS_notsig = EWAS_results.loc[EWAS_results[("Corrected P-value", "Diagnosis")] > 0.05, :]
    else:
        EWAS_notsig = EWAS_results.loc[EWAS_results["Corrected P-value"] > 0.05, :]

    # true positives - EWAS significant and differentially methylated in python/fed and in original R
    true_positive = len(set(set(DM_probes.index.values).intersection(set(EWAS_sig.index.values))).intersection(ground_truth["TP"]))
    # true negatives - EWAS not significant and not differentially methylated in python/fed and in original R
    true_negative = len(set(set(not_DM_probes.index.values).intersection(set(EWAS_notsig.index.values))).intersection(ground_truth["TN"]))
    # false positives - EWAS significant and not differentially methylated in python/fed and in original R
    false_positve = len(set(set(not_DM_probes.index.values).intersection(set(EWAS_sig.index.values))).intersection(ground_truth["FP"]))
    # false negatives = EWAS not significant and differentially methylated in python/fed and in original R
    false_negative = len(set(set(DM_probes.index.values).intersection(set(EWAS_notsig.index.values))).intersection(ground_truth["FN"]))

    Acc = (true_positive + true_negative)/(true_positive+true_negative+false_positve+false_negative)
    if (true_positive+false_positve) > 0:
        Pre = true_positive/(true_positive+false_positve)
    else:
        Pre = 0
    if (true_positive+false_negative) > 0:
        Rec = true_positive/(true_positive + false_negative)
    else:
        Rec = 0
    if Pre and Rec:
        F1 = 2*(Rec*Pre)/(Rec + Pre)
    else:
        F1 = 0

    return {"TP":true_positive, "TN":true_negative, "FP":false_positve, "FN":false_negative, "Accuracy":Acc, "Precision":Pre, "Recall":Rec, "F1":F1}


In [8]:
GSE66351_groundTruth = calculate_GroudTruth(original_r_GSE66351_E)

In [26]:
# loop through different methylation cut-offs to compare perfomance of R and fed
thresholds = {"0.01":0.01, "0.05":0.05, "0.1":0.1, "0.15":0.15, "0.2":0.20}
datasets = {"No Inbalance":federated, "Mild Inbalance":federated_mild_imbalance, "Strong Inbalance":federated_strong_imbalance}
results = {}
#thesis_tables = pd.ExcelWriter("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\ThesisTable.xlsx", mode="a", engine="openpyxl")
for data in datasets:
    inner = {}
    for i in thresholds:
        metrics = calculate_EWAS_metrics_p(datasets[data], diff_meth_thresh=thresholds[i], ground_truth=GSE66351_groundTruth, multiIndex=True)
        inner[i] = metrics
    results[data] = pd.DataFrame.from_dict(inner).T
    # save the metric tables to excel
    with pd.ExcelWriter("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\ThesisTable.xlsx", mode="a", engine="openpyxl", if_sheet_exists="replace") as writer:
        results[data].to_excel(writer, sheet_name = (data + "EWASMetrics"))

    

In [9]:
GSE66351_metrics_p = calculate_EWAS_metrics_p(python_central_GSE66351_E, diff_meth_thresh=0.05, ground_truth = GSE66351_groundTruth, multiIndex=False)
federated_metrics = calculate_EWAS_metrics_p(federated, diff_meth_thresh=0.05, ground_truth = GSE66351_groundTruth, multiIndex=True)
# combine the metrics results into one dataframe
performance_results = pd.DataFrame.from_dict([GSE66351_metrics_p, federated_metrics])
performance_results.index = ["GSE66351 Pyhton central", "GSE66351 Federated"]
#performance_results.to_excel("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\ThesisTables.xlsx")

performance_results

In [23]:
test = pd.Series.abs(original_r_GSE66351_E.loc[:, "Diagnosis_Beta"])
test.max()

0.253622674128926

In [36]:
probe_difference = pd.Series.abs(federated_mild_imbalance.loc[:, ("Methylation change", "Diagnosis")])
DM_probes = probe_difference[probe_difference >= 0.05]
no = probe_difference[probe_difference < 0.05]
print("There are %s probes in ground truth that are not differentially methylated in fed mild"%(len(GSE66351_groundTruth["FP"].difference(set(no)))))
print("There are %s probes in ground truth FP"%(len(GSE66351_groundTruth["FP"])))

There are 24 probes in ground truth that are not differentially methylated in fed mild
There are 24 probes in ground truth FP


In [40]:
probe_difference.loc[list(GSE66351_groundTruth["FP"].intersection(set(no.index)))]

cg13024624    0.027695
cg14091713    0.035844
cg00270291    0.022981
cg15858894    0.037446
cg25846190    0.047121
cg01211424    0.015661
cg26516741    0.009751
cg27554973    0.037428
cg00796364    0.012414
cg19435724    0.010890
cg18626282    0.006315
cg17519650    0.007102
cg05036100    0.010889
cg23289794    0.023096
cg09830308    0.032548
cg27313572    0.009798
cg25172682    0.013954
cg01832266    0.013471
cg07594247    0.049003
cg25744450    0.043815
cg09091181    0.022087
cg01894498    0.034574
cg24577417    0.011816
cg09303159    0.020531
Name: (Methylation change, Diagnosis), dtype: float64