In [1]:
import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
import methylcheck

## Evaluation of the workflows  
The performance of the python and federated implmentation of the workflow provided by the x group at Exeter university are compared to the original r-version of this workflow.  
Additionally the performance of the federated workflow is tested in several cases of sample size and class label imbalance.

In [47]:
GSE66351_pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Reduced_Pheno_Info.csv", index_col=0)
GSE105109_pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE105109\\GSE105109_Reduced_Pheno_Info.csv", index_col=0)
GSE134379_pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE134379\\Reduced_Pheno_Info.csv", index_col=0)
# make sure all the phenotype files used the same codes for the diagnosis
GSE105109_pheno.loc[GSE105109_pheno["Diagnosis"] == " Control", "Diagnosis"] = " CTRL"
GSE105109_pheno.loc[GSE105109_pheno["Diagnosis"] == " Alzheimer's disease", "Diagnosis"] = " AD"

GSE134379_pheno.loc[GSE134379_pheno["Diagnosis"] == " ND", "Diagnosis"] = " CTRL"
GSE134379_pheno.loc[GSE134379_pheno["Diagnosis"] == " AD", "Diagnosis"] = " AD"

# recode the sex column so in all datasets "gender: F" is females and "gender: M" is males
# only GSE66351 needs to be recoded that way

GSE66351_pheno.loc[GSE66351_pheno["Sex"] == " F", "Sex"] = " F"
GSE66351_pheno.loc[GSE66351_pheno["Sex"] == " M", "Sex"] = " M"

In [35]:
# read in the results from the different implementations
original_r_GSE66351_E = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\EWAS\\Results_small_dataset.csv", index_col=0)
original_r_GSE66351_B = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Normalised_Betas.csv", index_col=0)


original_r_GSE105109_E = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE105109\\EWAS\\Results_small_dataset.csv", index_col=0)
original_r_GSE105109_B = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE105109\\GSE105109_Normalised_Betas.csv", index_col=0)


original_r_GSE134379_E = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE134379\\EWAS\\Results_small_dataset.csv", index_col=0)
original_r_GSE134379_B = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE134379\\Normalised_Betas.csv", index_col=0)


original_r_combined = None

In [None]:
python_central_GSE66351 = None
python_central_GSE105109 = None
python_central_GSE134379 = None
python_combined = None

In [None]:
federated = None


In [None]:
federated_imbalance = None

In [51]:
def calculate_EWAS_metrics_r(EWAS_results, Normalised_betas, phenotype):
    #control_probe_means = Normalised_betas.loc[:, phenotype.index[phenotype["Diagnosis"] == " CTRL"]].mean(axis=1)
    #AD_probe_means = Normalised_betas.loc[:, phenotype.index[phenotype["Diagnosis"] != " CTRL"]].mean(axis=1)
    probe_difference = pd.Series.abs(EWAS_results.loc[:,"Diagnosis_Beta"])
    DM_probes = probe_difference[probe_difference >= 0.05]
    not_DM_probes = probe_difference[probe_difference < 0.05]

    # selecting the significant probes from the EWAS results
    EWAS_sig = EWAS_results.loc[EWAS_results["corr_pval"] <= 0.05, :]
    EWAS_notsig = EWAS_results.loc[EWAS_results["corr_pval"] > 0.05, :]

    # true positives - EWAS significant and differentially methylated
    true_positive = set(DM_probes.index.values).intersection(set(EWAS_sig.index.values))
    # true negatives - EWAS not significant and not differentially methylated
    true_negative = set(not_DM_probes.index.values).intersection(set(EWAS_notsig.index.values))
    # false positives - EWAS significant and not differentially methylated
    false_positve = set(not_DM_probes.index.values).intersection(set(EWAS_sig.index.values))
    # false negatives = EWAS not significant and differentially methylated
    false_negative = set(DM_probes.index.values).intersection(set(EWAS_notsig.index.values))

    return {"TP":true_positive, "TN":true_negative, "FP":false_positve, "FN":false_negative}

    

In [None]:
def calculate_EWAS_metrics_p(EWAS_results, Normalised_betas, phenotype):
    #control_probe_means = Normalised_betas.loc[:, phenotype.index[phenotype["Diagnosis"] == " CTRL"]].mean(axis=1)
    #AD_probe_means = Normalised_betas.loc[:, phenotype.index[phenotype["Diagnosis"] != " CTRL"]].mean(axis=1)
    probe_difference = pd.Series.abs(EWAS_results.loc[:,"Methylation Change"])
    DM_probes = probe_difference[probe_difference >= 0.05]
    not_DM_probes = probe_difference[probe_difference < 0.05]

    # selecting the significant probes from the EWAS results
    EWAS_sig = EWAS_results.loc[EWAS_results["Corrected P-value"] <= 0.05, :]
    EWAS_notsig = EWAS_results.loc[EWAS_results["Corrected P-value"] > 0.05, :]

    # true positives - EWAS significant and differentially methylated
    true_positive = set(DM_probes.index.values).intersection(set(EWAS_sig.index.values))
    # true negatives - EWAS not significant and not differentially methylated
    true_negative = set(not_DM_probes.index.values).intersection(set(EWAS_notsig.index.values))
    # false positives - EWAS significant and not differentially methylated
    false_positve = set(not_DM_probes.index.values).intersection(set(EWAS_sig.index.values))
    # false negatives = EWAS not significant and differentially methylated
    false_negative = set(DM_probes.index.values).intersection(set(EWAS_notsig.index.values))

    return {"TP":true_positive, "TN":true_negative, "FP":false_positve, "FN":false_negative}


In [52]:
GSE66351_metrics_r = calculate_EWAS_metrics_r(original_r_GSE66351_E, original_r_GSE66351_B, GSE66351_pheno)

GSE105109_metrics_r = calculate_EWAS_metrics_r(original_r_GSE105109_E, original_r_GSE105109_B, GSE105109_pheno)
GSE134379_metrics_r = calculate_EWAS_metrics_r(original_r_GSE134379_E, original_r_GSE134379_B, GSE134379_pheno)