In [1]:
import pandas as pd
import numpy as np
import math
import os

In [5]:
testdata_dir = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Federated_Differential_Methylation_Analysis\\Fed_test"
output = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE66351_Fed"

## Dasen normalisation results evaluation  
Comparing the betas calculated based on the dasen normalised methylated and unmethylated values obtained from the original R code (dasen() in the wateRmelon package), the centrally implemented pythonic translation of the dasen normalisation and the federated implementation of the pythonic translation of the dasen normalisation

In [19]:
R_dasen = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Normalised_Betas.csv", index_col=0)

Python_dasen = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Federated_Differential_Methylation_Analysis\\Output\\all_probes_normalised_betas.csv", index_col=0)
Federated_dasen = pd.read_csv(os.path.join(output, "even_split1_betas.csv"), index_col=0)
R_dasen_fed_compare = R_dasen.loc[Federated_dasen.index.values,set(R_dasen.columns.values).intersection(set(Federated_dasen.columns.values))]

In [33]:
import dasen_normalisation
unmet = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Federated_Differential_Methylation_Analysis\\Output\\test_normalisation\\unmethylation.csv", index_col=0)
met = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Federated_Differential_Methylation_Analysis\\Output\\test_normalisation\\methylation.csv", index_col=0)
probe_type_data = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Federated_Differential_Methylation_Analysis\\Output\\test_normalisation\\probe_annotation.csv", index_col=0)
Python_dasen = dasen_normalisation.dasen_normalisation(unmet, met, probe_type_data.squeeze())

In [3]:
# absolute difference between the beta values
if np.all(R_dasen.index == Python_dasen.index):
    dasen_diff_python = pd.Series.abs((R_dasen) - Python_dasen) # the (* -1) negates the sign difference between python and r calculated
    #regression coefficients
else: print("different probes in the two dataframes")

average_absolute_diff_python_dasen = dasen_diff_python.mean(axis = 1)
print(average_absolute_diff_python_dasen.mean())

0.02387727695187572


In [26]:
# absolute difference between the beta values
if np.all(R_dasen_fed_compare.index == Federated_dasen.index):
    dasen_diff_fed = pd.Series.abs(R_dasen_fed_compare - Federated_dasen) 
else: print("different probes in the two dataframes")

average_absolute_diff_fed_dasen = dasen_diff_fed.mean(axis = 1)
print("the absolute mean difference between r dasen and fed dasen is ", average_absolute_diff_fed_dasen.mean())
relative_fed_das_diff = dasen_diff_fed/len(R_dasen_fed_compare)
print("the max relative difference between r dasen and fed dasen is ", relative_fed_das_diff.mean(axis=1).max())

the absolute mean difference between r dasen and fed dasen is  2.424658631712162e-16
the relative mean difference between r dasen and fed dasen is  7.42683422515883e-22


In [None]:
# root mean square error between the pythonic/federated beta values and the r beta values (used as ground truth values)

## EWAS results evaluation

In [8]:

R_EWAS = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Federated_Differential_Methylation_Analysis\\EWAS_GSE66351\\Results_dataset_test.csv", index_col=0)
R_EWAS_test = R_EWAS.iloc[0:21,:]
Python_EWAS = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Federated_Differential_Methylation_Analysis\\Output\\QC_GSE66351_PythonShell\\results_diagnosis_regression_test_python.csv", index_col=0)
Federated_EWAS = pd.read_csv(os.path.join(testdata_dir, "Results_EWASa.csv"), header=[0,1], index_col=0)
Federated_EWAS_test = Federated_EWAS.iloc[0:21,:]

In [18]:
# check if the same probes are included in the test subset
if np.all(R_EWAS_test.index == Python_EWAS.index):
    coef_diff_python = pd.Series.abs((-1* R_EWAS_test["Diagnosis_Beta"]) - Python_EWAS["Diagnosis_Coef"]) # the (* -1) negates the sign difference between python and r calculated
    #regression coefficients
else: print("different probes in the two dataframes")

average_absolute_diff_python = coef_diff_python.mean()

if np.all(R_EWAS_test.index == Federated_EWAS_test.index):
    coef_diff_federated = pd.Series.abs((-1* R_EWAS_test["Diagnosis_Beta"]) - Federated_EWAS_test[("Coefficient", "Diagnosis")]) # the (* -1) negates the sign difference between python and r calculated
    #regression coefficients
else: print("different probes in the two dataframes")

average_absolute_diff_federated = coef_diff_federated.mean()

print(average_absolute_diff_python, average_absolute_diff_federated)

3.533879925419243e-14 0.05228027048083186


In [23]:
pd.Series.abs(sum(((R_EWAS_test["Diagnosis_Beta"]*-1) - Federated_EWAS_test[("Coefficient", "Diagnosis")]))/len(R_EWAS_test["Diagnosis_Beta"]))

0.0241536102793927

In [24]:
#check root mean square error between the r (ground truth) and python generated regression coefficients
RMSE_python = math.sqrt(pd.Series.abs(sum(((R_EWAS_test["Diagnosis_Beta"] * -1) - Python_EWAS["Diagnosis_Coef"]))/len(R_EWAS_test["Diagnosis_Beta"])))
RMSE_federated = math.sqrt(pd.Series.abs(sum(((R_EWAS_test["Diagnosis_Beta"]*-1) - Federated_EWAS_test[("Coefficient", "Diagnosis")]))/len(R_EWAS_test["Diagnosis_Beta"])))
print(RMSE_python, RMSE_federated)

1.879861677203736e-07 0.1554143181286483


## Federated results evaluation  
Test how well it performs with imbalanced splits:  
* Sample size imbalance  
* Class lable imbalance

In [None]:
# load in the test data