In [1]:
import pandas as pd
import numpy as np
import sys
import re
import os

import server
from server import Server 
import client
from client import Client

In [22]:
testdata_dir = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Federated_Differential_Methylation_Analysis\\Fed_test"

## Create test data and split across 3 "labs"

In [8]:
# load in some data and create a test data object to write these functions
test_methylated = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Methylated.csv", index_col=0)
test_methylatedA = test_methylated.iloc[:, 0:21]
test_methylatedB = test_methylated.iloc[:, 21:41]
test_methylatedC = test_methylated.iloc[: , 41:61]

test_unmethylated = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Unmethylated.csv", index_col=0)
test_unmethylatedA = test_unmethylated.iloc[:, 0:21]
test_unmethylatedB = test_unmethylated.iloc[:, 21:41]
test_unmethylatedC = test_unmethylated.iloc[:, 41:61]
# attach the probe type information to the (test) data so it can be used by the normalisation functions
annotation_data = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE66351_RAW\\GPL13534_HumanMethylation450_15017482_v.1.1.csv", skiprows=7, low_memory=False)
annotation_data.set_index(annotation_data["IlmnID"], inplace=True)
probe_type_data = annotation_data.loc[:, "Infinium_Design_Type"]
test_probe_annotation = pd.merge(test_methylated, probe_type_data, how = "inner", left_index=True, right_index=True, indicator = True)
test_probe_annotation = test_probe_annotation.loc[:,"Infinium_Design_Type"]

# create test design matrix
pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Reduced_Pheno_Info.csv", index_col= "Sample_ID")
x = pheno.loc[:,["Diagnosis", "Age", "Sex", "Sentrix_ID"]] # design matrix with the dependent/explainatory variables to be included in the model
x["Diagnosis"] = (x["Diagnosis"] == " AD").astype(int) #create binary diagnosis with 1 = AD and 0 = CTR
x["Sex"] = (x["Sex"] == " F").astype(int) #create binary sex with 1 = F and 0 = M
# create dummy variables for the unique sentrix_ids present in the dataset - this code can be reused to create center number dummies in the federated version of the code
""" unique_ids = x["Sentrix_ID"].unique()
for id in unique_ids:
    x[id] = (x["Sentrix_ID"] == id).astype(int)
x.drop(columns="Sentrix_ID", inplace = True) """
# turn the age variable into a continuous numerical variable without any leftover text
#x["Age"].replace("^[^:]*:", "", regex=True, inplace=True)
x["Age"] = pd.to_numeric(x["Age"])

xA = x.iloc[0:21, :]
xB = x.iloc[21:41, :]
xC = x.iloc[41:61, :]

# save the test data for easy reference later + to add to github ?
test_methylatedA.to_csv(os.path.join(testdata_dir, "methylatedA.csv"))
test_methylatedB.to_csv(os.path.join(testdata_dir, "methylatedB.csv"))
test_methylatedC.to_csv(os.path.join(testdata_dir, "methylatedC.csv"))

test_unmethylatedA.to_csv(os.path.join(testdata_dir, "unmethylatedA.csv"))
test_unmethylatedB.to_csv(os.path.join(testdata_dir, "unmethylatedB.csv"))
test_unmethylatedC.to_csv(os.path.join(testdata_dir, "unmethylatedC.csv"))

xA.to_csv(os.path.join(testdata_dir, "designA.csv"))
xB.to_csv(os.path.join(testdata_dir, "designB.csv"))
xC.to_csv(os.path.join(testdata_dir, "designC.csv"))

test_probe_annotation.to_csv(os.path.join(testdata_dir, "probe_annotation.csv"))

In [2]:
split_dir = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE66351_splits"

## Initialising the clients

In [3]:
# create client
lab_a = Client("Lab_A", os.path.join(split_dir, "Split_1_design.csv"), os.path.join(split_dir, "Split_1_methylated.csv"), os.path.join(split_dir, "Split_1_unmethylated.csv"), os.path.join(split_dir, "probe_annotation.csv"))
lab_b = Client("Lab_B", os.path.join(split_dir, "Split_2_design.csv"), os.path.join(split_dir, "Split_2_methylated.csv"), os.path.join(split_dir, "Split_2_unmethylated.csv"), os.path.join(split_dir, "probe_annotation.csv"))
lab_c = Client("Lab_C", os.path.join(split_dir, "Split_3_design.csv"), os.path.join(split_dir, "Split_3_methylated.csv"), os.path.join(split_dir, "Split_3_unmethylated.csv"), os.path.join(split_dir, "probe_annotation.csv"))

## Initialising the server

In [4]:
serv = Server(["Diagnosis"], ["Age", "Sex", "Sentrix_ID"])
global_conditions = serv.return_global_conditions()

In [5]:
global_conditions

['Age', 'Sex', 'Sentrix_ID', 'Diagnosis']

## Joining clients to the server

In [5]:
# join the clients
serv.get_clients(lab_a.cohort_name, lab_a.probes, lab_a.designmatrix.index)
serv.get_clients(lab_b.cohort_name, lab_b.probes, lab_b.designmatrix.index)
serv.get_clients(lab_c.cohort_name, lab_c.probes, lab_c.designmatrix.index)

In [6]:
global_probes = serv.find_global_probes()
cohort_effect = serv.find_cohort_effects()

In [7]:
#check client input
lab_a.input_validation(global_conditions, global_probes)
lab_b.input_validation(global_conditions, global_probes)
lab_c.input_validation(global_conditions, global_probes)

In [8]:
lab_a.cohort_effects(serv.client_names)
lab_b.cohort_effects(serv.client_names)
lab_c.cohort_effects(serv.client_names)

In [9]:
if "Sentrix_ID" in global_conditions:
    lab_a.find_unique_SentrixIDS()
    lab_b.find_unique_SentrixIDS()
    lab_c.find_unique_SentrixIDS()
    global_sentrix = serv.return_global_SentrixID(lab_a.unique_SentrixIDS,
                                lab_b.unique_SentrixIDS,
                                lab_c.unique_SentrixIDS)
    lab_a.SentrixID_effects(global_sentrix)
    lab_b.SentrixID_effects(global_sentrix)
    lab_c.SentrixID_effects(global_sentrix)
    



In [11]:
print(len(lab_a.designcolumns))
print(len(lab_b.designcolumns))
print(len(lab_c.designcolumns))

26
26
26


## Dasen normalisation

Client side

In [10]:
lab_a.intensity_distributions()
lab_b.intensity_distributions()
lab_c.intensity_distributions()

(                GSM2808877_8918692108_R02C02  GSM2808879_8918692108_R03C02  \
 cg00000029                            3087.0                        3873.0   
 cg00000108                            6121.0                        6523.0   
 cg00000109                            1922.0                        1745.0   
 cg00000165                             983.0                        1092.0   
 cg00000236                            3356.0                        3794.0   
 ...                                      ...                           ...   
 ch.X.97129969R                         395.0                         409.0   
 ch.X.97133160R                        1359.0                        1328.0   
 ch.X.97651759F                        1856.0                        1548.0   
 ch.X.97737721F                        2355.0                        2190.0   
 ch.X.98007042R                         718.0                         641.0   
 
                 GSM2808880_8918692108_R03C01  GSM

In [11]:
local_dasen_paramA = lab_a.local_normalisation_parameters()
local_dasen_paramB = lab_b.local_normalisation_parameters()
local_dasen_paramC = lab_c.local_normalisation_parameters()

Server side

In [12]:
probe_type_means = serv.aggregate_QN_means(local_dasen_paramA, local_dasen_paramB, local_dasen_paramC)

Client side

In [13]:
lab_a.final_normalisation(probe_type_means)
lab_b.final_normalisation(probe_type_means)
lab_c.final_normalisation(probe_type_means)

Unnamed: 0,GSM2808877_8918692108_R02C02,GSM2808879_8918692108_R03C02,GSM2808880_8918692108_R03C01,GSM2808883_8918692108_R05C02,GSM2808888_8918692120_R04C01,GSM2808889_8918692120_R05C02,GSM2808890_8918692120_R05C01,GSM2808893_8221932039_R04C01,GSM2808899_8221932039_R04C02,GSM2808900_8221932039_R03C02,...,GSM2809030_5900438023_R01C02,GSM2809031_5900438023_R02C02,GSM2809040_5900438003_R05C01,GSM2809044_5900438003_R03C02,GSM2809045_5900438003_R04C02,GSM2809048_5854945005_R04C01,GSM2809050_5854945005_R06C01,GSM2809052_5854945005_R04C02,GSM2809053_5854945005_R05C02,GSM2809057_5854945011_R03C01
cg00000029,0.645753,0.676544,0.586785,0.599437,0.555095,0.678419,0.602605,0.630883,0.602725,0.531689,...,0.613029,0.598163,0.642460,0.565499,0.566134,0.670904,0.607883,0.595667,0.610707,0.591322
cg00000108,0.885150,0.895378,0.905651,0.910837,0.905099,0.894724,0.903664,0.898865,0.877397,0.908008,...,0.881922,0.864616,0.894795,0.880975,0.855840,0.889166,0.908866,0.887042,0.880245,0.874415
cg00000109,0.686668,0.662039,0.740010,0.701507,0.788453,0.634554,0.754802,0.722652,0.732983,0.819840,...,0.733072,0.793031,0.714965,0.785609,0.757803,0.771876,0.772755,0.768522,0.768949,0.731248
cg00000165,0.351852,0.368762,0.327747,0.388608,0.295425,0.353539,0.332663,0.363169,0.388573,0.295979,...,0.362062,0.347784,0.351862,0.371013,0.298814,0.343835,0.373131,0.344934,0.351976,0.316026
cg00000236,0.812750,0.818738,0.771500,0.837046,0.777129,0.788290,0.772109,0.799445,0.808600,0.799761,...,0.809632,0.823554,0.795647,0.801339,0.807298,0.798221,0.812166,0.809719,0.820847,0.788946
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ch.X.97129969R,0.299612,0.216691,0.333599,0.230539,0.309394,0.257442,0.364019,0.194135,0.215614,0.324906,...,0.292830,0.375453,0.280861,0.286944,0.283163,0.416997,0.293111,0.364770,0.291119,0.315134
ch.X.97133160R,0.491542,0.350835,0.574544,0.401637,0.542554,0.391686,0.591472,0.318958,0.262889,0.577463,...,0.303743,0.491453,0.343868,0.366992,0.372724,0.503507,0.309546,0.498871,0.330280,0.487551
ch.X.97651759F,0.355892,0.224582,0.557608,0.283942,0.580222,0.268584,0.605756,0.264238,0.221564,0.622953,...,0.231544,0.437221,0.279688,0.340114,0.337564,0.453277,0.266676,0.486391,0.280557,0.442271
ch.X.97737721F,0.241596,0.232693,0.375735,0.195063,0.361287,0.276326,0.437649,0.282022,0.243418,0.393942,...,0.276989,0.308753,0.291934,0.291059,0.337026,0.329076,0.263860,0.319120,0.284369,0.350168


In [14]:
output = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE66351_Fed"

In [17]:
# save the betas for testing
lab_a.betas.to_csv(os.path.join(output, "strong_split1_betas.csv"))
lab_b.betas.to_csv(os.path.join(output, "strong_split2_betas.csv"))
lab_c.betas.to_csv(os.path.join(output, "strong_split3_betas.csv"))

## EWAS - Linear regression model

Client side

In [15]:
local_rega = lab_a.local_xtx_xty()
local_regb = lab_b.local_xtx_xty()
local_regc = lab_c.local_xtx_xty()

Server side

In [16]:
global_xtx, global_xty = serv.global_regression_parameter(local_rega, local_regb, local_regc)

In [17]:
serv.calculate_EWAS_results()

Unnamed: 0,Coefficient,Standard Error,P-value,Corrected P-value,Methylation Change
cg15978565,0.005387,0.039375,0.446159,0.5,0.005387
cg20909078,0.001655,0.039375,0.483407,0.5,0.001655
cg16266895,-0.003544,0.039375,0.464515,0.5,-0.003544
cg10116893,0.005610,0.039375,0.443942,0.5,0.005610
cg13748552,0.001155,0.039375,0.488421,0.5,0.001155
...,...,...,...,...,...
cg06185817,0.003125,0.039375,0.468695,0.5,0.003125
cg15313617,-0.007908,0.039375,0.421259,0.5,-0.007908
cg20220255,-0.021281,0.039375,0.296928,0.5,-0.021281
cg01384686,-0.010432,0.039375,0.396659,0.5,-0.010432


In [18]:
serv.EWAS_results.sort_values(by = "P-value")

Unnamed: 0,Coefficient,Standard Error,P-value,Corrected P-value,Methylation Change
cg08383526,-2.551160e-01,0.039375,5.308382e-07,0.239322,-2.551160e-01
cg25488288,1.790800e-01,0.039375,6.552479e-05,0.500000,1.790800e-01
cg12516504,1.707097e-01,0.039375,1.125213e-04,0.500000,1.707097e-01
cg14654629,-1.631961e-01,0.039375,1.826607e-04,0.500000,-1.631961e-01
cg07284994,1.600583e-01,0.039375,2.235185e-04,0.500000,1.600583e-01
...,...,...,...,...,...
cg21609640,-1.015260e-07,0.039375,4.999990e-01,0.500000,-1.015260e-07
cg14236430,5.972897e-08,0.039375,4.999994e-01,0.500000,5.972897e-08
cg23511982,-3.040680e-08,0.039375,4.999997e-01,0.500000,-3.040680e-08
cg20758929,2.340335e-08,0.039375,4.999998e-01,0.500000,2.340335e-08


Client side

In [19]:
serv.EWAS_results.to_csv(os.path.join(output, "_even_Results_EWAS.csv"))
