In [1]:
import pandas as pd
import numpy as np
import sys
import re
import os

import server
from server import Server 
import client
from client import Client

In [22]:
testdata_dir = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Federated_Differential_Methylation_Analysis\\Fed_test"

## Create test data and split across 3 "labs"

In [8]:
# load in some data and create a test data object to write these functions
test_methylated = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Methylated.csv", index_col=0)
test_methylatedA = test_methylated.iloc[:, 0:21]
test_methylatedB = test_methylated.iloc[:, 21:41]
test_methylatedC = test_methylated.iloc[: , 41:61]

test_unmethylated = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Unmethylated.csv", index_col=0)
test_unmethylatedA = test_unmethylated.iloc[:, 0:21]
test_unmethylatedB = test_unmethylated.iloc[:, 21:41]
test_unmethylatedC = test_unmethylated.iloc[:, 41:61]
# attach the probe type information to the (test) data so it can be used by the normalisation functions
annotation_data = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE66351_RAW\\GPL13534_HumanMethylation450_15017482_v.1.1.csv", skiprows=7, low_memory=False)
annotation_data.set_index(annotation_data["IlmnID"], inplace=True)
probe_type_data = annotation_data.loc[:, "Infinium_Design_Type"]
test_probe_annotation = pd.merge(test_methylated, probe_type_data, how = "inner", left_index=True, right_index=True, indicator = True)
test_probe_annotation = test_probe_annotation.loc[:,"Infinium_Design_Type"]

# create test design matrix
pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Reduced_Pheno_Info.csv", index_col= "Sample_ID")
x = pheno.loc[:,["Diagnosis", "Age", "Sex", "Sentrix_ID"]] # design matrix with the dependent/explainatory variables to be included in the model
x["Diagnosis"] = (x["Diagnosis"] == " AD").astype(int) #create binary diagnosis with 1 = AD and 0 = CTR
x["Sex"] = (x["Sex"] == " F").astype(int) #create binary sex with 1 = F and 0 = M
# create dummy variables for the unique sentrix_ids present in the dataset - this code can be reused to create center number dummies in the federated version of the code
""" unique_ids = x["Sentrix_ID"].unique()
for id in unique_ids:
    x[id] = (x["Sentrix_ID"] == id).astype(int)
x.drop(columns="Sentrix_ID", inplace = True) """
# turn the age variable into a continuous numerical variable without any leftover text
#x["Age"].replace("^[^:]*:", "", regex=True, inplace=True)
x["Age"] = pd.to_numeric(x["Age"])

xA = x.iloc[0:21, :]
xB = x.iloc[21:41, :]
xC = x.iloc[41:61, :]

# save the test data for easy reference later + to add to github ?
test_methylatedA.to_csv(os.path.join(testdata_dir, "methylatedA.csv"))
test_methylatedB.to_csv(os.path.join(testdata_dir, "methylatedB.csv"))
test_methylatedC.to_csv(os.path.join(testdata_dir, "methylatedC.csv"))

test_unmethylatedA.to_csv(os.path.join(testdata_dir, "unmethylatedA.csv"))
test_unmethylatedB.to_csv(os.path.join(testdata_dir, "unmethylatedB.csv"))
test_unmethylatedC.to_csv(os.path.join(testdata_dir, "unmethylatedC.csv"))

xA.to_csv(os.path.join(testdata_dir, "designA.csv"))
xB.to_csv(os.path.join(testdata_dir, "designB.csv"))
xC.to_csv(os.path.join(testdata_dir, "designC.csv"))

test_probe_annotation.to_csv(os.path.join(testdata_dir, "probe_annotation.csv"))

In [3]:
split_dir = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE66315_splits"

## Initialising the clients

In [4]:
# create client
lab_a = Client("Lab_A", os.path.join(split_dir, "Split_1_design.csv"), os.path.join(split_dir, "Split_1_methylated.csv"), os.path.join(split_dir, "Split_1_unmethylated.csv"), os.path.join(split_dir, "probe_annotation.csv"))
lab_b = Client("Lab_B", os.path.join(split_dir, "Split_2_design.csv"), os.path.join(split_dir, "Split_2_methylated.csv"), os.path.join(split_dir, "Split_2_unmethylated.csv"), os.path.join(split_dir, "probe_annotation.csv"))
lab_c = Client("Lab_C", os.path.join(split_dir, "Split_3_design.csv"), os.path.join(split_dir, "Split_3_methylated.csv"), os.path.join(split_dir, "Split_3_unmethylated.csv"), os.path.join(split_dir, "probe_annotation.csv"))

## Initialising the server

In [5]:
serv = Server(["Diagnosis"], ["Age", "Sex", "Sentrix_ID"])
global_conditions = serv.return_global_conditions()

## Joining clients to the server

In [6]:
# join the clients
serv.get_clients(lab_a.cohort_name, lab_a.probes, lab_a.designmatrix.index)
serv.get_clients(lab_b.cohort_name, lab_b.probes, lab_b.designmatrix.index)
serv.get_clients(lab_c.cohort_name, lab_c.probes, lab_c.designmatrix.index)

In [7]:
global_probes = serv.find_global_probes()

In [8]:
#check client input
lab_a.input_validation(global_conditions, global_probes)
lab_b.input_validation(global_conditions, global_probes)
lab_c.input_validation(global_conditions, global_probes)

In [9]:
lab_a.cohort_effects(serv.client_names)
lab_b.cohort_effects(serv.client_names)
lab_c.cohort_effects(serv.client_names)

In [10]:
if "Sentrix_ID" in global_conditions:
    lab_a.find_unique_SentrixIDS()
    lab_b.find_unique_SentrixIDS()
    lab_c.find_unique_SentrixIDS()
    global_sentrix = serv.return_global_SentrixID(lab_a.unique_SentrixIDS,
                                lab_b.unique_SentrixIDS,
                                lab_c.unique_SentrixIDS)
    lab_a.SentrixID_effects(global_sentrix)
    lab_b.SentrixID_effects(global_sentrix)
    lab_c.SentrixID_effects(global_sentrix)
    



## Dasen normalisation

Client side

In [11]:
lab_a.intensity_distributions()
lab_b.intensity_distributions()
lab_c.intensity_distributions()

(                GSM2808875_8918692108_R01C02  GSM2808876_8918692108_R01C01  \
 cg00000029                            2508.0                        2499.0   
 cg00000108                            5354.0                        5679.0   
 cg00000109                            1779.0                        1760.0   
 cg00000165                             966.0                         855.0   
 cg00000236                            3171.0                        2710.0   
 ...                                      ...                           ...   
 ch.X.97129969R                         340.0                         704.0   
 ch.X.97133160R                        1030.0                        2407.0   
 ch.X.97651759F                        1275.0                        3973.0   
 ch.X.97737721F                        2143.0                        3893.0   
 ch.X.98007042R                         544.0                        1514.0   
 
                 GSM2808877_8918692108_R02C02  GSM

In [12]:
local_dasen_paramA = lab_a.local_normalisation_parameters()
local_dasen_paramB = lab_b.local_normalisation_parameters()
local_dasen_paramC = lab_c.local_normalisation_parameters()

Server side

In [13]:
probe_type_means = serv.aggregate_QN_means(local_dasen_paramA, local_dasen_paramB, local_dasen_paramC)

Client side

In [14]:
lab_a.final_normalisation(probe_type_means)
lab_b.final_normalisation(probe_type_means)
lab_c.final_normalisation(probe_type_means)

Unnamed: 0,GSM2808875_8918692108_R01C02,GSM2808876_8918692108_R01C01,GSM2808877_8918692108_R02C02,GSM2808878_8918692108_R02C01,GSM2808879_8918692108_R03C02,GSM2808880_8918692108_R03C01,GSM2808881_8918692108_R04C02,GSM2808882_8918692108_R04C01,GSM2808883_8918692108_R05C02,GSM2808884_8918692108_R05C01,...,GSM2809055_5854945011_R01C01,GSM2809056_5854945011_R02C01,GSM2809057_5854945011_R03C01,GSM2809058_5854945011_R04C01,GSM2809059_5854945011_R05C01,GSM2809060_5854945011_R06C01,GSM2809061_5854945011_R01C02,GSM2809062_5854945011_R03C02,GSM2809063_5854945011_R04C02,GSM2809064_5854945011_R05C02
cg00000029,0.651772,0.612893,0.645753,0.594738,0.676544,0.586785,0.685034,0.665869,0.599437,0.631644,...,0.589649,0.653951,0.591322,0.593586,0.544863,0.615214,0.595921,0.627015,0.595071,0.604785
cg00000108,0.886961,0.894513,0.885150,0.895567,0.895378,0.905651,0.892796,0.895054,0.910837,0.905866,...,0.886876,0.864715,0.874415,0.887306,0.890384,0.895237,0.894619,0.885518,0.881948,0.873194
cg00000109,0.683035,0.765342,0.686668,0.767077,0.662039,0.740010,0.666553,0.775664,0.701507,0.784446,...,0.765047,0.801284,0.731248,0.719772,0.756501,0.708711,0.780224,0.785353,0.742880,0.749155
cg00000165,0.363254,0.305253,0.351852,0.283040,0.368762,0.327747,0.378875,0.279547,0.388608,0.268668,...,0.352004,0.330318,0.316026,0.372187,0.346962,0.321125,0.325860,0.323726,0.348148,0.304639
cg00000236,0.838525,0.789144,0.812750,0.788310,0.818738,0.771500,0.815253,0.803938,0.837046,0.822737,...,0.800473,0.823407,0.788946,0.819893,0.803959,0.833001,0.823150,0.791650,0.806551,0.794142
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ch.X.97129969R,0.226861,0.332673,0.299612,0.450387,0.216691,0.333599,0.283016,0.329563,0.230539,0.399877,...,0.374168,0.338459,0.315134,0.336106,0.347352,0.370116,0.387089,0.237799,0.302188,0.286885
ch.X.97133160R,0.322142,0.597740,0.491542,0.696986,0.350835,0.574544,0.399061,0.579815,0.401637,0.725559,...,0.467309,0.527641,0.487551,0.361825,0.565140,0.413496,0.494044,0.334567,0.395927,0.370618
ch.X.97651759F,0.226885,0.582474,0.355892,0.755088,0.224582,0.557608,0.366986,0.592153,0.283942,0.783406,...,0.447091,0.346343,0.442271,0.343252,0.495237,0.355323,0.393035,0.281672,0.370430,0.320815
ch.X.97737721F,0.236354,0.410953,0.241596,0.423455,0.232693,0.375735,0.299413,0.414539,0.195063,0.422111,...,0.300656,0.295588,0.350168,0.290463,0.335341,0.323936,0.305950,0.279807,0.326334,0.275812


In [19]:
output = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE66351_Fed"

In [21]:
# save the betas for testing
betas_a.to_csv(os.path.join(output, "split1_betas.csv"))
betas_b.to_csv(os.path.join(output, "split2_betas.csv"))
betas_c.to_csv(os.path.join(output, "split3_betas.csv"))

## EWAS - Linear regression model

Client side

In [15]:
local_rega = lab_a.local_xtx_xty()
local_regb = lab_b.local_xtx_xty()
local_regc = lab_c.local_xtx_xty()

Server side

In [16]:
global_xtx, global_xty = serv.global_regression_parameter(local_rega, local_regb, local_regc)

Client side

In [17]:
lab_a.calculate_EWAS_results(global_xtx, global_xty)
lab_b.calculate_EWAS_results(global_xtx, global_xty)
lab_c.calculate_EWAS_results(global_xtx, global_xty)

Unnamed: 0_level_0,Coefficient,Coefficient,Coefficient,Coefficient,Coefficient,Coefficient,Coefficient,Coefficient,Coefficient,Coefficient,...,Methylation change,Methylation change,Methylation change,Methylation change,Methylation change,Methylation change,Methylation change,Methylation change,Methylation change,Methylation change
Unnamed: 0_level_1,Diagnosis,Age,Sex,Sentrix_ID,Lab_A,Lab_B,Lab_C,5900438023,8221932039,9247377036,...,5854945057,3998920130,3998919115,3998919116,5854945005,5854945006,5854945010,5900438003,5854945011,5854945020
cg00000029,0.004137,-0.000106,0.007957,7.989583e-27,0.661112,0.661112,0.661112,-0.052887,-0.077533,-0.116169,...,-0.022029,-0.098167,-0.079209,-0.094916,-0.061341,-0.045956,-0.003389,-0.062362,-0.059642,-0.039670
cg00000108,0.001798,-0.000052,-0.000930,-6.271760e-26,0.881216,0.881216,0.881216,0.001624,0.018807,0.037513,...,0.001001,0.024491,0.023247,0.020185,0.004967,-0.000833,-0.005484,0.003902,0.005396,0.007198
cg00000109,-0.002106,0.000267,-0.007049,-2.524355e-26,0.709980,0.709980,0.709980,0.029093,0.030523,0.059060,...,0.023396,0.021825,0.034581,0.025716,0.032320,0.044949,0.016739,0.014548,0.029633,0.030741
cg00000165,0.005052,0.000274,0.000739,-4.102077e-29,0.317928,0.317928,0.317928,0.002132,-0.004342,0.002371,...,-0.003504,-0.014457,-0.030561,-0.019661,-0.014283,-0.008775,0.007699,-0.009546,-0.008487,-0.019706
cg00000236,0.000773,-0.000106,-0.002496,2.256773e-26,0.818604,0.818604,0.818604,-0.008018,-0.021931,-0.045933,...,-0.018008,-0.029781,-0.016333,-0.018326,-0.005957,0.003724,0.011251,-0.005221,-0.001115,-0.008548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ch.X.97129969R,0.004186,-0.000137,-0.075870,1.556265e-26,0.380370,0.380370,0.380370,0.025190,-0.026375,0.112856,...,-0.019562,-0.004299,-0.008351,-0.014583,0.007308,-0.023278,0.011654,-0.007634,0.002119,0.020058
ch.X.97133160R,-0.006443,-0.000030,-0.118821,-7.879143e-27,0.509088,0.509088,0.509088,0.001526,0.069864,0.216171,...,0.028660,0.074374,0.090540,0.073155,0.000443,-0.012632,0.000663,-0.007857,0.010779,0.005692
ch.X.97651759F,-0.018257,0.000386,-0.132725,-5.774462e-28,0.431506,0.431506,0.431506,0.005676,0.085122,0.320394,...,0.043323,0.079445,0.088772,0.078632,0.029073,-0.042810,0.008353,0.012621,0.011322,0.013503
ch.X.97737721F,-0.009316,0.000044,-0.007498,6.951442e-27,0.323559,0.323559,0.323559,-0.005643,0.013182,0.048790,...,0.022793,-0.009017,0.001896,-0.022376,0.002085,-0.034933,-0.000059,-0.003544,-0.007475,-0.013272


In [None]:
lab_a.EWAS_results.to_csv(os.path.join(output, "Results_EWAS1.csv"))
lab_b.EWAS_results.to_csv(os.path.join(output, "Results_EWAS2.csv"))
lab_c.EWAS_results.to_csv(os.path.join(output, "Results_EWAS3.csv"))