In [1]:
import pandas as pd
import numpy as np
import sys
import re
import os

import server
from server import Server 
import client
from client import Client

In [2]:
testdata_dir = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Federated_Differential_Methylation_Analysis\\Fed_test"

## Create test data and split across 3 "labs"

In [8]:
# load in some data and create a test data object to write these functions
test_methylated = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Methylated.csv", index_col=0)
test_methylatedA = test_methylated.iloc[:, 0:21]
test_methylatedB = test_methylated.iloc[:, 21:41]
test_methylatedC = test_methylated.iloc[: , 41:61]

test_unmethylated = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Unmethylated.csv", index_col=0)
test_unmethylatedA = test_unmethylated.iloc[:, 0:21]
test_unmethylatedB = test_unmethylated.iloc[:, 21:41]
test_unmethylatedC = test_unmethylated.iloc[:, 41:61]
# attach the probe type information to the (test) data so it can be used by the normalisation functions
annotation_data = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE66351_RAW\\GPL13534_HumanMethylation450_15017482_v.1.1.csv", skiprows=7, low_memory=False)
annotation_data.set_index(annotation_data["IlmnID"], inplace=True)
probe_type_data = annotation_data.loc[:, "Infinium_Design_Type"]
test_probe_annotation = pd.merge(test_methylated, probe_type_data, how = "inner", left_index=True, right_index=True, indicator = True)
test_probe_annotation = test_probe_annotation.loc[:,"Infinium_Design_Type"]

# create test design matrix
pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Reduced_Pheno_Info.csv", index_col= "Sample_ID")
x = pheno.loc[:,["Diagnosis", "Age", "Sex", "Sentrix_ID"]] # design matrix with the dependent/explainatory variables to be included in the model
x["Diagnosis"] = (x["Diagnosis"] == " AD").astype(int) #create binary diagnosis with 1 = AD and 0 = CTR
x["Sex"] = (x["Sex"] == " F").astype(int) #create binary sex with 1 = F and 0 = M
# create dummy variables for the unique sentrix_ids present in the dataset - this code can be reused to create center number dummies in the federated version of the code
""" unique_ids = x["Sentrix_ID"].unique()
for id in unique_ids:
    x[id] = (x["Sentrix_ID"] == id).astype(int)
x.drop(columns="Sentrix_ID", inplace = True) """
# turn the age variable into a continuous numerical variable without any leftover text
#x["Age"].replace("^[^:]*:", "", regex=True, inplace=True)
x["Age"] = pd.to_numeric(x["Age"])

xA = x.iloc[0:21, :]
xB = x.iloc[21:41, :]
xC = x.iloc[41:61, :]

# save the test data for easy reference later + to add to github ?
test_methylatedA.to_csv(os.path.join(testdata_dir, "methylatedA.csv"))
test_methylatedB.to_csv(os.path.join(testdata_dir, "methylatedB.csv"))
test_methylatedC.to_csv(os.path.join(testdata_dir, "methylatedC.csv"))

test_unmethylatedA.to_csv(os.path.join(testdata_dir, "unmethylatedA.csv"))
test_unmethylatedB.to_csv(os.path.join(testdata_dir, "unmethylatedB.csv"))
test_unmethylatedC.to_csv(os.path.join(testdata_dir, "unmethylatedC.csv"))

xA.to_csv(os.path.join(testdata_dir, "designA.csv"))
xB.to_csv(os.path.join(testdata_dir, "designB.csv"))
xC.to_csv(os.path.join(testdata_dir, "designC.csv"))

test_probe_annotation.to_csv(os.path.join(testdata_dir, "probe_annotation.csv"))

In [2]:
split_dir = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE66315_splits"

## Initialising the clients

In [3]:
# create client
lab_a = Client("Lab_A", os.path.join(split_dir, "Split_1_design.csv"), os.path.join(split_dir, "Split_1_methylated.csv"), os.path.join(split_dir, "Split_1_unmethylated.csv"), os.path.join(split_dir, "probe_annotation.csv"))
lab_b = Client("Lab_B", os.path.join(split_dir, "Split_2_design.csv"), os.path.join(split_dir, "Split_2_methylated.csv"), os.path.join(split_dir, "Split_2_unmethylated.csv"), os.path.join(split_dir, "probe_annotation.csv"))
lab_c = Client("Lab_C", os.path.join(split_dir, "Split_3_design.csv"), os.path.join(split_dir, "Split_3_methylated.csv"), os.path.join(split_dir, "Split_3_unmethylated.csv"), os.path.join(split_dir, "probe_annotation.csv"))

## Initialising the server

In [4]:
serv = Server(["Diagnosis"], ["Age", "Sex", "Sentrix_ID"])
global_conditions = serv.return_global_conditions()

## Joining clients to the server

In [5]:
# join the clients
serv.get_clients(lab_a.cohort_name, lab_a.probes, lab_a.designmatrix.index)
serv.get_clients(lab_b.cohort_name, lab_b.probes, lab_b.designmatrix.index)
serv.get_clients(lab_c.cohort_name, lab_c.probes, lab_c.designmatrix.index)

In [6]:
global_probes = serv.find_global_probes()

In [7]:
#check client input
lab_a.input_validation(global_conditions, global_probes)
lab_b.input_validation(global_conditions, global_probes)
lab_c.input_validation(global_conditions, global_probes)

1 global conditions are not present in the local design matrix


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [8]:
lab_a.cohort_effects(serv.client_names)
lab_b.cohort_effects(serv.client_names)
lab_c.cohort_effects(serv.client_names)

In [9]:
if "Sentrix_ID" in global_conditions:
    lab_a.find_unique_SentrixIDS()
    lab_b.find_unique_SentrixIDS()
    lab_c.find_unique_SentrixIDS()
    global_sentrix = serv.return_global_SentrixID(lab_a.unique_SentrixIDS,
                                lab_b.unique_SentrixIDS,
                                lab_c.unique_SentrixIDS)
    lab_a.SentrixID_effects(global_sentrix)
    lab_b.SentrixID_effects(global_sentrix)
    lab_c.SentrixID_effects(global_sentrix)
    



## Dasen normalisation

Client side

In [10]:
dist_a = lab_a.intensity_distributions()
dist_b = lab_b.intensity_distributions()
dist_c = lab_c.intensity_distributions()

In [11]:
local_dasen_paramA = lab_a.local_normalisation_parameters()
local_dasen_paramB = lab_b.local_normalisation_parameters()
local_dasen_paramC = lab_c.local_normalisation_parameters()

Server side

In [12]:
probe_type_means = serv.aggregate_QN_means(local_dasen_paramA, local_dasen_paramB, local_dasen_paramC)

Client side

In [13]:
betas_a = lab_a.final_normalisation(probe_type_means)
betas_b = lab_b.final_normalisation(probe_type_means)
betas_c = lab_c.final_normalisation(probe_type_means)

In [14]:
# save the betas for testing
betas_a.to_csv(os.path.join(testdata_dir, "splita_betas.csv"))
betas_b.to_csv(os.path.join(testdata_dir, "splitb_betas.csv"))
betas_c.to_csv(os.path.join(testdata_dir, "splitc_betas.csv"))

## EWAS - Linear regression model

Client side

In [15]:
local_rega = lab_a.local_xtx_xty()
local_regb = lab_b.local_xtx_xty()
local_regc = lab_c.local_xtx_xty()

Server side

In [16]:
global_xtx, global_xty = serv.global_regression_parameter(local_rega, local_regb, local_regc)

Client side

In [17]:
EWAS_a = lab_a.calculate_EWAS_results(global_xtx, global_xty)
EWAS_b = lab_b.calculate_EWAS_results(global_xtx, global_xty)
EWAS_c = lab_c.calculate_EWAS_results(global_xtx, global_xty)

  t = self.coef[i,:]/self.stnd_err[i,:]
  t = self.coef[i,:]/self.stnd_err[i,:]


In [19]:
EWAS_a.loc[:,"P-value"].head()

Unnamed: 0,Diagnosis,Age,Sex,Sentrix_ID,Lab_A,Lab_B,Lab_C,9247377057,3998920130,8221932039,3998919115,8918692108,9247377036,3998919116,8918692120
cg00000029,1.316132e-34,1.0,1.781386e-31,1.960312e-138,0.5,0.5,1.0,0.5,,0.5,0.5,1.0,0.0,0.5,
cg00000108,4.314596e-37,1.0,5.307609e-34,7.963297e-141,0.5,0.5,1.0,0.5,,0.5,0.5,1.0,0.0,0.5,
cg00000109,5.4958179999999995e-36,1.0,7.342617000000001e-33,1.090961e-139,0.5,0.5,1.0,0.5,,0.5,0.5,1.0,0.0,0.5,
cg00000165,1.8653e-30,1.0,2.5795670000000002e-27,2.834724e-134,0.5,0.5,1.0,0.5,0.0,0.5,0.5,1.0,0.0,0.5,
cg00000236,1.95181e-36,1.0,2.4352780000000003e-33,3.510242e-140,0.5,0.5,1.0,0.5,1.0,0.5,0.5,1.0,0.0,0.5,


In [18]:
# save the EWAS ouput
EWAS_a.to_csv(os.path.join(testdata_dir, "Results_EWASa.csv"))
EWAS_b.to_csv(os.path.join(testdata_dir, "Results_EWASb.csv"))
EWAS_c.to_csv(os.path.join(testdata_dir, "Results_EWASc.csv"))