In [1]:
import pandas as pd
import numpy as np
import sys
import re
import os

import server
from server import Server 
import client
from client import Client

In [2]:
testdata_dir = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Federated_Differential_Methylation_Analysis\\Fed_test"

## Create test data and split across 3 "labs"

In [5]:
# load in some data and create a test data object to write these functions
test_methylated = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Methylated.csv", index_col=0)
test_methylatedA = test_methylated.iloc[:, 0:21]
test_methylatedB = test_methylated.iloc[:, 21:41]
test_methylatedC = test_methylated.iloc[: , 41:61]

test_unmethylated = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Unmethylated.csv", index_col=0)
test_unmethylatedA = test_unmethylated.iloc[:, 0:21]
test_unmethylatedB = test_unmethylated.iloc[:, 21:41]
test_unmethylatedC = test_unmethylated.iloc[:, 41:61]
# attach the probe type information to the (test) data so it can be used by the normalisation functions
annotation_data = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE66351_RAW\\GPL13534_HumanMethylation450_15017482_v.1.1.csv", skiprows=7, low_memory=False)
annotation_data.set_index(annotation_data["IlmnID"], inplace=True)
probe_type_data = annotation_data.loc[:, "Infinium_Design_Type"]
test_probe_annotation = pd.merge(test_methylated, probe_type_data, how = "inner", left_index=True, right_index=True, indicator = True)
test_probe_annotation = test_probe_annotation.loc[:,"Infinium_Design_Type"]

# create test design matrix
pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Reduced_Pheno_Info.csv", index_col= "Sample_ID")
x = pheno.loc[:,["Diagnosis", "Age", "Sex", "Sentrix_ID"]] # design matrix with the dependent/explainatory variables to be included in the model
x["Diagnosis"] = (x["Diagnosis"] == " AD").astype(int) #create binary diagnosis with 1 = AD and 0 = CTR
x["Sex"] = (x["Sex"] == " F").astype(int) #create binary sex with 1 = F and 0 = M
# create dummy variables for the unique sentrix_ids present in the dataset - this code can be reused to create center number dummies in the federated version of the code
unique_ids = x["Sentrix_ID"].unique()
for id in unique_ids:
    x[id] = (x["Sentrix_ID"] == id).astype(int)
x.drop(columns="Sentrix_ID", inplace = True)
# turn the age variable into a continuous numerical variable without any leftover text
#x["Age"].replace("^[^:]*:", "", regex=True, inplace=True)
x["Age"] = pd.to_numeric(x["Age"])

xA = x.iloc[0:21, :]
xB = x.iloc[21:41, :]
xC = x.iloc[41:61, :]

# save the test data for easy reference later + to add to github ?
test_methylatedA.to_csv(os.path.join(testdata_dir, "methylatedA.csv"))
test_methylatedB.to_csv(os.path.join(testdata_dir, "methylatedB.csv"))
test_methylatedC.to_csv(os.path.join(testdata_dir, "methylatedC.csv"))

test_unmethylatedA.to_csv(os.path.join(testdata_dir, "unmethylatedA.csv"))
test_unmethylatedB.to_csv(os.path.join(testdata_dir, "unmethylatedB.csv"))
test_unmethylatedC.to_csv(os.path.join(testdata_dir, "unmethylatedC.csv"))

xA.to_csv(os.path.join(testdata_dir, "designA.csv"))
xB.to_csv(os.path.join(testdata_dir, "designB.csv"))
xC.to_csv(os.path.join(testdata_dir, "designC.csv"))

test_probe_annotation.to_csv(os.path.join(testdata_dir, "probe_annotation.csv"))

## Initialising the clients

In [3]:
# create client
lab_a = Client("Lab_A", os.path.join(testdata_dir, "designA.csv"), os.path.join(testdata_dir, "methylatedA.csv"), os.path.join(testdata_dir, "unmethylatedA.csv"), os.path.join(testdata_dir, "probe_annotation.csv"))
lab_b = Client("Lab_B", os.path.join(testdata_dir, "designB.csv"), os.path.join(testdata_dir, "methylatedB.csv"), os.path.join(testdata_dir, "unmethylatedB.csv"), os.path.join(testdata_dir, "probe_annotation.csv"))
lab_c = Client("Lab_C", os.path.join(testdata_dir, "designC.csv"), os.path.join(testdata_dir, "methylatedC.csv"), os.path.join(testdata_dir, "unmethylatedC.csv"), os.path.join(testdata_dir, "probe_annotation.csv"))

## Initialising the server

In [4]:
serv = Server(["Diagnosis"], ["Age", "Sex"])
global_conditions = serv.return_global_conditions()

## Joining clients to the server

In [5]:
# join the clients
serv.get_clients(lab_a.cohort_name, lab_a.probes, lab_a.designmatrix.index)
serv.get_clients(lab_b.cohort_name, lab_b.probes, lab_b.designmatrix.index)
serv.get_clients(lab_c.cohort_name, lab_c.probes, lab_c.designmatrix.index)

In [6]:
#check client input
lab_a.input_validation(global_conditions)
lab_b.input_validation(global_conditions)
lab_c.input_validation(global_conditions)

21 conditions in the design matrix are not specified in the global conditions and will be removed
21 conditions in the design matrix are not specified in the global conditions and will be removed
21 conditions in the design matrix are not specified in the global conditions and will be removed


In [7]:
lab_a.cohort_effects(serv.client_names)
lab_b.cohort_effects(serv.client_names)
lab_c.cohort_effects(serv.client_names)

## Dasen normalisation

Client side

In [8]:
dist_a = lab_a.intensity_distributions()
dist_b = lab_b.intensity_distributions()
dist_c = lab_c.intensity_distributions()

In [9]:
local_dasen_paramA = lab_a.local_normalisation_parameters()
local_dasen_paramB = lab_b.local_normalisation_parameters()
local_dasen_paramC = lab_c.local_normalisation_parameters()

Server side

In [10]:
probe_type_means = serv.aggregate_QN_means(local_dasen_paramA, local_dasen_paramB, local_dasen_paramC)

Client side

In [11]:
betas_a = lab_a.final_normalisation(probe_type_means)
betas_b = lab_b.final_normalisation(probe_type_means)
betas_c = lab_c.final_normalisation(probe_type_means)

In [12]:
# save the betas for testing
betas_a.to_csv(os.path.join(testdata_dir, "splita_betas.csv"))
betas_b.to_csv(os.path.join(testdata_dir, "splitb_betas.csv"))
betas_c.to_csv(os.path.join(testdata_dir, "splitc_betas.csv"))

## EWAS - Linear regression model

Client side

In [13]:
local_rega = lab_a.local_xtx_xty()
local_regb = lab_b.local_xtx_xty()
local_regc = lab_c.local_xtx_xty()

Server side

In [14]:
global_xtx, global_xty = serv.global_regression_parameter(local_rega, local_regb, local_regc)

Client side

In [15]:
EWAS_a = lab_a.calculate_EWAS_results(global_xtx, global_xty)
EWAS_b = lab_b.calculate_EWAS_results(global_xtx, global_xty)
EWAS_c = lab_c.calculate_EWAS_results(global_xtx, global_xty)

In [15]:
# save the EWAS ouput
EWAS_a.to_csv(os.path.join(testdata_dir, "Results_EWASa.csv"))
EWAS_b.to_csv(os.path.join(testdata_dir, "Results_EWASb.csv"))
EWAS_c.to_csv(os.path.join(testdata_dir, "Results_EWASc.csv"))