In [1]:
import pandas as pd
import numpy as np
import os

### Create the design matrix for GSE134379 to be used in local and federated EWAS 

In [None]:
# GSE134379 - Small
pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE134379\\Reduced_Pheno_Info.csv", index_col= "Sample_ID")
x = pheno.loc[:,["Diagnosis", "Age", "Sex", "Sentrix_ID"]] # design matrix with the dependent/explainatory variables to be included in the model

# The design matrix needs to consist of numeric representations of the covariates to be included in the model, i.e. binary diagnosis, binary sex, dummy sentrix etc.
x["Diagnosis"] = (x["Diagnosis"] == " AD").astype(int) #create binary diagnosis with 1 = AD and 0 = CTR
x["Sex"] = (x["Sex"] == " F").astype(int) #create binary sex with 1 = F and 0 = M

x["Sentrix_ID"] = x["Sentrix_ID"].astype(int)
# turn the age variable into a continuous numerical variable without any leftover text
x["Age"].replace("^[^:]*:", "", regex=True, inplace=True)
x["Age"] = pd.to_numeric(x["Age"])
x.to_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE134379\\Small_EWAS_design.csv")

# now on to the full design matrix
x_large = x.copy()
x_large["Cell_Type1"] = pd.to_numeric(pheno["Cell_Type.CT1"])
x_large["Cell_Type2"] = pd.to_numeric(pheno["Cell_Type.CT2"])
x_large["Cell_Type3"] = pd.to_numeric(pheno["Cell_Type.CT3"])
x_large["Cell_Type4"] = pd.to_numeric(pheno["Cell_Type.CT4"])
x_large["Cell_Type5"] = pd.to_numeric(pheno["Cell_Type.CT5"])
x_large.to_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE134379\\Full_EWAS_design.csv")

# create dummy variables for the unique sentrix_ids present in the dataset - this code can be reused to create center number dummies in the federated version of the code
unique_ids = x["Sentrix_ID"].unique()
for id in unique_ids:
    x[id] = (x["Sentrix_ID"] == id).astype(int)
x.drop(columns="Sentrix_ID", inplace = True)
x.to_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE134379\\Small_EWAS_design_local.csv")

unique_ids = x_large["Sentrix_ID"].unique()
for id in unique_ids:
    x_large[id] = (x_large["Sentrix_ID"] == id).astype(int)
x_large.drop(columns="Sentrix_ID", inplace = True)
x_large.to_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE134379\\Full_EWAS_design_local.csv")

### Create even splits of the three main datasets so they can be run locally (be it federated)


In [2]:
from random import seed, sample

In [10]:
#GSE134379

meth = pd.read_csv("/home/silke/Documents/Fed_EWAS/Data/QC_GSE134379_half/Filtered_Methylated.csv", index_col=0)
unmeth = pd.read_csv("/home/silke/Documents/Fed_EWAS/Data/QC_GSE134379_half/Filtered_Unmethylated.csv", index_col=0)
beta = pd.read_csv("/home/silke/Documents/Fed_EWAS/Data/QC_GSE134379_half/Filtered_Betas.csv", index_col=0)
design = pd.read_csv("/home/silke/Documents/Fed_EWAS/Data/GSE134379_RAW/Small_EWAS_design.csv", index_col=0)
#design_full = pd.read_csv("/home/silke/Documents/Fed_EWAS/Data/GSE134379_RAW/Full_EWAS_design.csv", index_col=0)

GSE134379_pheno = pd.read_csv("/home/silke/Documents/Fed_EWAS/Data/QC_GSE134379_half/Reduced_Pheno_Info.csv", index_col= "Sample_ID")
GSE134379_splits_pheno = GSE134379_pheno.copy(deep=True)

GSE134379_pheno = GSE134379_pheno.loc[set(meth.columns.values).intersection(set(GSE134379_pheno.index.values)), :]
GSE134379_splits_pheno = GSE134379_splits_pheno.loc[set(meth.columns.values).intersection(set(GSE134379_splits_pheno.index.values)), :]

GSE134379_splits_pheno["split"] = "Split_3"

N_total = GSE134379_splits_pheno.shape[0]*1.0*1.0
N_ad = GSE134379_splits_pheno.loc[GSE134379_splits_pheno["Diagnosis"] == "diagnosis: AD", :].shape[0]*1.0*1.0
random_state = 42
seed(random_state)
n_splits = 3
sizes = [1,1,1]
se_freqs = [0.55,0.55,0.55]

Sizes = []
n_ad = []
for i in range(0,n_splits-1):
    s  = int(N_total*sizes[i]/sum(sizes))
    Sizes.append(s)
    n_ad.append(int(s*se_freqs[i]))

Sizes.append(int(N_total-sum(Sizes)))
n_ad.append(int(N_ad-sum(n_ad)))
print(Sizes, sum(Sizes))
print(n_ad, sum(n_ad))



splits = {}
ad = set(GSE134379_pheno.loc[GSE134379_pheno["Diagnosis"]== "diagnosis: AD",:].index.values)
other = set(GSE134379_pheno.index.values).difference(ad)#.difference(fem)
for i in range(0,n_splits-1):
    b = set(sample(ad,n_ad[i]))  
    ad =  ad.difference(b)
    o = set(sample(other,Sizes[i]-n_ad[i]))
    other1 = other.difference(o)
    sele_samples = b |o 
    GSE134379_splits_pheno.loc[sele_samples,"split"] = "Split_"+str(i+1)
    GSE134379_splits_pheno["Split_"+str(i+1)] = 0
    GSE134379_splits_pheno.loc[sele_samples,"Split_"+str(i+1)]  =1
print(GSE134379_splits_pheno[["split","Diagnosis"]].groupby("split")["Diagnosis"].value_counts())

# save the dataset splits
#meth = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE134379\\Filtered_Methylated.csv", index_col=0)
#unmeth = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE134379\\Filtered_Unmethylated.csv", index_col=0)
#beta = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE134379\\Filtered_Betas.csv", index_col=0)
#design = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE134379\\Small_EWAS_design.csv", index_col=0)

output_base = "/home/silke/Documents/Fed_EWAS/Data/QC_GSE134379_half"
output_dir = os.path.join(output_base, "GSE134379_splits")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
for i in range(n_splits):
    s = "Split_"+str(i+1)
    GSE134379_splits_pheno_i = GSE134379_splits_pheno.loc[GSE134379_splits_pheno["split"]==s,:]
    samples = sorted(GSE134379_splits_pheno_i.index.values)
    print(samples)
    GSE134379_splits_pheno.loc[samples,:].to_csv(output_dir+"/"+s+"_pheno.csv")
    #print(meth.loc[:, samples].head())
    meth.loc[:, samples].to_csv(output_dir+"/"+s+"_methylated.csv")
    #print(unmeth.loc[:, samples].head())
    unmeth.loc[:, samples].to_csv(output_dir+"/"+s+"_unmethylated.csv")
    beta.loc[:, samples].to_csv(output_dir+"/"+s+"_betas.csv")
    design.loc[samples, :].to_csv(output_dir+"/"+s+"_design.csv")
    #design_full.loc[samples, :].to_csv(output_dir+"/"+s+"_Full_design.csv")

[16, 16, 18] 50
[8, 8, 8] 24
split    Diagnosis    
Split_1  diagnosis: AD     8
         diagnosis: ND     5
Split_2  diagnosis: AD     8
         diagnosis: ND     8
Split_3  diagnosis: ND    13
         diagnosis: AD     8
Name: Diagnosis, dtype: int64
['GSM3944749_6057833029_R04C02', 'GSM3944770_6057833029_R03C01', 'GSM3944795_6042324136_R01C01', 'GSM3944904_6055432167_R04C01', 'GSM3944972_6057833008_R03C01', 'GSM3945043_6057833013_R04C01', 'GSM3945118_6264488087_R01C01', 'GSM3945136_6042324060_R05C02', 'GSM3945152_6057833019_R06C01', 'GSM3945156_6057833010_R01C02', 'GSM3945263_6057825168_R01C02', 'GSM3945278_6057833018_R01C01', 'GSM3945394_6042324134_R04C02']
['GSM3944820_6057833015_R01C01', 'GSM3944874_6057825109_R03C02', 'GSM3944876_6057833040_R03C02', 'GSM3944943_6164647017_R06C01', 'GSM3945008_6042324160_R04C01', 'GSM3945049_6057833036_R06C01', 'GSM3945060_6057833038_R06C02', 'GSM3945101_6057833018_R05C02', 'GSM3945250_6057833007_R05C01', 'GSM3945307_6055432164_R06C02', 'GSM39