In [None]:
import pandas as pd
import numpy as np
import os

### Create the design matrix for GSE105109 to be used in the local and federated EWAS

In [None]:
# GSE105109 - Small
pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE105109\\GSE105109_Reduced_Pheno_Info.csv", index_col= "Sample_ID")
x = pheno.loc[:,["Diagnosis", "Age", "Sex", "Sentrix_ID"]] # design matrix with the dependent/explainatory variables to be included in the model

# The design matrix needs to consist of numeric representations of the covariates to be included in the model, i.e. binary diagnosis, binary sex, dummy sentrix etc.
#x["Diagnosis"] = (x["Diagnosis"] == " Alzheimer's disease").astype(int) #create binary diagnosis with 1 = AD and 0 = CTR
x["AD"] = 0
x.loc[x["Diagnosis"] == " Alzheimer's disease", "AD"] = 1
x["CTRL"] = 0
x.loc[x["Diagnosis"] == " Control", "CTRL"] = 1

#x["Sex"] = (x["Sex"] == " F").astype(int) #create binary sex with 1 = F and 0 = M
x["F"] = 0
x.loc[x["Sex"] == " F", "F"] = 1
x["M"] = 0
x.loc[x["Sex"] == "M", "M"] = 1
#x["Sentrix_ID"] = x["Sentrix_ID"].astype(int)

# turn the age variable into a continuous numerical variable without any leftover text
x["Age"].replace("^[^:]*:", "", regex=True, inplace=True)
x["Age"] = pd.to_numeric(x["Age"])
x.to_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE105109\\Small_EWAS_design.csv")

# now on to the full design matrix
x_large = x.copy()
x_large["Cell_Type1"] = pd.to_numeric(pheno["Cell_Type.CT1"])
x_large["Cell_Type2"] = pd.to_numeric(pheno["Cell_Type.CT2"])
x_large["Cell_Type3"] = pd.to_numeric(pheno["Cell_Type.CT3"])
x_large["Cell_Type4"] = pd.to_numeric(pheno["Cell_Type.CT4"])
x_large["Cell_Type5"] = pd.to_numeric(pheno["Cell_Type.CT5"])
x_large.to_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE105109\\Full_EWAS_design.csv")

# create dummy variables for the unique sentrix_ids present in the dataset - this code can be reused to create center number dummies in the federated version of the code
unique_ids = x["Sentrix_ID"].unique()
for id in unique_ids:
    x[id] = (x["Sentrix_ID"] == id).astype(int)
x.drop(columns="Sentrix_ID", inplace = True)
x.to_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE105109\\Small_EWAS_design_local.csv")

unique_ids = x_large["Sentrix_ID"].unique()
for id in unique_ids:
    x_large[id] = (x_large["Sentrix_ID"] == id).astype(int)
x_large.drop(columns="Sentrix_ID", inplace = True)
x_large.to_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE105109\\Full_EWAS_design_local.csv")

### Create even and imbalanced splits of GSE105109 


In [None]:
from random import seed, sample

In [None]:
#GSE105109
GSE105109_pheno = pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE105109\\GSE105109_Reduced_Pheno_Info.csv", index_col= "Sample_ID")
GSE105109_splits_pheno = GSE105109_pheno.copy()
# load in the data that needs to be split
meth = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE105109\\GSE105109_Filtered_Methylated.csv", index_col=0)
unmeth = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE105109\\GSE105109_Filtered_Unmethylated.csv", index_col=0)
beta = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE105109\\GSE105109_Filtered_Betas.csv", index_col=0)
design = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE105109\\Small_EWAS_design.csv", index_col=0)
design_full = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE105109\\Full_EWAS_design.csv", index_col=0)
design_local = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE105109\\Small_EWAS_design_local.csv", index_col=0)
design_full_local = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE105109\\Full_EWAS_design_local.csv", index_col=0)
# math the samples in the phenotype file to the samples in the data files
GSE105109_pheno = GSE105109_pheno.loc[set(meth.columns.values).intersection(set(GSE105109_pheno.index.values)), :]
GSE105109_splits_pheno = GSE105109_splits_pheno.loc[set(meth.columns.values).intersection(set(GSE105109_splits_pheno.index.values)), :]


GSE105109_splits_pheno["split"] = "Split_3"

N_total = GSE105109_splits_pheno.shape[0]*1.0*1.0
N_ad = GSE105109_splits_pheno.loc[GSE105109_splits_pheno["Diagnosis"] == " Alzheimer's disease", :].shape[0]*1.0*1.0
random_state = 42
seed(random_state)
n_splits = 3
sizes = [1,1,1]
se_freqs = [0.70,0.70,0.70]

Sizes = []
n_ad = []
for i in range(0,n_splits-1):
    s  = int(N_total*sizes[i]/sum(sizes))
    Sizes.append(s)
    n_ad.append(int(s*se_freqs[i]))

Sizes.append(int(N_total-sum(Sizes)))
n_ad.append(int(N_ad-sum(n_ad)))
print(Sizes, sum(Sizes))
print(n_ad, sum(n_ad))


splits = {}
ad = set(GSE105109_pheno.loc[GSE105109_pheno["Diagnosis"]== " Alzheimer's disease",:].index.values)
other = set(GSE105109_pheno.index.values).difference(ad)#.difference(fem)
for i in range(0,n_splits-1):
    b = set(sample(ad,n_ad[i]))  
    ad =  ad.difference(b)
    o = set(sample(other,Sizes[i]-n_ad[i]))
    other1 = other.difference(o)
    sele_samples = b |o 
    GSE105109_splits_pheno.loc[sele_samples,"split"] = "Split_"+str(i+1)
    GSE105109_splits_pheno["Split_"+str(i+1)] = 0
    GSE105109_splits_pheno.loc[sele_samples,"Split_"+str(i+1)]  =1
print(GSE105109_splits_pheno[["split","Diagnosis"]].groupby("split")["Diagnosis"].value_counts())



output_base = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data"
output_dir = os.path.join(output_base, "GSE105109_splits")

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
for i in range(n_splits):
    s = "Split_"+str(i+1)
    GSE105109_splits_pheno.loc[GSE105109_splits_pheno["split"]==s,:]
    samples = sorted(GSE105109_splits_pheno.index.values)
    GSE105109_splits_pheno.loc[samples,:].to_csv(output_dir+"/"+s+"_pheno.csv")
    meth.loc[:,samples ].to_csv(output_dir+"/"+s+"_methylated.csv")
    unmeth.loc[:,samples ].to_csv(output_dir+"/"+s+"_unmethylated.csv")
    beta.loc[:,samples ].to_csv(output_dir+"/"+s+"_betas.csv")
    design.loc[samples, :].to_csv(output_dir+"/"+s+"_design.csv")
    design_full.loc[samples, :].to_csv(output_dir+"/"+s+"_Full_design.csv")

central_design = design_local.copy()
central_design["Cohort_effect"] = GSE105109_splits_pheno["split"]
central_design.to_csv(os.path.join(output_dir + "/" +"central_design_matrix.csv"))
central_full_design = design_full_local.copy()
central_full_design["Cohort_effect"] = GSE105109_splits_pheno["split"]
central_full_design.to_csv(os.path.join(output_dir +"/" +"full_central_design_matrix.csv"))
