In [1]:
import pandas as pd
import numpy as np
import os

### Create the design matrix for local and federated EWAS for GSE66351

In [25]:
# GSE66351 - Small
pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Reduced_Pheno_Info.csv", index_col= "Sample_ID")
x = pheno.loc[:,["Diagnosis", "Age", "Sex", "Sentrix_ID"]] # design matrix with the dependent/explainatory variables to be included in the model

# The design matrix needs to consist of numeric representations of the covariates to be included in the model, i.e. binary diagnosis, binary sex, dummy sentrix etc.
x["AD"] = 0
x.loc[x["Diagnosis"] == " AD", "AD"] = 1 #create binary diagnosis with 1 = AD and 0 = CTR
x["CTRL"] = 0
x.loc[x["Diagnosis"] == " CTRL", "CTRL"] = 1

x["F"] = 0
x.loc[x["Sex"] == " F", "F"] = 1 #create binary sex with 1 = F and 0 = M
x["M"] = 0
x.loc[x["Sex"] == " M", "M"] = 1

# turn the age variable into a continuous numerical variable without any leftover text
x["Age"].replace("^[^:]*:", "", regex=True, inplace=True)
x["Age"] = pd.to_numeric(x["Age"])
x.loc[:, ["AD", "CTRL", "Age", "F", "M", "Sentrix_ID"]].to_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Small_EWAS_design.csv")
# now on to the full design matrix
x_large = x.copy()
x_large["Cell_Type1"] = pd.to_numeric(pheno["Cell_Type.CT1"])
x_large["Cell_Type2"] = pd.to_numeric(pheno["Cell_Type.CT2"])
x_large["Cell_Type3"] = pd.to_numeric(pheno["Cell_Type.CT3"])
x_large["Cell_Type4"] = pd.to_numeric(pheno["Cell_Type.CT4"])
x_large["Cell_Type5"] = pd.to_numeric(pheno["Cell_Type.CT5"])
x_large.loc[:, ["AD", "CTRL", "Age", "F", "M", "Sentrix_ID",
"Cell_Type1", "Cell_Type2", "Cell_Type3", "Cell_Type4", "Cell_Type5"]].to_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Full_EWAS_design.csv")


# create dummy variables for the unique sentrix_ids present in the dataset - this code can be reused to create center number dummies in the federated version of the code
x = x.copy()
unique_ids = x["Sentrix_ID"].unique()
include = unique_ids -1
for id in unique_ids[:-1]:
    x[id] = (x["Sentrix_ID"] == id).astype(int)
x.drop(columns=["Diagnosis", "Sex","Sentrix_ID"], inplace = True)
x.to_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Small_EWAS_design_local.csv")

#x_large = x_large.copy()
unique_ids = x_large["Sentrix_ID"].unique()
for id in unique_ids[:-1]:
    x_large[id] = (x_large["Sentrix_ID"] == id).astype(int)
x_large.drop(columns=["Diagnosis", "Sex","Sentrix_ID"], inplace = True)
x_large.to_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Full_EWAS_design_local.csv")

In [12]:
unique_ids[:-1]

array([8918692108, 8918692120, 8221932039, 9297953052, 9247377057,
       9247377036, 3998919115, 3998919116, 3998920130, 5854945043,
       5854945045, 5854945056, 5854945057, 5854945010, 5854945021,
       5854945020, 5854945006, 5900438023, 5900438003, 5854945005],
      dtype=int64)

### Create an overview dataset descriptives table

In [32]:
full_pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Full_Pheno_Info.csv", index_col= "Sample_ID", low_memory=False)
sex = full_pheno["Sex"].value_counts()
diagnosis = full_pheno["Diagnosis"].value_counts()
age = full_pheno.groupby("Sex")["Age"].mean()
brain_region = full_pheno["Brain_region"].value_counts()



In [31]:
full_pheno.columns

Index(['Unnamed: 0', 'Sample_title', 'Sample_geo_accession', 'Sample_type',
       'Sample_channel_count', 'Sample_source_name_ch1', 'Sample_organism_ch1',
       'Cell_tpye', 'Diagnosis', 'Braak_stage', 'Brain_region', 'Age', 'Sex',
       'Donor_id', 'sentrix_id', 'position', 'Sample_molecule_ch1',
       'Sample_extract_protocol_ch1', 'Sample_label_ch1',
       'Sample_label_protocol_ch1', 'Sample_taxid_ch1', 'Sample_hyb_protocol',
       'Sample_scan_protocol', 'Sample_description', 'Sample_data_processing',
       'Sample_platform_id', 'M.median', 'U.median', 'strict_outliers', 'bs',
       'predSex1', 'source', 'target', 'maxcorrelation', 'pFilterPass',
       'Cell_Type.CT1', 'Cell_Type.CT2', 'Cell_Type.CT3', 'Cell_Type.CT4',
       'Cell_Type.CT5'],
      dtype='object')

### Create even splits of the three main datasets so they can be run locally (be it federated)


In [2]:
from random import sample, seed

In [None]:
#GSE66351 - even
GSE66351_pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Reduced_Pheno_Info.csv", index_col= "Sample_ID")
GSE66351_splits_pheno = GSE66351_pheno.copy()
GSE66351_splits_pheno["split"] = "Split_3"

N_total = GSE66351_splits_pheno.shape[0]*1.0*1.0
N_ad = GSE66351_splits_pheno.loc[GSE66351_splits_pheno["Diagnosis"] == " AD", :].shape[0]*1.0*1.0
random_state = 42
seed(random_state)
n_splits = 3
sizes = [1,1,1]
ad_freqs = [0.53,0.53,0.53]

Sizes = []
n_ad = []
for i in range(0,n_splits-1):
    s  = int(N_total*sizes[i]/sum(sizes))
    Sizes.append(s)
    n_ad.append(int(s*ad_freqs[i]))

Sizes.append(int(N_total-sum(Sizes)))
n_ad.append(int(N_ad-sum(n_ad)))
print(Sizes, sum(Sizes))
print(n_ad, sum(n_ad))

splits = {}
ad = set(GSE66351_pheno.loc[GSE66351_pheno["Diagnosis"]== " AD",:].index.values)
other = set(GSE66351_pheno.index.values).difference(ad)#.difference(fem)
for i in range(0,n_splits-1):
    b = set(sample(ad,n_ad[i]))  
    ad =  ad.difference(b)
    o = set(sample(other,Sizes[i]-n_ad[i]))
    other = other.difference(o)
    sele_samples = b |o 
    GSE66351_splits_pheno.loc[sele_samples,"split"] = "Split_"+str(i+1)
    GSE66351_splits_pheno["Split_"+str(i+1)] = 0
    GSE66351_splits_pheno.loc[sele_samples,"Split_"+str(i+1)]  =1
print(GSE66351_splits_pheno[["split","Diagnosis"]].groupby("split")["Diagnosis"].value_counts())


# save the dataset splits
meth = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Methylated.csv", index_col=0)
unmeth = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Unmethylated.csv", index_col=0)
beta = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Betas.csv", index_col=0)
design = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Small_EWAS_design.csv", index_col=0)
design_local = x.copy() #pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Small_EWAS_design_local.csv", index_col=0)
design_full = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Full_EWAS_design.csv", index_col=0)
design_full_local = x_large.copy() #pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Full_EWAS_design_local.csv", index_col=0)

output_base = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data"
output_dir = os.path.join(output_base, "GSE66351_splits")



if not os.path.exists(output_dir):
    os.makedirs(output_dir)
for i in range(n_splits):
    s = "Split_"+str(i+1)
    GSE66351_pheno = GSE66351_splits_pheno.loc[GSE66351_splits_pheno["split"]==s,:]
    samples = sorted(GSE66351_pheno.index.values)
    print("these samples: %s are included in split %s"%(len(samples), i+1))
    GSE66351_splits_pheno.loc[samples,:].to_csv(output_dir+"/"+s+"_pheno.csv")
    meth.loc[:,samples ].to_csv(output_dir+"/"+s+"_methylated.csv")
    unmeth.loc[:,samples ].to_csv(output_dir+"/"+s+"_unmethylated.csv")
    beta.loc[:,samples ].to_csv(output_dir+"/"+s+"_betas.csv")
    design.loc[samples, :].to_csv(output_dir+"/"+s+"_design.csv")
    design_local.loc[samples, :].to_csv(output_dir+"/"+s+"_design_local.csv")
    design_full.loc[samples, :].to_csv(output_dir+"/"+s+"_Full_design.csv")
    design_full_local.loc[samples, :].to_csv(output_dir+"/"+s+"_Full_design_local.csv")

central_design = design_local.copy()
central_design["Cohort_effect"] = GSE66351_splits_pheno["split"]
central_design["Split1"] = 0
central_design.loc[central_design["Cohort_effect"] == "Split_1", "Split1"] = 1
central_design["Split2"] = 0
central_design.loc[central_design["Cohort_effect"] == "Split_2", "Split2"] = 1
central_design["Split3"] = 0
central_design.loc[central_design["Cohort_effect"] == "Split_3", "Split3"] = 1
central_design.drop(columns="Cohort_effect", inplace=True)
central_design.to_csv(os.path.join(output_base +"/"+"Data_Full_Datasets" + "/" + "GSE66351" + "/" +"central_design_matrix.csv"))

central_full_design = design_full_local.copy()
central_full_design["Cohort_effect"] = GSE66351_splits_pheno["split"]
central_full_design["Split1"] = 0
central_full_design.loc[central_full_design["Cohort_effect"] == "Split_1", "Split1"] = 1
central_full_design["Split2"] = 0
central_full_design.loc[central_full_design["Cohort_effect"] == "Split_2", "Split2"] = 1
central_full_design["Split3"] = 0
central_full_design.loc[central_full_design["Cohort_effect"] == "Split_3", "Split3"] = 1
central_full_design.drop(columns="Cohort_effect", inplace=True)
central_full_design.to_csv(os.path.join(output_base +"/"+"Data_Full_Datasets" + "/" + "GSE66351" + "/" +"full_central_design_matrix.csv"))

### GSE66351 - mild imbalance

In [11]:
GSE66351_pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Reduced_Pheno_Info.csv", index_col= "Sample_ID")
GSE66351_splits_pheno = GSE66351_pheno.copy()
GSE66351_splits_pheno["split"] = "Split_3"

N_total = GSE66351_splits_pheno.shape[0]*1.0*1.0
N_ad = GSE66351_splits_pheno.loc[GSE66351_splits_pheno["Diagnosis"] == " AD", :].shape[0]*1.0*1.0
random_state = 42
seed(random_state)
n_splits = 3
sizes = [0.15,0.35,0.5]
ad_freqs = [0.2,0.3,0.5]

Sizes = []
n_ad = []
for i in range(0,n_splits-1):
    s  = int(N_total*sizes[i]/sum(sizes))
    Sizes.append(s)
    n_ad.append(int(s*ad_freqs[i]))

Sizes.append(int(N_total-sum(Sizes)))
n_ad.append(int(N_ad-sum(n_ad)))
print(Sizes, sum(Sizes))
print(n_ad, sum(n_ad))

splits = {}
ad = set(GSE66351_pheno.loc[GSE66351_pheno["Diagnosis"]== " AD",:].index.values)
other = set(GSE66351_pheno.index.values).difference(ad)#.difference(fem)
for i in range(0,n_splits-1):
    b = set(sample(ad,n_ad[i]))  
    ad =  ad.difference(b)
    o = set(sample(other,Sizes[i]-n_ad[i]))
    other = other.difference(o)
    sele_samples = b |o 
    GSE66351_splits_pheno.loc[sele_samples,"split"] = "Split_"+str(i+1)
    GSE66351_splits_pheno["Split_"+str(i+1)] = 0
    GSE66351_splits_pheno.loc[sele_samples,"Split_"+str(i+1)]  =1
print(GSE66351_splits_pheno[["split","Diagnosis"]].groupby("split")["Diagnosis"].value_counts())


# save the dataset splits
meth = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Methylated.csv", index_col=0)
unmeth = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Unmethylated.csv", index_col=0)
beta = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Betas.csv", index_col=0)
design = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Small_EWAS_design.csv", index_col=0)
design_local = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Small_EWAS_design_local.csv", index_col=0)
design_full = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Full_EWAS_design.csv", index_col=0)
design_full_local = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Full_EWAS_design_local.csv", index_col=0)

output_base = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data"
output_dir = os.path.join(output_base, "GSE66351_mild_splits")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
for i in range(n_splits):
    s = "Split_"+str(i+1)
    GSE66351_pheno = GSE66351_splits_pheno.loc[GSE66351_splits_pheno["split"]==s,:]
    samples = sorted(GSE66351_pheno.index.values)
    GSE66351_splits_pheno.loc[samples,:].to_csv(output_dir+"/"+s+"_pheno.csv")
    meth.loc[:,samples ].to_csv(output_dir+"/"+s+"_methylated.csv")
    unmeth.loc[:,samples ].to_csv(output_dir+"/"+s+"_unmethylated.csv")
    beta.loc[:,samples ].to_csv(output_dir+"/"+s+"_betas.csv")
    design.loc[samples, :].to_csv(output_dir+"/"+s+"_design.csv")
    design_local.loc[samples, :].to_csv(output_dir+"/"+s+"_design_local.csv")
    design_full.loc[samples, :].to_csv(output_dir+"/"+s+"_Full_design.csv")
    design_full_local.loc[samples, :].to_csv(output_dir+"/"+s+"_Full_design_local.csv")

[28, 66, 96] 190
[5, 19, 82] 106
split    Diagnosis
Split_1   CTRL        23
          AD           5
Split_2   CTRL        47
          AD          19
Split_3   AD          82
          CTRL        14
Name: Diagnosis, dtype: int64


### GSE66351 - strong imbalance

In [12]:
GSE66351_pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Reduced_Pheno_Info.csv", index_col= "Sample_ID")
GSE66351_splits_pheno = GSE66351_pheno.copy()
GSE66351_splits_pheno["split"] = "Split_3"

N_total = GSE66351_splits_pheno.shape[0]*1.0*1.0
N_ad = GSE66351_splits_pheno.loc[GSE66351_splits_pheno["Diagnosis"] == " AD", :].shape[0]*1.0*1.0
random_state = 42
seed(random_state)
n_splits = 3
sizes = [0.25,0.35,0.40]
ad_freqs = [0.4,0.35,0.25]

Sizes = []
n_ad = []
for i in range(0,n_splits-1):
    s  = int(N_total*sizes[i]/sum(sizes))
    Sizes.append(s)
    n_ad.append(int(s*ad_freqs[i]))

Sizes.append(int(N_total-sum(Sizes)))
n_ad.append(int(N_ad-sum(n_ad)))
print(Sizes, sum(Sizes))
print(n_ad, sum(n_ad))

splits = {}
ad = set(GSE66351_pheno.loc[GSE66351_pheno["Diagnosis"]== " AD",:].index.values)
other = set(GSE66351_pheno.index.values).difference(ad)#.difference(fem)
for i in range(0,n_splits-1):
    b = set(sample(ad,n_ad[i]))  
    ad =  ad.difference(b)
    o = set(sample(other,Sizes[i]-n_ad[i]))
    other = other.difference(o)
    sele_samples = b |o 
    GSE66351_splits_pheno.loc[sele_samples,"split"] = "Split_"+str(i+1)
    GSE66351_splits_pheno["Split_"+str(i+1)] = 0
    GSE66351_splits_pheno.loc[sele_samples,"Split_"+str(i+1)]  =1
print(GSE66351_splits_pheno[["split","Diagnosis"]].groupby("split")["Diagnosis"].value_counts())


# save the dataset splits
meth = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Methylated.csv", index_col=0)
unmeth = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Unmethylated.csv", index_col=0)
beta = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Betas.csv", index_col=0)
design = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Small_EWAS_design.csv", index_col=0)
design_local = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Small_EWAS_design_local.csv", index_col=0)
design_full = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Full_EWAS_design.csv", index_col=0)
design_full_local = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Full_EWAS_design_local.csv", index_col=0)

output_base = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data"
output_dir = os.path.join(output_base, "GSE66351_strong_splits")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
for i in range(n_splits):
    s = "Split_"+str(i+1)
    GSE66351_pheno = GSE66351_splits_pheno.loc[GSE66351_splits_pheno["split"]==s,:]
    samples = sorted(GSE66351_pheno.index.values)
    GSE66351_splits_pheno.loc[samples,:].to_csv(output_dir+"/"+s+"_pheno.csv")
    meth.loc[:,samples ].to_csv(output_dir+"/"+s+"_methylated.csv")
    unmeth.loc[:,samples ].to_csv(output_dir+"/"+s+"_unmethylated.csv")
    beta.loc[:,samples ].to_csv(output_dir+"/"+s+"_betas.csv")
    design.loc[samples, :].to_csv(output_dir+"/"+s+"_design.csv")
    design_local.loc[samples, :].to_csv(output_dir+"/"+s+"_design_local.csv")
    design_full.loc[samples, :].to_csv(output_dir+"/"+s+"_Full_design.csv")
    design_full_local.loc[samples, :].to_csv(output_dir+"/"+s+"_Full_design_local.csv")

[47, 66, 77] 190
[18, 23, 65] 106
split    Diagnosis
Split_1   CTRL        29
          AD          18
Split_2   CTRL        43
          AD          23
Split_3   AD          65
          CTRL        12
Name: Diagnosis, dtype: int64
