In [1]:
import pandas as pd
#import numpy as np
import os

In [29]:
import subprocess
subprocess.check_output("which Rscript", shell=True).decode("utf-8").strip()

'/home/silke/miniconda3/envs/FedEWAS/bin/Rscript'

### Create the design matrix for local and federated EWAS for GSE66351

In [25]:
def createDesignMatrix(pheno_df_path:str, small:bool=True, federated:bool=False, per_region:bool=False, output_path:str=None):
    pheno = pd.read_csv(pheno_df_path, index_col= "Sample_ID",low_memory=False)
    pheno["Diagnosis"] = pheno.loc[:,"Diagnosis"].str.strip()
    pheno["Sex"] = pheno.loc[:,"Sex"].str.strip()
    pheno["Brain_region"] = pheno.loc[:, "Brain_region"].str.strip()
    pheno["Brain_region"] = pheno.loc[:,"Brain_region"].str.replace(" ", "")
    x = pheno.loc[:,["Diagnosis", "Age", "Sex", "sentrix_id", "Brain_region"]] # design matrix with the dependent/explainatory variables to be included in the model
    # The design matrix needs to consist of numeric representations of the covariates to be included in the model, i.e. binary diagnosis, binary sex, dummy sentrix etc.
    x["AD"] = 0
    x.loc[x["Diagnosis"] == "AD", "AD"] = 1 #create binary diagnosis with 1 = AD and 0 = CTR
    x["CTRL"] = 0
    x.loc[x["Diagnosis"] == "CTRL", "CTRL"] = 1
    x["Sex"]= x.loc[:,"Sex"].replace("^[^:]*:", "", regex=True)
    x.loc[x["Sex"] == " F", "Sex"] = 1
    x.loc[x["Sex"] == " M", "Sex"] = 0
    #x.loc[x["Sex"] == " F", "Sex"] = 1 #create binary sex with 1 = F and 0 = M
    
    # turn the age variable into a continuous numerical variable without any leftover text
    x["Age"]= x.loc[:,"Age"].replace("^[^:]*:", "", regex=True)
    x["Age"] = pd.to_numeric(x["Age"])
    tissues = pheno["Brain_region"].unique()
    if small:
        if federated:
            if per_region:
                for tis in tissues:
                    #x.loc[pheno["Brain_region"] == (" " + tis), :]
                    t = x.drop(columns = ["Brain_region"])
                    if output_path:
                        # use output path
                        t.to_csv(os.path.join(output_path, "Small_%s_EWAS_design.csv"%(tis)))
                    else:
                        # otherwise save in current working dir
                        t.to_csv("Small_%s_EWAS_design.csv"%(tis))
                    
            else:
                if output_path:
                    # use output path
                    x.loc[:, ["AD", "CTRL", "Age", "Sex", "sentrix_id"]].to_csv(os.path.join(output_path, "Small_EWAS_design.csv"))
                else:
                    x.loc[:, ["AD", "CTRL", "Age", "Sex", "sentrix_id"]].to_csv("Small_EWAS_design.csv")
        else:
            if per_region:
                for tis in tissues:
                    x_Tcen = x.copy()
                    x_Tcen = x_Tcen.loc[pheno["Brain_region"] == tis, :]
                    unique_ids = x_Tcen["sentrix_id"].unique()
                    for id in unique_ids[:-1]:
                        x_Tcen[id] = (x_Tcen["sentrix_id"] == id).astype(int)
                    t_cen = x_Tcen.drop(columns = ["Diagnosis","Brain_region", "sentrix_id"])
                    if output_path:
                        t_cen.to_csv(os.path.join(output_path, "Small_EWAS_design.csv"))
                    else:
                        t_cen.to_csv("Small_%s_EWAS_design_local.csv"%(tis))
            else:
                x_cen = x.copy()
                unique_ids = x_cen["sentrix_id"].unique()
                include = unique_ids[:-1]
                for id in unique_ids[:-1]:
                    x_cen[id] = (x_cen["sentrix_id"] == id).astype(int)
                x_cen.drop(columns=["Diagnosis", "sentrix_id", "Brain_region"], inplace = True)
                if output_path:
                    x_cen.to_csv(os.path.join(output_path, "Small_EWAS_design.csv"))
                else:
                    x_cen.to_csv("Small_EWAS_design_local.csv")
    else:
        # now on to the full design matrix
        x_large = x.copy()
        x_large["Cell_Type1"] = pd.to_numeric(pheno["Cell_Type.CT1"])
        x_large["Cell_Type2"] = pd.to_numeric(pheno["Cell_Type.CT2"])
        x_large["Cell_Type3"] = pd.to_numeric(pheno["Cell_Type.CT3"])
        x_large["Cell_Type4"] = pd.to_numeric(pheno["Cell_Type.CT4"])
        x_large["Cell_Type5"] = pd.to_numeric(pheno["Cell_Type.CT5"])
        if federated:
            if per_region:
                for tis in tissues:
                    #x.loc[pheno["Brain_region"] == (" " + tis), :]
                    t_large = x_large.drop(columns = ["Brain_region"])
                    if output_path:
                        t_large.to_csv(os.path.join(output_path, "Small_EWAS_design.csv"))
                    else:
                        t_large.to_csv("Full_%s_EWAS_design.csv"%(tis))
                    
            else:
                if output_path:
                    x_large.loc[:, ["AD", "CTRL", "Age", "Sex", "sentrix_id",
                    "Cell_Type1", "Cell_Type2", "Cell_Type3", "Cell_Type4", "Cell_Type5"]].to_csv(os.path.join(output_path, "Small_EWAS_design.csv"))
                else:
                    x_large.loc[:, ["AD", "CTRL", "Age", "Sex", "sentrix_id",
                    "Cell_Type1", "Cell_Type2", "Cell_Type3", "Cell_Type4", "Cell_Type5"]].to_csv("Full_EWAS_design.csv")
        else:
            if per_region:
                for tis in tissues:
                    x_large_Tcen = x_large.copy()
                    x_large_Tcen = x_large_Tcen.loc[pheno["Brain_region"] == tis, :]
                    unique_ids = x_large_Tcen["sentrix_id"].unique()
                    for id in unique_ids[:-1]:    
                        x_large_Tcen[id] = (x_large_Tcen["sentrix_id"] == id).astype(int)
                    t_large_cen = x_large_Tcen.drop(columns = ["Diagnosis","Brain_region", "sentrix_id"])
                    if output_path:
                        t_large_cen.to_csv(os.path.join(output_path, "Full_%s_EWAS_design_local.csv"%(tis)))
                    else:
                        t_large_cen.to_csv("Full_%s_EWAS_design_local.csv"%(tis))
            else:
                x_large_cen = x_large.copy()
                unique_ids = x_large_cen["sentrix_id"].unique()
                for id in unique_ids[:-1]:
                    x_large_cen[id] = (x_large_cen["sentrix_id"] == id).astype(int)
                x_large_cen.drop(columns=["Diagnosis", "sentrix_id", "Brain_region"], inplace = True)
                if output_path:
                    x_large_cen.to_csv(os.path.join(output_path, "Full_EWAS_design_local.csv"))
                else:
                    x_large_cen.to_csv("Full_EWAS_design_local.csv")



In [26]:
createDesignMatrix(pheno_df_path='/home/silke/Documents/Fed_EWAS/Data/QC_GSE66351_half/Reduced_Pheno_Info.csv', small=True, federated=False, per_region=False,   output_path='/home/silke/Documents/Fed_EWAS/Data/QC_GSE66351_half')

In [2]:
# GSE66351 - Small
pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Full_Pheno_Info.csv", index_col= "Sample_ID",low_memory=False)
pheno["Diagnosis"] = pheno.loc[:,"Diagnosis"].str.strip()
pheno["Sex"] = pheno.loc[:,"Sex"].str.strip()
pheno["Brain_region"] = pheno.loc[:, "Brain_region"].str.strip()
pheno["Brain_region"] = pheno.loc[:,"Brain_region"].str.replace(" ", "")
x = pheno.loc[:,["Diagnosis", "Age", "Sex", "sentrix_id", "Brain_region"]] # design matrix with the dependent/explainatory variables to be included in the model

# The design matrix needs to consist of numeric representations of the covariates to be included in the model, i.e. binary diagnosis, binary sex, dummy sentrix etc.
x["AD"] = 0
x.loc[x["Diagnosis"] == "AD", "AD"] = 1 #create binary diagnosis with 1 = AD and 0 = CTR
x["CTRL"] = 0
x.loc[x["Diagnosis"] == "CTRL", "CTRL"] = 1

x.loc[x["Sex"] == "F", "Sex"] = 1
x.loc[x["Sex"] == "M", "Sex"] = 0
#x.loc[x["Sex"] == " F", "Sex"] = 1 #create binary sex with 1 = F and 0 = M

# turn the age variable into a continuous numerical variable without any leftover text
x["Age"].replace("^[^:]*:", "", regex=True, inplace=True)
x["Age"] = pd.to_numeric(x["Age"])
x.loc[:, ["AD", "CTRL", "Age", "Sex", "sentrix_id"]].to_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\design_mat\\Small_EWAS_design.csv")

# now on to the full design matrix
x_large = x.copy()
x_large["Cell_Type1"] = pd.to_numeric(pheno["Cell_Type.CT1"])
x_large["Cell_Type2"] = pd.to_numeric(pheno["Cell_Type.CT2"])
x_large["Cell_Type3"] = pd.to_numeric(pheno["Cell_Type.CT3"])
x_large["Cell_Type4"] = pd.to_numeric(pheno["Cell_Type.CT4"])
x_large["Cell_Type5"] = pd.to_numeric(pheno["Cell_Type.CT5"])
x_large.loc[:, ["AD", "CTRL", "Age", "Sex", "sentrix_id",
"Cell_Type1", "Cell_Type2", "Cell_Type3", "Cell_Type4", "Cell_Type5"]].to_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\design_mat\\Full_EWAS_design.csv")

# create dummy variables for the unique sentrix_ids present in the dataset for the design matrices used in the central analysis - so they match with the way this variable
# is treated in the federated workflow
x_cen = x.copy()
unique_ids = x_cen["sentrix_id"].unique()
include = unique_ids -1
for id in unique_ids[:-1]:
    x_cen[id] = (x_cen["sentrix_id"] == id).astype(int)
x_cen.drop(columns=["Diagnosis", "sentrix_id", "Brain_region"], inplace = True)
x_cen.to_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\design_mat\\Small_EWAS_design_local.csv")

x_large_cen = x_large.copy()
unique_ids = x_large_cen["sentrix_id"].unique()
for id in unique_ids[:-1]:
    x_large_cen[id] = (x_large_cen["sentrix_id"] == id).astype(int)
x_large_cen.drop(columns=["Diagnosis", "sentrix_id", "Brain_region"], inplace = True)
x_large_cen.to_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\design_mat\\Full_EWAS_design_local.csv")

# create the design matrices per source tissue - used in federated
tissues = pheno["Brain_region"].unique()
for tis in tissues:
    x.loc[pheno["Brain_region"] == (" " + tis), :]
    t = x.drop(columns = ["Brain_region"])
    t.to_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\design_mat\\Small_%s_EWAS_design.csv"%(tis))
    
    x_large.loc[pheno["Brain_region"] == (" " + tis), :]
    t_large = x_large.drop(columns = ["Brain_region"])
    t_large.to_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\design_mat\\Full_%s_EWAS_design.csv"%(tis))

# create the design matrices per source tissue - used in centralised workflow
tissues = pheno["Brain_region"].unique()
for tis in tissues:
    x_Tcen = x.copy()
    x_Tcen = x_Tcen.loc[pheno["Brain_region"] == tis, :]
    unique_ids = x_Tcen["sentrix_id"].unique()
    for id in unique_ids[:-1]:
        x_Tcen[id] = (x_Tcen["sentrix_id"] == id).astype(int)
    t_cen = x_Tcen.drop(columns = ["Diagnosis","Brain_region", "sentrix_id"])
    t_cen.to_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\design_mat\\Small_%s_EWAS_design_local.csv"%(tis))
    
    x_large_Tcen = x_large.copy()
    x_large_Tcen = x_large_Tcen.loc[pheno["Brain_region"] == tis, :]
    unique_ids = x_large_Tcen["sentrix_id"].unique()
    for id in unique_ids[:-1]:    
        x_large_Tcen[id] = (x_large_Tcen["sentrix_id"] == id).astype(int)
    t_large_cen = x_large_Tcen.drop(columns = ["Diagnosis","Brain_region", "sentrix_id"])
    t_large_cen.to_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\design_mat\\Full_%s_EWAS_design_local.csv"%(tis))
 


### Create an overview dataset descriptives table

In [2]:
full_pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Full_Pheno_Info.csv", index_col= "Sample_ID", low_memory=False)
sex = full_pheno["Sex"].value_counts()
diagnosis = full_pheno["Diagnosis"].value_counts()
age = full_pheno.groupby("Sex")["Age"].mean()
brain_region = full_pheno["Brain_region"].value_counts()



In [21]:
sex = full_pheno.groupby(["Diagnosis"])["Sex"].value_counts()
sex_percent = full_pheno.groupby(["Diagnosis"])["Sex"].value_counts(normalize=True) *100
brain_region = full_pheno.groupby("Diagnosis")["Brain_region"].value_counts()
age_mean = full_pheno.groupby(["Diagnosis"])["Age"].mean()
age_sd = full_pheno.groupby(["Diagnosis"])["Age"].std()
print(sex)
print(sex_percent)
print(brain_region)
print(age_mean)
print(age_sd)

Diagnosis  Sex
 AD         F     67
            M     39
 CTRL       M     50
            F     34
Name: Sex, dtype: int64
Diagnosis  Sex
 AD         F     63.207547
            M     36.792453
 CTRL       M     59.523810
            F     40.476190
Name: Sex, dtype: float64
Diagnosis  Brain_region     
 AD         Temporal cortex     39
            Frontal cortex      37
            Occipital cortex    30
 CTRL       Occipital cortex    32
            Frontal cortex      26
            Temporal cortex     26
Name: Brain_region, dtype: int64
Diagnosis
 AD      80.264151
 CTRL    66.690476
Name: Age, dtype: float64
Diagnosis
 AD       7.881626
 CTRL    18.376895
Name: Age, dtype: float64


In [19]:
sex = full_pheno.groupby(["Brain_region", "Diagnosis"])["Sex"].value_counts()
sex_percent = full_pheno.groupby(["Brain_region", "Diagnosis"])["Sex"].value_counts(normalize=True) *100
brain_region = full_pheno.groupby("Diagnosis")["Brain_region"].value_counts()
age_mean = full_pheno.groupby(["Brain_region", "Diagnosis"])["Age"].mean()
age_sd = full_pheno.groupby(["Brain_region", "Diagnosis"])["Age"].std()
print(sex)
print(sex_percent)
#print(brain_region)
#print(age_mean)
#print(age_sd)

Brain_region       Diagnosis  Sex
 Frontal cortex     AD         F     23
                               M     14
                    CTRL       M     16
                               F     10
 Occipital cortex   AD         F     20
                               M     10
                    CTRL       M     18
                               F     14
 Temporal cortex    AD         F     24
                               M     15
                    CTRL       M     16
                               F     10
Name: Sex, dtype: int64
Brain_region       Diagnosis  Sex
 Frontal cortex     AD         F     62.162162
                               M     37.837838
                    CTRL       M     61.538462
                               F     38.461538
 Occipital cortex   AD         F     66.666667
                               M     33.333333
                    CTRL       M     56.250000
                               F     43.750000
 Temporal cortex    AD         F     61.538462
     

### Create even splits of the three main datasets so they can be run locally (be it federated)


In [2]:
from random import sample, seed

In [3]:
meth = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Methylated.csv", index_col=0)

In [4]:
#GSE66351 - even
GSE66351_pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Reduced_Pheno_Info.csv", index_col= "Sample_ID")
GSE66351_splits_pheno = GSE66351_pheno.copy()
GSE66351_splits_pheno["split"] = "Split_3"

N_total = GSE66351_splits_pheno.shape[0]*1.0*1.0
N_ad = GSE66351_splits_pheno.loc[GSE66351_splits_pheno["Diagnosis"] == "diagnosis: AD", :].shape[0]*1.0*1.0
random_state = 42
seed(random_state)
n_splits = 3
sizes = [1,1,1]
ad_freqs = [0.53,0.53,0.53]

Sizes = []
n_ad = []
for i in range(0,n_splits-1):
    s  = int(N_total*sizes[i]/sum(sizes))
    Sizes.append(s)
    n_ad.append(int(s*ad_freqs[i]))

Sizes.append(int(N_total-sum(Sizes)))
n_ad.append(int(N_ad-sum(n_ad)))
print(Sizes, sum(Sizes))
print(n_ad, sum(n_ad))

splits = {}
ad = set(GSE66351_pheno.loc[GSE66351_pheno["Diagnosis"]== "diagnosis: AD",:].index.values)
other = set(GSE66351_pheno.index.values).difference(ad)#.difference(fem)
for i in range(0,n_splits-1):
    b = set(sample(ad,n_ad[i]))  
    ad =  ad.difference(b)
    o = set(sample(other,Sizes[i]-n_ad[i]))
    other = other.difference(o)
    sele_samples = b |o 
    GSE66351_splits_pheno.loc[sele_samples,"split"] = "Split_"+str(i+1)
    GSE66351_splits_pheno["Split_"+str(i+1)] = 0
    GSE66351_splits_pheno.loc[sele_samples,"Split_"+str(i+1)]  =1
print(GSE66351_splits_pheno[["split","Diagnosis"]].groupby("split")["Diagnosis"].value_counts())


# save the dataset splits
meth = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Methylated.csv", index_col=0)
unmeth = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Unmethylated.csv", index_col=0)
beta = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Betas.csv", index_col=0)
design = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Small_EWAS_design.csv", index_col=0)
design_local = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Small_EWAS_design_local.csv", index_col=0)
design_full = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Full_EWAS_design.csv", index_col=0)
design_full_local = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Full_EWAS_design_local.csv", index_col=0)

output_base = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data"
output_dir = os.path.join(output_base, "GSE66351_splits")



if not os.path.exists(output_dir):
    os.makedirs(output_dir)
for i in range(n_splits):
    s = "Split_"+str(i+1)
    GSE66351_pheno = GSE66351_splits_pheno.loc[GSE66351_splits_pheno["split"]==s,:]
    samples = sorted(GSE66351_pheno.index.values)
    print("these samples: %s are included in split %s"%(len(samples), i+1))
    GSE66351_splits_pheno.loc[samples,:].to_csv(output_dir+"/"+s+"_pheno.csv")
    meth.loc[:,samples ].to_csv(output_dir+"/"+s+"_methylated.csv")
    unmeth.loc[:,samples ].to_csv(output_dir+"/"+s+"_unmethylated.csv")
    beta.loc[:,samples ].to_csv(output_dir+"/"+s+"_betas.csv")
    design.loc[samples, :].to_csv(output_dir+"/"+s+"_design.csv")
    #design_local.loc[samples, :].to_csv(output_dir+"/"+s+"_design_local.csv")
    design_full.loc[samples, :].to_csv(output_dir+"/"+s+"_Full_design.csv")
    #design_full_local.loc[samples, :].to_csv(output_dir+"/"+s+"_Full_design_local.csv")

central_design = design_local.copy()
central_design["Cohort_effect"] = GSE66351_splits_pheno["split"]
central_design["Split1"] = 0
central_design.loc[central_design["Cohort_effect"] == "Split_1", "Split1"] = 1
central_design["Split2"] = 0
central_design.loc[central_design["Cohort_effect"] == "Split_2", "Split2"] = 1
central_design.drop(columns="Cohort_effect", inplace=True)
central_design.to_csv(os.path.join(output_dir + "/" +"central_design_matrix.csv"))

central_full_design = design_full_local.copy()
central_full_design["Cohort_effect"] = GSE66351_splits_pheno["split"]
central_full_design["Split1"] = 0
central_full_design.loc[central_full_design["Cohort_effect"] == "Split_1", "Split1"] = 1
central_full_design["Split2"] = 0
central_full_design.loc[central_full_design["Cohort_effect"] == "Split_2", "Split2"] = 1
central_full_design.drop(columns="Cohort_effect", inplace=True)
central_full_design.to_csv(os.path.join(output_dir + "/" +"full_central_design_matrix.csv"))

[63, 63, 64] 190
[33, 33, 40] 106
split    Diagnosis
Split_1   AD          33
          CTRL        30
Split_2   AD          33
          CTRL        30
Split_3   AD          40
          CTRL        24
Name: Diagnosis, dtype: int64
these samples: 63 are included in split 1
these samples: 63 are included in split 2
these samples: 64 are included in split 3


### GSE66351 - mild imbalance

In [3]:
GSE66351_pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Reduced_Pheno_Info.csv", index_col= "Sample_ID")
GSE66351_splits_pheno = GSE66351_pheno.copy()
GSE66351_splits_pheno["split"] = "Split_3"

N_total = GSE66351_splits_pheno.shape[0]*1.0*1.0
N_ad = GSE66351_splits_pheno.loc[GSE66351_splits_pheno["Diagnosis"] == " AD", :].shape[0]*1.0*1.0
random_state = 42
seed(random_state)
n_splits = 3
sizes = [0.15,0.35,0.5]
ad_freqs = [0.2,0.3,0.5]

Sizes = []
n_ad = []
for i in range(0,n_splits-1):
    s  = int(N_total*sizes[i]/sum(sizes))
    Sizes.append(s)
    n_ad.append(int(s*ad_freqs[i]))

Sizes.append(int(N_total-sum(Sizes)))
n_ad.append(int(N_ad-sum(n_ad)))
print(Sizes, sum(Sizes))
print(n_ad, sum(n_ad))

splits = {}
ad = set(GSE66351_pheno.loc[GSE66351_pheno["Diagnosis"]== " AD",:].index.values)
other = set(GSE66351_pheno.index.values).difference(ad)#.difference(fem)
for i in range(0,n_splits-1):
    b = set(sample(ad,n_ad[i]))  
    ad =  ad.difference(b)
    o = set(sample(other,Sizes[i]-n_ad[i]))
    other = other.difference(o)
    sele_samples = b |o 
    GSE66351_splits_pheno.loc[sele_samples,"split"] = "Split_"+str(i+1)
    GSE66351_splits_pheno["Split_"+str(i+1)] = 0
    GSE66351_splits_pheno.loc[sele_samples,"Split_"+str(i+1)]  =1
print(GSE66351_splits_pheno[["split","Diagnosis"]].groupby("split")["Diagnosis"].value_counts())


# save the dataset splits
meth = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Methylated.csv", index_col=0)
unmeth = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Unmethylated.csv", index_col=0)
beta = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Betas.csv", index_col=0)
design = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Small_EWAS_design.csv", index_col=0)
design_local = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Small_EWAS_design_local.csv", index_col=0)
design_full = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Full_EWAS_design.csv", index_col=0)
design_full_local = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Full_EWAS_design_local.csv", index_col=0)

output_base = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data"
output_dir = os.path.join(output_base, "GSE66351_mild_splits")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
for i in range(n_splits):
    s = "Split_"+str(i+1)
    GSE66351_pheno = GSE66351_splits_pheno.loc[GSE66351_splits_pheno["split"]==s,:]
    samples = sorted(GSE66351_pheno.index.values)
    GSE66351_splits_pheno.loc[samples,:].to_csv(output_dir+"/"+s+"_pheno.csv")
    meth.loc[:,samples ].to_csv(output_dir+"/"+s+"_methylated.csv")
    unmeth.loc[:,samples ].to_csv(output_dir+"/"+s+"_unmethylated.csv")
    beta.loc[:,samples ].to_csv(output_dir+"/"+s+"_betas.csv")
    design.loc[samples, :].to_csv(output_dir+"/"+s+"_design.csv")
    #design_local.loc[samples, :].to_csv(output_dir+"/"+s+"_design_local.csv")
    design_full.loc[samples, :].to_csv(output_dir+"/"+s+"_Full_design.csv")
    #design_full_local.loc[samples, :].to_csv(output_dir+"/"+s+"_Full_design_local.csv")

central_design = design_local.copy()
central_design["Cohort_effect"] = GSE66351_splits_pheno["split"]
central_design["Split1"] = 0
central_design.loc[central_design["Cohort_effect"] == "Split_1", "Split1"] = 1
central_design["Split2"] = 0
central_design.loc[central_design["Cohort_effect"] == "Split_2", "Split2"] = 1
central_design.drop(columns="Cohort_effect", inplace=True)
central_design.to_csv(os.path.join(output_dir + "/" +"central_design_matrix.csv"))

central_full_design = design_full_local.copy()
central_full_design["Cohort_effect"] = GSE66351_splits_pheno["split"]
central_full_design["Split1"] = 0
central_full_design.loc[central_full_design["Cohort_effect"] == "Split_1", "Split1"] = 1
central_full_design["Split2"] = 0
central_full_design.loc[central_full_design["Cohort_effect"] == "Split_2", "Split2"] = 1
central_full_design.drop(columns="Cohort_effect", inplace=True)
central_full_design.to_csv(os.path.join(output_dir + "/" +"full_central_design_matrix.csv"))

[28, 66, 96] 190
[5, 19, 82] 106
split    Diagnosis
Split_1   CTRL        23
          AD           5
Split_2   CTRL        47
          AD          19
Split_3   AD          82
          CTRL        14
Name: Diagnosis, dtype: int64


### GSE66351 - strong imbalance

In [3]:
GSE66351_pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Reduced_Pheno_Info.csv", index_col= "Sample_ID")
GSE66351_splits_pheno = GSE66351_pheno.copy()
GSE66351_splits_pheno["split"] = "Split_3"

N_total = GSE66351_splits_pheno.shape[0]*1.0*1.0
N_ad = GSE66351_splits_pheno.loc[GSE66351_splits_pheno["Diagnosis"] == " AD", :].shape[0]*1.0*1.0
random_state = 42
seed(random_state)
n_splits = 3
sizes = [0.25,0.35,0.40]
ad_freqs = [0.4,0.35,0.25]

Sizes = []
n_ad = []
for i in range(0,n_splits-1):
    s  = int(N_total*sizes[i]/sum(sizes))
    Sizes.append(s)
    n_ad.append(int(s*ad_freqs[i]))

Sizes.append(int(N_total-sum(Sizes)))
n_ad.append(int(N_ad-sum(n_ad)))
print(Sizes, sum(Sizes))
print(n_ad, sum(n_ad))

splits = {}
ad = set(GSE66351_pheno.loc[GSE66351_pheno["Diagnosis"]== " AD",:].index.values)
other = set(GSE66351_pheno.index.values).difference(ad)#.difference(fem)
for i in range(0,n_splits-1):
    b = set(sample(ad,n_ad[i]))  
    ad =  ad.difference(b)
    o = set(sample(other,Sizes[i]-n_ad[i]))
    other = other.difference(o)
    sele_samples = b |o 
    GSE66351_splits_pheno.loc[sele_samples,"split"] = "Split_"+str(i+1)
    GSE66351_splits_pheno["Split_"+str(i+1)] = 0
    GSE66351_splits_pheno.loc[sele_samples,"Split_"+str(i+1)]  =1
print(GSE66351_splits_pheno[["split","Diagnosis"]].groupby("split")["Diagnosis"].value_counts())


# save the dataset splits
meth = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Methylated.csv", index_col=0)
unmeth = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Unmethylated.csv", index_col=0)
beta = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Filtered_Betas.csv", index_col=0)
design = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Small_EWAS_design.csv", index_col=0)
design_local = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Small_EWAS_design_local.csv", index_col=0)
design_full = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Full_EWAS_design.csv", index_col=0)
design_full_local = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Data_Full_Datasets\\GSE66351\\Full_EWAS_design_local.csv", index_col=0)

output_base = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data"
output_dir = os.path.join(output_base, "GSE66351_strong_splits")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
for i in range(n_splits):
    s = "Split_"+str(i+1)
    GSE66351_pheno = GSE66351_splits_pheno.loc[GSE66351_splits_pheno["split"]==s,:]
    samples = sorted(GSE66351_pheno.index.values)
    GSE66351_splits_pheno.loc[samples,:].to_csv(output_dir+"/"+s+"_pheno.csv")
    meth.loc[:,samples ].to_csv(output_dir+"/"+s+"_methylated.csv")
    unmeth.loc[:,samples ].to_csv(output_dir+"/"+s+"_unmethylated.csv")
    beta.loc[:,samples ].to_csv(output_dir+"/"+s+"_betas.csv")
    design.loc[samples, :].to_csv(output_dir+"/"+s+"_design.csv")
    design_local.loc[samples, :].to_csv(output_dir+"/"+s+"_design_local.csv")
    design_full.loc[samples, :].to_csv(output_dir+"/"+s+"_Full_design.csv")
    design_full_local.loc[samples, :].to_csv(output_dir+"/"+s+"_Full_design_local.csv")

central_design = design_local.copy()
central_design["Cohort_effect"] = GSE66351_splits_pheno["split"]
central_design["Split1"] = 0
central_design.loc[central_design["Cohort_effect"] == "Split_1", "Split1"] = 1
central_design["Split2"] = 0
central_design.loc[central_design["Cohort_effect"] == "Split_2", "Split2"] = 1
central_design.drop(columns="Cohort_effect", inplace=True)
central_design.to_csv(os.path.join(output_dir + "/" +"central_design_matrix.csv"))

central_full_design = design_full_local.copy()
central_full_design["Cohort_effect"] = GSE66351_splits_pheno["split"]
central_full_design["Split1"] = 0
central_full_design.loc[central_full_design["Cohort_effect"] == "Split_1", "Split1"] = 1
central_full_design["Split2"] = 0
central_full_design.loc[central_full_design["Cohort_effect"] == "Split_2", "Split2"] = 1
central_full_design.drop(columns="Cohort_effect", inplace=True)
central_full_design.to_csv(os.path.join(output_dir + "/" +"full_central_design_matrix.csv"))

[47, 66, 77] 190
[18, 23, 65] 106
split    Diagnosis
Split_1   CTRL        29
          AD          18
Split_2   CTRL        43
          AD          23
Split_3   AD          65
          CTRL        12
Name: Diagnosis, dtype: int64


In [4]:
split_dir = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE66351_strong_splits"
strong1 = pd.read_csv(os.path.join(split_dir, "Split_1_design.csv"), index_col=0)
strong2 = pd.read_csv(os.path.join(split_dir, "Split_2_design.csv"), index_col=0)
strong3 = pd.read_csv(os.path.join(split_dir, "Split_3_design.csv"), index_col=0)

In [7]:
strong1 = strong1.join(full_pheno["Brain_region"], how="inner")
strong2 = strong2.join(full_pheno["Brain_region"], how="inner")
strong3 = strong3.join(full_pheno["Brain_region"], how="inner")

In [11]:
print(strong1["Brain_region"].value_counts()) #(normalize = True) * 100)
print(strong2["Brain_region"].value_counts()) #(normalize = True) * 100)
print(strong3["Brain_region"].value_counts()) #(normalize = True) * 100)

 Temporal cortex     18
 Frontal cortex      16
 Occipital cortex    13
Name: Brain_region, dtype: int64
 Occipital cortex    24
 Frontal cortex      21
 Temporal cortex     21
Name: Brain_region, dtype: int64
 Frontal cortex      26
 Temporal cortex     26
 Occipital cortex    25
Name: Brain_region, dtype: int64


In [10]:
full_pheno["Brain_region"].value_counts()

 Temporal cortex     65
 Frontal cortex      63
 Occipital cortex    62
Name: Brain_region, dtype: int64