In [2]:
import pandas as pd
import numpy as np
import os
import re
import fnmatch


### Create a unified phenotype document to use when running the combined datasets on the server (or locally if memory would allow)

In [8]:
GSE66351_pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE66351_RAW\\GSE66351_pheno.txt", header = None, sep = "\t")
GSE105109_pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE105109_RAW\\GSE105109_pheno.txt", header = None, sep ="\t")
GSE134379_pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE134379_RAW\\GSE134379_pheno.txt", header = None,sep="\t")

GSE66351_pheno = GSE66351_pheno.T
GSE105109_pheno = GSE105109_pheno.T
GSE134379_pheno = GSE134379_pheno.T

GSE66351_pheno.columns = GSE66351_pheno.iloc[0,:]
GSE66351_pheno.drop(0, inplace=True)
GSE105109_pheno.columns = GSE105109_pheno.iloc[0,:]
GSE105109_pheno.drop(0, inplace=True)
GSE134379_pheno.columns = GSE134379_pheno.iloc[0,:]
GSE134379_pheno.drop(0, inplace=True)
# get the relevant columns from each dataset
# diagnosis, age, sex, braak stage, sample_id/sample_title
GSE66351_pheno = GSE66351_pheno.loc[:, ["Sample_title",  "Diagnosis", "Age", "Sex"]]
GSE105109_pheno = GSE105109_pheno.loc[:, ["Sample_title",  "Diagnosis", "Sex", "Age"]]
GSE134379_pheno = GSE134379_pheno.loc[:, ["Sample_title", "Diagnosis", "Sex", "Age"]]
# check for each dataset how the binary phenotypes diagnosis and sex are coded
# Diagnosis
print(GSE66351_pheno["Diagnosis"].unique())
print(GSE105109_pheno["Diagnosis"].unique())
print(GSE134379_pheno["Diagnosis"].unique())

# Sex
print(GSE66351_pheno["Sex"].unique())
print(GSE105109_pheno["Sex"].unique())
print(GSE134379_pheno["Sex"].unique())

['diagnosis: AD' 'diagnosis: CTRL']
['post-mortem diagnosis: Control'
 "post-mortem diagnosis: Alzheimer's disease"]
['diagnosis: AD' 'diagnosis: ND']
['Sex: F' 'Sex: M']
['gender: F' 'gender: M']
['gender: F' 'gender: M']


In [9]:
# recode the diagnosis column in each dataset to "diagnosis: AD" for alzheimers diagnosis and "diagnosis: CTRL" for control
# GSE66351_pheno stays the way it is because this is the basis for the change

GSE105109_pheno.loc[GSE105109_pheno["Diagnosis"] == "post-mortem diagnosis: Control", "Diagnosis"] = "diagnosis: CTRL"
GSE105109_pheno.loc[GSE105109_pheno["Diagnosis"] == "post-mortem diagnosis: Alzheimer's disease", "Diagnosis"] = "diagnosis: AD"

GSE134379_pheno.loc[GSE134379_pheno["Diagnosis"] == "diagnosis: ND", "Diagnosis"] = "diagnosis: CTRL"
GSE134379_pheno.loc[GSE134379_pheno["Diagnosis"] == "diagnosis: AD", "Diagnosis"] = "diagnosis: AD"
# recode the sex column so in all datasets "gender: F" is females and "gender: M" is males
# only GSE66351 needs to be recoded that way

GSE66351_pheno.loc[GSE66351_pheno["Sex"] == "Sex: F", "Sex"] = "gender: F"
GSE66351_pheno.loc[GSE66351_pheno["Sex"] == "Sex: M", "Sex"] = "gender: M"

# add the sample IDs (based on the names of the .idat files) to the phenotype frames
GSE66351_sampleIDs = [f for f in os.listdir("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE66351_RAW\\idat") if fnmatch.fnmatch(f, "*.idat")]
GSE66351_sampleID = []
for id in GSE66351_sampleIDs:
    if id[0:len(id)-9] not in GSE66351_sampleID:
        GSE66351_sampleID.append(id[0:len(id)-9])
GSE66351_pheno["Sample_ID"] = GSE66351_sampleID

GSE105109_sampleIDs = [f for f in os.listdir("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE105109_RAW\\idat") if fnmatch.fnmatch(f, "*.idat")]
GSE105109_sampleID = []
for id in GSE105109_sampleIDs:
    if id[0:len(id)-9] not in GSE105109_sampleID:
        GSE105109_sampleID.append(id[0:len(id)-9])
GSE105109_pheno["Sample_ID"] = GSE105109_sampleID

GSE134379_sampleIDs = [f for f in os.listdir("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE134379_RAW\\idat") if fnmatch.fnmatch(f, "*.idat")]
GSE134379_sampleID = []
for id in GSE134379_sampleIDs:
    if id[0:len(id)-9] not in GSE134379_sampleID:
        GSE134379_sampleID.append(id[0:len(id)-9])

GSE134379_pheno["Sample_ID"] = GSE134379_sampleID

# combine the datasets and include a column or something with information about which samples come from which dataset
unified_phenotype = pd.concat([GSE66351_pheno, GSE105109_pheno, GSE134379_pheno], keys = ["GSE66351", "GSE105109", "GSE134379"])
unified_phenotype.set_index(unified_phenotype.index.droplevel(1), inplace = True)
# name the index column
unified_phenotype.index.set_names(["Dataset_ID"], inplace=True)
unified_phenotype.Name = "Combined Dataset"
# save the unified phenotype file
unified_phenotype.to_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\Unified_dataset_pheno.csv")



### Generate balanced and imbalanced splits of the unified dataset  
These are used to test the performance of the federated method in situations where the client set are imbalanced in terms of sample size or class label distribution

In [14]:
# check for balance of class labels (AD vs. Control) in each of the datasets and the unified one
datasets = {"GSE66351":GSE66351_pheno, "GSE105109":GSE105109_pheno, "GSE134379":GSE134379_pheno, "combined":unified_phenotype}

for i in datasets:
    x,y = datasets[i].loc[:, "Diagnosis"].value_counts()
    print("In %s there are: %s AD samples and %s control samples. AD/CTRL = %s"%(i,x,y,(x/y)))

In [None]:
# create splits from the unified dataset to test the performance of the 