#### Second attempt at the python version of the centralised part of the microarray methylation analysis workflow (Quality control upto normalisation)
Using python as a shell to string together the specialised r functions used in the Exeter workflow

Loading in the required modules/packages

In [1]:
import pandas as pd
import numpy as np
import subprocess
import csv
import glob
import os
import re
import seaborn as sns
from matplotlib import pyplot as plt

# stuff needed for some specific analysis - maybe not needed in this version of the code
#from sklearn.decomposition import PCA 
#from scipy.stats import pearsonr
#from sklearn.cluster import KMeans

In [2]:
working_path = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Federated_Differential_Methylation_Analysis"
data_path = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data"
output_path = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\Practical work\\Federated_Differential_Methylation_Analysis\\Output"


Use subprocess to read the data contained in the idat files into dataframe using the readEPIC function from the wateRmelon package in R

### Creating an output file structure and loading in the idat files

The input arguments of this script are: 
1. file_path to the folder containing the .idat files 
2. file_path to the phenotype information sheet (.txt) 
3. the directory where the output should be saved 
4. OPTIONAL the data identifier to be used in the creation of the output folders - this still needs to be fixed

In [7]:
load_with_option = subprocess.run(["C:\\Program Files\\R\\R-4.1.2\\bin\\Rscript.exe", '--vanilla', "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Federated_Differential_Methylation_Analysis\\Loading_idats_code_saveOutput_python_shell.R", "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE66351_RAW\\idat", "E:\Msc Systems Biology\MSB5000_Master_Thesis\Practical work\Data\GSE66351_RAW\GSE66351_pheno.txt", "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\Practical work\\Federated_Differential_Methylation_Analysis\\Output", "GSE66351a"], capture_output=True)

In [None]:
print(load_with_option.stderr)

Using subprocess to perform the complete preprocessing workflow upto the normalisation  
This is the whole preprocessing run as one function in the r-script. The script takes 5 input arguments:  
1. The file path of the folder containing the .idat files
2. The phenotype information file
3. The working directory where the output folder should be created
4. The filepath to the illumina manifest file that contains the column "CHR" with the chromosome each probe is located on
5. The identifier that should be included in the name of the output folder


In [None]:
script_path = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Federated_Differential_Methylation_Analysis\\Preprocessing_Rscripts\\centralised_preprocessing_half.r"
idat_path = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE66351_RAW\\idat"
pheno_path = "E:\Msc Systems Biology\MSB5000_Master_Thesis\Practical work\Data\GSE66351_RAW\GSE66351_pheno.txt"
output_dump = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\Practical work\\Federated_Differential_Methylation_Analysis\\Output"
manifest_path = "E:\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE66351_RAW\\GSE66351\\GPL13534_HumanMethylation450_15017482_v.1.1.csv"
identifier = "GSE66351"

In [3]:
central_preprocessing = subprocess.run(["C:\\Program Files\\R\\R-4.1.2\\bin\\Rscript.exe", '--vanilla', "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Federated_Differential_Methylation_Analysis\\Preprocessing_Rscripts\\centralised_preprocessing_half.r", "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE66351_RAW\\idat", "E:\Msc Systems Biology\MSB5000_Master_Thesis\Practical work\Data\GSE66351_RAW\GSE66351_pheno.txt", "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\Practical work\\Federated_Differential_Methylation_Analysis\\Output", "E:\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE66351_RAW\\GSE66351\\GPL13534_HumanMethylation450_15017482_v.1.1.csv", "GSE66351"], capture_output = True)

In [4]:
#check what happend in the subprocess
print(central_preprocessing.stderr)



Next step is to normalise the data, this step will be offered centrally and distributed/federated to be flexible to the researchers needs  
Below a implementation of the normalisation algorithm behind the dasen function in the wateRmelon package is provided

Dasen normalisation is a form of quantile normalisation that is performed for the two probe types seperately. The normalised data (betas), per probe type, are calculated using the normalised methylated and unmethylated intensities of each probe type.  
    betas (per probe) = quantile normalised methylated intensities / (quantile normalised methylated intensities + quantile normalised unmethylated intensities + 100)  
The first step is to write the quantile normalisation function  
  
The python version of the dasen normalisation used in r can be found in the dasen_normalisation module

In [3]:
# Creating and loading the data to test the dasen normalisation translation

#create the probe-type annotation object
annotation_data = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE66351_RAW\\GPL13534_HumanMethylation450_15017482_v.1.1.csv", skiprows=7, low_memory=False)
annotation_data.set_index(annotation_data["IlmnID"], inplace=True)
probe_type_data = annotation_data.loc[:, "Infinium_Design_Type"]
#match it to the normalised data
meth = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Federated_Differential_Methylation_Analysis\\Output\\QC_GSE66351\\Preprocessed_methylated_intensities.csv", index_col=0)
probe_type_data = probe_type_data.loc[list(set(probe_type_data.index.values).intersection(set(meth.index.values)))]
# load the unmethylated data
unmeth = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Federated_Differential_Methylation_Analysis\\Output\\QC_GSE66351\\Preprocessed_unmethylated_intensities.csv", index_col =0)

In [4]:
import dasen_normalisation
test_normalised_betas_module = dasen_normalisation.dasen_normalisation(unmeth, meth, probe_type_data)
test_normalised_betas_module.to_csv(os.path.join(output_path, "all_probes_normalised_betas.csv"))

R script containing the RefFreeEWAS cell type decomposition which will be run in a subprocess, output saved and added to the phenotype information that will be used in the EWAS furhter down in this file

In [18]:
# specifying the paths that go into the subprocess function
file_path = os.path.join(working_path, "RefFreeEWAS_local.r") #this one requires .RData input
data_path = os.path.join(output_path, "QC_GSE66351", "preprocessed_MethyLumiSet.RData") #try again with different input data
manifest_path = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE66351_RAW\\GPL13534_HumanMethylation450_15017482_v.1.1.csv"
pheno_path = os.path.join(output_path, "QC_GSE66351", "pre_norm_pheno_information.csv")
output_dir = os.path.join(output_path, "QC_GSE66351")


# RefFreeEWAS subprocess
RefFreeEWAS_RData = subprocess.run(["C:\\Program Files\\R\\R-4.1.2\\bin\\Rscript.exe", '--vanilla', file_path, data_path, pheno_path, manifest_path, output_dir], capture_output=True)
RefFreeEWAS_RData.stdout

RefFreeEWAS for local input (.csv file with normalised betas)

In [34]:
# specifying the paths that go into the subprocess function
file_path = os.path.join(working_path, "RefFreeEWAS_local_csvinput.r") 
data_path = os.path.join(output_path, "QC_GSE66351", "Preprocessed_betas.csv") 
manifest_path = "E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\GSE66351_RAW\\GPL13534_HumanMethylation450_15017482_v.1.1.csv"
pheno_path = os.path.join(output_path, "QC_GSE66351", "pre_norm_pheno_information.csv")
output_dir = os.path.join(output_path, "QC_GSE66351")


# RefFreeEWAS subprocess
RefFreeEWAS = subprocess.run(["C:\\Program Files\\R\\R-4.1.2\\bin\\Rscript.exe", '--vanilla', file_path, data_path, pheno_path, manifest_path, output_dir], capture_output=True)
RefFreeEWAS.stdout

EWAS code, based on the least squares linear algebra as used in the fortran code at the foundation of the lm() function in r

In [3]:
#EWAS

import numpy as np
import pandas as pd

pheno = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Federated_Differential_Methylation_Analysis\\Output\\QC_GSE66351_PythonShell\\post_norm_pheno_information.csv", index_col= "Sample_ID")
betas = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Federated_Differential_Methylation_Analysis\Output\\QC_GSE66351_PythonShell\\Preprocessed_betas.csv", index_col=0)
x = pheno.loc[:,["Sample_diagnosis", "Sample_age", "Sample_sex", "Sample_sentrix_id"]] # design matrix with the dependent/explainatory variables to be included in the model
y = betas.iloc[0:21,:] # keeping it small now to test if everything works the way it should

# The design matrix needs to consist of numeric representations of the covariates to be included in the model, i.e. binary diagnosis, binary sex, dummy sentrix etc.
x["Sample_diagnosis"] = (x["Sample_diagnosis"] == "diagnosis: AD").astype(int) #create binary diagnosis with 1 = AD and 0 = CTR
x["Sample_sex"] = (x["Sample_sex"] == "Sex: F").astype(int) #create binary sex with 1 = F and 0 = M
# create dummy variables for the unique sentrix_ids present in the dataset - this code can be reused to create center number dummies in the federated version of the code
unique_ids = x["Sample_sentrix_id"].unique()
for id in unique_ids:
    x[id] = (x["Sample_sentrix_id"] == id).astype(int)
x.drop(columns="Sample_sentrix_id", inplace = True)
# turn the age variable into a continuous numerical variable without any leftover text
x["Sample_age"].replace("^[^:]*:", "", regex=True, inplace=True)
x["Sample_age"] = pd.to_numeric(x["Sample_age"])

def EWAS_central(design_matrix, beta_values):
    x_matrix = design_matrix.values
    y_matrix = beta_values.values


    n = y_matrix.shape[0] # select the number of rows of the beta matrix - #genes that the linear model will be calculated for
    m = x.shape[1] #select the number of columns from the design matrix

    import scipy.stats

    coefficient = []
    standard_error = []
    t_stat = []
    p_value = []
    for i in range(0, n):
        y_m = y_matrix[i, :]
        x_t = x_matrix.T @ x_matrix
        x_t_y = x_matrix.T @ y_m
        x_t_inv = np.linalg.inv(x_t)
        coef = x_t_inv @ x_t_y
        coefficient.append(coef)
        stan_er = np.diag(x_t_inv)
        standard_error.append(stan_er)
        t = coef/stan_er
        t_stat.append(t)
        df = y_matrix.shape[1]-2 #degrees of freedom is defined as number of observations - 2 
        p = scipy.stats.t.sf(t, df)
        p_value.append(p)

#turn the results saved in lists into a dataframe for each covariate with the probe ids as index
    result_coef = pd.DataFrame(coefficient, index=y.index, columns=x.columns)
    result_staner = pd.DataFrame(standard_error, index = y.index, columns=x.columns)
    result_tvalue = pd.DataFrame(t_stat, index=y.index, columns=x.columns)
    result_pvalue = pd.DataFrame(p_value, index=y.index, columns=x.columns)

#create a final results dataframe that contains the coefficient, standard error and p-value of the diagnosis covariate included in the linear regression
    results_diagnosis = pd.DataFrame({"Diagnosis_Coef":result_coef["Sample_diagnosis"], "Diagnosis_StanErr":result_staner["Sample_diagnosis"], "Diagnosis_Pvalue":result_pvalue["Sample_diagnosis"]}, index=y.index)
    results_diagnosis.to_csv(os.path.join(output_path, "results_diagnosis_regression_test_python.csv"))
    return results_diagnosis

  


Create a .bed structured text file with the regression output to be used as input into the differentially methylated region analysis

In [63]:
# start with importing the probe information from the .bed file that is available through the encord project (?)
bed_annotation = pd.read_csv("E:\\Msc Systems Biology\\MSB5000_Master_Thesis\\Practical work\\Data\\HAIB.A549.EtOH.Rep.3.bed", sep="\t", header=None)
# select the three necessary column, chr, start, stop from the annotation file and match these to the probes in the EWAS input betas based on probe ID
bed_annotation = bed_annotation.iloc[:,0:4]
bed_annotation.columns = ["chr", "ChromStart", "ChromEnd", "Illumina_ID"]
# merge the regression output onto the .bed standard columns based on the probe ID
results_bed = pd.merge(bed_annotation, results_diagnosis, left_on="Illumina_ID", right_index=True, how="inner") #using inner join since this preserves the order of the keys and
# only keeps the entries that are present in both dataframes
results_bed.set_index(results_bed["Illumina_ID"], inplace=True)
# write the dataframe as a tab separated .bed file
results_bed.to_csv(os.path.join(output_path, "results_diagnosis_regression_test.bed"), sep="\t")