## Data Prepation for Hyper- and Hypo-methylation Split
This notebook leverage the pre-computed mean signal value of each feature from normal patients to split the patient samples into hyper- and hypo-methylation

In [17]:
import pandas as pd
import numpy as np
from tqdm import trange
import matplotlib.pyplot as plt

In [9]:
datafolder = './cv_data_pvalue_fixed/'
hyper_out = './cv_data_pvalue_fixed_hyper/'
hypo_out = './cv_data_pvalue_fixed_hypo/'

modalities = ['ChIP_CTCF', 'ChIP_H3K27ac', 'ChIP_H3K27me3', 'ChIP_H3K4me3', 'DNase']
average = pd.read_csv(datafolder + 'average_signals.csv')
average = average.iloc[:, 2:]

In [11]:
for modality in modalities:
    for i in range(5):
        # Get identification numbers of patient samples
        labels = pd.read_csv(datafolder + modality + '_fold_' + str(i) + '_labels.csv',index_col=0)
        patient_names = list(labels.index[labels['AD_label']==0])

        # Merge frame_info with average value of normal samples
        frame_info = pd.read_csv(datafolder + modality + '_fold_' + str(i) + '_frame_info.csv')
        frame_info.rename(columns={'Unnamed: 0':'identifier'}, inplace=True )
        new_frame_info = pd.merge(frame_info, average,  how='left', left_on=['chrom','start', 'end'], right_on = ['chrom','start', 'end'])

        # Read in sample data
        df = pd.read_csv(datafolder + modality + '_fold_' + str(i) + '_data.csv',index_col=0)

        # Iterate through sample data and collect indexes of hyper/hypo features
        hypers = []
        hypos = []

        for index, row in df.iterrows():
            patient_avg = np.average(row.loc[patient_names])
            normal_avg = list(new_frame_info.loc[new_frame_info['identifier']==index].value_scaled)[0]
            if normal_avg >= patient_avg:
                hypers.append(index)
            else:
                hypos.append(index)
        
        df.loc[hypers].to_csv(hyper_out + modality + '_fold_' + str(i) + '_data_hyper.csv')
        df.loc[hypos].to_csv(hypo_out + modality + '_fold_' + str(i) + '_data_hypo.csv')

        print(f'{modality} | Fold: {i} | Hyper: {len(hypers)} | Hypo: {len(hypos)}')

ChIP_CTCF | Fold: 0 | Hyper: 2647 | Hypo: 7353
ChIP_CTCF | Fold: 1 | Hyper: 2095 | Hypo: 7905
ChIP_CTCF | Fold: 2 | Hyper: 2328 | Hypo: 7672
ChIP_CTCF | Fold: 3 | Hyper: 2965 | Hypo: 7035
ChIP_CTCF | Fold: 4 | Hyper: 1554 | Hypo: 8446
ChIP_H3K27ac | Fold: 0 | Hyper: 2213 | Hypo: 7787
ChIP_H3K27ac | Fold: 1 | Hyper: 4969 | Hypo: 5031
ChIP_H3K27ac | Fold: 2 | Hyper: 6230 | Hypo: 3770
ChIP_H3K27ac | Fold: 3 | Hyper: 4345 | Hypo: 5655
ChIP_H3K27ac | Fold: 4 | Hyper: 3628 | Hypo: 6372
ChIP_H3K27me3 | Fold: 0 | Hyper: 2588 | Hypo: 7412
ChIP_H3K27me3 | Fold: 1 | Hyper: 1688 | Hypo: 8312
ChIP_H3K27me3 | Fold: 2 | Hyper: 3751 | Hypo: 6249
ChIP_H3K27me3 | Fold: 3 | Hyper: 3948 | Hypo: 6052
ChIP_H3K27me3 | Fold: 4 | Hyper: 3276 | Hypo: 6724
ChIP_H3K4me3 | Fold: 0 | Hyper: 6320 | Hypo: 3680
ChIP_H3K4me3 | Fold: 1 | Hyper: 6635 | Hypo: 3365
ChIP_H3K4me3 | Fold: 2 | Hyper: 5647 | Hypo: 4353
ChIP_H3K4me3 | Fold: 3 | Hyper: 5406 | Hypo: 4594
ChIP_H3K4me3 | Fold: 4 | Hyper: 4310 | Hypo: 5690
DNase | Fo