## Get autosomal A1 / A2 Cscore and compute difference - Normalise to Z-score

In [1]:
import sys
sys.path.append('..')
from utilities import readDiffTAD, processBlacklist
import glob
import pandas as pd
import seaborn as sns
from scipy.stats import zscore
from pybedtools import BedTool

In [2]:
cells = ['GM12878', 'IMR90', 'H1hESC']
binSize = 20000
threshold = 3
validChroms = [f'chr{c}' for c in range(23)]

In [3]:
def readCscore(x):
    names = {'chrom': str, 'start': int, 'end': int, 'cscore': float}
    return pd.read_csv(
        x, usecols=[0,1,2,4], names=names.keys(), dtype=names, sep='\t')

In [4]:
allIntersects = {}
for cell in cells:
    cscores = []
    for a in [1, 2]:
        files = glob.glob(
            f'../../{cell}/alleleGRCh37/dat/Cscore/chr*/{binSize}/{cell}_a{a}-chr*-{binSize}-SNPsplit-Cscore_cscore.bed')
        cscore = pd.concat([readCscore(x) for x in files])
        cscore = cscore.loc[cscore['chrom'].isin(validChroms)]
        cscore = cscore.set_index(['chrom', 'start', 'end']).rename({'cscore': f'a{a}'}, axis=1)
        cscores.append(cscore)
    cscores = pd.merge(cscores[0], cscores[1], left_index=True, right_index=True)
    
    cscores['switchSign'] = ((cscores['a1'] * cscores['a2']) < 0).astype(int)

    cscores['z'] = zscore(cscores['a1'] - cscores['a2'])
    cscores = BedTool.from_dataframe(cscores.reset_index())
    
    diffTAD = readDiffTAD(cell, diffOnly=False, X=False, pyBed=True)
    
    names = ([
        'chrom', 'start', 'end', 'type', 'Z (TAD)', 'CNV', 
        'cscoreChrom', 'cscoreStart', 'cscoreEnd', 
        'cscoreA1', 'cscoreA2', 'Cscore Sign Switch', 'Z (Cscore Switch)'])
    drop = ([
        'cscoreChrom', 'cscoreStart', 'cscoreEnd', 
        'cscoreA1', 'cscoreA2'])
    allIntersects[cell] = (
        diffTAD.intersect(cscores, wa=True, wb=True)
        .to_dataframe(names=names)
        .drop(drop, axis=1)
        .groupby(['chrom', 'start', 'end', 'type']).mean().reset_index())
    
    # Remerge with original TAD to recover CNV status which is lost when
    # averaging Cscore
    allIntersects[cell] = (pd.merge(
        allIntersects[cell], diffTAD.to_dataframe(), 
        left_on=['chrom', 'start', 'end', 'type'], 
        right_on=['chrom', 'start', 'end', 'name'])
    .drop(['name', 'score'], axis=1).rename({'strand': 'CNV'}, axis=1))
    allIntersects[cell].insert(0, 'cell', cell)
allIntersects = pd.concat(allIntersects.values())

In [7]:
df = allIntersects.loc[(allIntersects['Z (Cscore Switch)'].abs() > threshold) & (allIntersects['type'] == 'ASTAD')]
df = processBlacklist(df)

In [8]:
with pd.ExcelWriter(f'ASTADcompartmentSwitch.xlsx') as writer:
    for cell, data in df.groupby('cell'):
        BedTool.from_dataframe(data).moveto(f'{cell}-compartmentSwitchASTADs.bed')
        data.to_excel(writer, sheet_name=cell, index=False)
    allIntersects.to_excel(writer, sheet_name='allData', index=False)