In [1]:
import glob
import numpy as np
import pandas as pd
from scipy import stats
from scipy.sparse import triu
from collections import defaultdict
from hicmatrix import HiCMatrix as hm
from statsmodels.stats.multitest import fdrcorrection
from matplotlib import cm
from matplotlib.colors import to_hex

In [2]:
def directionPreference(matrix, chrom: str, binSize: int):

    diffChange = defaultdict(list)
    for start, df in matrix.groupby('start'):
        diffChange['start'].append(start)
        try:
            _, p = stats.wilcoxon(df['score'], alternative='two-sided')
        except ValueError:
            p = np.nan

        diffChange['p'].append(p)
        nonZeroScores = df.loc[df['score'] != 0, 'score']
        if not nonZeroScores.empty:
            direction = 1 if nonZeroScores.median() > 0 else -1
        else:
            direction = np.nan
        diffChange['direction'].append(direction)
    diffChange = pd.DataFrame(diffChange)
    diffChange['chrom'] = chrom
    diffChange['end'] = diffChange['start'] + binSize

    return diffChange

In [3]:
def getColour(x, fdr, colourmap):
    if x['p(adj)'] <= fdr:
        if x['direction'] == 1:
            i = (x['quantScore'] * 0.5)  + 0.5
        else:
            i = (1 - x['quantScore']) * 0.5
        colour = to_hex(cm.get_cmap(colourmap, 40)(i))[1:]
    else:
        i = x['quantScore']
        colour = to_hex(cm.get_cmap('binary', 20)(i))[1:]
    colour = f'{int(colour[:2], 16)},{int(colour[2:4], 16)},{int(colour[4:], 16)}'
    return colour

In [4]:
fdr = 0.05
colourmap = 'bwr'

In [5]:
for cell in ['GM12878', 'H1hESC', 'IMR90']:
    # Awaiting H1 results
    if cell == 'H1hESC':
        continue
        
    matrices = glob.glob(f'../../../HiCimages/{cell}/alleleGRCh37/dat/HiCsubtract/chr*/20000/{cell}_a1-vs-{cell}_a2-LOESSdiff-noFilter-SNPsplit.h5')

    allRegions = []
    allDirection = []
    for matrix in matrices:
        hic = hm.hiCMatrix(matrix)
        binSize = hic.getBinSize()
        chrom = hic.getChrNames()[0]

        nonzeroIdx = hic.matrix.nonzero()
        nonzeroValues = hic.matrix[nonzeroIdx].tolist()[0]
        mat = pd.DataFrame({'start': nonzeroIdx[0], 'start2': nonzeroIdx[1], 'value': nonzeroValues})
        mat['start'] = mat['start'].apply(lambda x: hic.getBinPos(x)[1])
        mat['start2'] = mat['start2'].apply(lambda x: hic.getBinPos(x)[1])
        mat.columns = ['start', 'start2', 'score']
        binDirection = directionPreference(mat, chrom, binSize)
        allDirection.append(binDirection)

        mat['seperation'] = (mat['start'] - mat['start2']).abs()
        mat['abs(score)'] = abs(mat['score'])
        summed = mat.groupby('start')[['abs(score)']].sum().reset_index()
        summed['chrom'] = chrom
        summed['quantScore'] = pd.qcut(
            summed['abs(score)'].rank(method='first'), 20, labels=np.linspace(0, 1, 20))
        allRegions.append(summed)

    allRegions = pd.concat(allRegions).set_index(['chrom', 'start']).sample(frac=1)
    #allRegions['quantScore'] = pd.qcut(
    #    allRegions['abs(score)'].rank(method='first'), 20, labels=np.linspace(0, 1, 20))
    allDirection = pd.concat(allDirection).set_index(['chrom', 'start'])

    allRegions = allRegions.merge(
        allDirection, left_index=True, right_index=True).reset_index()
    allRegions['end'] = allRegions['start'] + binSize
    validP = allRegions['p'].notna()
    allRegions['p(adj)'] = np.nan
    allRegions.loc[validP, 'p(adj)'] = fdrcorrection(allRegions.loc[validP, 'p'])[1]

    allRegions['colour'] = allRegions.apply(getColour, args=(fdr, colourmap), axis=1)

    allRegions['name'] = '.'
    allRegions['strand'] = '.'
    allRegions['thickStart'] = allRegions['start']
    allRegions['thickEnd'] = allRegions['end']
    columns = ([
        'chrom', 'start', 'end', 'name', 'abs(score)', 'strand',
        'thickStart', 'thickEnd', 'colour'])
    allRegions[columns].to_csv(
        f'/media/stephen/MyPassport1/PhD/HiCimages/{cell}/alleleGRCh37/{cell}-newComputeScore.bed', 
        index=False, header=False, sep='\t')