In [1]:
import sys
sys.path.append("..")
import pandas as pd
from pybedtools import BedTool
from utilities import readDiffTAD, formatCell

In [2]:
def jaccard(x):
    intersection = len(range(max(x['start'], x['thickStart']), min(x['end'], x['thickEnd']) + 1))
    union = ((x['end'] - x['start']) + (x['thickEnd'] - x['thickStart'])) - intersection
    return float(intersection) / union

In [3]:
gencodeV38 = pd.read_pickle('gencode.v38lift37-geneStatus.pkl')

In [4]:
allTads = ({
    'GM12878': readDiffTAD('GM12878',X=True, pyBed=True),
    'IMR90': readDiffTAD('IMR90', X=True, pyBed=True),
    'H1hESC': readDiffTAD('H1hESC', X=True, pyBed=True),
})

In [5]:
allGenes = BedTool.from_dataframe(gencodeV38[['chrom', 'start', 'end', 'ID']])

In [17]:
allData = []
for cell, tads in allTads.items():
    cell = formatCell(cell)
    tadGeneOverlap = tads.intersect(allGenes, wa=True, wb=True).to_dataframe()
    topJaccard = (
        tadGeneOverlap.groupby('blockStarts')['score']
        .max()
        .reset_index()
        .rename({'blockStarts': 'ID', 'score': f'TAD Score ({cell})'}, axis=1)
    )
    allData.append(topJaccard)
geneScores = pd.merge(allData[0], allData[1], left_on='ID', right_on='ID')
geneScores = pd.merge(geneScores, allData[2], left_on='ID', right_on='ID')

gencodeV38 = pd.merge(gencodeV38, geneScores, left_on='ID', right_on='ID', how='left')

In [18]:
# Remove non-autosomal
gencodeV38 = gencodeV38.loc[~gencodeV38['chrom'].isin(['chrM', 'chrX', 'chrY'])]

gencodeV38.to_excel('allGeneStats.xlsx', index=False)
gencodeV38.to_pickle('allGeneStats.pkl')