In [1]:
import sys
sys.path.append("..")
import pandas as pd
from pybedtools import BedTool
from utilities import readDiffTAD, formatCell

In [2]:
def jaccard(x):
    intersection = len(range(max(x['start'], x['thickStart']), min(x['end'], x['thickEnd']) + 1))
    union = ((x['end'] - x['start']) + (x['thickEnd'] - x['thickStart'])) - intersection
    return float(intersection) / union

In [3]:
gencodeV38 = pd.read_pickle('gencode.v38lift37-geneStatus.pkl')

In [4]:
allTads = ({
    'GM12878': readDiffTAD('GM12878',X=True, pyBed=True),
    'IMR90': readDiffTAD('IMR90', X=True, pyBed=True),
    'H1hESC': readDiffTAD('H1hESC', X=True, pyBed=True),
})

In [5]:
allGenes = BedTool.from_dataframe(gencodeV38[['chrom', 'start', 'end', 'ID']])

In [6]:
allData = []
for cell, tads in allTads.items():
    cell = formatCell(cell)
    tadGeneOverlap = tads.intersect(allGenes, wa=True, wb=True).to_dataframe()
    topJaccard = (
        tadGeneOverlap.groupby('blockStarts')['score']
        .max()
        .reset_index()
        .rename({'blockStarts': 'ID', 'score': f'TAD Score ({cell})'}, axis=1)
    )
    allData.append(topJaccard)
geneScores = pd.merge(allData[0], allData[1], left_on='ID', right_on='ID')
geneScores = pd.merge(geneScores, allData[2], left_on='ID', right_on='ID')

gencodeV38 = pd.merge(gencodeV38, geneScores, left_on='ID', right_on='ID', how='left')

In [7]:
# Remove non-autosomal
gencodeV38 = gencodeV38.loc[~gencodeV38['chrom'].isin(['chrM', 'chrX', 'chrY'])]

gencodeV38.to_excel('allGeneStats.xlsx', index=False)
gencodeV38.to_pickle('allGeneStats.pkl')

In [11]:
gencodeV38.dropna()

Unnamed: 0,chrom,start,end,ID,score,strand,geneType,symbol,Imprinting Status,ASE (GM12878),ASE (IMR-90),ASE (H1-hESC),TAD Score (GM12878),TAD Score (IMR-90),TAD Score (H1-hESC)
28,chr1,521451,763176,ENSG00000230021,0,-,transcribed_processed_pseudogene,RP11-206L10.17,Not Imprinted,Unknown,Unknown,Unknown,-0.619352,-0.350143,-0.403675
37,chr1,610222,610645,ENSG00000268663,0,+,processed_pseudogene,WBP1LP6,Not Imprinted,Unknown,Unknown,Unknown,-0.619352,-0.350143,-0.403675
38,chr1,621059,622053,ENSG00000284662,0,-,protein_coding,OR4F16,Not Imprinted,Unknown,Unknown,Unknown,-0.619352,-0.350143,-0.403675
39,chr1,657472,660283,ENSG00000229376,0,+,processed_pseudogene,CICP3,Not Imprinted,Unknown,Unknown,Unknown,-0.619352,-0.350143,-0.403675
40,chr1,661265,714006,ENSG00000228327,0,-,transcribed_unprocessed_pseudogene,RP11-206L10.2,Not Imprinted,Unknown,Unknown,Unknown,-0.619352,-0.350143,-0.403675
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57240,chr22,50713408,50746075,ENSG00000196576,0,-,protein_coding,PLXNB2,Not Imprinted,Unassigned,Unknown,Unknown,0.129504,-0.057252,0.300977
57241,chr22,50747459,50765441,ENSG00000205593,0,-,protein_coding,DENND6B,Not Imprinted,Unassigned,Unknown,Unknown,0.129504,-0.057252,0.300977
57242,chr22,50753060,50754437,ENSG00000227484,0,+,lncRNA,XX-C283C717.1,Not Imprinted,Unknown,Unknown,Unknown,0.129504,-0.057252,0.300977
57243,chr22,50754464,50755454,ENSG00000279182,0,+,lncRNA,XX-C00717C00720L.1,Not Imprinted,Unknown,Unknown,Unknown,0.129504,-0.057252,0.300977
