In [6]:
import numpy as np
import pandas as pd
from pybedtools import BedTool

In [79]:
def getStatus(x):
    if x == '255,0,0':
        return 'paternal'
    elif x == '0,0,255':
        return 'maternal'
    elif x == '128,128,128':
        return 'biallelic'
    else:
        return 'monoallelic'

In [80]:
index = ['chrom', 'start', 'end', 'symbol']

In [81]:
gmASEG = 'GM12878-all-GRCh37.bed'

names = {'chrom': str, 'start': int, 'end': int, 'symbol': 'str', 'status': 'str'}
gm = pd.read_csv(gmASEG, usecols=[0, 1, 2, 3, 8], names=names.keys(), dtype=names, sep='\t').set_index(index)
gm['status'] = gm['status'].apply(getStatus)

In [82]:
imASEG = 'IMR90-ASEG-GRCh37.bed'

names = {'chrom': str, 'start': int, 'end': int, 'symbol': 'str', 'status': 'str'}
im = pd.read_csv(imASEG, usecols=[0, 1, 2, 3, 8], names=names.keys(), dtype=names, sep='\t').set_index(index)
im['status'] = im['status'].apply(getStatus)

In [83]:
h1ASEG = 'H1hESC-ASEG-GRCh37.bed'
names = {'chrom': str, 'start': int, 'end': int, 'symbol': 'str', 'status': 'str'}
h1 = pd.read_csv(h1ASEG, usecols=[0, 1, 2, 3, 8], names=names.keys(), dtype=names, sep='\t').set_index(index)
h1['status'] = h1['status'].apply(getStatus)

In [84]:
ase = pd.merge(gm, im, left_index=True, right_index=True, how='outer')
ase = (
    pd.merge(ase, h1, left_index=True, right_index=True, how='outer')
    .rename({'status_x': 'GM12878', 'status_y': 'IMR90', 'status': 'H1hESC'}, axis=1)
    .fillna('Unknown')
    .reset_index())

In [85]:
gmASTAD = '../diffTAD/results/GM12878/alleleGRCh37/GM12878_a1-vs-GM12878_a2-all-20000-SNPsplit_diff_tad-ASTAD.bed'
imASTAD = '../diffTAD/results/IMR90/alleleGRCh37/IMR90_a1-vs-IMR90_a2-all-20000-SNPsplit_diff_tad-ASTAD.bed'
h1ASTAD = '../diffTAD/results/H1hESC/alleleGRCh37/H1hESC_a1-vs-H1hESC_a2-all-20000-SNPsplit_diff_tad-ASTAD.bed'
imprinted = '../ASE_and_Imprinted/Imprinted-GeneImprint-GRCh37.bed'

allGenes = BedTool.from_dataframe(ase[['chrom', 'start', 'end', 'symbol']])
gmASTAD = set(allGenes.intersect(gmASTAD, wa=True).to_dataframe()['name'])
imASTAD = set(allGenes.intersect(imASTAD, wa=True).to_dataframe()['name'])
h1ASTAD = set(allGenes.intersect(h1ASTAD, wa=True).to_dataframe()['name'])
imprinted = set(allGenes.intersect(imprinted, wa=True).to_dataframe()['name'])

In [86]:
ase['GM12878_ASTAD'] = ase['symbol'].isin(gmASTAD)
ase['IMR90_ASTAD'] = ase['symbol'].isin(imASTAD)
ase['H1_ASTAD'] = ase['symbol'].isin(h1ASTAD)

ase['GM12878_ASEG'] = ase['GM12878'].isin(['paternal', 'maternal', 'monoallelic'])
ase['IMR90_ASEG'] = ase['IMR90'].isin(['paternal', 'maternal', 'monoallelic'])
ase['H1hESC_ASEG'] = ase['H1hESC'].isin(['paternal', 'maternal', 'monoallelic'])

ase['Imprinted'] = ase['symbol'].isin(imprinted)

In [68]:
ase.to_csv('GM12878-IMR90-ASEG.tsv', index=False, sep='\t')

In [72]:
ase[(~ase['GM12878_ASTAD']) & (ase['IMR90'] != 'Unknown') & (ase['GM12878'] == 'biallelic')]

Unnamed: 0,chrom,start,end,symbol,GM12878,IMR90,GM12878_ASTAD,IMR90_ASTAD
87,1,38147242,38157921,C1orf109,biallelic,monoallelic,False,False
129,1,71528974,71546972,ZRANB2,biallelic,monoallelic,False,False
155,1,90460669,90501092,ZNF326,biallelic,monoallelic,False,False
162,1,97187221,97289294,PTBP2,biallelic,monoallelic,False,False
245,1,162467041,162499421,UHMK1,biallelic,monoallelic,False,False
...,...,...,...,...,...,...,...,...
3605,8,33330904,33370703,TTI2,biallelic,monoallelic,False,False
3644,8,97251626,97273838,MTERF3,biallelic,monoallelic,False,False
3679,8,128806772,129199347,PVT1,biallelic,monoallelic,False,False
3718,9,19230433,19374279,DENND4C,biallelic,monoallelic,False,False


In [38]:
(allASE['GM12878'] != 'biallelic').sum()

480