In [1]:
import pandas as pd
import pybedtools
from utils import *

In [2]:
genes_top = set()
header = True
with open ('../data/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm_top.tab') as f:
    for line in f:
        if header:
            header = False
            continue
        genes_top.add(geneIdVersion2geneId(line.split()[0]))

In [3]:
len(genes_top)

5000

In [4]:
data = pd.read_csv('../data/GTEx_Analysis_v7_eQTL_allTissues_slope_top.csv.gz')
data['gene_id'] = data['gene_id'].apply(geneIdVersion2geneId)
data.head()

Unnamed: 0,gene_id,variant_id,Adipose_Subcutaneous,Adipose_Visceral_Omentum,Adrenal_Gland,Artery_Aorta,Artery_Coronary,Artery_Tibial,Brain_Amygdala,Brain_Anterior_cingulate_cortex_BA24,...,Skin_Sun_Exposed_Lower_leg,Small_Intestine_Terminal_Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole_Blood,Cells_EBV-transformed_lymphocytes
0,ENSG00000173801,17_39610981_A_G_b37,,,,,,-0.146567,,,...,,,,,,,,,,
1,ENSG00000100219,22_29149887_G_C_b37,,,,,,,,,...,,,,,,,,,,
2,ENSG00000206341,6_29817896_C_T_b37,0.692326,0.712692,1.14229,0.531796,,,,0.689146,...,0.941748,,0.826604,0.658147,0.427333,0.473418,,,0.794383,1.03219
3,ENSG00000198502,6_32226520_T_G_b37,,,,,-0.591224,-0.307221,,,...,-0.379419,,,-0.374231,,-0.439161,,,-0.317301,
4,ENSG00000112763,6_26601733_G_T_b37,,,,,,,,,...,,,,,,,,,0.22549,


In [5]:
# Filter by top genes
data = data[data['gene_id'].apply(lambda x: x in genes_top)]
data.shape

(1349165, 50)

In [6]:
data = data[pd.isna(data['Cells_EBV-transformed_lymphocytes']).apply(lambda x: not(x))]

In [7]:
data.shape

(51259, 50)

In [8]:
sum(pd.isnull(data).sum(axis=1) / (len(data.columns) - 2) < 0.5)

20627

In [9]:
pairs_vg = tuple(zip(data['variant_id'], data['gene_id'].apply(geneIdVersion2geneId)))

In [10]:
# Variant id to enhancer table
enhancers_f = '../../../prototype/enhancers.bed'

In [11]:
# Variant bed table
variants = pybedtools.BedTool.from_dataframe(pd.DataFrame({
    'variant_chr': data['variant_id'].apply(variantId2chr),
    'variant_start': data['variant_id'].apply(variantId2pos),
    'variant_end': data['variant_id'].apply(variantId2end),
    'variant_id': data['variant_id']})\
    .reindex(columns=['variant_chr', 'variant_start', 'variant_end', 'variant_id']))

In [12]:
# Enhancer bed table
enhancers = pybedtools.BedTool(enhancers_f)

In [13]:
# Intersection
variants_enhancers_names = ['variant_chr', 'variant_start', 'variant_end', 'variant_id', \
                           'enhancer_chr', 'enhancer_start', 'enhancer_end', 'enhancer_id']
variants_enhancers = variants.intersect(enhancers, wa=True, wb=True).to_dataframe(names=variants_enhancers_names)

  return pandas.read_table(self.fn, *args, **kwargs)


In [14]:
# Save it as a dictionary
variantID__enhancersID = {}
for i in variants_enhancers.index:
    v = variants_enhancers.loc[i,]['variant_id']
    e = variants_enhancers.loc[i,]['enhancer_id']
    if v not in variantID__enhancersID:
        variantID__enhancersID[v] = [e]
    else:
        variantID__enhancersID[v].append(e)

In [15]:
# Read genes table
genes = pybedtools.BedTool('../../../prototype/genes.bed')

In [16]:
# Read promoters table
promoters = pybedtools.BedTool('../../../prototype/promoters.bed')

In [17]:
# Intersection
genes_promoter_names = ['gene_chr', 'gene_start', 'gene_end', 'gene_id', 'smth1', 'strand', 'annotation', 'type', 'smth2', \
                       'promoter_chr', 'promoter_start', 'promoter_end', 'promoter_id']
genes_promoters = genes.intersect(promoters, wa=True, wb=True).to_dataframe(names = genes_promoter_names)
print(genes_promoters.shape)

(7457, 13)


  return pandas.read_table(self.fn, *args, **kwargs)


In [18]:
# Save a dictionary for the future
geneID__promotersID = {}
for i in genes_promoters.index:
    g = genes_promoters.loc[i,]['gene_id']
    p = genes_promoters.loc[i,]['promoter_id']
    if g not in geneID__promotersID:
        geneID__promotersID[g] = [p]
    else:
        geneID__promotersID[g].append(p)

In [19]:
genes_promoters = genes_promoters[['gene_chr', 'gene_start', 'gene_end', 'gene_id', \
                                   'promoter_chr', 'promoter_start', 'promoter_end', 'promoter_id']]
genes_promoters = genes_promoters[genes_promoters['gene_id'].apply(lambda x: x in genes_top)]
genes_promoters.shape

(1397, 8)

In [20]:
# Get promoter to enhancer table
variantID = []
geneID = []
enhancerID = []
promoterID = []

c = 0
for v, g in pairs_vg:
    if v in variantID__enhancersID and g in geneID__promotersID:
        promoters_list = geneID__promotersID[g]
        enhancers_list = variantID__enhancersID[v]

        pairs_ep = tuple((enhancers_list[i], promoters_list[u]) \
                         for i in range(len(enhancers_list)) for u in range(len(promoters_list)))
        
        for e, p in pairs_ep:
            variantID.append(v)
            geneID.append(g)
            enhancerID.append(e)
            promoterID.append(p)

            c += 1
print(c)

14047


In [21]:
# Set a dataframe
pairs_df = pd.DataFrame({
    'variant_id': variantID,
    'gene_id': geneID,
    'enhancer_id': enhancerID,
    'promoter_id': promoterID})

pairs_df.head()

pairs_df = pairs_df.drop_duplicates()
print(pairs_df.shape)

(10259, 4)


In [22]:
## Add linear data

In [23]:
peaks = pybedtools.BedTool('../../../prototype/peaks.bed.gz')
peaks.head()

chr1	10140	10374	H3K9me3	10.796883
 chr1	10166	10376	FAIRE-seq	0.0158
 chr1	235614	235797	H3K4me1	8.534075
 chr1	235869	236051	H3K4me1	9.24104
 chr1	237564	237934	RUNX3	40.4869547364138
 chr1	237593	237953	CTCF	17.021810811418
 chr1	237640	237790	DNase-seq	15.0
 chr1	237660	237850	RAD21	46.4827955207139
 chr1	521337	521697	CTCF	18.2400563234528
 chr1	521488	521678	RAD21	29.4130058402459
 

In [24]:
methylation = pybedtools.BedTool('../../../prototype/methylation.bed.gz')
methylation.head()

chr1	713375	713376	Methylation	83
 chr1	713375	713376	Methylation	60
 chr1	713387	713388	Methylation	46
 chr1	713387	713388	Methylation	60
 chr1	713399	713400	Methylation	21
 chr1	713399	713400	Methylation	33
 chr1	714583	714584	Methylation	4
 chr1	714583	714584	Methylation	4
 chr1	799084	799085	Methylation	46
 chr1	799084	799085	Methylation	46
 

In [25]:
cage = pybedtools.BedTool('../../../prototype/cage.bed.gz')
cage.head()

chr1	17470	17505	CAGE	2.96
 chr1	61713	61742	CAGE	3.755
 chr1	534298	534331	CAGE	3.46
 chr1	564460	564489	CAGE	1.68
 chr1	564639	564666	CAGE	1.28
 chr1	564713	564750	CAGE	1.18
 chr1	565253	565288	CAGE	1.18
 chr1	565263	565290	CAGE	1.28
 chr1	565380	565409	CAGE	1.68
 chr1	565462	565489	CAGE	1.18
 

In [26]:
peaks = peaks.cat(*[methylation, cage], postmerge=False).sort()
peaks.head()

chr1	10140	10374	H3K9me3	10.796883
 chr1	10166	10376	FAIRE-seq	0.0158
 chr1	17470	17505	CAGE	2.96
 chr1	61713	61742	CAGE	3.755
 chr1	235614	235797	H3K4me1	8.534075
 chr1	235869	236051	H3K4me1	9.24104
 chr1	237564	237934	RUNX3	40.4869547364138
 chr1	237593	237953	CTCF	17.021810811418
 chr1	237640	237790	DNase-seq	15.0
 chr1	237660	237850	RAD21	46.4827955207139
 

In [27]:
enhancer_names = ['enhancer_chr', 'enhancer_start', 'enhancert_end', 'enhancer_id']
promoter_names = ['promoter_chr', 'promoter_start', 'promoter_end', 'promoter_id']
peak_names = ['peak_chr', 'peak_start', 'peak_end', 'peak_name', 'peak_value']

In [28]:
peaks_df = peaks.to_dataframe(names=peak_names)

  return pandas.read_table(self.fn, *args, **kwargs)


In [29]:
peaks_df.head()

Unnamed: 0,peak_chr,peak_start,peak_end,peak_name,peak_value
0,chr1,10140,10374,H3K9me3,10.796883
1,chr1,10166,10376,FAIRE-seq,0.0158
2,chr1,17470,17505,CAGE,2.96
3,chr1,61713,61742,CAGE,3.755
4,chr1,235614,235797,H3K4me1,8.534075


In [30]:
sum(peaks_df['peak_name'] == '.')

0

In [31]:
enhancer_peaks = enhancers.intersect(peaks, loj=True, wa=True, wb=True)\
                .to_dataframe(names = enhancer_names + peak_names)[['enhancer_id','peak_name','peak_value']]
enhancer_peaks.head()

  return pandas.read_table(self.fn, *args, **kwargs)


Unnamed: 0,enhancer_id,peak_name,peak_value
0,GM12878|chr1:235686-235784,H3K4me1,8.534075
1,GM12878|chr1:235686-235784,H3K4me1,9.24104
2,GM12878|chr1:235686-235784,RUNX3,40.486955
3,GM12878|chr1:235686-235784,CTCF,17.021811
4,GM12878|chr1:235686-235784,DNase-seq,15.0


In [32]:
enhancer_peaks = enhancer_peaks.groupby(['enhancer_id', 'peak_name']).sum().reset_index()
enhancer_peaks = enhancer_peaks.pivot_table(index='enhancer_id', columns='peak_name', values='peak_value')\
                .reset_index().fillna(0).drop(columns='.')

In [33]:
enhancer_peaks.columns = ['enhancer_id'] + ['enhancer_' + i for i in enhancer_peaks.columns[1:]]
enhancer_peaks.head()

Unnamed: 0,enhancer_id,enhancer_ATF2,enhancer_ATF3,enhancer_BATF,enhancer_BCL11A,enhancer_BCL3,enhancer_BCLAF1,enhancer_BHLHE40,enhancer_BRCA1,enhancer_CAGE,...,enhancer_USF1,enhancer_USF2,enhancer_WRNIP1,enhancer_YY1,enhancer_ZBTB33,enhancer_ZEB1,enhancer_ZNF143,enhancer_ZNF274,enhancer_ZNF384,enhancer_ZZZ3
0,GM12878|chr10:100027961-100028252,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,GM12878|chr10:100099100-100099215,50.657848,0.0,38.09667,0.0,21.807906,0.0,79.276888,0.0,0.0,...,109.745819,48.057805,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,GM12878|chr10:100099435-100099559,50.657848,0.0,38.09667,0.0,21.807906,0.0,79.276888,0.0,0.0,...,109.745819,48.057805,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,GM12878|chr10:100099600-100100600,50.657848,0.0,38.09667,0.0,21.807906,0.0,79.276888,0.0,0.0,...,109.745819,48.057805,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,GM12878|chr10:100100600-100100675,50.657848,0.0,38.09667,0.0,21.807906,0.0,79.276888,0.0,0.0,...,109.745819,48.057805,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
promoter_peaks = promoters.intersect(peaks, loj=True, wa=True, wb=True)\
                 .to_dataframe(names = promoter_names + peak_names)[['promoter_id', 'peak_name', 'peak_value']]
promoter_peaks.head()

  return pandas.read_table(self.fn, *args, **kwargs)


Unnamed: 0,promoter_id,peak_name,peak_value
0,GM12878|chr1:713188-714800,H3K4me3,4.8638
1,GM12878|chr1:713188-714800,H3K9ac,12.593103
2,GM12878|chr1:713188-714800,H3K27ac,11.636454
3,GM12878|chr1:713188-714800,H3K79me2,14.015522
4,GM12878|chr1:713188-714800,H3K4me2,17.93889


In [35]:
promoter_peaks = promoter_peaks.groupby(['promoter_id', 'peak_name']).sum().reset_index()
promoter_peaks = promoter_peaks.pivot_table(index='promoter_id', columns='peak_name', values='peak_value')\
                 .reset_index().fillna(0)

In [36]:
promoter_peaks.columns = ['promoter_id'] + ['promoter_' + i for i in promoter_peaks.columns[1:]]
promoter_peaks.head()

Unnamed: 0,promoter_id,promoter_ATF2,promoter_ATF3,promoter_BATF,promoter_BCL11A,promoter_BCL3,promoter_BCLAF1,promoter_BHLHE40,promoter_BRCA1,promoter_CAGE,...,promoter_TCF3,promoter_USF1,promoter_USF2,promoter_WRNIP1,promoter_YY1,promoter_ZBTB33,promoter_ZEB1,promoter_ZNF143,promoter_ZNF384,promoter_ZZZ3
0,GM12878|chr10:100174720-100175000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,GM12878|chr10:100205197-100207294,76.724122,73.400824,0.0,23.297729,0.0,0.0,147.281136,0.0,66.03136,...,25.516657,414.951343,336.582307,18.153197,0.0,0.0,0.0,49.353312,0.0,0.0
2,GM12878|chr10:101189543-101191510,0.0,0.0,29.116682,0.0,0.0,0.0,0.0,0.0,48.49248,...,19.440035,0.0,0.0,0.0,0.0,0.0,45.356204,0.0,0.0,0.0
3,GM12878|chr10:101418472-101419903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.09618,...,0.0,0.0,0.0,0.0,0.0,0.0,29.462871,0.0,122.9982,0.0
4,GM12878|chr10:101490920-101492820,37.483596,0.0,0.0,0.0,0.0,28.290804,0.0,0.0,41.351,...,0.0,0.0,0.0,0.0,0.0,0.0,25.523109,285.318391,5.52492,0.0


In [37]:
promoter_peaks.shape, enhancer_peaks.shape

((8453, 100), (100036, 101))

In [38]:
pairs_df = pairs_df.merge(enhancer_peaks, on='enhancer_id', how='left')
pairs_df = pairs_df.merge(promoter_peaks, on='promoter_id', how='left')

In [39]:
pairs_df.shape

(10259, 203)

In [40]:
pairs_df = pairs_df.drop_duplicates()

In [41]:
pairs_df.shape

(10259, 203)

In [42]:
pairs_df.head()

Unnamed: 0,variant_id,gene_id,enhancer_id,promoter_id,enhancer_ATF2,enhancer_ATF3,enhancer_BATF,enhancer_BCL11A,enhancer_BCL3,enhancer_BCLAF1,...,promoter_TCF3,promoter_USF1,promoter_USF2,promoter_WRNIP1,promoter_YY1,promoter_ZBTB33,promoter_ZEB1,promoter_ZNF143,promoter_ZNF384,promoter_ZZZ3
0,1_150746406_G_T_b37,ENSG00000163131,GM12878|chr1:150743400-150743433,GM12878|chr1:150737600-150738647,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,72.707258,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1_150746406_G_T_b37,ENSG00000163131,GM12878|chr1:150743625-150743717,GM12878|chr1:150737600-150738647,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,72.707258,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1_150746406_G_T_b37,ENSG00000163131,GM12878|chr1:150743817-150744119,GM12878|chr1:150737600-150738647,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,72.707258,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20_33729442_A_C_b37,ENSG00000101000,GM12878|chr20:33732345-33732953,GM12878|chr20:35200330-35204291,0.0,0.0,0.0,0.0,0.0,0.0,...,20.522111,0.0,0.0,24.801033,0.0,0.0,0.0,0.0,0.0,0.0
4,11_842543_C_G_b37,ENSG00000177697,GM12878|chr11:843566-844649,GM12878|chr11:832697-833556,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
pairs_df.to_csv('../data/GTEx_Analysis_v7_eQTL_EVB_linearData.csv.gz', index=False, compression='gzip')