In [2]:
import pandas as pd
import pybedtools 
import sys
sys.path.append('../../prototype/notebooks/')
import methodName
from sklearn.ensemble import ExtraTreesRegressor
import numpy as np
import matplotlib.pyplot as plt

import requests, sys
server = "http://rest.ensembl.org"

In [7]:
gtex_f = '../../prototype/GTEx_Analysis_v7_eQTL/Cells_EBV-transformed_lymphocytes.v7.signif_variant_gene_pairs.txt.gz'
hi_c_folder = '../../prototype/GM12878_diploid_intrachromosomal_contact_matrices/maternal/5kb_resolution_intrachromosomal/chr8/MAPQGE10/'
methylation_f = '../../prototype/methylation.bed.gz'
cage_f = '../../prototype/cage.bed.gz'
peaks_f = '../../prototype/peaks.bed.gz'
enhancers_f = '../../prototype/notebooks/list_of_enhancers_state.csv'
promoters_f = '../../prototype/notebooks/list_of_promoters_state.csv'
genes_f = '../../prototype/genes.bed'
rs_f = '../../prototype/rs_chromosome_8.tab'
rs_name = 'rs_id_dbSNP147_GRCh37p13'

# List of Pairs for chromosome 8

In [8]:
# Read eQTL 
eQTL = pd.read_csv(gtex_f, sep='\t')
eQTL = eQTL.loc[eQTL['variant_id'].apply(methodName.variantId2chrNum)=='8',]
#eQTL['gene_id'] = eQTL['gene_id'].apply(methodName.geneIdVersion2geneId)
eQTL.head()

Unnamed: 0,variant_id,gene_id,tss_distance,ma_samples,ma_count,maf,pval_nominal,slope,slope_se,pval_nominal_threshold,min_pval_nominal,pval_beta
138112,8_636099_G_A_b37,ENSG00000254207.1,-39778,60,73,0.311966,1.44588e-06,0.643478,0.125106,1e-05,8.27415e-20,4.03391e-15
138113,8_636101_A_C_b37,ENSG00000254207.1,-39776,45,51,0.217949,2.90078e-07,0.733659,0.132846,1e-05,8.27415e-20,4.03391e-15
138114,8_637069_C_G_b37,ENSG00000254207.1,-38808,49,57,0.24359,2.61069e-07,0.70668,0.127397,1e-05,8.27415e-20,4.03391e-15
138115,8_638503_G_C_b37,ENSG00000254207.1,-37374,50,61,0.260684,1.68516e-07,0.701395,0.124183,1e-05,8.27415e-20,4.03391e-15
138116,8_640522_G_A_b37,ENSG00000254207.1,-35355,49,58,0.247863,6.01446e-08,0.752441,0.127906,1e-05,8.27415e-20,4.03391e-15


In [4]:
pairs_vg = tuple(zip(eQTL['variant_id'], eQTL['gene_id'].apply(methodName.geneIdVersion2geneId)))
pairs_vg[100:110]

(('8_1710632_T_C_b37', 'ENSG00000253982'),
 ('8_1710977_A_C_b37', 'ENSG00000253982'),
 ('8_1712158_AG_A_b37', 'ENSG00000253982'),
 ('8_2064932_A_G_b37', 'ENSG00000036448'),
 ('8_2065430_A_T_b37', 'ENSG00000036448'),
 ('8_2068801_T_C_b37', 'ENSG00000036448'),
 ('8_2069598_GAC_G_b37', 'ENSG00000036448'),
 ('8_2070062_T_C_b37', 'ENSG00000036448'),
 ('8_2071002_T_C_b37', 'ENSG00000036448'),
 ('8_2074042_A_G_b37', 'ENSG00000036448'))

### Get variant id to enhancers dictionary

In [5]:
# Variant bed table
variants = pybedtools.BedTool.from_dataframe(pd.DataFrame({
    'variant_chr': eQTL['variant_id'].apply(methodName.variantId2chr),
    'variant_start': eQTL['variant_id'].apply(methodName.variantId2pos),
    'variant_end': eQTL['variant_id'].apply(methodName.variantId2end),
    'variant_id': eQTL['variant_id']}))

# Enhancer bed table
enhancers = pybedtools.BedTool(enhancers_f)

# Intersection
variants_enhancers_names = ['variant_chr', 'variant_start', 'variant_end', 'variant_id', \
                           'enhancer_chr', 'enhancer_start', 'enhancer_end', 'enhancer_state', 'enhancer_id']
variants_enhancers = variants.intersect(enhancers, wa=True, wb=True).to_dataframe(names=variants_enhancers_names)

# Save it as a dictionary for the future
variantID__enhancersID = {}
for i in variants_enhancers.index:
    v = variants_enhancers.loc[i,]['variant_id']
    e = variants_enhancers.loc[i,]['enhancer_id']
    if v not in variantID__enhancersID:
        variantID__enhancersID[v] = [e]
    else:
        variantID__enhancersID[v].append(e)

  return pandas.read_table(self.fn, *args, **kwargs)


### Get gene to promoter id dictionary

In [6]:
# Read genes table
genes = pybedtools.BedTool(genes_f)

# Read promoters table
promoters = pybedtools.BedTool(promoters_f)

# Intersection
genes_promoter_names = ['gene_chr', 'gene_start', 'gene_end', 'gene_id', 'smth1', 'strand', 'annotation', 'type', 'smth2', \
                       'promoter_chr', 'promoter_start', 'promoter_end', 'promoter_state', 'promoter_id']
genes_promoters = genes.intersect(promoters, wa=True, wb=True).to_dataframe(names = genes_promoter_names)
print(genes_promoters.shape)

# Save a dictionary for the future
geneID__promotersID = {}
for i in genes_promoters.index:
    g = genes_promoters.loc[i,]['gene_id']
    p = genes_promoters.loc[i,]['promoter_id']
    if g not in geneID__promotersID:
        geneID__promotersID[g] = [p]
    else:
        geneID__promotersID[g].append(p)
        
genes_promoters = genes_promoters[['gene_chr', 'gene_start', 'gene_end', 'gene_id', \
                                   'promoter_chr', 'promoter_start', 'promoter_end', 'promoter_state', 'promoter_id']]

# Filter to get only chromosome 8
genes_chromosome = set(eQTL['gene_id'])
geneID__promotersID  = {g:p for g, p in geneID__promotersID.items() if g in set(genes_chromosome)}

  return pandas.read_table(self.fn, *args, **kwargs)


(49767, 14)


In [7]:
print(len(pairs_vg), len(variantID__enhancersID), len(geneID__promotersID))

7952 257 106


In [8]:
# Get promoter enhancer table
variantID = []
geneID = []
enhancerID = []
promoterID = []

c = 0
for v, g in pairs_vg:
    if v in variantID__enhancersID and g in geneID__promotersID:
        promoters = geneID__promotersID[g]
        enhancers = variantID__enhancersID[v]

        pairs_ep = tuple((enhancers[i], promoters[u]) for i in range(len(enhancers)) for u in range(len(promoters)))
        
        for e, p in pairs_ep:
            variantID.append(v)
            geneID.append(g)
            enhancerID.append(e)
            promoterID.append(p)

            c += 1
print(c)

329


In [9]:
pairs_df = pd.DataFrame({
    'variant_id': variantID,
    'gene_id': geneID,
    'enhancer_id': enhancerID,
    'promoter_id': promoterID})

In [10]:
pairs_df.head()

Unnamed: 0,variant_id,gene_id,enhancer_id,promoter_id
0,8_6262831_A_T_b37,ENSG00000246089,chr8:6262801-6263000,chr8:6405000-6408201
1,8_8258448_C_G_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601
2,8_8258455_G_A_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601
3,8_8258712_T_C_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601
4,8_8259117_T_A_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601


In [11]:
rs = pd.read_csv(rs_f, sep='\t')
rs.head()

Unnamed: 0,chr,variant_pos,variant_id,ref,alt,num_alt_per_site,rs_id_dbSNP147_GRCh37p13
0,8,23845,8_23845_G_GT_b37,G,GT,1,.
1,8,44764,8_44764_C_T_b37,C,T,2,rs544896538
2,8,44811,8_44811_C_T_b37,C,T,1,.
3,8,46114,8_46114_C_T_b37,C,T,2,.
4,8,46125,8_46125_T_G_b37,T,G,2,.


In [12]:
rs_name = 'rs_id_dbSNP147_GRCh37p13'
pairs_df = pairs_df.merge(rs, right_on='variant_id', left_on='variant_id')
pairs_df.head()

Unnamed: 0,variant_id,gene_id,enhancer_id,promoter_id,chr,variant_pos,ref,alt,num_alt_per_site,rs_id_dbSNP147_GRCh37p13
0,8_6262831_A_T_b37,ENSG00000246089,chr8:6262801-6263000,chr8:6405000-6408201,8,6262831,A,T,1,rs725438
1,8_8258448_C_G_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601,8,8258448,C,G,1,rs2920992
2,8_8258455_G_A_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601,8,8258455,G,A,1,rs2979150
3,8_8258712_T_C_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601,8,8258712,T,C,1,rs2976929
4,8_8259117_T_A_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601,8,8259117,T,A,1,rs2920991


In [13]:
pairs_df.shape

(329, 10)

In [14]:
pairs_df = pairs_df.merge(eQTL, right_on=['variant_id', 'gene_id'], left_on=['variant_id', 'gene_id'])
print(pairs_df.shape)
pairs_df.head()

(329, 20)


Unnamed: 0,variant_id,gene_id,enhancer_id,promoter_id,chr,variant_pos,ref,alt,num_alt_per_site,rs_id_dbSNP147_GRCh37p13,tss_distance,ma_samples,ma_count,maf,pval_nominal,slope,slope_se,pval_nominal_threshold,min_pval_nominal,pval_beta
0,8_6262831_A_T_b37,ENSG00000246089,chr8:6262801-6263000,chr8:6405000-6408201,8,6262831,A,T,1,rs725438,-1832,60,75,0.323276,1.28344e-06,-0.613816,0.118679,5e-06,1.77326e-08,9.8627e-05
1,8_8258448_C_G_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601,8,8258448,C,G,1,rs2920992,174312,85,112,0.478632,7.15701e-06,-0.54385,0.114459,1.9e-05,7.25016e-11,3.03794e-07
2,8_8258455_G_A_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601,8,8258455,G,A,1,rs2979150,174319,85,112,0.478632,7.15701e-06,-0.54385,0.114459,1.9e-05,7.25016e-11,3.03794e-07
3,8_8258712_T_C_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601,8,8258712,T,C,1,rs2976929,174576,85,112,0.478632,7.15701e-06,-0.54385,0.114459,1.9e-05,7.25016e-11,3.03794e-07
4,8_8259117_T_A_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601,8,8259117,T,A,1,rs2920991,174981,81,107,0.490826,6.82261e-07,-0.616851,0.115898,1.9e-05,7.25016e-11,3.03794e-07


## Adding linear data

In [15]:
enhancers = pybedtools.BedTool(enhancers_f)
promoters = pybedtools.BedTool(promoters_f)
peaks = pybedtools.BedTool(peaks_f)
methylation = pybedtools.BedTool(methylation_f)
cage = pybedtools.BedTool(cage_f)
linear = peaks.cat(*[methylation, cage], postmerge=False).sort()

In [16]:
def normalizeByLen(row):
    to_norm = row[1:]
    start, end = row[0].split(':')[1].split('-')
    l = int(end) - int(start)
    row[1:] = row[1:] / l
    return row

In [17]:
enhancer_names = ['enhancer_chr', 'enhancer_start', 'enhancert_end', 'enhancer_state', 'enhancer_id']
peak_names = ['peak_chr', 'peak_start', 'peak_end', 'peak_name', 'peak_value']
enhancer_linear = enhancers.intersect(linear, loj=True, wa=True, wb=True).to_dataframe(names = enhancer_names + peak_names)
notDrop = ['enhancer_id', 'peak_name', 'peak_value']
enhancer_linear = enhancer_linear[notDrop]
enhancer_linear = enhancer_linear.groupby(['enhancer_id', 'peak_name']).sum().reset_index()
enhancer_linear = enhancer_linear.pivot_table(index='enhancer_id', columns='peak_name', values='peak_value').reset_index().fillna(0)
enhancer_linear.columns = ['enhancer_id'] + ['enhancer_' + i for i in enhancer_linear.columns[1:]]
enhancer_linear.tail()

  return pandas.read_table(self.fn, *args, **kwargs)


Unnamed: 0,enhancer_id,enhancer_.,enhancer_ATF2,enhancer_ATF3,enhancer_BATF,enhancer_BCL11A,enhancer_BCL3,enhancer_BCLAF1,enhancer_BHLHE40,enhancer_BRCA1,...,enhancer_USF1,enhancer_USF2,enhancer_WRNIP1,enhancer_YY1,enhancer_ZBTB33,enhancer_ZEB1,enhancer_ZNF143,enhancer_ZNF274,enhancer_ZNF384,enhancer_ZZZ3
126101,chrX:99711201-99712400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
126102,chrX:99716001-99716800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
126103,chrX:99725801-99726000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
126104,chrX:99726401-99727000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
126105,chrX:99737001-99737200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
peak_names = ['peak_chr', 'peak_start', 'peak_end', 'peak_name', 'peak_value']
promoter_names = ['promoter_chr', 'promoter_start', 'promoter_end', 'promoter_state', 'promoter_id']
promoter_linear = promoters.intersect(linear, loj=True, wa=True, wb=True).to_dataframe(names = promoter_names + peak_names)
notDrop = ['promoter_id', 'peak_name', 'peak_value']
promoter_linear = promoter_linear[notDrop]
promoter_linear = promoter_linear.groupby(['promoter_id', 'peak_name']).sum().reset_index()
promoter_linear = promoter_linear.pivot_table(index='promoter_id', columns='peak_name', values='peak_value').reset_index().fillna(0)
promoter_linear.columns = ['promoter_id'] + ['promoter_' + i for i in promoter_linear.columns[1:]]
promoter_linear.head()

  return pandas.read_table(self.fn, *args, **kwargs)


Unnamed: 0,promoter_id,promoter_.,promoter_ATF2,promoter_ATF3,promoter_BATF,promoter_BCL11A,promoter_BCL3,promoter_BCLAF1,promoter_BHLHE40,promoter_BRCA1,...,promoter_USF1,promoter_USF2,promoter_WRNIP1,promoter_YY1,promoter_ZBTB33,promoter_ZEB1,promoter_ZNF143,promoter_ZNF274,promoter_ZNF384,promoter_ZZZ3
0,chr10:100007000-100011201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,chr10:100115000-100115401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,chr10:100183200-100187601,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,chr10:100227800-100230401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,23.381434,0.0,0.0,0.0,0.0
4,chr10:100266000-100268800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Adding Hi-C

In [19]:
chromosome = '8'

In [20]:
chromosome__contacts = {}
chromosome__contacts[chromosome] = methodName.ContactMatrix(hi_c_folder, chrm=chromosome, resl=5000)

In [21]:
contactMatrix = chromosome__contacts[chromosome].rawMatrix

The raw observed matrix file has 4610383 lines.


In [22]:
pairs_df['enhancer_bin'] = pairs_df['enhancer_id'].apply(lambda x: int(x.split(':')[1].split('-')[0]))\
                            .apply(methodName.position2matrixBin)

pairs_df['promoter_bin'] = pairs_df['promoter_id'].apply(lambda x: int(x.split(':')[1].split('-')[0]))\
                            .apply(methodName.position2matrixBin)

In [23]:
hi_c_dict = {'promoter_id':[], 'enhancer_id':[], 'hi_c':[], 'variant_id':[], 'gene_id':[]}
for i in pairs_df.index:
    pb = pairs_df.loc[i,]['promoter_bin']
    eb = pairs_df.loc[i,]['enhancer_bin']
    p = pairs_df.loc[i,]['promoter_id']
    e = pairs_df.loc[i,]['enhancer_id']
    v = pairs_df.loc[i,]['variant_id']
    g = pairs_df.loc[i,]['gene_id']
    x, y = sorted([pb, eb])
    if x in contactMatrix and y in contactMatrix[x]:
        hi_c_dict['hi_c'].append(contactMatrix[x][y])
    else:
        hi_c_dict['hi_c'].append(0)
    hi_c_dict['variant_id'].append(v)
    hi_c_dict['promoter_id'].append(p)
    hi_c_dict['enhancer_id'].append(e)
    hi_c_dict['gene_id'].append(g)


## Adding eQTL from other tissues

In [24]:
tissues_names = ['Adipose_Subcutaneous',
 'Adipose_Visceral_Omentum',
 'Adrenal_Gland',
 'Artery_Aorta',
 'Artery_Coronary',
 'Artery_Tibial',
 'Brain_Amygdala',
 'Brain_Anterior_cingulate_cortex_BA24',
 'Brain_Caudate_basal_ganglia',
 'Brain_Cerebellar_Hemisphere',
 'Brain_Cerebellum',
 'Brain_Cortex',
 'Brain_Frontal_Cortex_BA9',
 'Brain_Hippocampus',
 'Brain_Hypothalamus',
 'Brain_Nucleus_accumbens_basal_ganglia',
 'Brain_Putamen_basal_ganglia',
 'Brain_Spinal_cord_cervical_c-1',
 'Brain_Substantia_nigra',
 'Breast_Mammary_Tissue',
 'Cells_Transformed_fibroblasts',
 'Colon_Sigmoid',
 'Colon_Transverse',
 'Esophagus_Gastroesophageal_Junction',
 'Esophagus_Mucosa',
 'Esophagus_Muscularis',
 'Heart_Atrial_Appendage',
 'Heart_Left_Ventricle',
 'Liver',
 'Lung',
 'Minor_Salivary_Gland',
 'Muscle_Skeletal',
 'Nerve_Tibial',
 'Ovary',
 'Pancreas',
 'Pituitary',
 'Prostate',
 'Skin_Not_Sun_Exposed_Suprapubic',
 'Skin_Sun_Exposed_Lower_leg',
 'Small_Intestine_Terminal_Ileum',
 'Spleen',
 'Stomach',
 'Testis',
 'Thyroid',
 'Uterus',
 'Vagina',
 'Whole_Blood']

In [25]:
c = 0
n = 0
interaction__tissues_pval = {}
for i in pairs_df.index:
    gene_stbl_id = pairs_df.loc[i,]['gene_id']
    variant_name = pairs_df.loc[i,][rs_name]
    for tissue in tissues_names:
        ext = "/eqtl/id/homo_sapiens/"+gene_stbl_id+"?statistic=p-value;variant_name="+variant_name+";tissue=" + tissue
        r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
        if not r.ok:
            try:
                r.raise_for_status()
            except:
                n += 1
        try:
            decoded = r.json()
        except:
            n += 1
        if "error" not in repr(decoded) and len(repr(decoded)) > 2: 
            if (gene_stbl_id, variant_name) not in interaction__tissues_pval:
                interaction__tissues_pval[(gene_stbl_id, variant_name)] = {}
                interaction__tissues_pval[(gene_stbl_id, variant_name)][tissue] = decoded[0]['value']
            else:
                interaction__tissues_pval[(gene_stbl_id, variant_name)][tissue] = decoded[0]['value']
            c += 1
print(c, n)

12668 1834


In [26]:
tissues_data_frame = {}
tissues_data_frame['gene_id'] = []
tissues_data_frame[rs_name] = []
for t in tissues_names: tissues_data_frame[t] = []
for i in pairs_df.index:
    gene_stbl_id = pairs_df.loc[i,]['gene_id']
    variant_name = pairs_df.loc[i,][rs_name]
    if (gene_stbl_id, variant_name) in interaction__tissues_pval:
        tissues_data_frame['gene_id'].append(gene_stbl_id)
        tissues_data_frame[rs_name].append(variant_name)
        for t in tissues_names:
            if t in interaction__tissues_pval[(gene_stbl_id, variant_name)]:
                tissues_data_frame[t].append(interaction__tissues_pval[(gene_stbl_id, variant_name)][t])
            else:
                tissues_data_frame[t].append(1)
tissues_pval = pd.DataFrame(tissues_data_frame)
tissues_pval.head()

Unnamed: 0,gene_id,rs_id_dbSNP147_GRCh37p13,Adipose_Subcutaneous,Adipose_Visceral_Omentum,Adrenal_Gland,Artery_Aorta,Artery_Coronary,Artery_Tibial,Brain_Amygdala,Brain_Anterior_cingulate_cortex_BA24,...,Skin_Not_Sun_Exposed_Suprapubic,Skin_Sun_Exposed_Lower_leg,Small_Intestine_Terminal_Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole_Blood
0,ENSG00000246089,rs725438,1.043269e-21,6.992728e-11,1.086137e-16,2.7213820000000002e-17,7.93775e-09,2.789501e-14,1,0.003194,...,6.47606e-10,4.340546e-15,1.9e-05,3.08727e-10,2.22996e-08,3.699122e-25,2.288615e-13,0.043879,0.000399,9.007175e-25
1,ENSG00000253893,rs2920992,0.2691379,0.3598285,0.2005095,0.3509573,0.02239074,0.2079237,1,0.000353,...,0.1634199,0.1208264,0.744984,1.0,0.001831117,2.916586e-09,0.0008773499,0.004506,0.437609,1.0
2,ENSG00000253893,rs2979150,0.2811242,0.36006,0.2037117,0.3421716,0.02260352,0.230303,1,0.000356,...,0.1636915,0.1190036,0.746993,1.0,0.001853348,2.886361e-09,0.0009617443,0.003875,0.441878,1.0
3,ENSG00000253893,rs2976929,0.2671713,0.3598285,0.2003048,0.3478835,0.02228504,0.2089616,1,0.000353,...,0.1634199,0.1220044,0.744984,1.0,0.001828282,2.916586e-09,0.0008731569,0.004506,0.435632,1.0
4,ENSG00000253893,rs2920991,0.2595891,0.3598285,0.2001547,0.3428594,0.02200349,0.2061667,1,0.000353,...,0.1815006,0.1250446,0.744984,1.0,0.001823206,3.666259e-09,0.0009606651,0.004506,0.432639,1.0


In [27]:
c = 0
n = 0
interaction__tissues_beta = {}
for i in pairs_df.index:
    gene_stbl_id = pairs_df.loc[i,]['gene_id']
    variant_name = pairs_df.loc[i,][rs_name]
    for tissue in tissues_names:
        ext = "/eqtl/id/homo_sapiens/"+gene_stbl_id+"?statistic=beta;variant_name="+variant_name+";tissue=" + tissue
        r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
        if not r.ok:
            try:
                r.raise_for_status()
            except:
                n += 1
        try:
            decoded = r.json()
        except:
            n += 1
        if "error" not in repr(decoded) and len(repr(decoded)) > 2: 
            if (gene_stbl_id, variant_name) not in interaction__tissues_beta:
                interaction__tissues_beta[(gene_stbl_id, variant_name)] = {}
                interaction__tissues_beta[(gene_stbl_id, variant_name)][tissue] = decoded[0]['value']
            else:
                interaction__tissues_beta[(gene_stbl_id, variant_name)][tissue] = decoded[0]['value']
            c += 1
print(c, n)

12668 1832


In [28]:
tissues_data_frame = {}
tissues_data_frame['gene_id'] = []
tissues_data_frame[rs_name] = []
for t in tissues_names: tissues_data_frame[t] = []
for i in pairs_df.index:
    gene_stbl_id = pairs_df.loc[i,]['gene_id']
    variant_name = pairs_df.loc[i,][rs_name]
    if (gene_stbl_id, variant_name) in interaction__tissues_beta:
        tissues_data_frame['gene_id'].append(gene_stbl_id)
        tissues_data_frame[rs_name].append(variant_name)
        for t in tissues_names:
            if t in interaction__tissues_beta[(gene_stbl_id, variant_name)]:
                tissues_data_frame[t].append(interaction__tissues_beta[(gene_stbl_id, variant_name)][t])
            else:
                tissues_data_frame[t].append(1)
tissues_beta = pd.DataFrame(tissues_data_frame)
tissues_beta.head()

Unnamed: 0,gene_id,rs_id_dbSNP147_GRCh37p13,Adipose_Subcutaneous,Adipose_Visceral_Omentum,Adrenal_Gland,Artery_Aorta,Artery_Coronary,Artery_Tibial,Brain_Amygdala,Brain_Anterior_cingulate_cortex_BA24,...,Skin_Not_Sun_Exposed_Suprapubic,Skin_Sun_Exposed_Lower_leg,Small_Intestine_Terminal_Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole_Blood
0,ENSG00000246089,rs725438,-0.658403,-0.647966,-0.936025,-0.713329,-0.631175,-0.452004,1,-0.464902,...,-0.461236,-0.447576,-0.638269,-0.81166,-0.473882,-0.883826,-0.480686,-0.38361,-0.616397,-0.677099
1,ENSG00000253893,rs2920992,-0.097214,0.107471,-0.182481,-0.099121,-0.27494,-0.118052,1,-0.617426,...,-0.169422,-0.135644,-0.065092,1.0,-0.320903,-0.737579,-0.292812,-0.598346,-0.118196,1.0
2,ENSG00000253893,rs2979150,-0.094421,0.107488,-0.181278,-0.101037,-0.274442,-0.112311,1,-0.618445,...,-0.169465,-0.136068,-0.064566,1.0,-0.320723,-0.738005,-0.290298,-0.602307,-0.115607,1.0
3,ENSG00000253893,rs2976929,-0.097675,0.107471,-0.182548,-0.099779,-0.275128,-0.117793,1,-0.617426,...,-0.169422,-0.135275,-0.065092,1.0,-0.320917,-0.737579,-0.293012,-0.598346,-0.118846,1.0
4,ENSG00000253893,rs2920991,-0.099342,0.107471,-0.182616,-0.100867,-0.275712,-0.118531,1,-0.617426,...,-0.164296,-0.134278,-0.065092,1.0,-0.32094,-0.735544,-0.292039,-0.598346,-0.119806,1.0


# Merge data

In [30]:
pairs_df = pairs_df.drop(columns=tissues_features_beta).drop_duplicates()

NameError: name 'tissues_features_beta' is not defined

In [None]:
tissues_features_beta

In [31]:
# Join all data 
pairs_df.head()

Unnamed: 0,variant_id,gene_id,enhancer_id,promoter_id,chr,variant_pos,ref,alt,num_alt_per_site,rs_id_dbSNP147_GRCh37p13,...,ma_count,maf,pval_nominal,slope,slope_se,pval_nominal_threshold,min_pval_nominal,pval_beta,enhancer_bin,promoter_bin
0,8_6262831_A_T_b37,ENSG00000246089,chr8:6262801-6263000,chr8:6405000-6408201,8,6262831,A,T,1,rs725438,...,75,0.323276,1.28344e-06,-0.613816,0.118679,5e-06,1.77326e-08,9.8627e-05,6260000,6405000
1,8_8258448_C_G_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601,8,8258448,C,G,1,rs2920992,...,112,0.478632,7.15701e-06,-0.54385,0.114459,1.9e-05,7.25016e-11,3.03794e-07,8255000,8225000
2,8_8258455_G_A_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601,8,8258455,G,A,1,rs2979150,...,112,0.478632,7.15701e-06,-0.54385,0.114459,1.9e-05,7.25016e-11,3.03794e-07,8255000,8225000
3,8_8258712_T_C_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601,8,8258712,T,C,1,rs2976929,...,112,0.478632,7.15701e-06,-0.54385,0.114459,1.9e-05,7.25016e-11,3.03794e-07,8255000,8225000
4,8_8259117_T_A_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601,8,8259117,T,A,1,rs2920991,...,107,0.490826,6.82261e-07,-0.616851,0.115898,1.9e-05,7.25016e-11,3.03794e-07,8255000,8225000


In [None]:
pairs_df.shape

In [32]:
# enhancers
pairs_df = pairs_df.merge(enhancer_linear, right_on = 'enhancer_id', left_on = 'enhancer_id')
pairs_df.shape

(329, 123)

In [33]:
# promoters
pairs_df = pairs_df.merge(promoter_linear, right_on = 'promoter_id', left_on = 'promoter_id')
pairs_df.shape

(329, 224)

In [34]:
hi_c = pd.DataFrame(hi_c_dict)
hi_c.head()

Unnamed: 0,promoter_id,enhancer_id,hi_c,variant_id,gene_id
0,chr8:6405000-6408201,chr8:6262801-6263000,3.0,8_6262831_A_T_b37,ENSG00000246089
1,chr8:8226600-8230601,chr8:8258201-8259600,8.0,8_8258448_C_G_b37,ENSG00000253893
2,chr8:8226600-8230601,chr8:8258201-8259600,8.0,8_8258455_G_A_b37,ENSG00000253893
3,chr8:8226600-8230601,chr8:8258201-8259600,8.0,8_8258712_T_C_b37,ENSG00000253893
4,chr8:8226600-8230601,chr8:8258201-8259600,8.0,8_8259117_T_A_b37,ENSG00000253893


In [None]:
# hi-c
pairs_df = pairs_df.merge(hi_c, how='inner', right_on=['promoter_id', 'enhancer_id', 'variant_id', 'gene_id'], left_on=['promoter_id', 'enhancer_id', 'variant_id', 'gene_id'])
pairs_df.shape

In [None]:
pairs_df = pairs_df.drop(columns=tissues_features_beta)

In [38]:
# eQTL
tissues_beta = tissues_beta.drop_duplicates()
tissues_beta = tissues_beta.fillna(1)
pairs_df = pairs_df.merge(tissues_beta, how='left', right_on=['gene_id', rs_name], left_on=['gene_id', rs_name])
columns = []
for i in pairs_df.columns:
    if i in tissues_names: columns.append(i+'_beta')
    else: columns.append(i)
pairs_df.columns = columns

In [39]:
pairs_df.head()

Unnamed: 0,variant_id,gene_id,enhancer_id,promoter_id,chr,variant_pos,ref,alt,num_alt_per_site,rs_id_dbSNP147_GRCh37p13,...,Skin_Not_Sun_Exposed_Suprapubic_beta,Skin_Sun_Exposed_Lower_leg_beta,Small_Intestine_Terminal_Ileum_beta,Spleen_beta,Stomach_beta,Testis_beta,Thyroid_beta,Uterus_beta,Vagina_beta,Whole_Blood_beta
0,8_6262831_A_T_b37,ENSG00000246089,chr8:6262801-6263000,chr8:6405000-6408201,8,6262831,A,T,1,rs725438,...,-0.461236,-0.447576,-0.638269,-0.81166,-0.473882,-0.883826,-0.480686,-0.38361,-0.616397,-0.677099
1,8_8258448_C_G_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601,8,8258448,C,G,1,rs2920992,...,-0.169422,-0.135644,-0.065092,1.0,-0.320903,-0.737579,-0.292812,-0.598346,-0.118196,1.0
2,8_8258455_G_A_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601,8,8258455,G,A,1,rs2979150,...,-0.169465,-0.136068,-0.064566,1.0,-0.320723,-0.738005,-0.290298,-0.602307,-0.115607,1.0
3,8_8258712_T_C_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601,8,8258712,T,C,1,rs2976929,...,-0.169422,-0.135275,-0.065092,1.0,-0.320917,-0.737579,-0.293012,-0.598346,-0.118846,1.0
4,8_8259117_T_A_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601,8,8259117,T,A,1,rs2920991,...,-0.164296,-0.134278,-0.065092,1.0,-0.32094,-0.735544,-0.292039,-0.598346,-0.119806,1.0


In [42]:
tissues_pval = tissues_pval.drop_duplicates()
tissues_pval = tissues_pval.fillna(1)
pairs_df = pairs_df.merge(tissues_pval, how='left', right_on=['gene_id', rs_name], left_on=['gene_id', rs_name])
columns = []
for i in pairs_df.columns:
    if i in tissues_names: columns.append(i+'_pval')
    else: columns.append(i)
pairs_df.columns = columns

In [43]:
pairs_df = pairs_df.drop_duplicates()
pairs_df.shape

(290, 412)

In [None]:
# Add Z-Scores

In [3]:
pairs_df = pd.read_csv('../data/chromosome8_all_data_pval.csv')
pairs_df.head()

Unnamed: 0,variant_id,gene_id,enhancer_id,promoter_id,chr,variant_pos,ref,alt,num_alt_per_site,rs_id_dbSNP147_GRCh37p13,...,Skin_Not_Sun_Exposed_Suprapubic,Skin_Sun_Exposed_Lower_leg,Small_Intestine_Terminal_Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole_Blood
0,8_6262831_A_T_b37,ENSG00000246089,chr8:6262801-6263000,chr8:6405000-6408201,8,6262831,A,T,1,rs725438,...,6.47606e-10,4.340546e-15,1.9e-05,3.08727e-10,2.22996e-08,3.699122e-25,2.288615e-13,0.043879,0.000399,9.007175e-25
1,8_8258448_C_G_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601,8,8258448,C,G,1,rs2920992,...,0.1634199,0.1208264,0.744984,1.0,0.001831117,2.916586e-09,0.0008773499,0.004506,0.437609,1.0
2,8_8258455_G_A_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601,8,8258455,G,A,1,rs2979150,...,0.1636915,0.1190036,0.746993,1.0,0.001853348,2.886361e-09,0.0009617443,0.003875,0.441878,1.0
3,8_8258712_T_C_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601,8,8258712,T,C,1,rs2976929,...,0.1634199,0.1220044,0.744984,1.0,0.001828282,2.916586e-09,0.0008731569,0.004506,0.435632,1.0
4,8_8259117_T_A_b37,ENSG00000253893,chr8:8258201-8259600,chr8:8226600-8230601,8,8259117,T,A,1,rs2920991,...,0.1815006,0.1250446,0.744984,1.0,0.001823206,3.666259e-09,0.0009606651,0.004506,0.432639,1.0


In [50]:
pairs_df['rsid'] = pairs_df[rs_name] + '_' + pairs_df['promoter_id'] + pairs_df['enhancer_id'] 
pairs_df = pairs_df.drop_duplicates(subset='rsid')

In [51]:
data_z = pd.read_csv('../data/chromosome8_to_run_finemap.snp', sep=' ')[['rsid', 'beta', 'se', 'z', 'prob']]
print(data_z.shape)
data_z.head()

(248, 5)


Unnamed: 0,rsid,beta,se,z,prob
0,rs2639898_chr8:70667200-70670001chr8:71918401-...,0.774308,0.099723,7.76456,1.0
1,rs2923444_chr8:42537600-42545401chr8:42396801-...,-0.63454,0.06089,-10.4211,1.0
2,rs1441438_chr8:78665200-78668001chr8:79641601-...,-0.991622,0.091605,-10.825,0.5
3,rs1441438_chr8:78668200-78670000chr8:79641601-...,-0.991622,0.091605,-10.825,0.5
4,rs72669109_chr8:109362200-109363201chr8:110523...,0.633513,0.142989,4.4305,0.333333


In [52]:
pairs_df = pairs_df.merge(data_z, how='inner', left_on='rsid', right_on='rsid')

In [53]:
pairs_df.shape

(248, 417)

In [44]:
enhancer_features = []
promoter_features = []

for f in pairs_df.columns:
    if   f.startswith('enhancer_') and f not in {'enhancer_id', 'enhancer_bin'}: enhancer_features.append(f)
    elif f.startswith('promoter_') and f not in {'promoter_id', 'promoter_bin'}: promoter_features.append(f)
        
tissues_features_pval = list(i for i in pairs_df.columns if i.endswith('_pval'))
tissues_features_beta = list(i for i in pairs_df.columns if i.endswith('_beta'))

hi_c_features = ['hi_c']

In [None]:
tissues_features_beta = list(i+'_beta' for i in tissues_names)

In [None]:
pairs_df.head()

In [None]:
x_line = pairs_df[enhancer_features+promoter_features].values
x_hi_c = pairs_df[hi_c_features].values
x_same = pairs_df[hi_c_features+enhancer_features+promoter_features].values
x_pval = pairs_df[tissues_features_pval].values
x_beta = pairs_df[tissues_features_beta].values
x_allb = pairs_df[hi_c_features+enhancer_features+promoter_features+tissues_features_beta].values
y = pairs_df['z'].values
y = pairs_df['z'].values

_, e_line, _, _, _ = methodName.train_random_forest(x_line, y)
_, e_hi_c, _, _, _ = methodName.train_random_forest(x_hi_c, y)
_, e_same, _, _, _ = methodName.train_random_forest(x_same, y)
_, e_pval, _, _, _ = methodName.train_random_forest(x_pval, y)
_, e_beta, _, _, _ = methodName.train_random_forest(x_beta, y)
_, e_allb, _, _, _ = methodName.train_random_forest(x_allb, y)

In [None]:
e_line, e_hi_c, e_same, e_pval, e_beta, e_allb

In [None]:
pairs_df.head()

# Test idea residuals

In [46]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [54]:
X = pairs_df[tissues_features_beta+enhancer_features+promoter_features]
y = pairs_df['z']

In [None]:
X_internal.head()

In [64]:
full_errors = []
resd_errors = []
eqtl_errors = []
for i in range(1000):
    
    X_internal, X_external, y_internal, y_external = train_test_split(X, y, test_size=0.5)

    # Models
    model_eqtl = RandomForestRegressor()
    model_resd = RandomForestRegressor()
    model_full = RandomForestRegressor()

    # Train eqtl model
    X_train, X_test, y_train, y_test = train_test_split(X_internal, y_internal, test_size=0.2, random_state=12)
    model_eqtl = model_eqtl.fit(X_train[tissues_features_beta], y_train)
    y_pred = model_eqtl.predict(X_test[tissues_features_beta])
    e_test = mean_squared_error(y_test, y_pred)
    eqtl_errors.append(e_test)
    eqtl_pred = model_eqtl.predict(X_internal[tissues_features_beta])
    eqtl_resd = eqtl_pred - y_internal

    # Train residual models
    X_train, X_test, y_train, y_test = train_test_split(X_internal, eqtl_resd, test_size=0.2, random_state=12)
    model_resd = model_resd.fit(X_train[enhancer_features+promoter_features], y_train)
    y_pred = model_resd.predict(X_test[enhancer_features+promoter_features])
    e_test = mean_squared_error(y_test, y_pred)

    # Train full model
    X_train, X_test, y_train, y_test = train_test_split(X_internal, y_internal, test_size=0.2, random_state=12)
    model_full = model_full.fit(X_train[tissues_features_beta+enhancer_features+promoter_features], y_train)
    y_pred = model_full.predict(X_test[tissues_features_beta+enhancer_features+promoter_features])
    e_test = mean_squared_error(y_test, y_pred)


    # Compare external predictions
    external_pred = model_eqtl.predict(X_external[tissues_features_beta])
    external_pred += model_resd.predict(X_external[enhancer_features+promoter_features])
    e_test = mean_squared_error(y_external, external_pred)
    resd_errors.append(e_test)

    external_pred = model_full.predict(X_external[tissues_features_beta+enhancer_features+promoter_features])
    e_test = mean_squared_error(y_external, external_pred)
    full_errors.append(e_test)

In [61]:
full_errors = np.array(full_errors)
resd_errors = np.array(resd_errors)

In [62]:
sum(full_errors-resd_errors < 0)

74

In [63]:
(3-4)

-1

In [59]:
import warnings
warnings.filterwarnings('ignore')

In [None]:

print(e_test)

In [None]:

print(e_test)

In [9]:
for i in eQTL['gene_id'].unique(): print(i)

ENSG00000254207.1
ENSG00000253982.1
ENSG00000036448.5
ENSG00000246089.3
ENSG00000271743.1
ENSG00000253893.2
ENSG00000269918.1
ENSG00000255310.2
ENSG00000154319.10
ENSG00000136573.8
ENSG00000154328.11
ENSG00000186523.10
ENSG00000255556.2
ENSG00000145002.8
ENSG00000129422.9
ENSG00000158863.17
ENSG00000173566.9
ENSG00000120910.10
ENSG00000254272.1
ENSG00000158941.12
ENSG00000248738.2
ENSG00000120889.8
ENSG00000253616.1
ENSG00000228451.3
ENSG00000104228.8
ENSG00000147419.12
ENSG00000246339.4
ENSG00000104671.3
ENSG00000253457.1
ENSG00000272338.1
ENSG00000172728.11
ENSG00000129696.8
ENSG00000158669.7
ENSG00000070718.7
ENSG00000176209.7
ENSG00000168172.4
ENSG00000120925.9
ENSG00000104738.12
ENSG00000253140.1
ENSG00000168300.9
ENSG00000206579.7
ENSG00000137574.6
ENSG00000169122.7
ENSG00000215114.3
ENSG00000104388.10
ENSG00000254802.1
ENSG00000177182.6
ENSG00000172817.3
ENSG00000104442.5
ENSG00000066855.11
ENSG00000255107.1
ENSG00000221947.3
ENSG00000121039.5
ENSG00000171033.8
ENSG00000104427.7