# Part II

# Remapping SNPs in promoters, genes and enhancers (Genehancer)

For promoters, I will associate the center of the ENSEMBL curated promoters within 2500 base pairs of the TSS (Gao 2016)

For enhancers, I will remap using two sets of databases:

EnhancerAtlas (Thymus, Fetal Spinal Cord, and Astrocyte) (Gao et al, Bioinformatics 2016)

Genehancer (Predicted using HiC data and curated data) (Fishelevich et al, Database 2017)

In each case, the most significant SNP will be assigned to the gene

In [1]:
import pandas as pd

In [33]:
#first load genehancer into python

genehancer = pd.read_excel('genehancer.xlsx')
genehancer = genehancer.loc[:,['chrom','start','end','attributes']]
genehancer.head()

Unnamed: 0,chrom,start,end,attributes
0,chr22,25191646,25191697,genehancer_id=GH22H025191;connected_gene=GC22P...
1,chr10,36278128,36278746,genehancer_id=GH10H036278;connected_gene=GC10M...
2,chr4,141347800,141349400,genehancer_id=GH04H141347;connected_gene=LINC0...
3,chr6,36141287,36142442,genehancer_id=GH06H036141;connected_gene=MAPK1...
4,chr4,184644792,184651047,genehancer_id=GH04H184644;connected_gene=CENPU...


In [40]:
#map snps to enhancers

enh_mat = genehancer.as_matrix()
snp_mat = pd.read_csv('hglft_genome_snps_38.BED', header = None, sep = '\t')
snp_mat = snp_mat.as_matrix() #from hg18 to hg38

snpmap_gh = [[d[3],f[3]] for d in snp_mat for f in enh_mat if f[0] == d[0] and d[1] >= f[1] and d[1] <= f[2]]
len(snpmap_gh)

3365

In [58]:
snpmap_gh = pd.DataFrame(snpmap_gh)
snpmap_gh.columns = ['snpid','attributes']
snpmap_gh.head()

Unnamed: 0,snpid,attributes
0,2438059,genehancer_id=GH01H002436;connected_gene=MORN1...
1,2459149,genehancer_id=GH01H002456;connected_gene=SLC35...
2,3214671,genehancer_id=GH01H003214;connected_gene=LOC10...
3,6065024,genehancer_id=GH01H006063;connected_gene=CHD5;...
4,6069775,genehancer_id=GH01H006069;connected_gene=CHD5;...


In [67]:
snpmap_gh['attributes'] = snpmap_gh['attributes'].str[26:]
snpmap_gh['attributes'] = snpmap_gh['attributes'].str.replace('connected_gene=','')
snpmap_gh['attributes'] = snpmap_gh['attributes'].str.split(';')

In [68]:
x = []
for d in snpmap_gh['attributes']:
    x.append([k for k in d if 'score' not in k])

In [69]:
snpmap_gh['attributes'] = x
snpmap_gh['attributes'].head()

0    [MORN1, FAAP20, ATAD3B, LOC100129534, ENSG0000...
1                 [SLC35E2B, GC01M002449, GC01M002425]
2                [LOC105378605, PIR41593, GC01P003241]
3                    [CHD5, GC01M006038, LOC105376687]
4    [CHD5, RNF207, LOC102724450, RPL22, GC01M00603...
Name: attributes, dtype: object

In [70]:
genes = snpmap_gh.apply(lambda x: pd.Series(x['attributes']),axis=1).stack().reset_index(level=1, drop=True)
genes.name = 'gene'
snpmap_gh = snpmap_gh.drop('attributes', axis=1).join(genes)
snpmap_gh.head()

Unnamed: 0,snpid,gene
0,rs4592207,MORN1
0,rs4592207,FAAP20
0,rs4592207,ATAD3B
0,rs4592207,LOC100129534
0,rs4592207,ENSG00000234396


In [74]:
snpmap_gh.to_csv('genehancer_snps.csv', index = False, header = False)

In [73]:
#Remap snps to promoters and genes

mapped_promoters = pd.read_csv('hg38_promoters_mapped_by_tss.csv')
p_mat = mapped_promoters.as_matrix()
snpmap_p = {d[3]:f[3] for d in snp_mat for f in p_mat if f[0] == d[0] and d[1] >= f[1] and d[1] <= f[2]}
len(snpmap_p) #the number of significant snps mapped to promoters is 249

genes = pd.read_csv('genes_BED.BED', sep = '\t')
g_mat = genes.as_matrix()
snpmap_g = {d[3]:f[3] for d in snp_mat for f in g_mat if f[0] == d[0] and d[1] >= f[1] and d[1] <= f[2]}
len(snpmap_g) #the number of significant snps mapped to genes was 4277

4277

In [76]:
#Load snp database

snps = pd.read_csv('snp_level_summary_stats_pmid_25056061.txt', sep = '\t', skiprows = 1, header = None)
snps.columns = ['snpid','chr','bp','a1','a2','or','se','pval','info','ngt','CEUaf']
snps = snps.loc[:,['snpid','chr','bp','pval']]
x = []
for d in snps.loc[:,'chr']:
    x.append('chr'+str(d))
snps['chr'] = pd.DataFrame(x)
snps = snps.loc[snps['pval']<0.005]
snps.head()

Unnamed: 0,snpid,chr,bp,pval
147,rs3737721,chr1,1217760,0.00455
531,rs4592207,chr1,2359358,0.001813
537,rs10910078,chr1,2380448,0.002838
600,rs12133956,chr1,2518200,0.003047
789,rs10909901,chr1,3121095,0.002338


In [79]:
#make dataframe with snpid, gene symbol, p-value

x = []
for d in snpmap_gh.as_matrix():
    x.append(d)
for k,v in snpmap_p.items():
    x.append([k,v])
for k,v in snpmap_g.items():
    x.append([k,v])
snp_genes = pd.DataFrame(x)
snp_genes.columns = ['snpid','gene']
snp_genes = snp_genes.merge(snps, how = 'inner', right_on = 'snpid', left_on = 'snpid')
snp_genes.head() #27953 most significant snps mapped

Unnamed: 0,snpid,gene,chr,bp,pval
0,rs4592207,MORN1,chr1,2359358,0.001813
1,rs4592207,FAAP20,chr1,2359358,0.001813
2,rs4592207,ATAD3B,chr1,2359358,0.001813
3,rs4592207,LOC100129534,chr1,2359358,0.001813
4,rs4592207,ENSG00000234396,chr1,2359358,0.001813


In [85]:
#only get smallest pvals within each gene

def func(group):
    return group.loc[group['pval'] == group['pval'].min()]

snp_genes = snp_genes.groupby('gene', as_index=False).apply(func).reset_index(drop=True)

len(snp_genes) #8299 genes mapped

8299

In [89]:
#sort list by p-value

snp_genes = snp_genes.sort_values(by = 'pval').reset_index(drop = True)
snp_genes.iloc[0:10,:]

Unnamed: 0,snpid,gene,chr,bp,pval
0,rs2021722,TRIM31,chr6,30282110,4.30138e-11
1,rs2021722,TRIM39,chr6,30282110,4.30138e-11
2,rs2021722,ENSG00000233892,chr6,30282110,4.30138e-11
3,rs2021722,ERVK9-12,chr6,30282110,4.30138e-11
4,rs2021722,HCG18,chr6,30282110,4.30138e-11
5,rs2021722,PIR43263,chr6,30282110,4.30138e-11
6,rs2021722,HLA-A,chr6,30282110,4.30138e-11
7,rs2021722,ENSG00000237669,chr6,30282110,4.30138e-11
8,rs2021722,ZNRD1,chr6,30282110,4.30138e-11
9,rs2021722,HLA-W,chr6,30282110,4.30138e-11


In [91]:
#export to csv

snp_genes.to_csv('snps_genehancer_list.csv')