# Part I

# Remapping SNPs in promoters, genes and enhancers (EnhancerAtlas)

For promoters, I will associate the center of the ENSEMBL curated promoters within 2500 base pairs of the TSS (Gao 2016)

For enhancers, I will remap using two sets of databases:

EnhancerAtlas (Thymus, Fetal Spinal Cord, and Astrocyte) (Gao et al, Bioinformatics 2016)

Genehancer (Predicted using HiC data and curated data) (Fishelevich et al, Database 2017)

In each case, the most significant SNP will be assigned to the gene

In [1]:
import pandas as pd
import mygene

In [25]:
snps = pd.read_csv('C:\\Users\\magne\\Documents\\Classes\\BNFO286\\final_project\\snp_level_summary_stats_pmid_25056061.txt', sep = '\t', skiprows = 1, header = None)
snps.columns = ['snpid','chr','bp','a1','a2','or','se','pval','info','ngt','CEUaf']
snps = snps.loc[:,['snpid','chr','bp','pval']]
snps.head()

Unnamed: 0,snpid,chr,bp,pval
0,rs3131972,1,742584,0.761033
1,rs3131969,1,744045,0.784919
2,rs3131967,1,744197,0.79352
3,rs1048488,1,750775,0.761041
4,rs12562034,1,758311,0.987899


In [26]:
x = []
for d in snps.loc[:,'chr']:
    x.append('chr'+str(d))
snps['chr'] = pd.DataFrame(x)
snps.head()

Unnamed: 0,snpid,chr,bp,pval
0,rs3131972,chr1,742584,0.761033
1,rs3131969,chr1,744045,0.784919
2,rs3131967,chr1,744197,0.79352
3,rs1048488,chr1,750775,0.761041
4,rs12562034,chr1,758311,0.987899


In [27]:
#I decided to choose the top 18,000 SNPS for remapping, which corresponds to a p-value of less than 0.005

snps = snps.loc[snps['pval']<0.005]

#Next make BED file to convert genomic positions via liftover (UCSC) from hg18 to hg38

In [120]:
snps_BED = snps
snps_BED = snps_BED.reset_index(drop=True)
x = []
for d in snps_BED.as_matrix():
    x.append(d[2]+1)
snps_BED['start'] = snps_BED['bp']
snps_BED['end'] = pd.DataFrame(x)
snps_BED = snps_BED.loc[:,['chr','start','end','snpid','pval']]
snps_BED.columns = ['chrom','chromStart','chromEnd','name','score']
snps_BED.to_csv('snps_BED.bed', sep = '\t', index = False, header = False)

snps_BED.head()

Unnamed: 0,chrom,chromStart,chromEnd,name,score
0,chr1,1217760,1217761,rs3737721,0.00455
1,chr1,2359358,2359359,rs4592207,0.001813
2,chr1,2380448,2380449,rs10910078,0.002838
3,chr1,2518200,2518201,rs12133956,0.003047
4,chr1,3121095,3121096,rs10909901,0.002338


In [5]:
#First we are going to assign gene symbols to the astrocyte enhancers 

enh_astrocyte = pd.read_csv('C:\\Users\\magne\\Documents\\Classes\\BNFO286\\final_project\\Astrocyte_EP.txt', sep = '\t', header = None)
enh_astrocyte = enh_astrocyte.iloc[:,[0,1,2,6]]
enh_astrocyte.columns = ['chr','start','end','ensembl']
enh_astrocyte.head()

Unnamed: 0,chr,start,end,ensembl
0,chrY,9991120,9996720,ENST00000515896
1,chrY,10011330,10030810,ENST00000515896
2,chrY,10030830,10041420,ENST00000515896
3,chrX,1763970,1782250,ENST00000381192
4,chrX,1763970,1782250,ENST00000381401


In [None]:
mg = mygene.MyGeneInfo()
geneList = enh_astrocyte.loc[:,'ensembl']
geneSyms = mg.querymany(geneList , scopes='ensembl.transcript', fields='symbol', species='human', as_dataframe = True)

In [7]:
sym = geneSyms
sym['ensembl'] = sym.index
sym = sym.loc[:,['symbol','ensembl']]
len(sym)

50964

In [8]:
sym.head()

Unnamed: 0_level_0,symbol,ensembl
query,Unnamed: 1_level_1,Unnamed: 2_level_1
ENST00000515896,RNA5-8SP6,ENST00000515896
ENST00000515896,RNA5-8SP6,ENST00000515896
ENST00000515896,RNA5-8SP6,ENST00000515896
ENST00000381192,CD99,ENST00000381192
ENST00000381401,SLC25A6,ENST00000381401


In [9]:
enh_astrocyte = enh_astrocyte.merge(sym, how = 'inner', right_on = 'ensembl', left_on = 'ensembl')
enh_astrocyte = enh_astrocyte.drop_duplicates()
enh_astrocyte.head()

Unnamed: 0,chr,start,end,ensembl,symbol
0,chrY,9991120,9996720,ENST00000515896,RNA5-8SP6
3,chrY,10011330,10030810,ENST00000515896,RNA5-8SP6
6,chrY,10030830,10041420,ENST00000515896,RNA5-8SP6
9,chrX,1763970,1782250,ENST00000381192,CD99
17,chrX,2254050,2296900,ENST00000381192,CD99


In [None]:
#now we are going to do the same for the fetal spinal cord enhancers

enh_fetalsc = pd.read_csv('C:\\Users\\magne\\Documents\\Classes\\BNFO286\\final_project\\Fetal_spinal_cord_EP.txt', sep = '\t', header = None)
enh_fetalsc = enh_fetalsc.iloc[:,[0,1,2,6]]
enh_fetalsc.columns = ['chr','start','end','ensembl']
mg = mygene.MyGeneInfo()
geneList = enh_fetalsc.loc[:,'ensembl']
geneSyms = mg.querymany(geneList , scopes='ensembl.transcript', fields='symbol', species='human', as_dataframe = True)
sym = geneSyms
sym['ensembl'] = sym.index
sym = sym.loc[:,['symbol','ensembl']]
enh_fetalsc = enh_fetalsc.merge(sym, how = 'inner', right_on = 'ensembl', left_on = 'ensembl')
enh_fetalsc = enh_fetalsc.drop_duplicates()
enh_fetalsc.head()

In [None]:
#now we are going to do the same for the thymus enhancers

enh_thymus = pd.read_csv('C:\\Users\\magne\\Documents\\Classes\\BNFO286\\final_project\\Thymus_EP.txt', sep = '\t', header = None)
enh_thymus = enh_thymus.iloc[:,[0,1,2,6]]
enh_thymus.columns = ['chr','start','end','ensembl']
mg = mygene.MyGeneInfo()
geneList = enh_thymus.loc[:,'ensembl']
geneSyms = mg.querymany(geneList , scopes='ensembl.transcript', fields='symbol', species='human', as_dataframe = True)
sym = geneSyms
sym['ensembl'] = sym.index
sym = sym.loc[:,['symbol','ensembl']]
enh_thymus = enh_thymus.merge(sym, how = 'inner', right_on = 'ensembl', left_on = 'ensembl')
enh_thymus = enh_thymus.drop_duplicates()
enh_thymus.head()

In [12]:
#concatenate all enhancers

enh_EA = pd.concat([enh_thymus, enh_fetalsc, enh_astrocyte])
enh_EA = enh_EA.iloc[:,[0,1,2,4]]
len(enh_EA)

98447

In [125]:
#export to BED file for conversion to hg38

enh_EA.to_csv('enh_BED.BED', sep = '\t', index = False, header = False)
enh_EA.head()

Unnamed: 0,chr,start,end,symbol
0,chrX,317820,319010,GTPBP6
3,chrX,330310,332820,GTPBP6
6,chrX,346250,346800,GTPBP6
9,chrX,317820,319010,PPP2R3B
10,chrX,330310,332820,GTPBP6


In [10]:
enh_mat = pd.read_csv('C:\\Users\\magne\\Documents\\Classes\\BNFO286\\final_project\\hglft_genome_enh_38.BED', header = None, sep = '\t')
enh_mat = enh_mat.as_matrix() #from hg19 to hg38
snp_mat = pd.read_csv('C:\\Users\\magne\\Documents\\Classes\\BNFO286\\final_project\\hglft_genome_snps_38.BED', header = None, sep = '\t')
snp_mat = snp_mat.as_matrix() #from hg18 to hg38

snpmap = {d[3]:f[3] for d in snp_mat for f in enh_mat if f[0] == d[0] and d[1] >= f[1] and d[1] <= f[2]}

In [155]:
len(snpmap)#We were able to map 2361 enhancers using this method

2361

In [156]:
import pickle

def save_object(obj, filename):
    with open(filename, 'wb') as output:  # Overwrites any existing file.
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

save_object(snpmap, 'snmmap_enhancer_atlas.pkl')

In [21]:
import pickle

with open('snmmap_enhancer_atlas.pkl', 'rb') as input:
    snpmap = pickle.load(input)

In [34]:
#convert gene positions to hg38 from hg18

genes = pd.read_csv('C:\\Users\\magne\\Documents\\Classes\\BNFO286\\final_project\\gene_level_summary_stats_pmid_25056061.txt', sep = '\t')
genes = genes.iloc[:,1:5]
x = []
for d in genes.loc[:,'Chr']:
    x.append('chr'+str(d))
genes['Chr'] = pd.DataFrame(x)
genes = genes.iloc[:,[1,2,3,0]]
genes.to_csv('genes_BED.BED', sep = '\t', index = False, header = False)
genes.head()

Unnamed: 0,Chr,Gene Start,Gene End,Gene
0,chr6,27906930,27907284,HIST1H4K
1,chr6,27913636,27914096,HIST1H2AK
2,chr6,27914418,27914867,HIST1H2BN
3,chr6,27941085,27941555,HIST1H2AL
4,chr6,27942548,27943338,HIST1H1B


In [192]:
#get TSS's

tss = pd.read_csv('refGene_hg19_TSS.BED', sep = '\t', header = None)
tss = tss.iloc[:,[0,1,2,4]]
x = []
for d in tss.as_matrix():
    x.append(d[1]+1)
tss[2]=x
tss.to_csv('tss_BED.BED',sep='\t',header = False,index = False)
tss.head()

Unnamed: 0,0,1,2,4
0,chr19,50432773,50432774,NUP62
1,chr13,45563613,45563614,NUFIP1
2,chr1,36081,36082,FAM138A
3,chr14,74353317,74353318,ZNF410
4,chr15,88799962,88799963,NTRK3


In [2]:
#promoters from ENSEMBL, tss from ENSEMBL

promoters = pd.read_csv('promoters_biomart.txt', sep = '\t')
x = []
for d in promoters.as_matrix():
    x.append(abs((d[1]-d[2])/2+d[1]))
promoters['average'] = x

tss = pd.read_csv('hglft_genome_tss_38.BED',sep='\t',header=None)
tss = tss.drop_duplicates()
tss = tss.as_matrix()            

In [7]:
#functionally map promoters to genes
y = []
for d in promoters.as_matrix():
    for f in tss:
        if abs(d[4] - f[1]) < 2500:
            y.append([d[0],d[1],d[2],f[3]])

In [8]:
x = []
for d in y:
    x.append('chr'+str(d[0]))
mapped_promoters = pd.DataFrame(y)
mapped_promoters[0]=x
mapped_promoters.to_csv('hg38_promoters_mapped_by_tss.csv')
mapped_promoters.head()

Unnamed: 0,0,1,2,3
0,chr5,134758200,134759801,DDX46
1,chr1,179081400,179083201,TOR3A
2,chr3,58491600,58493001,KCTD6
3,chr3,58491600,58493001,GAPT
4,chr1,32221600,32223201,TMEM234


In [15]:
p_mat = mapped_promoters.as_matrix()

snpmap_p = {d[3]:f[3] for d in snp_mat for f in p_mat if f[0] == d[0] and d[1] >= f[1] and d[1] <= f[2]}
len(snpmap_p) #the number of significant snps mapped to promoters is 249

249

In [38]:
#map genes
g_mat = genes.as_matrix()

snpmap_g = {d[3]:f[3] for d in snp_mat for f in g_mat if f[0] == d[0] and d[1] >= f[1] and d[1] <= f[2]}
len(snpmap_g) #the number of significant snps mapped to genes was 4277

4277

In [44]:
#make dataframe with snpid, gene symbol, p-value, and source

x = []
for k,v in snpmap.items():
    x.append([k,v])
for k,v in snpmap_p.items():
    x.append([k,v])
for k,v in snpmap_g.items():
    x.append([k,v])
snp_genes = pd.DataFrame(x)
snp_genes.columns = ['snpid','gene']
snp_genes = snp_genes.merge(snps, how = 'inner', right_on = 'snpid', left_on = 'snpid')
snp_genes.head() #6887 snps mapped

Unnamed: 0,snpid,gene,chr,bp,pval
0,rs4592207,SKI,chr1,2359358,0.001813
1,rs4592207,PANK4,chr1,2359358,0.001813
2,rs10910078,SKI,chr1,2380448,0.002838
3,rs10909901,HES5,chr1,3121095,0.002338
4,rs10909901,PRDM16,chr1,3121095,0.002338


In [45]:
#only get smallest pvals within each gene

def func(group):
    return group.loc[group['pval'] == group['pval'].min()]

snp_genes = snp_genes.groupby('gene', as_index=False).apply(func).reset_index(drop=True)

In [46]:
len(snp_genes)

2124

In [54]:
snp_genes = snp_genes.sort_values(by = 'pval').reset_index(drop = True) #Reset list

# Importing the Genehancer database

Please see the file Remapping_SNPs_EnhancerAtlas