In [165]:
import pandas as pd
import requests
from requests_futures.sessions import FuturesSession
from concurrent.futures import as_completed
import time
import json
import re

snp_list = ['rs61766198','rs2477678']

result_dict = {}
problems = []

session = FuturesSession(max_workers=10)
batch = 0
futures = []
    
headers={"Content-Type" : "application/json",
             "Accept" : "application/json"}
    

futures.append(session.post("https://rest.ensembl.org/variation/human/?pops=1", 
                                    headers=headers,
                                    data = json.dumps({'ids' : snp_list})))

n_complete = 0
n_total = len(snp_list)
for f in as_completed(futures):
    json_data = f.result().json()
    result_dict.update(json_data)
    n_complete += 1
    print("Finished {} of {}".format(n_complete, n_total))
    
for key in result_dict.keys():
    pop_list = result_dict[key]['populations']
    KGP_list = [pop for pop in pop_list if pop.get('population') == '1000GENOMES:phase_3:ALL']
    

Finished 1 of 2
{'frequency': 0.8161, 'allele_count': 50679, 'allele': 'T', 'population': 'gnomADg:nfe'}
{'frequency': 0.496, 'allele_count': 31848, 'allele': 'T', 'population': 'gnomADg:nfe'}


In [3]:
import pandas as pd
import requests
from requests_futures.sessions import FuturesSession
from concurrent.futures import as_completed
import time
import json
import re

def ensembl_snp_info(snp_list):
    start_time = time.time()
    
    # Check input
    if not isinstance(snp_list, list):
        raise TypeError("The input is not a list!")
        
    result_dict = {}
    problems = []
    
    # Start up the non-blocking requests in batches of 100 SNPs per request
    session = FuturesSession(max_workers=10)
    batch = 0
    futures = []
    
    headers={"Content-Type" : "application/json",
             "Accept" : "application/json"}
    
    if len(snp_list) < 100:
        futures.append(session.post("https://rest.ensembl.org/variation/human/?pops=1", 
                                    headers=headers,
                                    data = json.dumps({'ids' : snp_list})))

    else:
        for i in range(0, len(snp_list)//100):
            futures.append(session.post("https://rest.ensembl.org/variation/human/?pops=1", 
                                        headers=headers,
                                        data = json.dumps({'ids' : snp_list[batch*100:(batch+1)*100]})))
            batch += 1
        
        # Add the last batch (with less than 100 IDs)
        futures.append(session.post("https://rest.ensembl.org/variation/human/?pops=1", 
                                    headers=headers,
                                    data = json.dumps({'ids' : snp_list[(batch)*100:]})))

    # As the requests complete, try to parse the data into the results_dict, otherwise report a problem URL
    n_complete = 0
    n_total = len(snp_list)
    for f in as_completed(futures):
        try:
            json_data = f.result().json()
            result_dict.update(json_data)
            n_complete += 1
            print("Finished {} of {}".format(n_complete, n_total))

            
        except:
            #problems.append(f.result().request.url)
            problems = problems + json.loads(f.result().request.body)['ids']
            print("Found a problem.")
    print("Done!")
    print("Elapsed: {0} minutes {1:0f} seconds".format((time.time() - start_time) // 60, (time.time() - start_time) % 60))
    #snp_info = pd.DataFrame.from_dict(result_dict)
    snp_allele_info = {'snp_id':[],
                   'allele_string':[],
                   'location' : [],
                   'gnomAD_allele':[],
                   'gnomAD_allele_freq':[],
                   '1KGP_allele':[],
                   '1KGP_allele_freq':[]}
    for key in result_dict.keys():
        snp_allele_info['snp_id'].append(key)
        pop_list = result_dict[key]['populations']
        KGP_list = [pop for pop in pop_list if pop.get('population') == '1000GENOMES:phase_3:ALL']
        gnomAD_list = [pop for pop in pop_list if pop.get('population') == 'gnomADg:ALL']
        KGP_allele=[]
        KGP_allele_freq=[]
        gnomAD_allele=[]
        gnomAD_allele_freq=[]
        for i in range(len(KGP_list)):
            KGP_allele.append(KGP_list[i]['allele'])
            KGP_allele_freq.append(KGP_list[i]['frequency'])
        snp_allele_info['1KGP_allele'].append(KGP_allele)
        snp_allele_info['1KGP_allele_freq'].append(KGP_allele_freq)
        
        for j in range(len(gnomAD_list)):
            gnomAD_allele.append(gnomAD_list[j]['allele'])
            gnomAD_allele_freq.append(gnomAD_list[j]['frequency'])
        snp_allele_info['gnomAD_allele'].append(gnomAD_allele)
        snp_allele_info['gnomAD_allele_freq'].append(gnomAD_allele_freq)
    # Some of these have missing mappings? Or maybe it isn't a list?
        try:
            snp_allele_info['allele_string'].append(result_dict[key]['mappings'][0]['allele_string'])
            snp_allele_info['location'].append(result_dict[key]['mappings'][0]['location'])
        except:
            print(result_dict[key])
            snp_allele_info['allele_string'].append(None)
            snp_allele_info['location'].append(None)

    
    return snp_allele_info, problems


snp_list_df = pd.read_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/final_snp_list.txt', names=['snp_id'])
snp_list = list(snp_list_df['snp_id'])

snp_allele_info, problems = ensembl_snp_info(snp_list)
snp_allele_info_df = pd.DataFrame.from_dict(snp_allele_info)

coords = snp_allele_info_df['location'].str.split(r":|-", expand=True)
snp_allele_info_df['chrom'], snp_allele_info_df['start'], snp_allele_info_df['end'] = 'chr' + coords[0], coords[1], coords[2]

snp_allele_info_df


Finished 1 of 2330
Finished 2 of 2330
Finished 3 of 2330
Finished 4 of 2330
Finished 5 of 2330
Finished 6 of 2330
Finished 7 of 2330
Finished 8 of 2330
Finished 9 of 2330
Finished 10 of 2330
Finished 11 of 2330
Finished 12 of 2330
Finished 13 of 2330
Finished 14 of 2330
Finished 15 of 2330
Finished 16 of 2330
Finished 17 of 2330
Finished 18 of 2330
Finished 19 of 2330
Finished 20 of 2330
Finished 21 of 2330
Finished 22 of 2330
Finished 23 of 2330
Finished 24 of 2330
Done!
Elapsed: 2.0 minutes 37.105392 seconds


Unnamed: 0,1KGP_allele,1KGP_allele_freq,allele_string,gnomAD_allele,gnomAD_allele_freq,location,snp_id,chrom,start,end
0,"[C, A]","[0.698282747603834, 0.301717252396166]",A/C/G,"[G, A, C]","[6.996e-06, 0.328, 0.6723]",8:78695658-78695658,rs10957897,chr8,78695658,78695658
1,"[T, C]","[0.312699680511182, 0.687300319488818]",C/A/T,"[C, T]","[0.7133, 0.2867]",10:62637156-62637156,rs10995249,chr10,62637156,62637156
2,"[A, G]","[0.479832268370607, 0.520167731629393]",G/A,"[A, G]","[0.4972, 0.5028]",2:197785535-197785535,rs771018,chr2,197785535,197785535
3,"[G, A]","[0.857428115015974, 0.142571884984026]",G/A,"[A, G]","[0.2085, 0.7915]",22:37148446-37148446,rs3218255,chr22,37148446,37148446
4,"[C, T]","[0.909944089456869, 0.090055910543131]",C/G/T,"[T, C]","[0.1541, 0.8459]",11:128522042-128522042,rs61907765,chr11,128522042,128522042
5,"[G, A]","[0.916134185303514, 0.0838658146964856]",G/A,"[G, A]","[0.9099, 0.09006]",16:67864894-67864894,rs1123072,chr16,67864894,67864894
6,"[C, T]","[0.270367412140575, 0.729632587859425]",T/C,"[C, T]","[0.2208, 0.7792]",1:119988510-119988510,rs1493698,chr1,119988510,119988510
7,"[G, A]","[0.245806709265176, 0.754193290734824]",G/A/C,"[G, A]","[0.2556, 0.7444]",6:32609443-32609443,rs9271144,chr6,32609443,32609443
8,"[G, C]","[0.629992012779553, 0.370007987220447]",G/A/C,"[G, C]","[0.6064, 0.3936]",6:29738351-29738351,rs2523389,chr6,29738351,29738351
9,"[A, C]","[0.30870607028754, 0.69129392971246]",C/A,"[C, A]","[0.6831, 0.3169]",11:118692672-118692672,rs585039,chr11,118692672,118692672


In [64]:
snp_allele_info_df[snp_allele_info_df['1KGP_allele_freq']==0]

Unnamed: 0,1KGP_allele,1KGP_allele_freq,allele_string,gnomAD_allele,gnomAD_allele_freq,location,snp_id,chrom,start,end


In [33]:
import numpy as np

snp_allele_df = snp_allele_info_df[['chrom','start','end','snp_id','allele_string','1KGP_allele','1KGP_allele_freq','gnomAD_allele','gnomAD_allele_freq']]

gnomAD_ref_allele=[]
KGP_ref_allele=[]
for n in range(0,len(snp_allele_df)):
    if len(snp_allele_df['gnomAD_allele_freq'][n])!=0:
        idx = snp_allele_df['gnomAD_allele_freq'][n].index(max(snp_allele_df['gnomAD_allele_freq'][n]))
        gnomAD_ref_allele.append(snp_allele_df['gnomAD_allele'][n][idx])
    else:
        gnomAD_ref_allele.append('')
        
    if len(snp_allele_df['1KGP_allele_freq'][n])!=0:
        idx = snp_allele_df['1KGP_allele_freq'][n].index(max(snp_allele_df['1KGP_allele_freq'][n]))
        KGP_ref_allele.append(snp_allele_df['1KGP_allele'][n][idx])
    else:
        KGP_ref_allele.append('')

snp_allele_df['gnomAD_ref_allele'] = gnomAD_ref_allele
snp_allele_df['1KGP_ref_allele'] = KGP_ref_allele

snp_allele_df['difference'] = snp_allele_df['end'].astype(int) - snp_allele_df['start'].astype(int)

snp_allele_df['end'] = np.where(snp_allele_df['difference']<0, snp_allele_df['end'].astype(int)+np.abs(snp_allele_df['difference']) , snp_allele_df['end'])
snp_allele_df['start_1'] = snp_allele_df['start'].astype(int)-1
snp_allele_df['chr_start_end'] = snp_allele_df['chrom']+"_"+snp_allele_df['start_1'].astype(str)+"_"+snp_allele_df['end'].astype(str)

snp_allele_df[['chrom','start_1','end']].to_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/variant_coordinates.bed', sep='\t', header=None, index=False)





In [59]:
snp_allele_df[snp_allele_df['gnomAD_allele_freq']==0]

Unnamed: 0,chrom,start,end,snp_id,allele_string,1KGP_allele,1KGP_allele_freq,gnomAD_allele,gnomAD_allele_freq,gnomAD_ref_allele,1KGP_ref_allele,variant_ref_sequence,score,strand


In [28]:
%%bash
sort -k1,1 -k2,2n -k3,3n /data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/variant_coordinates.bed \
> /data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/variant_coordinates.sorted.bed

In [29]:
%%bash
module load bedtools2
bedtools getfasta -fi /data/reddylab/Reference_Data/Genomes/hg38/hg38.fa \
-bed /data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/variant_coordinates.sorted.bed \
-fo /data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/variant_coordinates_ref_seq.fa

In [35]:
import pandas as pd


variant_fasta = pd.read_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/variant_coordinates_ref_seq.fa', sep='\t', header=None)
variant_fasta = pd.DataFrame(variant_fasta.values.reshape(-1, 2), columns = ['position', 'sequence'])
variant_fasta['position'] = variant_fasta['position'].str.replace('>','')
variant_fasta['position'] = variant_fasta['position'].str.replace(r':|-','_')
variant_fasta = variant_fasta.rename(columns = {'position':'chr_start_end',
                                                    'sequence':'variant_ref_sequence'})
# variant_fasta.to_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/large_snp_atac_gene_library_strand_bed_phase2_hg38.txt',sep='\t',index=False)
snp_allele_fasta_df = snp_allele_df.merge(variant_fasta, on='chr_start_end', how='inner')
snp_allele_fasta_df.drop(columns=['start_1','difference','chr_start_end']).to_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/final_snp_list_with_allele_frequencies.txt', sep='\t', index=False)


In [43]:
import pandas as pd

snp_allele_df = pd.read_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/final_snp_list_with_allele_frequencies.txt', sep='\t')
snp_allele_df['score'] = '0'
snp_allele_df['strand'] = '.'

snp_allele_df[['chrom','start','end','snp_id','score','strand','allele_string','variant_ref_sequence','1KGP_ref_allele','1KGP_allele','1KGP_allele_freq','gnomAD_ref_allele','gnomAD_allele','gnomAD_allele_freq']].to_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/final_snp_list_with_allele_frequencies.bed', sep='\t',header=None, index=False)


In [66]:
snp_allele_df[snp_allele_df[['variant_ref_sequence','1KGP_ref_allele','gnomAD_ref_allele']].nunique(axis=1) == 1]

Unnamed: 0,chrom,start,end,snp_id,allele_string,1KGP_allele,1KGP_allele_freq,gnomAD_allele,gnomAD_allele_freq,gnomAD_ref_allele,1KGP_ref_allele,variant_ref_sequence,score,strand
1,chr10,62637156,62637156,rs10995249,C/A/T,"['T', 'C']","[0.312699680511182, 0.687300319488818]","['C', 'T']","[0.7133, 0.2867]",C,C,C,0,.
2,chr2,197785535,197785535,rs771018,G/A,"['A', 'G']","[0.479832268370607, 0.520167731629393]","['A', 'G']","[0.4972, 0.5028]",G,G,G,0,.
3,chr22,37148446,37148446,rs3218255,G/A,"['G', 'A']","[0.857428115015974, 0.142571884984026]","['A', 'G']","[0.2085, 0.7915]",G,G,G,0,.
4,chr11,128522042,128522042,rs61907765,C/G/T,"['C', 'T']","[0.909944089456869, 0.090055910543131]","['T', 'C']","[0.1541, 0.8459]",C,C,C,0,.
5,chr16,67864894,67864894,rs1123072,G/A,"['G', 'A']","[0.916134185303514, 0.0838658146964856]","['G', 'A']","[0.9099, 0.09006]",G,G,G,0,.
6,chr1,119988510,119988510,rs1493698,T/C,"['C', 'T']","[0.270367412140575, 0.729632587859425]","['C', 'T']","[0.2208, 0.7792]",T,T,T,0,.
8,chr6,29738351,29738351,rs2523389,G/A/C,"['G', 'C']","[0.629992012779553, 0.370007987220447]","['G', 'C']","[0.6064, 0.3936]",G,G,G,0,.
9,chr11,118692672,118692672,rs585039,C/A,"['A', 'C']","[0.30870607028754, 0.69129392971246]","['C', 'A']","[0.6831, 0.3169]",C,C,C,0,.
13,chr3,46335514,46335514,rs71615445,C/T,"['C', 'T']","[0.931709265175719, 0.0682907348242811]","['C', 'T']","[0.945, 0.05502]",C,C,C,0,.
15,chr11,65982174,65982174,rs551659,C/G/T,"['C', 'T']","[0.58905750798722, 0.41094249201278]","['C', 'T']","[0.6242, 0.3758]",C,C,C,0,.


In [44]:
%%bash
sort -k1,1 -k2,2n -k3,3n /data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/final_snp_list_with_allele_frequencies.bed \
> /data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/final_snp_list_with_allele_frequencies.sorted.bed


In [45]:
!head -n 20 /data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/final_snp_list_with_allele_frequencies.sorted.bed

chr1	1308516	1308516	rs61766198	0	.	C/G/T	C	C	['C', 'T']	[0.689496805111821, 0.310503194888179]	T	['T', 'C']	[0.56, 0.44]
chr1	2547072	2547072	rs2477678	0	.	T/A/G	T	A	['A', 'T']	[0.631988817891374, 0.368011182108626]	A	['A', 'T', 'G']	[0.587, 0.4132, 7.02e-06]
chr1	2548544	2548544	rs1974044	0	.	A/G	a	G	['G', 'A']	[0.625399361022364, 0.374600638977636]	G	['A', 'G']	[0.4249, 0.5751]
chr1	2556224	2556224	rs2227312	0	.	C/A/G	C	A	['A', 'C']	[0.599241214057508, 0.400758785942492]	A	['C', 'A']	[0.4562, 0.5438]
chr1	2556327	2556327	rs2227313	0	.	T/A/C/G	T	C	['T', 'C']	[0.38917731629393, 0.61082268370607]	C	['C', 'T']	[0.5609, 0.4393]
chr1	2556714	2556714	rs4870	0	.	A/G	A	G	['G', 'A']	[0.614816293929712, 0.385183706070288]	G	['A', 'G']	[0.4439, 0.5561]
chr1	2557169	2557169	rs1886730	0	.	T/A/C	T	C	['C', 'T']	[0.645766773162939, 0.354233226837061]	C	['C', 'T']	[0.5907, 0.4093]
chr1	2585342	2585342	rs60733400	0	.	G/A/C	G	A	['G', 'A']	[0.478634185303514, 0.521365814696486]	G	['G', 'A']	[0.56