### ANCESTRY based SNP identification and LD expansion for each disorder

SNPS associated with all 4 disorders (MS, IBD, RA, PS) were wrangled from EBI GWAS catalog and then were separated into 4 lists based on the ancestries (Ancestries under consideration - EUR for European, ASW for African, EAS for East Asian and SAS for South Asian)

#### SNP data wrangling

In [None]:
import pandas as pd

gwas_snps_all = pd.read_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/source_snps/gwas_snps_ancestry_disorder.txt', sep='\t')

# european ancestry snp list
# gwas_snps_european = gwas_snps_all[(gwas_snps_all.ANCESTRY=='EUR') | (gwas_snps_all.ANCESTRY=='EUR ')]
# european_snp_list = gwas_snps_european.SNPS.unique().tolist()
# with open('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/source_snps/european_snp_list.txt', "w") as f:
#     for snp in european_snp_list:
#         f.write("%s\n" % snp)

## east asian ancestry snp_list
gwas_snps_eastasian = gwas_snps_all[gwas_snps_all.ANCESTRY=='EAS']
eastasian_snp_list = gwas_snps_eastasian.SNPS.unique().tolist()
with open('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/source_snps/eastasian_snp_list.txt', "w") as f:
    for snp in eastasian_snp_list:
        f.write("%s\n" % snp)

## african ancestry snp_list
gwas_snps_african = gwas_snps_all[gwas_snps_all.ANCESTRY=='ASW']
african_snp_list = gwas_snps_african.SNPS.unique().tolist()
with open('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/source_snps/african_snp_list.txt', "w") as f:
    for snp in african_snp_list:
        f.write("%s\n" % snp)

## south asian ancestry snp_list
gwas_snps_southasian = gwas_snps_all[gwas_snps_all.ANCESTRY=='SAS']
southasian_snp_list = gwas_snps_southasian.SNPS.unique().tolist()
with open('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/source_snps/southasian_snp_list.txt', "w") as f:
    for snp in southasian_snp_list:
        f.write("%s\n" % snp)

print(len(southasian_snp_list))
print(len(african_snp_list))
print(len(eastasian_snp_list))

In [None]:
import pandas as pd

gwas_snps_all = pd.read_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/source_snps/gwas_snps_ancestry_disorder.txt', sep='\t')
gwas_snps_all = gwas_snps_all.drop_duplicates()
gwas_snps_all_list = gwas_snps_all.SNPS.unique().tolist()

# #gwas_snps_all['ANCESTRY'] = gwas_snps_all.groupby(['SNPS'])['ANCESTRY'].transform(lambda x: ','.join(x))
# gwas_snps_all['CATEGORY'] = gwas_snps_all.groupby(['SNPS', 'ANCESTRY'])['CATEGORY'].transform(lambda x: ','.join(x))
# gwas_snps_all_dedup = gwas_snps_all[['SNPS','ANCESTRY','CATEGORY']].drop_duplicates()
#gwas_snps_all.to_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/gwas_snps_ancestry_disorder_deduplicated.txt', sep='\t', index=False)
#gwas_snps_all[gwas_snps_all['SNPS']=='rs112768831']



In [None]:
import requests
from requests_futures.sessions import FuturesSession
from concurrent.futures import as_completed
import time
import json

def getVarID(chrom, start):
    chrom = int(chrom)
    start = int(start)
    end = start + 1
    headers={ "Content-Type" : "application/json", "Accept" : "application/json"}
    r = requests.post("https://rest.ensembl.org/ga4gh/variants/search", headers = headers,
                     data = '{ "variantSetId" : 1, "referenceName" : %d, "start" : %d, "end" : %d }' % (chrom, start,end))
    try:
        result = r.json()['variants'][0]['names'][0]
    except:
        result = None
    return result

In [None]:
s = pd.Series([x for x in gwas_snps_all_list if x.startswith('chr')]).str.split(':', expand = True)
s[2] = s[1].astype(int) + 1
s[1] = s[1].str.replace(' ', '')
s[3] = s[0].astype(str)+':'+s[1].astype(str)
s.to_csv("/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/source_snps/no_id_hg19.bed", sep="\t", header=None, index=False)

In [None]:
# Download liftOver chain files to convert to hg38
!curl 'http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz' | gunzip > "/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/source_snps/hg19ToHg38.over.chain"


In [None]:
%%bash
/data/reddylab/Revathy/software/liftOver/liftOver \
/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/source_snps/no_id_hg19.bed \
/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/source_snps/hg19ToHg38.over.chain \
/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/source_snps/no_id_hg38.bed \
/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/source_snps/no_id.unmapped

In [None]:
!head /data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/source_snps/no_id_hg38.bed

In [None]:
lifted = pd.read_table("/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/source_snps/no_id_hg38.bed", header=None)
lifted[0] = lifted[0].str.strip("chr")
lifted['id'] = ''

for ix, row in lifted.iterrows():
    lifted.loc[ix, 'id'] = getVarID(row[0], row[1])
lifted.to_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/source_snps/lifted_ids.txt', sep='\t', index=False)

lifted_ids = [x for x in lifted['id'] if str(x).startswith('rs')]
print(len(lifted_ids), "rsIDs recovered")

In [None]:
lifted = pd.read_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/source_snps/lifted_ids.txt', sep='\t')
lifted = lifted.rename(columns={'3':'SNPS'})
lifted = lifted[['SNPS','id']]
lifted = lifted.dropna()
lifted_ids = lifted.id.tolist()
lifted_ids


In [None]:
chr_ids = gwas_snps_all[gwas_snps_all['SNPS'].str.startswith('chr')]
chr_ids['SNPS'] = chr_ids['SNPS'].str.replace(' ', '')
chrID_rsID = chr_ids.merge(lifted, on='SNPS', how='inner').drop_duplicates()
chrID_rsID = chrID_rsID[['id','ANCESTRY','CATEGORY']]
chrID_rsID = chrID_rsID.rename(columns={'id':'SNPS'})

### remove from gwas_snps_all SNPS which doesnt start with rsID and concatenate the mapped ones

gwas_rsID = gwas_snps_all[gwas_snps_all['SNPS'].str.startswith('rs')]
gwas_snps_rsID_mapped = pd.concat([gwas_rsID,chrID_rsID])
gwas_snps_rsID_mapped['origin'] = 'gwas'
gwas_snps_rsID_mapped.to_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/source_snps/gwas_snps_all_rsID_mapped.txt', sep='\t', index=False)


In [None]:
gwas_snps_rsID_mapped

In [None]:
import pandas as pd

european_snp = pd.read_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/source_snps/european_snp_list.txt', sep='\t', names=['snp_id'])
european_snp = european_snp[european_snp['snp_id'].str.startswith('rs')]

snp_euro_list = list(european_snp['snp_id'])

# problem_list = ['DRB*15:01','chr2:2062560332','DRB*08:01','DRB*03:01','DRB*13:03','A*02:01','chr1:208286009','chr10:2081043743','chr16:2011354091','chr16:2031021078','chr2:2061072183','chr18:2051816394','chr22:2021974703','chr11:20128391937','chr1:20152591953','chr6:20111929862','chr6:2030916259','chr3:20189662658','chr14:2035839236','chr11:20109962432','chr2:20163167746','chr5:20150469973','chr6:2020689945',
#                 'chr1:2067713346','chr19:2010463118','chr17:2026124908','chr13:2040745693','chr5:20158829527','chr6:20138197824','chr1:2024518206','chr5:2096118852','chr12:2056741228','chr5:20131996445','chr20:2048574454','chr3:2016996623','chr21:2036488822','chr10:2075601596','chr1:2025291010','chr19:2010886206','chr13:2045321731','chr1:20197757846','chr6:20159506600','chr6:20577820','chr17:2078175483','chr9:2032523737',
#                 'chr17:2040536396','chr11:2064053157','chr7:2037385365','chr3:20101647309','chr9:20110792282','chrX:78464616','chr17:38031857','2-62564875','chr1:2523811','chr6:14103212','chr11:107967350','chr21:35928240','chr7:128580042','chr8:129011095','chr20:52210360','chr6:119215402','chr10:6390285','rs67934705','chr5:1519833','chr13:100026952','chr3:5035903','chr6:14691215','chr3:121783015',
#                 'chr3:100656795','chr3:121765368','chr3:100848597','chr1:198573373','chr14:88523488','chr3:112693983','chr1:1682374','chr1:154983036','chr16:68694818','chr1:150593391','chr11:118783424','chr11:14868316','chr16:11213951','chr1:32715071','chr1:32738415','chr16:11353879','chr16:68335911','chr18:60902282','chr2:112492986','chr5:40429250','chr8:129177769','chr20:52783991','chr8:95851818',
#                 'chr7:50328339','chr9:86543849','rs34723276','chr3:47005668','chr6:130348257','chr16:30171625','chr10:21867179','chr1:208286011','chr10:2081043743','chr16:2031021078','chr16:2011354091','chr2:2062560332','chr18:2051816394','chr22:2021974703','chr11:20109962432','chr1:20152591953','chr3:20189662658','chr14:2035839236','chr6:2020689945','chr2:2061072183','chr1:2067713346','chr17:2026124908','chr11:20128391937','chr2:20163167746',
#                 'chr6:20111929862','chr5:20150469973','chr6:2030916259','chr12:2056741228','chr19:2010463118','chr13:2040745693','chr5:20158829527','chr6:20138197824','chr5:20131996445','chr20:2048574454','chr10:2075601596','chr17:2040536396','chr21:2036488822','chr1:2025291010','chr3:20101647309','chr5:2096118852','chr1:2024518206','chr6:20577820','chr7:2037385365','chr3:2016996623','chr19:2010886206','chr1:20197757846','chr6:20159506600',
#                 'chr9:2032523737','chr13:2045321731','chr11:2064053157','chr17:2078175483','chr9:20110792282']


# problem_df = pd.DataFrame(problem_list, columns=['snp_id'])

# final_european_snp = pd.concat([european_snp,problem_df], axis=0, ignore_index=True).drop_duplicates(keep=False)
# final_european_snp_list = list(final_european_snp['snp_id'])


european_snp_list = snp_euro_list + lifted_ids
len(european_snp_list)


In [None]:
!mkdir -p /data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/

#### LD expansion based on ancestry

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import requests
from requests_futures.sessions import FuturesSession
from concurrent.futures import as_completed
import time
import json
import re

def ensembl_LD(snp_list):
    start_time = time.time()
    
    # Check input
    if not isinstance(snp_list, list):
        raise TypeError("The input is not a list!")
        
    # Init empty dict for storing results and list for problem URLs
    ld_snps = {"query" : [],
             "result" : [],
             "r2" : [],
             "d_prime" : []}
    problems = []
    
    # Start up the non-blocking requests
    session = FuturesSession(max_workers=10)
    futures = [session.get("https://rest.ensembl.org/ld/human/{}/1000GENOMES:phase_3:EUR?".format(snp),
                          headers={ "Content-Type" : "application/json"}) for snp in snp_list]
    
    # As the requests complete, try to parse the data into the results, otherwise report a problem URL
    for f in as_completed(futures):
        try:
            json_data = f.result().json()
            for rec in range(0, len(json_data)):
                ld_snps['query'].append(json_data[rec]['variation1'])
                ld_snps['result'].append(json_data[rec]['variation2'])
                ld_snps['r2'].append(json_data[rec]['r2'])
                ld_snps['d_prime'].append(json_data[rec]['d_prime'])
        except:
            problems.append(f.result().request.url)
            print("Found a problem. Will be requeued.")
            
    # Requeue the problem URLs in case there was a fluke
    session = FuturesSession(max_workers=8)
    futures = [session.get(url,
                          headers={ "Content-Type" : "application/json"}) for url in problems]
    problems2 = []
    for f in as_completed(futures):
        try:
            json_data = f.result().json()
            for rec in range(0, len(json_data)):
                ld_snps['query'].append(json_data[rec]['variation1'])
                ld_snps['result'].append(json_data[rec]['variation2'])
                ld_snps['r2'].append(json_data[rec]['r2'])
                ld_snps['d_prime'].append(json_data[rec]['d_prime'])
            print("Succeeded on requeue!")
        except:
            problems2.append(f.result().request.url)
            print("Failed on a requeued problem.")
            
    print("Done!")
    print("Elapsed: {0} minutes {1:0f} seconds".format((time.time() - start_time) // 60, (time.time() - start_time) % 60))
    ld_snps = pd.DataFrame.from_dict(ld_snps)
    return ld_snps, problems2

def ensembl_snp_info(snp_list):
    start_time = time.time()
    
    # Check input
    if not isinstance(snp_list, list):
        raise TypeError("The input is not a list!")
        
    result_dict = {}
    problems = []
    
    # Start up the non-blocking requests in batches of 100 SNPs per request
    session = FuturesSession(max_workers=10)
    batch = 0
    futures = []
    
    headers={"Content-Type" : "application/json",
             "Accept" : "application/json"}
    
    if len(snp_list) < 100:
        futures.append(session.post("https://rest.ensembl.org/variation/homo_sapiens", 
                                    headers=headers,
                                    data = json.dumps({'ids' : snp_list})))

    else:
        for i in range(0, len(snp_list)//100):
            futures.append(session.post("https://rest.ensembl.org/variation/homo_sapiens", 
                                        headers=headers,
                                        data = json.dumps({'ids' : snp_list[batch*100:(batch+1)*100]})))
            batch += 1
        
        # Add the last batch (with less than 100 IDs)
        futures.append(session.post("https://rest.ensembl.org/variation/homo_sapiens", 
                                    headers=headers,
                                    data = json.dumps({'ids' : snp_list[(batch)*100:]})))

    # As the requests complete, try to parse the data into the results_dict, otherwise report a problem URL
    n_complete = 0
    n_total = len(snp_list)
    for f in as_completed(futures):
        try:
            json_data = f.result().json()
            result_dict.update(json_data)
            n_complete += 1
            print("Finished {} of {}".format(n_complete, n_total))

            
        except:
            #problems.append(f.result().request.url)
            problems = problems + json.loads(f.result().request.body)['ids']
            print("Found a problem.")
    print("Done!")
    print("Elapsed: {0} minutes {1:0f} seconds".format((time.time() - start_time) // 60, (time.time() - start_time) % 60))
    #snp_info = pd.DataFrame.from_dict(result_dict)
    all_snp_info = {'snp_id' : [],
                    'MAF' :[],
                    'minor_allele' : [],
                    'consequence' : [],
                    'allele_string' : [],
                    'location' : [],
                    'synonyms' : []}
    for key in result_dict.keys():
        all_snp_info['snp_id'].append(key)
        all_snp_info['MAF'].append(result_dict[key]['MAF'])
        all_snp_info['minor_allele'].append(result_dict[key]['minor_allele'])
        all_snp_info['synonyms'].append(result_dict[key]['synonyms'])
        all_snp_info['consequence'].append(result_dict[key]['most_severe_consequence'])
    # Some of these have missing mappings? Or maybe it isn't a list?
        try:
            all_snp_info['allele_string'].append(result_dict[key]['mappings'][0]['allele_string'])
            all_snp_info['location'].append(result_dict[key]['mappings'][0]['location'])
        except:
            print(result_dict[key])
            all_snp_info['allele_string'].append(None)
            all_snp_info['location'].append(None)
    
    return all_snp_info

ld_snps, problems = ensembl_LD(african_snp_list)
ld_snps.r2 = ld_snps.r2.astype(float)
ld_snps_threshold = ld_snps[ld_snps.r2 >= 0.8]
ld_snps_threshold_list = ld_snps_threshold['result'].unique().tolist()

ld_snp_info = ensembl_snp_info(ld_snps_threshold_list)
all_snp_df = pd.DataFrame.from_dict(ld_snp_info)

ld_snps_with_info = ld_snps_threshold.merge(all_snp_df, left_on='result', right_on='snp_id', how='left')

coords = ld_snps_with_info['location'].str.split(r":|-", expand=True)
ld_snps_with_info['chrom'], ld_snps_with_info['start'], ld_snps_with_info['end'] = 'chr' + coords[0], coords[1], coords[2]
ld_snps_with_info.drop(columns=['location'], inplace=True)
ld_snps_with_info.to_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/ld_snps_african.txt', sep='\t', index=False)



In [None]:
problems

#### Concatenate all the ld_expansions 

In [None]:
!wc -l /data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/ld_snps_eastasian.txt

In [None]:
import pandas as pd

european_df = pd.read_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/ld_snps_european.txt', sep='\t')
european_df['category'] = 'European'

eastasian_df = pd.read_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/ld_snps_eastasian.txt', sep='\t')
eastasian_df['category'] = 'EastAsian'

southasian_df = pd.read_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/ld_snps_southasian.txt', sep='\t')
southasian_df['category'] = 'SouthAsian'

african_df = pd.read_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/ld_snps_african.txt', sep='\t')
african_df['category'] = 'African'

all_snp_df = pd.concat([european_df,eastasian_df,southasian_df,african_df])
all_snp_df = all_snp_df.dropna(subset=['chrom'])
all_snp_df = all_snp_df.merge(gwas_snps_rsID_mapped, left_on='query', right_on='SNPS', how='inner')
all_snp_df = all_snp_df.drop_duplicates()
all_snp_df

# all_except_eur = all_snp_df[(all_snp_df['category']=='SouthAsian') | (all_snp_df['category']=='EastAsian') | (all_snp_df['category']=='African')]
# all_except_eur
# all_snp_df.to_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/all_ld_snp.txt', sep='\t', index=False)
# #all_snp_df[all_snp_df.isnull().any(axis=1)]
# all_snp_df[all_snp_df['query']=='rs12601925']

In [None]:
!head /data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/ld_snps_european.txt

In [None]:
!wc -l /data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/all_ld_snp.txt

In [None]:
len(european_df.snp_id.unique())

In [None]:
len(all_snp_df['result'][all_snp_df['CATEGORY']=='RA'].unique())

In [None]:
import pandas as pd

all_snp_df = pd.read_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/all_ld_snp.txt', sep='\t')
ld_to_bed = all_snp_df[['chrom']]
ld_to_bed['start'] = all_snp_df['start'].astype(int)
ld_to_bed['end'] = all_snp_df['end'].astype(int)
ld_to_bed['name'] = all_snp_df['snp_id']
ld_to_bed['ancestry'] = all_snp_df['category']
ld_to_bed['disorder'] = all_snp_df['CATEGORY']
ld_to_bed['allele_string'] = all_snp_df['allele_string']
ld_to_bed['minor_allele'] = all_snp_df['minor_allele']
ld_to_bed = ld_to_bed.drop_duplicates()

ld_to_bed_annotated = ld_to_bed.merge(gwas_snps_rsID_mapped, left_on='name', right_on='SNPS', how='left')
ld_to_bed_annotated = ld_to_bed_annotated.drop(columns = ['SNPS','CATEGORY','ANCESTRY'])
ld_to_bed_annotated['origin'] = ld_to_bed_annotated['origin'].fillna('ld')
# ld_to_bed_annotated[ld_to_bed_annotated.origin=='ld']

ld_to_bed_annotated.to_csv("/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/all_ld_snps_r2_0.8.bed", sep="\t", header=None, index=None)
# len(ld_to_bed_annotated['name'].unique())


In [None]:
%%bash
cd /data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry
sort -k1,1 -k2,2n all_ld_snps_r2_0.8.bed > all_ld_snps_r2_0.8.sorted.bed

In [None]:
ld_to_bed_annotated

In [None]:
!head /data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/all_ld_snps_r2_0.8.sorted.bed

In [None]:
import pandas as pd
import numpy as np

all_ld_snp = pd.read_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/all_ld_snps_r2_0.8.sorted.bed', sep='\t',
                         names=['chr','start','end','snp_id','ancestry','disorder','allele_string','minor_allele','origin'])
all_ld_snp['difference'] = all_ld_snp['end'] - all_ld_snp['start']
all_ld_snp['end'] = np.where(all_ld_snp['difference']<0, all_ld_snp['end']+np.abs(all_ld_snp['difference']) , all_ld_snp['end'])

all_ld_snp.drop_duplicates().to_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/all_ld_snps_r2_0.8_new.sorted.bed', sep='\t',index=False, header=None)
all_ld_snp

# # all_ld_snp_point = all_ld_snp[all_ld_snp['difference']==0]
# all_ld_snp_non_point = all_ld_snp[(all_ld_snp['difference']<=12)&(all_ld_snp['difference']!=0)&(all_ld_snp['difference']!=-1)]

# # all_ld_snp_point = all_ld_snp_point.drop(columns='difference')
# all_ld_snp_non_point = all_ld_snp_non_point.drop(columns='difference')
# all_ld_snp_non_point.drop_duplicates().to_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/all_non_point_ld_snps_r2_0.8.sorted.bed', sep='\t',index=False, header=None)
# all_ld_snp_point.drop_duplicates().to_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/ancestry/all_point_ld_snps_r2_0.8.sorted.bed', sep='\t',index=False, header=None)

In [None]:
all_ld_snp_point = all_ld_snp_point.drop_duplicates()
all_ld_snp_point[all_ld_snp_point.origin=='gwas']

#### Checking the Query SNPs in MS

In [None]:
import pandas as pd

gwas_snps_all = pd.read_csv('/data/reddylab/Revathy/collabs/Maria/human-th-ms_new/data_v1/snp_data/source_snps/gwas_snps_ancestry_disorder.txt', sep='\t')
gwas_snps_all_list = gwas_snps_all.SNPS.unique().tolist()
# gwas_snps_ms = set(gwas_snps_all['SNPS'][gwas_snps_all['CATEGORY']=='MS'].unique())
# len(gwas_snps_ms)