# Demo of Utilizing New GraphQL API for SNP Data

## Setting Up API Connection

The script begins by setting up a connection to a GraphQL API, specifying the base URL and the endpoint.

In [None]:
import requests
import json
import pandas as pd
from config.settings import settings

BASE_URL = settings.SITE_URL
GRAPHQL_ENDPOINT = 'graphql'

## Understanding Annotations in the API

It executes a GET request to retrieve a list of annotations. These annotations describe various data fields available through the API, detailing their characteristics such as name, description, and how they relate to each other in a hierarchical manner, much like a structured catalog of options you can query. 

**api_field:** Specifies the field name as it should be used in API requests, particularly when crafting queries for a GraphQL API. This ensures you're asking for data in a format the API understands.

In [3]:
response = requests.get(f"{BASE_URL}annotations")

annotations = response.json()
annotations_df = pd.DataFrame(annotations['results'])
annotations_df

Unnamed: 0,id,leaf,name,label,sort,parent_id,detail,link,pmid,field_type,keyword_searchable,api_field,root_url,sample_url,value_type
0,0,False,root,Annotation,0.0,,,,,,,,,,
1,1,False,Basic Info,,1.0,0,"Basic information about the variant, such as c...",,,,,,,,
2,26,False,ANNOVAR,,2.0,0,Pre-computed ANNOVAR annotations for all alter...,http://annovar.openbioinformatics.org/en/lates...,20601685,,,,,,
3,208,False,SnpEff,,3.0,0,AnpEff is a program for annotating and predict...,http://pcingola.github.io/SnpEff/,22728672,,,,,,
4,132,False,VEP,,4.0,0,Variant Effect Predictor (VEP) is developed by...,https://uswest.ensembl.org/info/docs/tools/vep...,27268795,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541,621,True,sno_miRNA_type,,,495,the type of snoRNA or miRNA (from miRBase/snoR...,,,text,True,sno_miRNA_type,,,
542,622,True,splicing_consensus_ada_score,,,495,splicing-change prediction for splicing consen...,,,float,,splicing_consensus_ada_score,,,
543,623,True,splicing_consensus_rf_score,,,495,splicing-change prediction for splicing consen...,,,float,,splicing_consensus_rf_score,,,
544,624,True,target_gene,,,495,"target gene (for promoter, enhancer, etc.) bas...",,,text,True,target_gene,,,


## Extracting SNP Data Through a GraphQL Query

The script continues by constructing a GraphQL query designed to fetch specific information about Single Nucleotide Polymorphisms (SNPs) based on criteria such as chromosome number and position range. This query illustrates GraphQL's capability to precisely target and retrieve the needed data from the server, thus optimizing the data acquisition process. The response from this query provides detailed attributes of SNPs for subsequent processing or analysis.

In [15]:
query = """
query MyQuery {
  GetSNPsByChromosome(chr: "1", end: 1000000, start: 10) {
    alt 
    chr 
    pos
    rs_dbSNP151
    ref
    ANNOVAR_ensembl_Effect 
    ANNOVAR_refseq_Effect
  }
}
"""

response = requests.post(f"{BASE_URL}{GRAPHQL_ENDPOINT}", json={'query': query})

data = json.loads(response.text)
snps_by_chromosome = data['data']['GetSNPsByChromosome']

## Processing and Displaying the Data

After receiving data from the GraphQL query, the script processes it for analysis. This involves flattening the nested structure of the data response to a more straightforward, table-like format.

In [17]:
flattened_data = pd.json_normalize(snps_by_chromosome)
snp_df = pd.DataFrame(flattened_data)
snp_df

Unnamed: 0,alt,chr,pos,rs_dbSNP151,ref,ANNOVAR_ensembl_Effect,ANNOVAR_refseq_Effect
0,A,1,54353,rs140052487,C,ncRNA_intronic|downstream,intergenic
1,T,1,54564,rs558796213,G,ncRNA_intronic|downstream,intergenic
2,G,1,54591,rs561234294,A,ncRNA_intronic|downstream,intergenic
3,A,1,16071,rs541172944,G,ncRNA_intronic|downstream,ncRNA_intronic|downstream
4,T,1,16141,rs529651976,C,ncRNA_intronic|downstream,ncRNA_intronic|downstream
5,C,1,16280,rs866639523,T,ncRNA_intronic|downstream,ncRNA_intronic|downstream
6,G,1,13380,rs571093408,C,ncRNA_exonic|ncRNA_intronic|downstream,ncRNA_exonic|downstream
7,C,1,49298,rs10399793,T,upstream,intergenic
8,T,1,54676,rs2462492,C,ncRNA_intronic|downstream,intergenic


# Get SNP by VCF File
This is done using the IDs

In [39]:
file = open("../data/sample-vcf.txt", "r") 
lines = file.readlines()
file.close() 
lines

['##fileformat=VCFv4.1\n',
 '\n',
 '\n',
 '#CHROM  POS ID  REF ALT QUAL    FILTER  INFO    FORMAT  Test\n',
 'chr1    115921355   rs7552722   T   C   50.46   .   AC=1;AF=0.500;AN=2;BaseQRankSum=-1.920;ClippingRankSum=0.322;DB;DP=8;FS=0.000;MLEAC=1;MLEAF=0.500;MQ=60.00;MQRankSum=-1.517;QD=11.22;ReadPosRankSum=0.322;SOR=1.329 GT:AD:DP:GQ:PL  "0/1:3,5:8:77:118,0,77"\n',
 'chr1    12046063    rs2336384   G   T   581.46  .   AC=1;AF=0.500;AN=2;BaseQRankSum=0.736;ClippingRankSum=0.736;DB;DP=3;FS=0.000;MLEAC=1;MLEAF=0.500;MQ=60.00;MQRankSum=-0.736;QD=10.60;ReadPosRankSum=0.736;SOR=1.179  GT:AD:DP:GQ:PL  "0/1:1,2:3:23:60,0,23"\n',
 'chr1    154418879   rs4537545   C   T   599.45  .   AC=1;AF=0.500;AN=2;BaseQRankSum=0.727;ClippingRankSum=0.727;DB;DP=4;FS=0.000;MLEAC=1;MLEAF=0.500;MQ=60.00;MQRankSum=0.727;QD=10.95;ReadPosRankSum=0.727;SOR=0.916   GT:AD:DP:GQ:PL  "0/1:1,3:4:20:72,0,20"\n',
 'chr1    154426970   rs2228145   A   C   1161.77 .   AC=1;AF=0.500;AN=2;BaseQRankSum=0.531;ClippingRankSum=

In [53]:
def get_ids(text):
    ids = []
    for line in text:
        if line.startswith("#") or line.startswith("\n"):
            continue
        line_split = line.split()
        chr = line_split[0].replace("chr", "")
        pos = line_split[1]
        ref = line_split[3]
        alt = line_split[4]
        ids.append(f"{chr}:{pos}{ref}>{alt}")
    return ids
ids = get_ids(lines)
json.dumps(ids)

'["1:115921355T>C", "1:12046063G>T", "1:154418879C>T", "1:154426970A>C", "1:169073346G>A", "1:169099483A>G", "1:183081194A>C", "1:183266182C>T", "1:218860068C>T", "1:235600129T>C", "1:237266603A>G", "1:237349738T>C", "1:237990122A>G", "1:96944797A>C", "10:123337335A>G", "10:123346116G>A", "10:123346190A>G", "10:123348662T>G", "10:123352317A>G", "10:35295431C>G", "10:35535695A>G", "10:62085337T>C", "10:78646536G>T", "11:108175462G>A", "11:112085316T>C", "11:112991618G>T", "11:113106455A>G", "11:117156893G>C", "11:13293905C>T", "11:13331226G>A", "11:13361524C>T", "11:18822037A>G", "11:2138139A>G", "11:2484803T>C", "11:2550730A>G", "11:2691471A>G", "11:2752609G>A", "11:2757985C>T", "11:27583129T>C", "11:2766282G>A", "11:27667202T>G", "11:27725986T>A", "11:27728539A>C", "11:2839751C>T", "11:2847069T>G", "11:35329615G>T", "11:47275064G>A", "11:61746291A>C", "11:65249145A>G", "11:65260646A>G", "11:68201295C>T", "12:102912558G>A", "12:124894184T>C", "12:13641706C>A", "12:13872044G>T", "12:139

In [55]:
query = f"""
query myQuery {{
  GetSNPsByIDs(
    ids: {json.dumps(ids)}
    page_args: {{from_: 0, size: 5}}
  ) {{
    id
    chr
    pos
    ANNOVAR_ensembl_Effect
  }}
}}
"""

response = requests.post(f"{BASE_URL}{GRAPHQL_ENDPOINT}", json={'query': query})

data = json.loads(response.text)
snps_by_id = data['data']['GetSNPsByIDs']
snps_by_id

[]

In [27]:
flattened_data = pd.json_normalize(snps_by_id)
snp_df = pd.DataFrame(flattened_data)
snp_df

Unnamed: 0,id,chr,pos,ANNOVAR_ensembl_Effect
0,2:10662G>C,2,10662,intergenic
1,2:10632C>A,2,10632,intergenic


# Get SNP by RsID

In [28]:
query = """
query myQuery {
  GetSNPsByRsID(rsID: "rs189126619", filter_args: {exists: ["rs_dbSNP151"]}) {
    rs_dbSNP151
    chr
  }
}
"""

response = requests.post(f"{BASE_URL}{GRAPHQL_ENDPOINT}", json={'query': query})

data = json.loads(response.text)
snps_by_RsID = data['data']['GetSNPsByRsID']
snps_by_RsID

[{'rs_dbSNP151': 'rs189126619', 'chr': '2'}]

In [29]:
flattened_data = pd.json_normalize(snps_by_RsID)
snp_df = pd.DataFrame(flattened_data)
snp_df

Unnamed: 0,rs_dbSNP151,chr
0,rs189126619,2


# Get SNP by RsIDs

In [30]:
query = """
query myQuery {
  GetSNPsByRsIDs(rsIDs: ["rs115366554", "rs189126619"], filter_args: {exists: ["rs_dbSNP151"]}) {
    rs_dbSNP151
    chr
  }
}
"""

response = requests.post(f"{BASE_URL}{GRAPHQL_ENDPOINT}", json={'query': query})

data = json.loads(response.text)
snps_by_RsIDs = data['data']['GetSNPsByRsIDs']
snps_by_RsIDs

[{'rs_dbSNP151': 'rs115366554', 'chr': '2'},
 {'rs_dbSNP151': 'rs189126619', 'chr': '2'}]

In [32]:
flattened_data = pd.json_normalize(snps_by_RsIDs)
snp_df = pd.DataFrame(flattened_data)
snp_df

Unnamed: 0,rs_dbSNP151,chr
0,rs115366554,2
1,rs189126619,2


# Get aggregates by chromosome

In [18]:
query = """
query myQuery {
  GetAggsByChromosome(chr: "2", end: 100000, start: 10) {
    chr {
      histogram {
        key
        doc_count
      }
      min
      max
    }
    pos {
      frequency {
        doc_count
        key
      }
      missing {
        doc_count
      }
      doc_count
    }
  }
}
"""

response = requests.post(f"{BASE_URL}{GRAPHQL_ENDPOINT}", json={'query': query})

data = json.loads(response.text)
aggs_by_chromosome = data['data']['GetAggsByChromosome']
aggs_by_chromosome

{'chr': {'histogram': [{'key': '9786.54', 'doc_count': 9},
   {'key': '14679.810000000001', 'doc_count': 0},
   {'key': '19573.08', 'doc_count': 0},
   {'key': '24466.350000000002', 'doc_count': 0},
   {'key': '29359.620000000003', 'doc_count': 0},
   {'key': '34252.89', 'doc_count': 0},
   {'key': '39146.16', 'doc_count': 0},
   {'key': '44039.43000000001', 'doc_count': 0},
   {'key': '48932.700000000004', 'doc_count': 0},
   {'key': '53825.97', 'doc_count': 0},
   {'key': '58719.240000000005', 'doc_count': 0},
   {'key': '63612.51000000001', 'doc_count': 0},
   {'key': '68505.78', 'doc_count': 0},
   {'key': '73399.05', 'doc_count': 0},
   {'key': '78292.32', 'doc_count': 0},
   {'key': '83185.59000000001', 'doc_count': 0},
   {'key': '88078.86000000002', 'doc_count': 0},
   {'key': '92972.13', 'doc_count': 0},
   {'key': '97865.40000000001', 'doc_count': 0},
   {'key': '102758.67000000001', 'doc_count': 0},
   {'key': '107651.94', 'doc_count': 0},
   {'key': '112545.21', 'doc_count'

In [20]:
flattened_data = pd.json_normalize(aggs_by_chromosome, max_level=4)
snp_df = pd.DataFrame(flattened_data)
snp_df

Unnamed: 0,chr.histogram,chr.min,chr.max,pos.frequency,pos.missing.doc_count,pos.doc_count
0,"[{'key': '9786.54', 'doc_count': 9}, {'key': '...",10597.0,10662.0,"[{'doc_count': 1, 'key': '10597'}, {'doc_count...",0,9


# Download Snp Data by chromosomes

In [24]:
query = """
query myQuery {
  DownloadSNPsByChromosome(
    chr: "2"
    end: 1000000
    fields: ["chr", "pos", "ref"]
    start: 10
    page_args: {from_: 2, size: 2}
  )
}
"""

response = requests.post(f"{BASE_URL}{GRAPHQL_ENDPOINT}", json={'query': query})

data = json.loads(response.text)
download_by_chromosome = data['data']['DownloadSNPsByChromosome']
download_by_chromosome

'/downloads/22074ef2-6729-4839-8dc2-a51014f14382.txt'

# Count SNP by chromosomes

In [25]:
query = """
query myQuery {
  CountSNPsByChromosome(
    chr: "2"
    end: 100000
    start: 10
    filter_args: {exists: ["chr", "ref", "pos"]}
  )
}
"""

response = requests.post(f"{BASE_URL}{GRAPHQL_ENDPOINT}", json={'query': query})

data = json.loads(response.text)
count_by_chromosome = data['data']['CountSNPsByChromosome']
count_by_chromosome

9