In [1]:
import pandas as pd
import requests

from api.data import (store_database_for_eys_gene,
                      parse_lovd,
                      LOVD_PATH,
                      set_lovd_dtypes,
                      request_clinvar_api_data,
                      get_variant_ids_from_clinvar_name_api,
                      request_gnomad_api_data,
                      )
from api.data import save_lovd_as_vcf

pd.options.display.max_columns = 0

In [None]:
store_database_for_eys_gene("lovd", override=False)

In [None]:
data = parse_lovd(LOVD_PATH + "/lovd_data.txt")

In [None]:
for i in data:
    print(i)
    display(data[i])

In [None]:
set_lovd_dtypes(data)
for i in data:
    print(i)
    display(data[i].info())

In [None]:
save_lovd_as_vcf(data["Variants_On_Genome"], "./lovd.vcf")

In [None]:
from subprocess import Popen

process = Popen("spliceai -I ./lovd.vcf -O ./lovd_output.vcf -R ../tools/spliceai/hg38.fa -A grch38".split())
process.wait()

In [None]:
from api.tools import get_revel_scores

chromosome = 6
position = 65655758

results = get_revel_scores(chromosome, position)

display(results)

In [None]:
variation_ids = '148002'

frames = request_clinvar_api_data(variation_ids)

display(frames)

In [None]:
clinvar_data = pd.read_csv("C:\\Users\\Kajus\\Desktop\\clinvar_results.txt", sep='\t')

display(clinvar_data)

Explanation of whats happening in the code below:

Function to get all the ids from a gene name:
```python
get_variant_ids_from_clinvar_name_api(name: str, count: int)
```

function gets the ids from the clinvar api, the name is the gene name and the count is the maximum number of ids to get (api's limit is 500)

function returns a dictionary with the count and the list of ids:

```json
{
    'count': int,
    'idlist': List[str]
}
```

if the count is greater than the api's limit, the function will split the list of ids into smaller lists of 500 and then request the data from the api in chunks of 500 ids:

```python
id_lists = [id_list[i:i + max] for i in range(0, size, max)]
```

then the function will request the data from the api and concatenate the dataframes into a single dataframe:

```python
frames = request_clinvar_api_data(join)
variations = pd.concat([variations, frames], ignore_index=True)
```

The variant extraction function contains a lot of nested lists and dictionaries, so the function will flatten the data and then concatenate the dataframes into a single dataframe

**NOTE**

> joining function may have been implemented wrong due to the waiting time of the api.


In [None]:
import pandas as pd

variations = pd.DataFrame()

max = 500
name = "EYS"
count = 2147483647

id_array = get_variant_ids_from_clinvar_name_api(name, count)
size = int(id_array['count'])
id_list = id_array['idlist']

id_lists = [id_list[i:i + max] for i in range(0, size, max)]

track = 0
for lists in id_lists:
    join = ",".join(lists)
    frame = request_clinvar_api_data(join)
    
    variations = pd.concat([variations, frame], ignore_index=True)
    
    print(f"{track + 1}/{len(id_lists)}")
    track += 1

display(variations)


In [None]:
clinvar_data = pd.read_csv('C:\\Users\\Kajus\\Desktop\\clinvar_result.txt', sep='\t')

display(clinvar_data)

In [2]:
gnomad_from_api = request_gnomad_api_data(False)

display(gnomad_from_api)

Unnamed: 0,variant_id,cDNA change,Protein change,Allele Frequency,Homozygote Count,Popmax,Popmax population
0,6-63720525-A-G,c.*71T>C,0,1.807419e-06,0.0,0.000016,African/African American
1,6-63720525-A-T,c.*71T>A,0,6.573844e-06,0.0,0.000192,East Asian
2,6-63720525-A-C,c.*71T>G,0,0.000000e+00,0.0,0.000000,
3,6-63720526-T-A,c.*70A>T,0,1.045299e-06,0.0,0.000020,South Asian
4,6-63720527-G-T,c.*69C>A,0,0.000000e+00,0.0,0.000000,
...,...,...,...,...,...,...,...
14295,6-65495479-G-T,c.-69C>A,0,0.000000e+00,0.0,0.000000,
14296,6-65495479-G-A,c.-69C>T,0,1.446349e-06,0.0,0.000031,African/African American
14297,6-65495482-A-G,c.-72T>C,0,2.629510e-06,0.0,0.000070,Admixed American
14298,6-65495484-T-G,c.-74A>C,0,3.645085e-06,0.0,0.000060,South Asian
