In [1]:
import pandas as pd
import requests

from api.data import (store_database_for_eys_gene,
                      parse_lovd,
                      LOVD_PATH,
                      set_lovd_dtypes,
                      request_clinvar_api_data,
                      get_variant_ids_from_clinvar_name_api,
                      )
from api.data import save_lovd_as_vcf

pd.options.display.max_columns = 0

In [None]:
store_database_for_eys_gene("lovd", override=False)

In [None]:
data = parse_lovd(LOVD_PATH + "/lovd_data.txt")

In [None]:
for i in data:
    print(i)
    display(data[i])

In [None]:
set_lovd_dtypes(data)
for i in data:
    print(i)
    display(data[i].info())

In [None]:
save_lovd_as_vcf(data["Variants_On_Genome"], "./lovd.vcf")

In [None]:
from subprocess import Popen

process = Popen("spliceai -I ./lovd.vcf -O ./lovd_output.vcf -R ../tools/spliceai/hg38.fa -A grch38".split())
process.wait()

In [None]:
from api.tools import get_revel_scores

chromosome = 6
position = 65655758

results = get_revel_scores(chromosome, position)

display(results)

In [None]:
import requests
from api.data import request_clinvar_api_data

gene_id = '1519785,1519786'

frames = request_clinvar_api_data(gene_id)

display(frames)

In [None]:
gene_id = '1519785'


clinvar_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=clinvar&id={gene_id}&retmode=json"

request = requests.get(clinvar_url)

if request.status_code != 200:
    raise ValueError(f"Request failed with status code {request.status_code}")

data = request.json()

# Extract the 'result' part of the JSON
results = data['result']

# Extract the 'uids' part of the JSON
flattened_data = []

for uid in results['uids']:
    entry = results[uid]

    # Using pd.json_normalize to flatten the JSON data
    flattened_entry = pd.json_normalize(entry, sep='_')

    # Process variation_set
    variation_set = flattened_entry.at[0, 'variation_set']
    for idx, var_set in enumerate(variation_set):
        flat_var_set = pd.json_normalize(var_set, sep='_')
        flat_var_set = flat_var_set.add_prefix(f'variation_set_{idx}_')

        # Process variation_loc within variation_set
        variation_loc = var_set.get('variation_loc', [])
        for loc_idx, loc in enumerate(variation_loc):
            flat_loc = pd.json_normalize(loc, sep='_')
            flat_loc = flat_loc.add_prefix(f'variation_set_{idx}_loc_{loc_idx}_')
            flat_var_set = flat_var_set.join(flat_loc, rsuffix=f'_{idx}_{loc_idx}_vl')
        
        var_xrefs = var_set.get('variation_xrefs', [])
        for var_xrefs_idx, var_xref in enumerate(var_xrefs):
            flat_var_xrefs = pd.json_normalize(var_xref, sep='_')
            flat_var_xrefs = flat_var_xrefs.add_prefix(f'variation_set_{idx}_var_xrefs_{var_xrefs_idx}_')
            flat_var_set = flat_var_set.join(flat_var_xrefs, rsuffix=f'_{idx}_{var_xrefs_idx}_vx')


        allele_freq = var_set.get('allele_freq_set', [])
        for allele_freq_idx, allele in enumerate(allele_freq):
            flat_allele = pd.json_normalize(allele, sep='_')
            flat_allele = flat_allele.add_prefix(f'variation_set_{idx}_allele_freq_{allele_freq_idx}_')
            flat_var_set = flat_var_set.join(flat_allele, rsuffix=f'_{idx}_{allele_freq_idx}_af')
            
        # drop original nested lists columns
        flat_var_set = flat_var_set.drop(columns=[f'variation_set_{idx}_variation_loc', f'variation_set_{idx}_variation_xrefs', f'variation_set_{idx}_allele_freq_set'])
        
        flattened_entry = flattened_entry.join(flat_var_set, rsuffix=f'_{idx}_vs')

    # Process genes
    genes = flattened_entry.at[0, 'genes']
    for idx, gene in enumerate(genes):
        flat_genes = pd.json_normalize(gene, sep='_')
        flat_genes = flat_genes.add_prefix(f'gene_{idx}_')
        flattened_entry = flattened_entry.join(flat_genes, rsuffix=f'_{idx}_g')
    # Process germline_classification_trait_set
    germline_classification_trait_set = flattened_entry.at[0, 'germline_classification_trait_set']
    for idx, germline_set in enumerate(germline_classification_trait_set):
        flat_germline_set = pd.json_normalize(germline_set, sep='_')
        flat_germline_set = flat_germline_set.add_prefix(f'germline_set_{idx}_')

        trait_xrefs = flat_germline_set.at[0, f'germline_set_{idx}_trait_xrefs']
        for jdx, trait_xref in enumerate(trait_xrefs):
            flat_trait_xrefs = pd.json_normalize(trait_xref, sep='_')
            flat_trait_xrefs = flat_trait_xrefs.add_prefix(f'trait_xref_{jdx}_')
            flat_germline_set = flat_germline_set.join(flat_trait_xrefs, rsuffix=f'_{idx}_{jdx}_tx')

        flat_germline_set = flat_germline_set.drop(columns=[f'germline_set_{idx}_trait_xrefs'])
        flattened_entry = flattened_entry.join(flat_germline_set, rsuffix=f'_{idx}_gls')

    # Dropping original nested lists columns
    flattened_entry = flattened_entry.drop(columns=['variation_set', 'genes', 'germline_classification_trait_set'])

    # Append the flattened entry to the list
    flattened_data.append(flattened_entry)

# Concatenate all flattened entries into a single DataFrame
df = pd.concat(flattened_data, ignore_index=True)

display(df)

Explanation of whats happening in the code below:

Function to get all the ids from a gene name:
```python
get_variant_ids_from_clinvar_name_api(name: str, count: int)
```

function gets the ids from the clinvar api, the name is the gene name and the count is the maximum number of ids to get (api's limit is 500)

function returns a dictionary with the count and the list of ids:

```json
{
    'count': int,
    'idlist': List[str]
}
```

if the count is greater than the api's limit, the function will split the list of ids into smaller lists of 500 and then request the data from the api in chunks of 500 ids:

```python
id_lists = [id_list[i:i + max] for i in range(0, size, max)]
```

then the function will request the data from the api and concatenate the dataframes into a single dataframe:

```python
frames = request_clinvar_api_data(join)
variations = pd.concat([variations, frames], ignore_index=True)
```

The variant extraction function contains a lot of nested lists and dictionaries, so the function will flatten the data and then concatenate the dataframes into a single dataframe

**NOTE**

> joining function may have been implemented wrong due to the waiting time of the api.


In [37]:
import pandas as pd

variations = pd.DataFrame()

max = 500
name = "EYS"
count = 2147483647

id_array = get_variant_ids_from_clinvar_name_api(name, count)
size = int(id_array['count'])
id_list = id_array['idlist']

id_lists = [id_list[i:i + max] for i in range(0, size, max)]

track = 0
for lists in id_lists:
    join = ",".join(lists)
    frame = request_clinvar_api_data(join)
    
    variations = pd.concat([variations, frame], ignore_index=True)
    
    print(f"{track + 1}/{len(id_lists)}")
    track += 1

display(variations)


1/10
2/10
3/10
4/10
5/10
6/10
7/10
8/10
9/10
10/10


Unnamed: 0,uid,obj_type,accession,accession_version,title,record_status,gene_sort,chr_sort,location_sort,variation_set_name,variation_set_id,molecular_consequence_list,protein_change,fda_recognized_database,supporting_submissions_scv,supporting_submissions_rcv,germline_classification_description,germline_classification_last_evaluated,germline_classification_review_status,germline_classification_fda_recognized_database,clinical_impact_classification_description,clinical_impact_classification_last_evaluated,clinical_impact_classification_review_status,clinical_impact_classification_fda_recognized_database,clinical_impact_classification_trait_set,oncogenicity_classification_description,oncogenicity_classification_last_evaluated,oncogenicity_classification_review_status,oncogenicity_classification_fda_recognized_database,oncogenicity_classification_trait_set,variation_set_0_measure_id,variation_set_0_variation_name,variation_set_0_cdna_change,variation_set_0_aliases,variation_set_0_variant_type,variation_set_0_canonical_spdi,variation_set_0_loc_0_status,variation_set_0_loc_0_assembly_name,variation_set_0_loc_0_chr,variation_set_0_loc_0_band,...,gene_1020_symbol,gene_1020_geneid,gene_1020_strand,gene_1020_source,gene_1021_symbol,gene_1021_geneid,gene_1021_strand,gene_1021_source,gene_1022_symbol,gene_1022_geneid,gene_1022_strand,gene_1022_source,gene_1023_symbol,gene_1023_geneid,gene_1023_strand,gene_1023_source,gene_1024_symbol,gene_1024_geneid,gene_1024_strand,gene_1024_source,gene_1025_symbol,gene_1025_geneid,gene_1025_strand,gene_1025_source,gene_1026_symbol,gene_1026_geneid,gene_1026_strand,gene_1026_source,gene_1027_symbol,gene_1027_geneid,gene_1027_strand,gene_1027_source,gene_1028_symbol,gene_1028_geneid,gene_1028_strand,gene_1028_source,gene_1029_symbol,gene_1029_geneid,gene_1029_strand,gene_1029_source
0,3251429,single nucleotide variant,VCV003251429,VCV003251429.,NM_001142800.2(EYS):c.5886T>C (p.Thr1962=),,EYS,06,00000000000064436215,,,[synonymous variant],,,[SCV005076913],[RCV004587835],Likely benign,2024/04/08 00:00,"criteria provided, single submitter",,,1/01/01 00:00,,,[],,1/01/01 00:00,,,[],3410228,NM_001142800.2(EYS):c.5886T>C (p.Thr1962=),c.5886T>C,[],single nucleotide variant,NC_000006.12:64436214:A:G,current,GRCh38,6,6q12,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,3246148,Deletion,VCV003246148,VCV003246148.,NC_000006.11:g.(?_66204859)_(66217229_?)del,,EYS,06,99999999999999999999,,,[],,,[SCV005067530],[RCV004578792],Pathogenic,2023/01/02 00:00,"criteria provided, single submitter",,,1/01/01 00:00,,,[],,1/01/01 00:00,,,[],3403857,NC_000006.11:g.(?_66204859)_(66217229_?)del,NC_000006.11:g.(?_66204859)_(66217229_?)del,[],Deletion,,previous,GRCh37,6,6q12,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,3246147,Deletion,VCV003246147,VCV003246147.,NC_000006.11:g.(?_64511633)_(64516181_?)del,,EYS,06,99999999999999999999,,,[],,,[SCV005067529],[RCV004578791],Likely pathogenic,2023/03/08 00:00,"criteria provided, single submitter",,,1/01/01 00:00,,,[],,1/01/01 00:00,,,[],3403856,NC_000006.11:g.(?_64511633)_(64516181_?)del,NC_000006.11:g.(?_64511633)_(64516181_?)del,[],Deletion,,previous,GRCh37,6,6q12,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,3246146,Deletion,VCV003246146,VCV003246146.,NC_000006.11:g.(?_65523280)_(65527746_?)del,,EYS,06,99999999999999999999,,,[],,,[SCV005067528],[RCV004578790],Likely pathogenic,2023/04/30 00:00,"criteria provided, single submitter",,,1/01/01 00:00,,,[],,1/01/01 00:00,,,[],3403855,NC_000006.11:g.(?_65523280)_(65527746_?)del,NC_000006.11:g.(?_65523280)_(65527746_?)del,[],Deletion,,previous,GRCh37,6,6q12,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,3246145,Deletion,VCV003246145,VCV003246145.,NC_000006.11:g.(?_65587645)_(65596716_?)del,,EYS,06,99999999999999999999,,,[],,,[SCV005067527],[RCV004578789],Likely pathogenic,2023/06/27 00:00,"criteria provided, single submitter",,,1/01/01 00:00,,,[],,1/01/01 00:00,,,[],3403854,NC_000006.11:g.(?_65587645)_(65596716_?)del,NC_000006.11:g.(?_65587645)_(65596716_?)del,[],Deletion,,previous,GRCh37,6,6q12,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4778,538,single nucleotide variant,VCV000000538,VCV000000538.,NM_001142800.2(EYS):c.9405T>A (p.Tyr3135Ter),,EYS,06,00000000000063720626,,,"[3 prime UTR variant, nonsense]","Y3156*, Y3135*",,"[SCV000020717, SCV000894389, SCV000709692, SCV...","[RCV000000568, RCV000593252, RCV003914789, RCV...",Pathogenic/Likely pathogenic,2024/03/09 00:00,"criteria provided, multiple submitters, no con...",,,1/01/01 00:00,,,[],,1/01/01 00:00,,,[],15577,NM_001142800.2(EYS):c.9405T>A (p.Tyr3135Ter),c.9405T>A,[],single nucleotide variant,NC_000006.12:63720625:A:T,current,GRCh38,6,6q12,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4779,537,single nucleotide variant,VCV000000537,VCV000000537.,NM_001142800.2(EYS):c.5857G>T (p.Glu1953Ter),,EYS,06,00000000000064436244,,,[nonsense],E1953*,,"[SCV000020716, SCV002519636, SCV004195857, SCV...","[RCV000000567, RCV001387157]",Pathogenic,2024/02/15 00:00,"criteria provided, multiple submitters, no con...",,,1/01/01 00:00,,,[],,1/01/01 00:00,,,[],15576,NM_001142800.2(EYS):c.5857G>T (p.Glu1953Ter),c.5857G>T,[],single nucleotide variant,NC_000006.12:64436243:C:A,current,GRCh38,6,6q12,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4780,536,Deletion,VCV000000536,VCV000000536.,NM_001142800.1(EYS):c.1767-24596_2023+238135del,,LOC441155,06,00000000000065057728,,,[],,,[SCV000020715],[RCV000000566],Pathogenic,2008/11/01 00:00,no assertion criteria provided,,,1/01/01 00:00,,,[],,1/01/01 00:00,,,[],15575,NM_001142800.1(EYS):c.1767-24596_2023+238135del,NM_001142800.1(EYS):c.1767-24596_2023+238135del,[EX12DEL],Deletion,,current,GRCh38,6,6q12,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4781,535,Deletion,VCV000000535,VCV000000535.,NM_001142800.1(EYS):c.2260-51191_2992+45990del,,EYS,06,00000000000064840707,,,[],,,[SCV000020714],[RCV000000565],Pathogenic,2008/11/01 00:00,no assertion criteria provided,,,1/01/01 00:00,,,[],,1/01/01 00:00,,,[],15574,NM_001142800.1(EYS):c.2260-51191_2992+45990del,NM_001142800.1(EYS):c.2260-51191_2992+45990del,[EX15-19DEL],Deletion,,current,GRCh38,6,6q12,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
