# Exploring Orthology AGR data for BioThings Studio Parser

In [76]:
import pandas as pd
import numpy as np
import os, csv
import gzip
from biothings.utils.dataload import dict_convert, dict_sweep


## Example Data : Drug Labels


In [35]:
infile = os.path.join("c:\\Users\\19802\\Documents\\dev\scripps\\BioThings_SuLab\\data\\drugLabels.byGene.tsv")
assert os.path.exists(infile2)

In [42]:
example_data=pd.read_csv(infile2, sep="\t",squeeze=True,quoting=csv.QUOTE_NONE).to_dict(orient='records') 

View the first record of the `example_data`. 


In [43]:
example_data[0]

{'Gene ID': 'PA267',
 'Gene Symbol': 'ABCB1',
 'Label IDs': 'PA166123409;PA166159586',
 'Label Names': 'Annotation of EMA Label for aliskiren and ABCB1;Annotation of HCSC Label for aliskiren and ABCB1'}

In [37]:
# we'll remove space in keys to make queries easier. Also, lowercase is preferred
# for a BioThings API. We'll an helper function from BioThings SDK
process_key = lambda k: k.replace(" ","_").lower()

In [44]:
# Test Example Parse
for rec in example_data[:2]:
    label_ids = rec.pop("Label IDs").split(";")
    label_names = rec.pop("Label Names").split(";")
    assert len(label_ids) == len(label_names)
    labels = []
    for i,_ in enumerate(label_ids):
        #print(label_ids[i])
        labels.append({"id" : label_ids[i],
                        "name" : label_names[i]})
    _id = rec["Gene ID"]
    rec = dict_convert(rec,keyfn=process_key)
    doc = {"_id" : _id, "drug_labels" : labels}

print(doc)

{'_id': 'PA390', 'drug_labels': [{'id': 'PA166184427', 'name': 'Annotation of Swissmedic Label for ezetimibe / rosuvastatin and ABCG2, SLCO1B1'}, {'id': 'PA166184499', 'name': 'Annotation of Swissmedic Label for rosuvastatin and ABCG2, SLCO1B1'}]}


>**Notes/Questsions**    
> - Ways to test BioThings/ES mapping?  
> - geneID search on mygene.info comparison to parsed mapping, *concerns over inconsistency*  
> - remove any columns/variables?  
> - only searchable index is `gene1id` (it is saved as a text vs. keyword)




In [45]:
example_data[0]

{'Gene ID': 'PA267', 'Gene Symbol': 'ABCB1'}

---

## Ortholog AGR Data  



### Load input data   

Datasource: ```https://fms.alliancegenome.org/download/ORTHOLOGY-ALLIANCE_COMBINED.tsv.gz```     
  
  
*The first lines (0-14) are summary information and are disregarded when formatting the dataframe.*

In [22]:
ortho_infile = os.path.join("c:\\Users\\19802\\Documents\\dev\scripps\\BioThings_SuLab\\data\\ORTHOLOGY-ALLIANCE_COMBINED.tsv.gz")
assert os.path.exists(ortho_infile)

In [26]:
# open and print the summary key at top of file
# open and load tsv into dataframe
with gzip.open(ortho_infile, 'rb') as f_in:
    orthoAGR=pd.read_csv(f_in, header=15, sep="\\t", engine="python")
    for line in f_in.readlines()[:15]:
        print(line)
    print("\n")
data_ortho_agr=orthoAGR.to_dict(orient='records')

print("[INFO] completed loading data.")


b'##########################################################################\n'
b'#\n'
b'# Data type: Orthology\n'
b'# Data format: tsv\n'
b'# README: \n'
b'# Source: Alliance of Genome Resources (Alliance)\n'
b'# Source URL: http://alliancegenome.org/downloads\n'
b'# Help Desk: help@alliancegenome.org\n'
b'# Orthology Filter: Stringent\n'
b'# Taxon IDs: NCBITaxon:9606, NCBITaxon:10116, NCBITaxon:10090, NCBITaxon:7955, NCBITaxon:7227, NCBITaxon:6239, NCBITaxon:559292\n'
b'# Species: Homo sapiens, Rattus norvegicus, Mus musculus, Danio rerio, Drosophila melanogaster, Caenorhabditis elegans, Saccharomyces cerevisiae\n'
b'# Alliance Database Version: 4.1.0\n'
b'# Date file generated (UTC): 2021-08-10 17:59\n'
b'#\n'
b'##########################################################################\n'


[INFO] completed loading data.


### Explore Data

For each column in the dataframe `orthoAGR` list out the name and the unique value count. *If the count is low, list the values.* 

In [29]:
# view orthology data columns available
orthoAGR.columns.values

array(['Gene1ID', 'Gene1Symbol', 'Gene1SpeciesTaxonID',
       'Gene1SpeciesName', 'Gene2ID', 'Gene2Symbol',
       'Gene2SpeciesTaxonID', 'Gene2SpeciesName', 'Algorithms',
       'AlgorithmsMatch', 'OutOfAlgorithms', 'IsBestScore',
       'IsBestRevScore'], dtype=object)

In [11]:
# Count unique values found in columns, if under a certain threshold view the unique list of values
for col in orthoAGR.columns.values:
    list_len=len(orthoAGR[col].unique().tolist())
    print("column: %s length: %s"%(col, list_len))
    if list_len < 20:
        print("unique values: ", orthoAGR[col].unique().tolist())


column: Gene1ID length: 100430
column: Gene1Symbol length: 81990
column: Gene1SpeciesTaxonID length: 7
unique values:  ['NCBITaxon:6239', 'NCBITaxon:7955', 'NCBITaxon:10116', 'NCBITaxon:10090', 'NCBITaxon:9606', 'NCBITaxon:7227', 'NCBITaxon:559292']
column: Gene1SpeciesName length: 7
unique values:  ['Caenorhabditis elegans', 'Danio rerio', 'Rattus norvegicus', 'Mus musculus', 'Homo sapiens', 'Drosophila melanogaster', 'Saccharomyces cerevisiae']
column: Gene2ID length: 100430
column: Gene2Symbol length: 81990
column: Gene2SpeciesTaxonID length: 7
unique values:  ['NCBITaxon:559292', 'NCBITaxon:7227', 'NCBITaxon:7955', 'NCBITaxon:10116', 'NCBITaxon:9606', 'NCBITaxon:6239', 'NCBITaxon:10090']
column: Gene2SpeciesName length: 7
unique values:  ['Saccharomyces cerevisiae', 'Drosophila melanogaster', 'Danio rerio', 'Rattus norvegicus', 'Homo sapiens', 'Caenorhabditis elegans', 'Mus musculus']
column: Algorithms length: 2529
column: AlgorithmsMatch length: 11
unique values:  [9, 8, 7, 10, 6

In [61]:
orthoAGR.head()

Unnamed: 0,Gene1ID,Gene1Symbol,Gene1SpeciesTaxonID,Gene1SpeciesName,Gene2ID,Gene2Symbol,Gene2SpeciesTaxonID,Gene2SpeciesName,Algorithms,AlgorithmsMatch,OutOfAlgorithms,IsBestScore,IsBestRevScore
0,WB:WBGene00011502,vps-53,NCBITaxon:6239,Caenorhabditis elegans,SGD:S000003566,VPS53,NCBITaxon:559292,Saccharomyces cerevisiae,PhylomeDB|OrthoFinder|Hieranoid|OMA|Ensembl Co...,9,10,Yes,Yes
1,ZFIN:ZDB-GENE-041114-199,vps53,NCBITaxon:7955,Danio rerio,SGD:S000003566,VPS53,NCBITaxon:559292,Saccharomyces cerevisiae,OrthoFinder|Hieranoid|OMA|Ensembl Compara|Roun...,8,10,Yes,Yes
2,RGD:1311391,Vps53,NCBITaxon:10116,Rattus norvegicus,SGD:S000003566,VPS53,NCBITaxon:559292,Saccharomyces cerevisiae,OrthoFinder|Hieranoid|OMA|Ensembl Compara|InPa...,7,9,Yes,Yes
3,MGI:1915549,Vps53,NCBITaxon:10090,Mus musculus,SGD:S000003566,VPS53,NCBITaxon:559292,Saccharomyces cerevisiae,PhylomeDB|OrthoFinder|Hieranoid|OMA|Ensembl Co...,9,10,Yes,Yes
4,HGNC:25608,VPS53,NCBITaxon:9606,Homo sapiens,SGD:S000003566,VPS53,NCBITaxon:559292,Saccharomyces cerevisiae,PhylomeDB|OrthoFinder|Hieranoid|OMA|Ensembl Co...,9,10,Yes,Yes


In [62]:
orthoAGR.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 558386 entries, 0 to 558385
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Gene1ID              558386 non-null  object
 1   Gene1Symbol          558361 non-null  object
 2   Gene1SpeciesTaxonID  558386 non-null  object
 3   Gene1SpeciesName     558386 non-null  object
 4   Gene2ID              558386 non-null  object
 5   Gene2Symbol          558361 non-null  object
 6   Gene2SpeciesTaxonID  558386 non-null  object
 7   Gene2SpeciesName     558386 non-null  object
 8   Algorithms           558386 non-null  object
 9   AlgorithmsMatch      558386 non-null  int64 
 10  OutOfAlgorithms      558386 non-null  int64 
 11  IsBestScore          558386 non-null  object
 12  IsBestRevScore       558386 non-null  object
dtypes: int64(2), object(11)
memory usage: 55.4+ MB


In [33]:
# view first record
data_ortho_agr[0]

{'Gene1ID': 'WB:WBGene00011502',
 'Gene1Symbol': 'vps-53',
 'Gene1SpeciesTaxonID': 'NCBITaxon:6239',
 'Gene1SpeciesName': 'Caenorhabditis elegans',
 'Gene2ID': 'SGD:S000003566',
 'Gene2Symbol': 'VPS53',
 'Gene2SpeciesTaxonID': 'NCBITaxon:559292',
 'Gene2SpeciesName': 'Saccharomyces cerevisiae',
 'Algorithms': 'PhylomeDB|OrthoFinder|Hieranoid|OMA|Ensembl Compara|Roundup|InParanoid|PANTHER|OrthoInspector',
 'AlgorithmsMatch': 9,
 'OutOfAlgorithms': 10,
 'IsBestScore': 'Yes',
 'IsBestRevScore': 'Yes'}

In [65]:
# Lets view a chunk of IDs to see the formatting
orthoAGR['Gene1ID'][:10].to_list()

['WB:WBGene00011502',
 'ZFIN:ZDB-GENE-041114-199',
 'RGD:1311391',
 'MGI:1915549',
 'HGNC:25608',
 'FB:FBgn0031598',
 'ZFIN:ZDB-GENE-070112-1002',
 'RGD:1586427',
 'MGI:2444430',
 'HGNC:19743']

In [75]:
#len(orthoAGR["Gene1ID"].value_counts().to_list())
#len(orthoAGR["Gene1ID"].unique())
orthoAGR["Gene1ID"].value_counts()#.to_list()


ZFIN:ZDB-GENE-990415-190    356
ZFIN:ZDB-GENE-050510-2      245
WB:WBGene00021931           206
ZFIN:ZDB-GENE-070806-57     173
ZFIN:ZDB-GENE-070806-56     173
                           ... 
RGD:2325594                   1
MGI:3619266                   1
MGI:2676900                   1
RGD:2325337                   1
SGD:S000004300                1
Name: Gene1ID, Length: 100430, dtype: int64

In [10]:
orthoAGR['Gene2ID'][:10].to_list()

['SGD:S000003566',
 'SGD:S000003566',
 'SGD:S000003566',
 'SGD:S000003566',
 'SGD:S000003566',
 'SGD:S000003566',
 'SGD:S000000021',
 'SGD:S000000021',
 'SGD:S000000021',
 'SGD:S000000021']

### Parse Data
Now that we have loaded the data and correctly formatted it into records, we can loop through the new dictionary and access our data.  

The id variables are using a "compact uri" , or [curie](https://en.wikipedia.org/wiki/CURIE).  
*"The left part is called the "prefix".  I think the prefixes should be stripped to match what's in mygene.info.  but you should probably do some spotchecking of other prefixes just to be sure."*

In [34]:

# Build Parser 
def load_orthology(data, data_list):
    results = {}

    for rec in data_ortho_agr[:4]:
        orig_id1= rec["Gene1ID"].split(':')
        id1_tag2=orig_id1[1]
        _id = id1_tag2
        rec = dict_convert(rec,keyfn=process_key)
        # remove NaN values, not indexable
        rec = dict_sweep(rec,vals=[np.nan])
        results.setdefault(_id,[]).append(rec)
        #print(rec)
    for _id,docs in results.items():
        doc = {"_id": _id, "orthology_data" : docs}
        yield doc
    
    #return results;

In [35]:
def load_testcase():
    results=load_orthology(orthoAGR, data_ortho_agr)
    #print(results)
    print(next(results))
load_testcase()

{'_id': 'WBGene00011502', 'orthology_data': [{'gene1id': 'WB:WBGene00011502', 'gene1symbol': 'vps-53', 'gene1speciestaxonid': 'NCBITaxon:6239', 'gene1speciesname': 'Caenorhabditis elegans', 'gene2id': 'SGD:S000003566', 'gene2symbol': 'VPS53', 'gene2speciestaxonid': 'NCBITaxon:559292', 'gene2speciesname': 'Saccharomyces cerevisiae', 'algorithms': 'PhylomeDB|OrthoFinder|Hieranoid|OMA|Ensembl Compara|Roundup|InParanoid|PANTHER|OrthoInspector', 'algorithmsmatch': 9, 'outofalgorithms': 10, 'isbestscore': 'Yes', 'isbestrevscore': 'Yes'}]}


### Testing the API

In [77]:
!curl localhost:8000/metadata 

{"biothing_type": "gene", "build_date": "2021-09-15T16:11:48.091144", "build_version": "20210915", "src": {"orthologyAGR": {"stats": {"orthologyAGR": 99452}, "version": "2021-09"}}, "stats": {"total": 99452}}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100   208  100   208    0     0    208      0  0:00:01 --:--:--  0:00:01  4425


In [78]:
import requests

In [44]:
r=requests.get("http://localhost:8000/metadata")
print(r.content)

b'{"biothing_type": "gene", "build_date": "2021-09-15T16:11:48.091144", "build_version": "20210915", "src": {"orthologyAGR": {"stats": {"orthologyAGR": 99452}, "version": "2021-09"}}, "stats": {"total": 99452}}'


In [51]:
# method to search Biothings API  
def test_query(query_input, basecase=True):
    # run basecase first -- search metadata
    if basecase==True:
        query="http://localhost:8000/metadata"
        r=requests.get(query)
        print("\n[INFO] metadata query: \n", r.content)
        
    # run user query
    query="http://localhost:8000/query?q=%s"%query_input
    r=requests.get(query)
    print("\n[INFO] query for %s: \n"%query_input, r.content)

In [None]:
#"ZDB-GENE-041114-199", "1311391", "WBGene00011502"

In [90]:
test_query("ZDB-GENE-041114-199", basecase=False)



[INFO] query for ZDB-GENE-041114-199: 
 b'{"took": 8, "total": 18614, "max_score": 26.043224, "hits": [{"_id": "ZDB-GENE-041114-199", "_score": 26.043224, "ortholog_info": [{"algorithms": "OrthoFinder|Hieranoid|OMA|Ensembl Compara|Roundup|InParanoid|PANTHER|OrthoInspector", "algorithmsmatch": 8, "gene1id": "ZFIN:ZDB-GENE-041114-199", "gene1speciesname": "Danio rerio", "gene1speciestaxonid": "NCBITaxon:7955", "gene1symbol": "vps53", "gene2id": "SGD:S000003566", "gene2speciesname": "Saccharomyces cerevisiae", "gene2speciestaxonid": "NCBITaxon:559292", "gene2symbol": "VPS53", "isbestrevscore": "Yes", "isbestscore": "Yes", "outofalgorithms": 10}, {"algorithms": "OrthoFinder|Hieranoid|OMA|Ensembl Compara|Roundup|InParanoid|PANTHER|TreeFam|OrthoInspector", "algorithmsmatch": 9, "gene1id": "ZFIN:ZDB-GENE-041114-199", "gene1speciesname": "Danio rerio", "gene1speciestaxonid": "NCBITaxon:7955", "gene1symbol": "vps53", "gene2id": "FB:FBgn0031598", "gene2speciesname": "Drosophila melanogaster", "

---

  Gene1ID: 
- left hand symbol into column, count unique 
- determine error cases/ isolate out 
  
  
  
## <u>References</u>  
  
https://en.wikipedia.org/wiki/CURIE  
http://mygene.info/  
http://mygene.info/v3/gene/WB:WBGene00011502    
https://github.com/sirloon/pharmgkb/tree/pharmgkb_v1  



---