# Exploring Orthology data for BioThings Studio Parser

In [1]:
import pandas as pd
import numpy as np
import os, glob, csv, re
import gzip
#import shutil

In [2]:
# we'll remove space in keys to make queries easier. Also, lowercase is preferred
# for a BioThings API. We'll an helper function from BioThings SDK
process_key = lambda k: k.replace(" ","_").lower()

---
## Load input data   

Datasource: ```https://fms.alliancegenome.org/download/ORTHOLOGY-ALLIANCE_COMBINED.tsv.gz```     
  
  
*The first lines (0-14) are summary information and are disregarded when formatting the dataframe.*

In [3]:
infile = os.path.join("c:\\Users\\19802\\Documents\\dev\scripps\\BioThings\\ORTHOLOGY-ALLIANCE_COMBINED.tsv.gz")
assert os.path.exists(infile)

In [4]:
infile2 = os.path.join("c:\\Users\\19802\\Documents\\dev\scripps\\BioThings\\drugLabels.byGene.tsv")
assert os.path.exists(infile2)

In [5]:
# open and print the summary key at top of file
with gzip.open(infile, 'rb') as f_in:
    for line in f_in.readlines()[:15]:
        print(line)
    print("\n")

# open and load tsv into dataframe
with gzip.open(infile, 'rb') as f_in:
    data=pd.read_csv(f_in, header=15, sep="\\t")
data_list=data.to_dict(orient='records')

print("[INFO] completed loading data.")


b'##########################################################################\n'
b'#\n'
b'# Data type: Orthology\n'
b'# Data format: tsv\n'
b'# README: \n'
b'# Source: Alliance of Genome Resources (Alliance)\n'
b'# Source URL: http://alliancegenome.org/downloads\n'
b'# Help Desk: help@alliancegenome.org\n'
b'# Orthology Filter: Stringent\n'
b'# Taxon IDs: NCBITaxon:9606, NCBITaxon:10116, NCBITaxon:10090, NCBITaxon:7955, NCBITaxon:7227, NCBITaxon:6239, NCBITaxon:559292\n'
b'# Species: Homo sapiens, Rattus norvegicus, Mus musculus, Danio rerio, Drosophila melanogaster, Caenorhabditis elegans, Saccharomyces cerevisiae\n'
b'# Alliance Database Version: 4.1.0\n'
b'# Date file generated (UTC): 2021-08-10 17:59\n'
b'#\n'
b'##########################################################################\n'




  return func(*args, **kwargs)


[INFO] completed loading data.


In [6]:
data2=pd.read_csv(infile2, sep="\t",squeeze=True,quoting=csv.QUOTE_NONE).to_dict(orient='records') 

## Explore Data  


In [7]:
# Count unique values found in columns, if under a certain threshold view the unique list of values
for col in data.columns.values:
    list_len=len(data[col].unique().tolist())
    print("column: %s length: %s"%(col, list_len))
    if list_len < 20:
        print("unique values: ", data[col].unique().tolist())


column: Gene1ID length: 100430
column: Gene1Symbol length: 81990
column: Gene1SpeciesTaxonID length: 7
unique values:  ['NCBITaxon:6239', 'NCBITaxon:7955', 'NCBITaxon:10116', 'NCBITaxon:10090', 'NCBITaxon:9606', 'NCBITaxon:7227', 'NCBITaxon:559292']
column: Gene1SpeciesName length: 7
unique values:  ['Caenorhabditis elegans', 'Danio rerio', 'Rattus norvegicus', 'Mus musculus', 'Homo sapiens', 'Drosophila melanogaster', 'Saccharomyces cerevisiae']
column: Gene2ID length: 100430
column: Gene2Symbol length: 81990
column: Gene2SpeciesTaxonID length: 7
unique values:  ['NCBITaxon:559292', 'NCBITaxon:7227', 'NCBITaxon:7955', 'NCBITaxon:10116', 'NCBITaxon:9606', 'NCBITaxon:6239', 'NCBITaxon:10090']
column: Gene2SpeciesName length: 7
unique values:  ['Saccharomyces cerevisiae', 'Drosophila melanogaster', 'Danio rerio', 'Rattus norvegicus', 'Homo sapiens', 'Caenorhabditis elegans', 'Mus musculus']
column: Algorithms length: 2529
column: AlgorithmsMatch length: 11
unique values:  [9, 8, 7, 10, 6

In [8]:
data2[0]

{'Gene ID': 'PA267',
 'Gene Symbol': 'ABCB1',
 'Label IDs': 'PA166123409;PA166159586',
 'Label Names': 'Annotation of EMA Label for aliskiren and ABCB1;Annotation of HCSC Label for aliskiren and ABCB1'}

In [9]:
# view first record
data_list[0]

{'Gene1ID': 'WB:WBGene00011502',
 'Gene1Symbol': 'vps-53',
 'Gene1SpeciesTaxonID': 'NCBITaxon:6239',
 'Gene1SpeciesName': 'Caenorhabditis elegans',
 'Gene2ID': 'SGD:S000003566',
 'Gene2Symbol': 'VPS53',
 'Gene2SpeciesTaxonID': 'NCBITaxon:559292',
 'Gene2SpeciesName': 'Saccharomyces cerevisiae',
 'Algorithms': 'PhylomeDB|OrthoFinder|Hieranoid|OMA|Ensembl Compara|Roundup|InParanoid|PANTHER|OrthoInspector',
 'AlgorithmsMatch': 9,
 'OutOfAlgorithms': 10,
 'IsBestScore': 'Yes',
 'IsBestRevScore': 'Yes'}

In [10]:
data.columns.values

array(['Gene1ID', 'Gene1Symbol', 'Gene1SpeciesTaxonID',
       'Gene1SpeciesName', 'Gene2ID', 'Gene2Symbol',
       'Gene2SpeciesTaxonID', 'Gene2SpeciesName', 'Algorithms',
       'AlgorithmsMatch', 'OutOfAlgorithms', 'IsBestScore',
       'IsBestRevScore'], dtype=object)

In [11]:
data.head()

Unnamed: 0,Gene1ID,Gene1Symbol,Gene1SpeciesTaxonID,Gene1SpeciesName,Gene2ID,Gene2Symbol,Gene2SpeciesTaxonID,Gene2SpeciesName,Algorithms,AlgorithmsMatch,OutOfAlgorithms,IsBestScore,IsBestRevScore
0,WB:WBGene00011502,vps-53,NCBITaxon:6239,Caenorhabditis elegans,SGD:S000003566,VPS53,NCBITaxon:559292,Saccharomyces cerevisiae,PhylomeDB|OrthoFinder|Hieranoid|OMA|Ensembl Co...,9,10,Yes,Yes
1,ZFIN:ZDB-GENE-041114-199,vps53,NCBITaxon:7955,Danio rerio,SGD:S000003566,VPS53,NCBITaxon:559292,Saccharomyces cerevisiae,OrthoFinder|Hieranoid|OMA|Ensembl Compara|Roun...,8,10,Yes,Yes
2,RGD:1311391,Vps53,NCBITaxon:10116,Rattus norvegicus,SGD:S000003566,VPS53,NCBITaxon:559292,Saccharomyces cerevisiae,OrthoFinder|Hieranoid|OMA|Ensembl Compara|InPa...,7,9,Yes,Yes
3,MGI:1915549,Vps53,NCBITaxon:10090,Mus musculus,SGD:S000003566,VPS53,NCBITaxon:559292,Saccharomyces cerevisiae,PhylomeDB|OrthoFinder|Hieranoid|OMA|Ensembl Co...,9,10,Yes,Yes
4,HGNC:25608,VPS53,NCBITaxon:9606,Homo sapiens,SGD:S000003566,VPS53,NCBITaxon:559292,Saccharomyces cerevisiae,PhylomeDB|OrthoFinder|Hieranoid|OMA|Ensembl Co...,9,10,Yes,Yes


---

## Parse Data
Now that we have loaded the data and correctly formatted it into records, we can loop through the new dictionary and access our data.

In [16]:
# Test Example Parse
#for rec in data2[:4]:
    #label_ids = rec.pop("Label IDs").split(";")
    #label_names = rec.pop("Label Names").split(";")
    #assert len(label_ids) == len(label_names)
    #labels = []
    #for i,_ in enumerate(label_ids):
        #labels.append({"id" : label_ids[i],
         #               "name" : label_names[i]})
   # _id = rec["Gene ID"]
    #rec = dict_convert(rec,keyfn=process_key)
    #doc = {"_id" : _id, "drug_labels" : labels}

#print(doc)

In [12]:
data.head()

Unnamed: 0,Gene1ID,Gene1Symbol,Gene1SpeciesTaxonID,Gene1SpeciesName,Gene2ID,Gene2Symbol,Gene2SpeciesTaxonID,Gene2SpeciesName,Algorithms,AlgorithmsMatch,OutOfAlgorithms,IsBestScore,IsBestRevScore
0,WB:WBGene00011502,vps-53,NCBITaxon:6239,Caenorhabditis elegans,SGD:S000003566,VPS53,NCBITaxon:559292,Saccharomyces cerevisiae,PhylomeDB|OrthoFinder|Hieranoid|OMA|Ensembl Co...,9,10,Yes,Yes
1,ZFIN:ZDB-GENE-041114-199,vps53,NCBITaxon:7955,Danio rerio,SGD:S000003566,VPS53,NCBITaxon:559292,Saccharomyces cerevisiae,OrthoFinder|Hieranoid|OMA|Ensembl Compara|Roun...,8,10,Yes,Yes
2,RGD:1311391,Vps53,NCBITaxon:10116,Rattus norvegicus,SGD:S000003566,VPS53,NCBITaxon:559292,Saccharomyces cerevisiae,OrthoFinder|Hieranoid|OMA|Ensembl Compara|InPa...,7,9,Yes,Yes
3,MGI:1915549,Vps53,NCBITaxon:10090,Mus musculus,SGD:S000003566,VPS53,NCBITaxon:559292,Saccharomyces cerevisiae,PhylomeDB|OrthoFinder|Hieranoid|OMA|Ensembl Co...,9,10,Yes,Yes
4,HGNC:25608,VPS53,NCBITaxon:9606,Homo sapiens,SGD:S000003566,VPS53,NCBITaxon:559292,Saccharomyces cerevisiae,PhylomeDB|OrthoFinder|Hieranoid|OMA|Ensembl Co...,9,10,Yes,Yes


 
Example record:   
``` 
{'Gene1ID': 'WB:WBGene00011502',
  'Gene1Symbol': 'vps-53',
  'Gene1SpeciesTaxonID': 'NCBITaxon:6239',
  'Gene1SpeciesName': 'Caenorhabditis elegans',
  'Gene2ID': 'SGD:S000003566',
  'Gene2Symbol': 'VPS53',
  'Gene2SpeciesTaxonID': 'NCBITaxon:559292',
  'Gene2SpeciesName': 'Saccharomyces cerevisiae',
  'Algorithms': 'PhylomeDB|OrthoFinder|Hieranoid|OMA|Ensembl Compara|Roundup|InParanoid|PANTHER|OrthoInspector',
  'AlgorithmsMatch': 9,
  'OutOfAlgorithms': 10,
  'IsBestScore': 'Yes',
  'IsBestRevScore': 'Yes'} 
  
```

In [13]:
data.columns.values

array(['Gene1ID', 'Gene1Symbol', 'Gene1SpeciesTaxonID',
       'Gene1SpeciesName', 'Gene2ID', 'Gene2Symbol',
       'Gene2SpeciesTaxonID', 'Gene2SpeciesName', 'Algorithms',
       'AlgorithmsMatch', 'OutOfAlgorithms', 'IsBestScore',
       'IsBestRevScore'], dtype=object)

### **Build Parser**  

In [14]:
import biothings
#from biothings import config
from biothings.utils.dataload import dict_convert, dict_sweep

In [17]:
def load_orthology(data, data_list):

    process_key = lambda k: k.replace(" ","_").lower()

    gene1_ids=data['Gene1ID']
    gene2_ids=data["Gene2ID"]

    assert len(gene1_ids) == len(gene2_ids)

    results = {}

    for rec in data_list:
        _id = rec['Gene1ID']
        rec = dict_convert(rec,keyfn=process_key)
        # remove NaN values, not indexable
        rec = dict_sweep(rec,vals=[np.nan])
        results.setdefault(_id,[]).append(rec)
        #print(rec)
    for _id,docs in results.items():
        doc = {"_id": _id, "orthology_data" : docs}
        yield doc
    
    #return results;

In [18]:
def load_testcase():
    results=load_orthology(data, data_list)
    #print(results)
    print(next(results))
load_testcase()

{'_id': 'WB:WBGene00011502', 'orthology_data': [{'gene1id': 'WB:WBGene00011502', 'gene1symbol': 'vps-53', 'gene1speciestaxonid': 'NCBITaxon:6239', 'gene1speciesname': 'Caenorhabditis elegans', 'gene2id': 'SGD:S000003566', 'gene2symbol': 'VPS53', 'gene2speciestaxonid': 'NCBITaxon:559292', 'gene2speciesname': 'Saccharomyces cerevisiae', 'algorithms': 'PhylomeDB|OrthoFinder|Hieranoid|OMA|Ensembl Compara|Roundup|InParanoid|PANTHER|OrthoInspector', 'algorithmsmatch': 9, 'outofalgorithms': 10, 'isbestscore': 'Yes', 'isbestrevscore': 'Yes'}, {'gene1id': 'WB:WBGene00011502', 'gene1symbol': 'vps-53', 'gene1speciestaxonid': 'NCBITaxon:6239', 'gene1speciesname': 'Caenorhabditis elegans', 'gene2id': 'FB:FBgn0031598', 'gene2symbol': 'Vps53', 'gene2speciestaxonid': 'NCBITaxon:7227', 'gene2speciesname': 'Drosophila melanogaster', 'algorithms': 'PhylomeDB|OrthoFinder|Hieranoid|OMA|Ensembl Compara|Roundup|InParanoid|PANTHER|OrthoInspector', 'algorithmsmatch': 9, 'outofalgorithms': 10, 'isbestscore': 'Y

Example from ```load_druglabels```

In [69]:

for rec in data2[:3]:
    label_ids = rec.pop("Label IDs").split(";")
    label_names = rec.pop("Label Names").split(";")
    assert len(label_ids) == len(label_names)
    labels = []
    for i,_ in enumerate(label_ids):
        labels.append({"id" : label_ids[i],
                        "name" : label_names[i]})
    _id = rec["Gene ID"]
    rec = dict_convert(rec,keyfn=process_key)
    doc = {"_id" : _id, "drug_labels" : labels}
    print(rec)
    print(doc)


{'gene_id': 'PA267', 'gene_symbol': 'ABCB1'}
{'_id': 'PA267', 'drug_labels': [{'id': 'PA166123409', 'name': 'Annotation of EMA Label for aliskiren and ABCB1'}, {'id': 'PA166159586', 'name': 'Annotation of HCSC Label for aliskiren and ABCB1'}]}
{'gene_id': 'PA390', 'gene_symbol': 'ABCG2'}
{'_id': 'PA390', 'drug_labels': [{'id': 'PA166184427', 'name': 'Annotation of Swissmedic Label for ezetimibe / rosuvastatin and ABCG2, SLCO1B1'}, {'id': 'PA166184499', 'name': 'Annotation of Swissmedic Label for rosuvastatin and ABCG2, SLCO1B1'}]}
{'gene_id': 'PA36144', 'gene_symbol': 'ABI1'}
{'_id': 'PA36144', 'drug_labels': [{'id': 'PA166127660', 'name': 'Annotation of HCSC Label for dasatinib and ABI1, BCR'}, {'id': 'PA166127682', 'name': 'Annotation of HCSC Label for imatinib and ABI1, BCR, FIP1L1, KIT, PDGFRA, PDGFRB'}, {'id': 'PA166127693', 'name': 'Annotation of HCSC Label for nilotinib and ABI1, BCR'}, {'id': 'PA166129527', 'name': 'Annotation of HCSC Label for ponatinib and ABI1, BCR'}]}


---
Draft extras - 

In [None]:
# Format Version 2
# Each ID has a unique doc
#info_list=[]
#for i,_ in enumerate(g1_ids):
#    info_list.append({
#                "id": g1_ids[i],
            #"ortholog_data": 
#   })

"""
# Format Version 1
# Two IDs, gene1_id & gene2_id
ids=[]


for i,_ in enumerate(g1_ids):
ids.append({
            "gene1_id": g1_ids[i],
            "gene2_id": g2_ids[i]
})
"""

"""
# cols with the gene IDS col 1 & 2 taken out 
data_cols=['Gene1Symbol', 'Gene1SpeciesTaxonID',
    'Gene1SpeciesName', 'Gene2Symbol',
    'Gene2SpeciesTaxonID', 'Gene2SpeciesName', 'Algorithms',
    'AlgorithmsMatch', 'OutOfAlgorithms', 'IsBestScore',
    'IsBestRevScore']
"""

---