In [None]:
#!/usr/bin/python
import requests, sys


def convert_and_map_human_gene_symbol(gene, target_taxon):
    server = "https://rest.ensembl.org"
    ext = "/homology/symbol/human/"+gene+"?target_taxon="+str(target_taxon)+";format=condensed;type=orthologues"
    r = requests.get(server+ext, headers={"Content-Type" : "application/json"})

    if not r.ok:
        r.raise_for_status()
        sys.exit()

    decoded = r.json()['data'][0]
    return(decoded)

def get_goterms(gene):  
    server = "https://rest.ensembl.org"
    ext = "/xrefs/id/"+gene+"?external_db=GO;all_levels=1"
    r = requests.get(server+ext, headers={"Content-Type" : "application/json"})
    
    if not r.ok:
        r.raise_for_status()
        sys.exit()

    decoded = r.json()
    return(decoded)

def parse_goterms(gotermjson):
    parsed=[]
    for term in gotermjson:
        if not term['description'] in parsed:
            parsed.append(term['description'])

    return(parsed)
        
def convert_symbol_file(fname, target_taxon):
    mappings = {}
    with open(fname) as f:
        header = f.readline()
        header = header + '\n'+ f.readline()
        for line in f:
            gene=line.strip()
            mappings[gene] = convert_and_map_human_gene_symbol(gene, target_taxon)

    return(mappings)


def output_terms(fname, mappings, origin=True):
    outf = open(fname, 'w')
    outf.write('\t'.join(["SYMBOL","ID","GOterms"])+'\n')
    
    for gid in mappings:
        if len(mappings[gid]['homologies']) <= 0 and not origin:
            outf.write('\t'.join([gid, "", ""])+'\n')
        elif origin:
            outf.write('\t'.join([gid, mappings[gid]['id'], ", ".join(parse_goterms(get_goterms(mappings[gid]['id'])))])+'\n')
        else:
            outf.write('\t'.join([gid, mappings[gid]['homologies'][0]['id'], ", ".join(parse_goterms(get_goterms(mappings[gid]['homologies'][0]['id'])))])+'\n')
    outf.close()

def output_converted_ids(fname, mappings):
    outf = open(fname, 'w')
    outf.write('\t'.join(["SYMBOL","ID","MOUSE"])+'\n')
    for gid in mappings:
        if len(mappings[gid]['homologies']) <= 0:
            outf.write('\t'.join([gid, mappings[gid]['id'], ""])+'\n')
        else:
            outf.write('\t'.join([gid, mappings[gid]['id'], mappings[gid]['homologies'][0]['id']])+'\n')
    outf.close()
    
    


In [None]:
# Homo Sapiens - Human 
origin_taxon=9606
# Mus Musculus - Mouse 
target_taxon=10090

In [None]:
mappings = convert_symbol_file("../shared/"+"List1.txt", target_taxon)
output_converted_ids("../"+"List1.converted.txt", mappings)

In [None]:
# RUN THE CODE ON AN ID SET
output_terms("../"+"List1_ID.GO.txt", mappings, origin=True)

In [61]:
! echo "Official ID"
! grep CRISP ../List1_ID.GO.txt

Official ID
CRISP1	ENSG00000124812	extracellular region,extracellular space,nucleus,fusion of sperm to egg plasma membrane,calcium channel regulator activity,binding of sperm to zona pellucida,regulation of acrosome reaction
CRISP2	ENSG00000124490	extracellular region,extracellular space,single organismal cell-cell adhesion
CRISP3	ENSG00000096006	molecular_function,extracellular region,proteinaceous extracellular matrix,extracellular space,defense response,specific granule lumen,specific granule,neutrophil degranulation,innate immune response,extracellular exosome,tertiary granule lumen,membrane,integral component of membrane


In [60]:
#! git code here
! git commit -a -m "ID version"
! git tag -a v1.3 -m "ID version"
! git push


[master e08d925] ID version
 2 files changed, 62 insertions(+), 9 deletions(-)
Counting objects: 4, done.
Delta compression using up to 8 threads.
Compressing objects: 100% (4/4), done.
Writing objects: 100% (4/4), 871 bytes | 0 bytes/s, done.
Total 4 (delta 3), reused 0 (delta 0)
remote: Resolving deltas: 100% (3/3), completed with 3 local objects.[K
remote: This repository moved. Please use the new location:[K
remote:   git@github.com:UBEC/ReproducibleResearch.git[K
To github.com:UBEC/ReproducibleScience.git
   6c38497..e08d925  master -> master
