In [1]:
from Bio.Blast import NCBIWWW
from Bio import SeqIO
import os
import Bio.KEGG.REST as REST
import pandas as pd

from Bio.Blast import NCBIXML
from Bio import Entrez
import pandas as pd

In [2]:
def getOrgs():
    Org = REST.kegg_list('organism')
    dfAllOrgs = pd.DataFrame(columns=['Tnumber', 'orgCode', 'orgName', 'phylogeny'])
    for r in Org.readlines():
        spltOrg = r.split("\t")
        sngItem = pd.DataFrame([spltOrg], columns=['Tnumber', 'orgCode', 'orgName', 'phylogeny'])
        dfAllOrgs = dfAllOrgs.append(sngItem, ignore_index=True)
    return dfAllOrgs

##### FROM BYPYTHON, IT IS NOT MINE
from Bio._py3k import urlopen as _urlopen
from Bio._py3k import _binary_to_string_handle


def _q(op, arg1, arg2=None, arg3=None):
    URL = "http://rest.kegg.jp/%s"
    if arg2 and arg3:
        args = "%s/%s/%s/%s" % (op, arg1, arg2, arg3)
    elif arg2:
        args = "%s/%s/%s" % (op, arg1, arg2)
    else:
        args = "%s/%s" % (op, arg1)
    resp = _urlopen(URL % (args))

    if "image" == arg2:
        return resp

    return _binary_to_string_handle(resp)



def kegg_info(database):

    return _q("info", database)


def kegg_list(database, org=None):
    if isinstance(database, str) and (database in ["pathway", "module"]) and org:
        resp = _q("list", database, org)
    elif isinstance(database, str) and database and org:
        raise Exception("Invalid database arg for kegg list request.")
    else:
        if isinstance(database, list) and len(database) <= 100:
            database = ("+").join(database)
        elif isinstance(database, list) and len(database) > 100:
            raise Exception("Maximuim number of databases is 100 for kegg list query")
        resp = _q("list", database)

    return resp


def kegg_find(database, query, option=None):
    if database in ["compound", "drug"] and \
          option in ["formula", "exact_mass", "mol_weight"]:
        resp = _q("find", database, query, option)
    elif option:
        raise Exception("Invalid option arg for kegg find request.")
    else:
        if isinstance(query, list):
            query = "+".join(query)
        resp = _q("find", database, query)

    return resp


def kegg_get(dbentries, option=None):
    if isinstance(dbentries, list) and len(dbentries) <= 10:
        dbentries = "+".join(dbentries)
    elif isinstance(dbentries, list) and len(dbentries) > 10:
        raise Exception("Maximum number of dbentries is 10 for kegg get query")
    if option in ["aaseq", "ntseq", "mol", "kcf", "image", "kgml"]:
        resp = _q("get", dbentries, option)
    elif option:
        raise Exception("Invalid option arg for kegg get request.")
    else:
        resp = _q("get", dbentries)

    return resp


def kegg_conv(target_db, source_db, option=None):
    if option and option not in ["turtle", "n-triple"]:
        raise Exception("Invalid option arg for kegg conv request.")

    if isinstance(source_db, list):
        source_db = "+".join(source_db)

    if target_db in ["ncbi-gi", "ncbi-geneid", "ncbi-proteinid", "uniprot", "genes"] or \
       source_db in ["ncbi-gi", "ncbi-geneid", "ncbi-proteinid", "uniprot", "genes"] or \
       (target_db in ["drug", "compound", "glycan"] and
           source_db in ["pubchem", "glycan"]) or \
       (target_db in ["pubchem", "glycan"] and
           source_db in ["drug", "compound", "glycan"]):

        if option:
            resp = _q("conv", target_db, source_db, option)
        else:
            resp = _q("conv", target_db, source_db)

        return resp
    else:
        raise Exception("Bad argument target_db or source_db for kegg conv request.")


def kegg_link(target_db, source_db, option=None):
    if option and option not in ["turtle", "n-triple"]:
        raise Exception("Invalid option arg for kegg conv request.")

    if isinstance(source_db, list):
        source_db = "+".join(source_db)

    if option:
        resp = _q("link", target_db, source_db, option)
    else:
        resp = _q("link", target_db, source_db)

    return resp

In [3]:
record = SeqIO.read("./data/anisakis_coi.fasta", format="fasta")

In [5]:
print('Output 1:\n', record.seq, '\n')
print('Output 2:\n', record.format("fasta"), '\n')
print(f'Length: {len(record.seq)}')

Output 1:
 CATCCTGAGGTTTATATTTTGATTTTACCGGCTTTTGGTATTATTAGACAGTCTAGTTTGTATTTAACTGGTAAGAAAGAAGTTTTTGGTTCTTTGGGGATGGTATATGCTATTTTAAGAATTGGTCTTATTGGTTGTGTAGTTTGGGCTCATCATATGTATACTGTGGGCATGGATCTTGATTCTCGTGCTTATTTTACTGCTGCTACTATGGTTATTGCTGTTCCTACTGGGGTAAAAGTTTTTAGTTGATTAGCTACTCTTTTTGGTATGAAGATGGTTTTTCAACCTTTGCTTCTTTGGGTATTGGGCTTTATCTTTTTATTTACTGTTGGTGGTTTGACTGGTGTTGTTCTTTCTAATTCTAGTTTGGATGTTATTCTGCATGATACTTATTATGTAGTAAGTCATTTTCATTATGT 

Output 2:
 >KY973682.1 Anisakis simplex cytochrome c oxidase subunit I (COI) gene, partial cds; mitochondrial
CATCCTGAGGTTTATATTTTGATTTTACCGGCTTTTGGTATTATTAGACAGTCTAGTTTG
TATTTAACTGGTAAGAAAGAAGTTTTTGGTTCTTTGGGGATGGTATATGCTATTTTAAGA
ATTGGTCTTATTGGTTGTGTAGTTTGGGCTCATCATATGTATACTGTGGGCATGGATCTT
GATTCTCGTGCTTATTTTACTGCTGCTACTATGGTTATTGCTGTTCCTACTGGGGTAAAA
GTTTTTAGTTGATTAGCTACTCTTTTTGGTATGAAGATGGTTTTTCAACCTTTGCTTCTT
TGGGTATTGGGCTTTATCTTTTTATTTACTGTTGGTGGTTTGACTGGTGTTGTTCTTTCT
AATTCTAGTTTGGATGTTATTCTGCATGATACTTATTATGTAGTAAGTCATTTTCATTAT
GT
 

Length: 422


In [None]:
blastResult = NCBIWWW.qblast("blastx", "nucleotide", record.format("fasta")) 

In [None]:
with open(f'./data/blastOutput_{str(record.id)}.xml', "w") as f:
    f.write(blastResult.read())
    blastResult.close()
f.close()

In [None]:
dfAllOrgsInfo = getOrgs()

In [None]:
dfAllOrgsInfo.to_excel('./data/dfOrgs.xlsx')

In [None]:
Entrez.email = 'giacomo.villa.mi@gmail.com'

dfOrgs = pd.read_excel("./data/dfOrgs.xlsx")

queryNames = []
ncbiIDs = []
descriptions = []
orgNames = []
expvalues = []
lengths = []
keggIds = []

result_handle = open(f'./data/blastOutput_{str(record.id)}.xml', "r")

blast_records = NCBIXML.read(result_handle)

queryName = blast_records.query

aligns = blast_records.alignments

for hit in aligns:       
    ncbiID = hit.accession #The NCBI identifier
    description = hit.hit_def #The description of the sequence
    length = hit.length
    hsp = hit.hsps
    
    entrez_result = Entrez.efetch(db="protein", id=ncbiID, rettype="gp", retmode="xml")
    
    record = Entrez.read(entrez_result), '\n'
    
    # print('record\n',record[0], '\n\n', record[0][0], '\n\n', record[0][0]['GBSeq_organism'],'\n\n')
    
    orgName = record[0][0]['GBSeq_organism']
    print('NCBI ID: ', ncbiID)
    print('Description: ', description)
    print('Sequence length: ', length)
    evalues = []
    for h in hsp:
        evalue = h.expect #E-value of each hsp
        evalues.append(evalue)
        print('Evalue: ', evalues)
    print('Organism: ', orgName)    
    orgCode = ''    
    for i, row in dfOrgs.iterrows():
        lsname = dfOrgs['orgName'][i].split(" ")[0:2]
        name = lsname[0] + " " + lsname[1]
        if name in orgName:
            orgCode = dfOrgs['orgCode'][i]
    print('Organism code: ', orgCode)
    keggId = ''
    #OrgCodes.append(orgCode)
    ## Conversion of RefSeq Protein ID to KEGG ID
    if orgCode != '':
        keggSearch = kegg_conv(orgCode, "ncbi-proteinid")
        print('keggSearch\n', keggSearch, '\n')
        ncbiAccessionNumber = 'ncbi-proteinid:' + ncbiID
        for k in keggSearch:
            print('k\n', k, '\n')
            ncbi = k.split("\n")[0].split("\t")[0]
            kegg = k.split("\n")[0].split("\t")[1]
            if ncbi == ncbiAccessionNumber:
                keggId = kegg
                #keggIDs.append(kegg)
                break
    print('KEGG ID: ', keggId, '\n')
    print()
    queryNames.append(queryName)
    ncbiIDs.append(ncbiID)
    descriptions.append(description)
    orgNames.append(orgName)
    expvalues.append(evalues)
    lengths.append(length)
    keggIds.append(keggId)

dfinfo = pd.DataFrame({'QueryName': queryNames, 'Description': descriptions, 'E-value': expvalues,
                      'NCBI ID': ncbiIDs, 'KEGG ID': keggIds, 'Organism': orgNames,  'Length':lengths})
dfinfo = dfinfo[['QueryName', 'Description', 'E-value','NCBI ID', 'KEGG ID', 'Organism', 'Length']]
dfinfo.to_excel(f'./data/blastOutput_{str(record.id)}.xml')