### Uniprot protein alignment analysis
This notebook takes the output from Uniprot API (https://www.ebi.ac.uk/proteins/api/doc/#!/proteins/search). The goal is the alignment between the CDS from samples BGCs and the proteins associated with "antibiotic biosynthesis" in the API.

### Search in the Uniprot API
This section exist to make a dataframe for the API results. The query searched was "antibiotic biosynthesis"

In [1]:
import requests, sys
# Change the number of query results
requestURL = "https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=10&keywords=antibiotic%20biosynthesis&format=tsv"

r = requests.get(requestURL, headers={ "Accept" : "application/xml"})

if not r.ok:
  r.raise_for_status()
  sys.exit()

responseBody = r.text
print(responseBody)

<?xml version='1.0' encoding='UTF-8'?><uniprot xmlns="http://uniprot.org/uniprot" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><entry xmlns="https://uniprot.org/uniprot" dataset="TrEMBL" created="2014-06-11" modified="2025-04-02" version="29"><accession>A0A010REA4</accession><name>A0A010REA4_9PEZI</name><protein><submittedName><fullName evidence="5">TfdA family Taurine catabolism dioxygenase TauD</fullName></submittedName></protein><gene><name evidence="5" type="ORF">CFIO01_07979</name></gene><organism evidence="5 6"><name type="scientific">Colletotrichum fioriniae PJ7</name><dbReference type="NCBI Taxonomy" id="1445577"/><lineage><taxon>Eukaryota</taxon><taxon>Fungi</taxon><taxon>Dikarya</taxon><taxon>Ascomycota</taxon><taxon>Pezizomycotina</taxon><taxon>Sordariomycetes</taxon><taxon>Hypocreomycetidae</taxon><taxon>Glomerellales</taxon><taxon>Glomerellaceae</taxon><taxon>Colletotri

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
records = []
root = ET.fromstring(responseBody)

ns = {
    "u": "http://uniprot.org/uniprot",
    "e": "https://uniprot.org/uniprot"
}

for entry in root.findall("e:entry", ns):

    accession = entry.findtext("e:accession", namespaces=ns)

    name = entry.findtext("e:name", namespaces=ns)

    protein_name = entry.findtext(
        "e:protein/e:submittedName/e:fullName",
        namespaces=ns
    )
    gene_id = entry.findtext(
        "e:gene/e:name",
        namespaces=ns
    )
    organism_name = entry.findtext(
        "e:organism/e:name[@type='scientific']",
        namespaces=ns
    )

    kingdom = entry.findtext(
    "e:organism/e:lineage/e:taxon[1]",
    namespaces=ns
    )

    taxon_id = None
    tax_ref = entry.find("e:organism/e:dbReference[@type='NCBI Taxonomy']", ns)
    if tax_ref is not None:
        taxon_id = tax_ref.attrib.get("id")

    lineage = [
        tax.text
        for tax in entry.findall("e:organism/e:lineage/e:taxon", ns)
    ]

    sequence = entry.findtext("e:sequence", namespaces=ns)

    records.append({
            "accession": accession,
            "submitted_name": protein_name,
            "gene_id": gene_id,
            "organism": organism_name,
            "kingdom": kingdom,
            "taxon_id": taxon_id,
            "lineage": ";".join(lineage),
            "sequence": sequence
        })

df = pd.DataFrame(records)
df.to_csv("uniprot_table_2.csv", index=False)
