In [1]:
import sys, os, json
sys.path.append('../')
from scripts.uniprot import UniprotInterface
import pandas as pd

In [2]:
def replace_char_at_index(s, i, new_char):
    if i < 0 or i >= len(s):
        raise IndexError("Index out of range.")
    return s[:i] + new_char + s[i+1:]

In [11]:
ids = ["Q75UA4"]
from_db = 'UniProtKB_AC-ID'
to_db = 'UniProtKB'
disease = "CRC"

In [12]:
downloader = UniprotInterface()

job_id = downloader.submit_id_mapping(from_db=from_db, to_db=to_db, ids=ids)

In [13]:
if downloader.check_id_mapping_results_ready(job_id):
    link = downloader.get_id_mapping_results_link(job_id)
    results = downloader.get_id_mapping_results_search(link)

Fetched: 1 / 1


In [10]:
with open("results.json", "w") as f:
    json.dump(results, f)

In [14]:
results['results'][0]['to']['sequence']['value']

'MKFGKFVLLAASTALAVVGLGGPAAADSTPQAQPSIIGGSNATSGPWAARLFVNGRQNCTATIIAPQYILTAKHCVSSSGTYTFRIGSLDQTSGGTMATGSTITRYPGSADLAIVRLTTSVNATYSPLGSVGDVSVGQNVSVYGWGATSQCGSEINCQSRYLKVATVRVNSISCSDYTGGVAVCANRVNGITAGGDSGGPMFASGRQVGVASTSDRVNNTAYTNITRYRSWISQVAGV'

In [15]:
for result in results['results']:
    print(result['from'])

Q75UA4


In [None]:
export_data = []
sequence = results['results'][0]['to']['sequence']['value']
for feature in results['results'][0]['to']['features']:
    row = []
    if feature['type'] == 'Natural variant' and disease in feature['description']:     
        row.append(feature['featureId'])
        location_start = feature['location']['start']['value']
        location_end = feature['location']['end']['value']
        if location_start == location_end:
            row.append(location_start)
            original_sequence = feature['alternativeSequence']['originalSequence']
            new_sequence = feature['alternativeSequence']['alternativeSequences'][0]
            row.append(f"{original_sequence}->{new_sequence}")
            row.append(replace_char_at_index(sequence, int(location_start)-1, new_sequence))
        else:
            row.append(f"{location_start}-{location_end}")
            row.append("missing")
            row.append(sequence[:int(location_start)-1] + sequence[int(location_end)-1:])
        export_data.append(row)
export_data

In [None]:
df = pd.DataFrame(export_data, columns=["variant id", "position", "change", "sequence"])
df

In [None]:
df.to_csv("results.csv", index=False)

In [46]:
result = results['results'][0]
for reference in result['to']['references']:
    print(reference['citation']['citationCrossReferences'])

[{'database': 'PubMed', 'id': '11133465'}, {'database': 'DOI', 'id': '10.1128/AEM.67.1.345-353.2001'}]
[{'database': 'PubMed', 'id': '16237016'}, {'database': 'DOI', 'id': '10.1128/JB.187.21.7333-7340.2005'}]


In [49]:
references_list = []
result = results['results'][0]                 

try:
    for r in result['to']['references']:
        tmp = {}
        tmp["citacionCrossReferences"] = r['citation']['citationCrossReferences']
        tmp.update({"title": r['citation']['title']})
        references_list.append(tmp)
except KeyError:
    pass

In [50]:
references_list

[{'citacionCrossReferences': [{'database': 'PubMed', 'id': '11133465'},
   {'database': 'DOI', 'id': '10.1128/AEM.67.1.345-353.2001'}],
  'title': 'Purification and characterization of an extracellular poly(L-lactic acid) depolymerase from a soil isolate, Amycolatopsis sp. strain K104-1.'},
 {'citacionCrossReferences': [{'database': 'PubMed', 'id': '16237016'},
   {'database': 'DOI', 'id': '10.1128/JB.187.21.7333-7340.2005'}],
  'title': 'Gene cloning and molecular characterization of an extracellular poly(L-lactic acid) depolymerase from Amycolatopsis sp. strain K104-1.'}]

## Blast

In [1]:
import os, argparse
import shutil
import subprocess
import tarfile
from pathlib import Path
from urllib.request import urlopen
import re
from typing import List

import pandas as pd

DB_DIR = os.path.join("scripts", "db")
BLAST_BASE_URL = "https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/"
UNIPROT_BASE_URL = "https://ftp.uniprot.org/pub/databases/uniprot/current_release"
BLAST_DIR = Path("blast_bin")
#https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref100/uniref100.fasta.gz
#https://ftp.uniprot.org/pub/databases/uniprot/current_release/uniref/uniref100/uniref.xsd
#https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref100/uniref100.xml.gz
#https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.xml.gz
#https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz
#https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz

databases = {
    "uniprotkb_reviewed": "knowledgebase/complete/uniprot_sprot",
    "uniprotkb_unreviewed": "knowledgebase/complete/uniprot_trembl",
    "uniref100": "uniref/niref100/uniref100",
    "uniref90": "uniref/uniref90/uniref90",
    "uniref50": "uniref/uniref50/uniref50",
}

def download_uniprot_database(db_name: str, extension: str = "xml"):
    """ Download a Uniprot database from the Uniprot FTP server.
    Args:
        db_name (str): Name of the database to download.
        extension (str): File extension of the database. Default is "xml".
    """

    if db_name not in databases:
        raise ValueError(f"Database {db_name} is not supported. Supported databases are: {', '.join(databases.keys())}.")
    
    db_path = os.path.join(DB_DIR, f"{db_name}.{extension}")
    
    if not os.path.exists(db_path):
        os.makedirs(DB_DIR, exist_ok=True)
        url = f"{UNIPROT_BASE_URL}/{databases[db_name]}.{extension}.gz"
        os.system(f"wget {url} -O {db_path}.gz")
        print(f"Unzipping {db_path}...")
        subprocess.run(["gunzip", db_path], check=True)
    else:
        print(f"Database {db_name} already exists at {db_path}.")

def get_latest_version_url():
    """Retrieve the latest BLAST+ tarball URL from the NCBI FTP site."""
    with urlopen(BLAST_BASE_URL) as response:
        html = response.read().decode("utf-8")
    # Look for something like: ncbi-blast-2.16.0+-x64-linux.tar.gz
    match = re.search(r'ncbi-blast-(\d+\.\d+\.\d+\+)-x64-linux\.tar\.gz', html)
    if match:
        version = match.group(1)
        tar_name = f"ncbi-blast-{version}-x64-linux.tar.gz"
        return version, BLAST_BASE_URL + tar_name
    else:
        raise RuntimeError("Could not find the latest BLAST version from NCBI.")

def is_blast_installed():
    """Check if 'blastp' is available in the system PATH."""
    try:
        subprocess.run(["blastp", "-version"], check=True, stdout=subprocess.DEVNULL)
        return True
    except (subprocess.CalledProcessError, FileNotFoundError):
        return False


def download_and_extract_blast(version: str, url: str):
    """Download and extract the BLAST+ tarball."""
    tarball_name = url.split("/")[-1]
    if not Path(tarball_name).exists():
        print(f"Downloading BLAST+ {version}...")
        subprocess.run(["wget", url], check=True)

    print("Extracting BLAST+...")
    with tarfile.open(tarball_name, "r:gz") as tar:
        tar.extractall(BLAST_DIR)
    print(f"BLAST extracted to: {BLAST_DIR.resolve()}")


def get_local_blastp_path(version: str):
    """Return the path to local blastp binary."""
    return BLAST_DIR / f"ncbi-blast-{version}" / "bin" / "blastp"


def check_blast():
    """Ensure BLAST is installed. Return path to `blastp` binary."""
    if is_blast_installed():
        print("System-wide BLAST is installed.")
        return shutil.which("blastp")
    else:
        version, url = get_latest_version_url()
        local_blastp = get_local_blastp_path(version)
        if not local_blastp.exists():
            print(f"BLAST {version} not found locally. Installing...")
            BLAST_DIR.mkdir(exist_ok=True)
            download_and_extract_blast(version, url)
        else:
            print(f"Using already downloaded BLAST {version}.")
        return str(local_blastp)

def make_blast_database(db_name: str, db_type: str = "prot", extension: str = "xml"):
    """Create a BLAST database from the Uniprot database."""
    db_path = os.path.join(DB_DIR, f"{db_name}.{extension}")
    if not os.path.exists(db_path):
        raise FileNotFoundError(f"Database {db_name} not found at {db_path}. Please download it first.")
    
    # Check if the database is already created
    blast_db_path = os.path.join(DB_DIR, db_name)
    extensions = [".pdb", ".phr", ".pin", ".psq", ".pot", ".psq", ".ptf", ".pto"]
    makedb = False
    # For all extensions check if exists if there is one failing makedb again
    for ext in extensions:
        if not os.path.exists(blast_db_path + "/db" + ext):
            makedb = True
            break
    if makedb:
        print(f"Creating BLAST database for {db_name}...")
        blast_db_cmd = [
            "makeblastdb",
            "-in", db_path,
            "-dbtype", db_type,
            "-out", os.path.join(DB_DIR, db_name) + "/db",
        ]
    
        subprocess.run(blast_db_cmd, check=True)
        print(f"BLAST database created at: {os.path.join(DB_DIR, databases[db_name])}")
    else:
        print(f"BLAST database already exists at {blast_db_path}. No need to create it again.")

def run_blast(sequences: List[str], db_name: str, blast_type: str = "blastp", evalue: float = 0.001):
    """Run BLAST search."""
    blast_db_path = os.path.join(DB_DIR, db_name)
    if not os.path.exists(blast_db_path):
        raise FileNotFoundError(f"Database {db_name} not found at {blast_db_path}. Please download it first.")

    # Make tmp directory if it does not exist
    os.makedirs("tmp", exist_ok=True)

    # Write sequences to a temporary file
    with open("tmp/sequences.fasta", "w") as f:
        for i, seq in enumerate(sequences):
            f.write(f">{i}\n{seq}\n")
    
    blast_cmd = [
        blast_type,
        "-query", "tmp/sequences.fasta",
        "-db", blast_db_path + "/db",
        "-outfmt", "6",
        "-evalue", str(evalue),
    ]
    
    print(f"Running BLAST search...")
    with open("tmp/blast_results.txt", "w") as f:
        subprocess.run(blast_cmd, stdout=f, check=True)
    print(f"BLAST results saved to tmp/blast_results.txt")
    # Clean up temporary file
    os.remove("tmp/sequences.fasta")

def parse_blast_results(file_path: str, identity_threshold: float = 90.0):
    """Parse BLAST results from a file."""
    with open(file_path, "r") as f:
        results = f.readlines()
    
    parsed_results = []
    for line in results:
        fields = line.strip().split("\t")
        identity = float(fields[2])
        if identity >= identity_threshold:
            parsed_results.append({
                "query": fields[0],
                "subject": fields[1],
                "identity": fields[2],
                "alignment_length": fields[3],
                "evalue": fields[4],
                "bit_score": fields[5],
            })
    
    return parsed_results

In [24]:
df = pd.read_csv("data/test.csv")
sequences = df["sequences"].dropna().unique().tolist()
    
download_uniprot_database("uniprotkb_reviewed", "fasta")
    
blastp_path = check_blast()
print(f"Using blastp at: {blastp_path}")

make_blast_database("uniprotkb_reviewed", extension="fasta")

run_blast(sequences, "uniprotkb_reviewed", blast_type="blastp", evalue=0.0001)

results = parse_blast_results("tmp/blast_results.txt")

# Convert to DataFrame
sequences_df = pd.DataFrame(sequences, columns=["sequences"])
sequences_df["id"] = sequences_df.index

sequences_df

Database uniprotkb_reviewed already exists at scripts/db/uniprotkb_reviewed.fasta.
System-wide BLAST is installed.
Using blastp at: /home/diego/micromamba/envs/bioseqdownloader/bin/blastp
BLAST database already exists at scripts/db/uniprotkb_reviewed. No need to create it again.
Running BLAST search...
BLAST results saved to tmp/blast_results.txt


Unnamed: 0,sequences,id
0,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,0
1,MSFKVYDPIAELIATQFPTSNPDLQIINNDVLVVSPHKITLPMGPQ...,1
2,NYTETAQAIARSWRAGSHDRLKARGEAVAVTVHRLVAVPRGRDTPR...,2


In [26]:
df_blast = pd.DataFrame(results)

df_blast = df_blast.rename(columns={"query": "id", "subject": "subject_id"})
df_blast["id"] = df_blast["id"].astype(int)
df_blast = df_blast.merge(sequences_df, on="id", how="left")
df_blast = df_blast.drop(columns=["id"])
df_blast = df_blast.rename(columns={"sequences": "sequence"})

# Separate subject into source, accession, entry_name
df_blast["source"] = df_blast["subject_id"].apply(lambda x: x.split("|")[0])
df_blast["accession"] = df_blast["subject_id"].apply(lambda x: x.split("|")[1])
df_blast["entry_name"] = df_blast["subject_id"].apply(lambda x: x.split("|")[2])
df_blast = df_blast.drop(columns=["subject_id"])


In [27]:
df_blast

Unnamed: 0,identity,alignment_length,evalue,bit_score,sequence,source,accession,entry_name
0,100.0,438,0,0,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,sp,Q6GZX2,003R_FRG3G
1,100.0,180,0,0,MSFKVYDPIAELIATQFPTSNPDLQIINNDVLVVSPHKITLPMGPQ...,sp,Q197F2,008L_IIV3
2,100.0,50,0,0,MSFKVYDPIAELIATQFPTSNPDLQIINNDVLVVSPHKITLPMGPQ...,sp,Q6GZW6,009L_FRG3G
3,100.0,345,0,0,NYTETAQAIARSWRAGSHDRLKARGEAVAVTVHRLVAVPRGRDTPR...,sp,Q6GZW6,009L_FRG3G


# GO

In [14]:
from scripts.description_go import *
import os, ast
import pandas as pd
from tqdm import tqdm

In [17]:
DOCKER_IMAGE_NAME = "metastudent"
DOCKER_CONTAINER_NAME = "metastudent_container"
HOST_INPUT_FILE = os.path.abspath("tmp/sequences.fasta")
HOST_OUTPUT_DIR = os.path.abspath("tmp/")
CONTAINER_INPUT_FILE = "/app/input.fasta"
CONTAINER_OUTPUT_FILE = "/app/output.result"


print("[DESCRIPTION_GO] Getting Gen Ontology")
tqdm.pandas()

if not check_dependencies(DOCKER_IMAGE_NAME):
    print("[DESCRIPTION_GO] Metastudent not found. Installing...")
    install_dependencies(DOCKER_IMAGE_NAME)
else:
    print("[DESCRIPTION_GO] Metastudent found.")

input_df = pd.read_csv("results/umami_uniprot.csv")
obsolete_df = pd.read_csv("scripts/resources/amiGO_data.csv", sep="\t", names=["id_go", "description", "is_obsolete"])
input_df['go_terms'] = input_df['go_terms'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

parsed_df = pd.DataFrame()
if os.path.isfile(f"{HOST_OUTPUT_DIR}/output.BPO.txt") and \
        os.path.isfile(f"{HOST_OUTPUT_DIR}/output.CCO.txt") and \
        os.path.isfile(f"{HOST_OUTPUT_DIR}/output.MFO.txt"):
    print("[DESCRIPTION_GO] Metastudent results found.")
    parsed_df = parse_outputs("uniprot_id")

# Filter input_df with go_terms ~= null
input_df_with_go_terms = input_df[input_df["go_terms"].apply(lambda x: isinstance(x, list) and len(x) > 0)]
input_df = input_df[input_df["go_terms"].apply(lambda x: isinstance(x, list) and len(x) == 0)]

if not input_df_with_go_terms.empty:
    print("[DESCRIPTION_GO] Go terms found in input data.")
    input_df_with_go_terms = input_df_with_go_terms[["uniprot_id", "go_terms"]]
    input_df_with_go_terms = input_df_with_go_terms.explode("go_terms")
    parsed_df = pd.concat(
        [
            parsed_df,
            pd.merge(
                input_df_with_go_terms, 
                obsolete_df, 
                left_on="go_terms", 
                right_on="id_go", 
                how="left"
            )
            .drop(columns=["go_terms"])
            .rename(columns={"id_go": "go"})  
        ]
    )

input_df

[DESCRIPTION_GO] Getting Gen Ontology
Docker version 28.0.0, build f9ced58158
[DESCRIPTION_GO] Metastudent found.
[DESCRIPTION_GO] Go terms found in input data.


Unnamed: 0,uniprot_id,entry_type,protein_name,ec_numbers,organism,taxon_id,sequence,length,go_terms,pfam_ids,references,features,keywords,source_db


In [18]:
if not parsed_df.empty:
    # Check if all sequences have been processed
    parsed_ids = parsed_df["uniprot_id"].unique()
    input_ids = input_df["uniprot_id"].unique()
    if len(parsed_ids) == len(input_ids):
        print("[DESCRIPTION_GO] All sequences have been processed.")
        input_df = pd.DataFrame()
    else:
        input_df = input_df[~input_df["uniprot_id"].isin(parsed_ids)]
        print(f"[DESCRIPTION_GO] {len(input_df)} sequences have not been processed.")

[DESCRIPTION_GO] 0 sequences have not been processed.


In [19]:
os.makedirs(HOST_OUTPUT_DIR, exist_ok=True)
if not input_df.empty:
    print("[DESCRIPTION_GO] Running in batches of 50...")
    for i in tqdm(range(0, len(input_df), 50)):
        run_in_batches(input_df[i:i+50], HOST_OUTPUT_DIR)

In [None]:
test = pd.concat(
    [
        parsed_df,
        parse_outputs("uniprot_id")
    ]
)
    
test = test.sort_values(by="uniprot_id")
test = test.merge(obsolete_df, left_on="go", right_on="id_go", how="left")
test = test.drop(columns=["id_go"])

test

File '/home/diego/Documents/PythonProjects/BioSeqDownloader/tmp/output.BPO.txt' not found.


NameError: name 'exit' is not defined

## Uniprot query

In [1]:
import os, json
os.chdir("src/")
print(f"Current working directory: {os.getcwd()}")

Current working directory: /home/diego/Documents/PythonProjects/BioSeqDownloader/src


In [7]:
query="organism_name:homo sapiens (human) AND length:[15 TO 30] AND reviewed:true"
fields="accession,protein_name,sequence,ec,lineage,organism_name,xref_pfam,xref_alphafolddb,xref_pdb,go_id"
sort="accession asc"
download=True
format="json"

In [8]:
from uniprot import UniprotInterface

uniprot = UniprotInterface()
response = uniprot.submit_stream(
    query=query,
    fields=fields,
    sort=sort,
    include_isoform=True,
    download=download,
    format=format
)

In [9]:
uniprot.parse_stream_response(
    query=query,
    response=response
)

Unnamed: 0,query,accession,protein_name,organism_name,taxon_id,ineage,sequence,length,go_terms,pfam_ids,alphafold_ids,pdb_ids
0,organism_name:homo sapiens (human) AND length:...,A0A075B6S0,T cell receptor gamma joining 1,Homo sapiens,9606,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",NYYKKLFGSGTTLVVT,16,"[GO:0042101, GO:0002250]",[],[A0A075B6S0],[]
1,organism_name:homo sapiens (human) AND length:...,A0A075B6Y3,T cell receptor alpha joining 3,Homo sapiens,9606,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",GYSSASKIIFGSGTRLSIRP,20,"[GO:0042101, GO:0002250]",[],[A0A075B6Y3],[]
2,organism_name:homo sapiens (human) AND length:...,A0A075B6Y9,T cell receptor alpha joining 42,Homo sapiens,9606,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",YGGSQGNLIFGKGTKLSVKP,20,"[GO:0005886, GO:0002250]",[],[A0A075B6Y9],[]
3,organism_name:homo sapiens (human) AND length:...,A0A075B700,T cell receptor alpha joining 31,Homo sapiens,9606,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",NNNARLMFGDGTQLVVKP,18,"[GO:0005886, GO:0002250]",[],[A0A075B700],[]
4,organism_name:homo sapiens (human) AND length:...,A0A075B706,T cell receptor delta joining 1,Homo sapiens,9606,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",TDKLIFGKGTRVTVEP,16,"[GO:0042101, GO:0002250]",[],[A0A075B706],[]
5,organism_name:homo sapiens (human) AND length:...,A0A0A0MT70,T cell receptor beta joining 2-6,Homo sapiens,9606,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",SGANVLTFGAGSRLTVL,17,"[GO:0042101, GO:0002250]",[],[A0A0A0MT70],[]
6,organism_name:homo sapiens (human) AND length:...,A0A0A0MT78,T cell receptor beta joining 2-7,Homo sapiens,9606,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",SYEQYFGPGTRLTVT,15,"[GO:0042101, GO:0002250]",[],[],[]
7,organism_name:homo sapiens (human) AND length:...,A0A0A0MT87,T cell receptor beta joining 2-4,Homo sapiens,9606,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",AKNIQYFGAGTRLSVL,16,"[GO:0042101, GO:0002250]",[],[A0A0A0MT87],[]
8,organism_name:homo sapiens (human) AND length:...,A0A0A0MT94,T cell receptor beta joining 2-2,Homo sapiens,9606,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",NTGELFFGEGSRLTVL,16,"[GO:0042101, GO:0002250]",[],[A0A0A0MT94],[]
9,organism_name:homo sapiens (human) AND length:...,A0A0A0MTA4,T cell receptor beta joining 2-5,Homo sapiens,9606,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",QETQYFGPGTRLLVL,15,"[GO:0042101, GO:0002250]",[],[],[]


## Activity search

In [1]:
import os
import pandas as pd

In [2]:
uniprot_search_files = os.listdir("uniprot_search")
uniprot_search_files

['uniprot_celiac-toxic.csv',
 'uniprot_embryotoxic.csv',
 'uniprot_ace-inhibitor.csv',
 'uniprot_anuran-defense.csv',
 'uniprot_campde-inhibitor.csv',
 'uniprot_anti-neurotensive.csv',
 'uniprot_antitrypanosomic.csv',
 'uniprot_anticancer.csv',
 'uniprot_anorectic.csv',
 'uniprot_chemotactic.csv',
 'uniprot_targeting-GP.csv',
 'uniprot_Blood-Brain-Barrier.csv',
 'uniprot_antitumor.csv',
 'uniprot_cytotoxic.csv',
 'uniprot_activating-ubiquitin-mediated-proteolysis.csv',
 'uniprot_protein-kinase-c-inhibitor.csv',
 'uniprot_antihiv.csv',
 'uniprot_antidiabetic.csv',
 'uniprot_calpain-2-inhibitor.csv',
 'uniprot_antileishmania.csv',
 'uniprot_inhibitor.csv',
 'uniprot_antimicrobial.csv',
 'uniprot_antituberculosis.csv',
 'uniprot_antiviral.csv',
 'uniprot_wound-healing.csv',
 'uniprot_targeting-GN.csv',
 'uniprot_hmg-coa-reductase-inhibitor.csv',
 'uniprot_opioid-agonist.csv',
 'uniprot_toxicology.csv',
 'uniprot_antibacterial.csv',
 'uniprot_antibiofilm.csv',
 'uniprot_hypocholesterolemic

In [3]:
# Create an empty list to store the results
activity_data = []

# Iterate over each file in uniprot_search_files
for file in uniprot_search_files:
    # Load the CSV file into a DataFrame
    df = pd.read_csv(f"uniprot_search/{file}")
    
    # Extract the activity name from the file name
    activity = file.replace("uniprot_", "").replace(".csv", "").replace("-", " ").capitalize()
    
    # Count the number of sequences in the DataFrame
    sequence_count = len(df)
    
    # Append the activity and sequence count to the list
    activity_data.append({"activity": activity, "sequence_count": sequence_count})

# Create a DataFrame from the activity data
activity_df = pd.DataFrame(activity_data)

# Sort the DataFrame by sequence count in descending order
activity_df = activity_df.sort_values(by="sequence_count", ascending=False)

# Calculate the total number of sequences
total_sequences = activity_df["sequence_count"].sum()

# Add a row for the total sequences
activity_df = pd.concat([activity_df, pd.DataFrame([{"activity": "Total", "sequence_count": total_sequences}])], ignore_index=True)

# Display the resulting DataFrame
activity_df

Unnamed: 0,activity,sequence_count
0,Binding,411281
1,Inhibitor,18950
2,Surface binding,15663
3,Regulating,11361
4,Antimicrobial,5945
...,...,...
85,Dipeptidyl peptidaseiv,0
86,Antiamnestic,0
87,Edema inducer,0
88,Antiendotoxin,0


In [4]:
anti_activities_df = activity_df[activity_df['activity'].str.startswith('Anti', na=False)]
anti_activities_df

Unnamed: 0,activity,sequence_count
4,Antimicrobial,5945
6,Antiviral,3835
10,Antibacterial,2366
17,Antifungal,1209
19,Antitoxin,832
24,Antitumor,479
25,Anticancer,393
32,Antiparasitic,140
37,Antiangiogenic,86
41,Antioxidative,55


### ALphafold

In [1]:
import requests, re, os, time, threading
from requests.adapters import HTTPAdapter, Retry
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
API_URL = "https://alphafold.ebi.ac.uk/api/prediction/"
POLLING_INTERVAL = 3
#SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
CACHE_DIR = os.path.join("src", ".alphafold_cache")

class AlphafoldInterface():
    def __init__(self, total_retries=5):
        self.retries = Retry(total=total_retries, backoff_factor=0.25, status_forcelist=[ 500, 502, 503, 504 ])
        self.session = requests.Session()
        self.session.mount('https://', HTTPAdapter(max_retries=self.retries))
        self.session.headers.update({"Content-Type": "application/json"})
        

    def get_prediction(self, uniprot_id) -> dict:
        """
        Get prediction for a given UniProt ID.
        Args:
            uniprot_id (str): UniProt ID to fetch prediction for.
        Returns:
            dict: Prediction data.
        """
        url = f"{API_URL}{uniprot_id}"
        
        # Try and retry if necesary
        try:
            response = self.session.get(url)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching prediction for {uniprot_id}: {e}")
            return None
        return response.json()
    
    def _camel_to_snake(self, name: str) -> str:
        """
        Convert camelCase to snake_case.
        Args:
            name (str): String in camelCase.
        Returns:
            str: String in snake_case.
        """
        return re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower()
    
    def parse_prediction(self, prediction: dict, download_structure: bool = False) -> dict:
        """
        Parse the prediction data.
        Args:
            prediction (dict): Prediction data.
            download_structure (bool): Whether to download the structure.
        Returns:
            dict: Parsed prediction data.
        """
        # Change keys from camelCase to snake_case
        parsed_prediction = {}
        for key, value in prediction.items():
            if isinstance(value, dict):
                parsed_prediction[key] = {self._camel_to_snake(k): v for k, v in value.items()}
            else:
                parsed_prediction[self._camel_to_snake(key)] = value
        
        # Download structure if requested
        if download_structure:
            # Get keys ending with "_url"
            structure_url = [k for k in parsed_prediction.keys() if k.endswith("_url") and parsed_prediction[k]]
            if structure_url:
                # Download the structure file
                for url in structure_url:
                    file_name = parsed_prediction[url].split("/")[-1]
                    # Check if file already exists
                    if os.path.exists(CACHE_DIR + "/" + file_name):
                        continue
                    else:
                        response = self.session.get(parsed_prediction[url])
                        with open(CACHE_DIR + "/" + file_name, "wb") as f:
                            f.write(response.content)
        return parsed_prediction

    def get_and_parse_predictions(self, uniprot_id: str, download_structure: bool = False) -> list:
        """
        Get and parse predictions for a given UniProt ID.
        Args:
            uniprot_id (str): UniProt ID to fetch predictions for.
            download_structure (bool): Whether to download structure files.
        Returns:
            list: List of parsed predictions.
        """
        print(f"Fetching prediction for {uniprot_id}")
        prediction = self.get_prediction(uniprot_id)
        time.sleep(POLLING_INTERVAL)  # control spacing between calls

        if not prediction:
            return []
        
        return [self.parse_prediction(p, download_structure) for p in prediction]
    
    def get_predictions_for_ids(self, ids: list, download_structure: bool = False, max_workers: int = 5) -> pd.DataFrame:
        """
        Get predictions in parallel for a list of UniProt IDs.
        Args:
            ids (list): List of UniProt IDs to fetch predictions for.
            download_structure (bool): Whether to download structure files.
            max_workers (int): Maximum number of parallel requests.
        Returns:
            pd.DataFrame: DataFrame containing parsed predictions.
        """
        if not os.path.exists(CACHE_DIR):
            os.makedirs(CACHE_DIR)

        predictions = []
        lock = threading.Lock()

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(self.get_and_parse_predictions, uid, download_structure) for uid in ids]
            for future in as_completed(futures):
                result = future.result()
                if result:
                    with lock:
                        predictions.extend(result)

        return pd.DataFrame(predictions)

In [7]:
instance = AlphafoldInterface()
uniprot_ids = ["P02666", "Q9TSI0", "P33048", "P11839", "O15552", "P76011"]

In [None]:
# Extraer la calidad del modelo y solo el modelo
instance.get_predictions_for_ids_parallel(uniprot_ids, download_structure=True, max_workers=2)

[START] Fetching prediction for P02666
[START] Fetching prediction for Q9TSI0
[START] Fetching prediction for P33048
[START] Fetching prediction for P11839
[START] Fetching prediction for O15552
[START] Fetching prediction for P76011


Unnamed: 0,entry_id,gene,sequence_checksum,sequence_version_date,uniprot_accession,uniprot_id,uniprot_description,tax_id,organism_scientific_name,uniprot_start,...,bcif_url,cif_url,pdb_url,pae_image_url,pae_doc_url,am_annotations_url,am_annotations_hg19_url,am_annotations_hg38_url,is_reviewed,is_reference_proteome
0,AF-Q9TSI0-F1,CSN2,14FD3687DD17C5A9,2000-05-01,Q9TSI0,CASB_BUBBU,Beta-casein,89462,Bubalus bubalis,1,...,https://alphafold.ebi.ac.uk/files/AF-Q9TSI0-F1...,https://alphafold.ebi.ac.uk/files/AF-Q9TSI0-F1...,https://alphafold.ebi.ac.uk/files/AF-Q9TSI0-F1...,https://alphafold.ebi.ac.uk/files/AF-Q9TSI0-F1...,https://alphafold.ebi.ac.uk/files/AF-Q9TSI0-F1...,,,,True,False
1,AF-P02666-F1,CSN2,F0BBDD8148A238AE,1989-07-01,P02666,CASB_BOVIN,Beta-casein,9913,Bos taurus,1,...,https://alphafold.ebi.ac.uk/files/AF-P02666-F1...,https://alphafold.ebi.ac.uk/files/AF-P02666-F1...,https://alphafold.ebi.ac.uk/files/AF-P02666-F1...,https://alphafold.ebi.ac.uk/files/AF-P02666-F1...,https://alphafold.ebi.ac.uk/files/AF-P02666-F1...,,,,True,True
2,AF-P33048-F1,CSN2,96AE17746A01CD05,1993-10-01,P33048,CASB_CAPHI,Beta-casein,9925,Capra hircus,1,...,https://alphafold.ebi.ac.uk/files/AF-P33048-F1...,https://alphafold.ebi.ac.uk/files/AF-P33048-F1...,https://alphafold.ebi.ac.uk/files/AF-P33048-F1...,https://alphafold.ebi.ac.uk/files/AF-P33048-F1...,https://alphafold.ebi.ac.uk/files/AF-P33048-F1...,,,,True,True
3,AF-P11839-F1,CSN2,061B4424DCB49EB1,1995-11-01,P11839,CASB_SHEEP,Beta-casein,9940,Ovis aries,1,...,https://alphafold.ebi.ac.uk/files/AF-P11839-F1...,https://alphafold.ebi.ac.uk/files/AF-P11839-F1...,https://alphafold.ebi.ac.uk/files/AF-P11839-F1...,https://alphafold.ebi.ac.uk/files/AF-P11839-F1...,https://alphafold.ebi.ac.uk/files/AF-P11839-F1...,,,,True,True
4,AF-O15552-F1,FFAR2,F4A8AC6AFBDF1E90,1998-01-01,O15552,FFAR2_HUMAN,Free fatty acid receptor 2,9606,Homo sapiens,1,...,https://alphafold.ebi.ac.uk/files/AF-O15552-F1...,https://alphafold.ebi.ac.uk/files/AF-O15552-F1...,https://alphafold.ebi.ac.uk/files/AF-O15552-F1...,https://alphafold.ebi.ac.uk/files/AF-O15552-F1...,https://alphafold.ebi.ac.uk/files/AF-O15552-F1...,https://alphafold.ebi.ac.uk/files/AF-O15552-F1...,https://alphafold.ebi.ac.uk/files/AF-O15552-F1...,https://alphafold.ebi.ac.uk/files/AF-O15552-F1...,True,True
5,AF-P76011-F1,ymgE,78ED929220264E62,2001-07-11,P76011,YMGE_ECOLI,UPF0410 protein YmgE,83333,Escherichia coli (strain K12),1,...,https://alphafold.ebi.ac.uk/files/AF-P76011-F1...,https://alphafold.ebi.ac.uk/files/AF-P76011-F1...,https://alphafold.ebi.ac.uk/files/AF-P76011-F1...,https://alphafold.ebi.ac.uk/files/AF-P76011-F1...,https://alphafold.ebi.ac.uk/files/AF-P76011-F1...,,,,True,True


## PDB

In [None]:
!pip install --upgrade rcsb-api

Puedo usar rcbapi.search para hacer busquedas como en uniprot

In [78]:
import os
import pandas as pd
from rcsbapi.data import DataQuery as Query
import requests
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
# Check https://data.rcsb.org/rest/v1/core/entry/4HHB for more attributes
# https://pdb101.rcsb.org/train/training-events/apis-python

In [9]:
query = Query(
    input_type="entries",
    input_ids=["4HHB"],  # CSM IDs can be used as well
    return_data_list=["exptl.method", "rcsb_entry_info", "rcsb_entry_info"]
)

In [11]:
results = query.exec(
    batch_size=5000,
    progress_bar=True
)
results

100%|██████████| 1/1 [00:00<00:00,  1.07it/s]


{'data': {'entries': [{'rcsb_id': '4HHB',
    'exptl': [{'method': 'X-RAY DIFFRACTION'}],
    'rcsb_entry_info': {'polymer_monomer_count_maximum': 146,
     'polymer_entity_count_nucleic_acid_hybrid': 0,
     'deposited_unmodeled_polymer_monomer_count': 0,
     'deposited_atom_count': 4779,
     'experimental_method': 'X-ray',
     'structure_determination_methodology': 'experimental',
     'nonpolymer_bound_components': ['HEM'],
     'resolution_combined': [1.74],
     'na_polymer_entity_types': 'Other',
     'diffrn_radiation_wavelength_minimum': None,
     'assembly_count': 1,
     'polymer_composition': 'heteromeric protein',
     'cis_peptide_count': 0,
     'branched_entity_count': 0,
     'disulfide_bond_count': 0,
     'diffrn_radiation_wavelength_maximum': None,
     'structure_determination_methodology_priority': 10,
     'polymer_entity_count_protein': 2,
     'nonpolymer_molecular_weight_maximum': 0.62,
     'deposited_modeled_polymer_monomer_count': 574,
     'deposited_hy

In [3]:
PDB_CACHE_DIR = os.path.join("src", ".pdb_cache")
class PdbInterface():
    def __init__(self):
        self.base_url = "https://files.rcsb.org/download/"
    
    def search_ids(self, input_ids: list, return_data_list: list = None, batch_size: int = 5000) -> dict:
        if return_data_list is None:
            return_data_list = ["exptl.method", "rcsb_entry_info", "rcsb_entry_info"]

        query = Query(
            input_type="entries",
            input_ids=input_ids,
            return_data_list=return_data_list
        )

        results = query.exec(
            batch_size=batch_size,
            progress_bar=True
        )

        return results 
    
    def download_structure(self, pdb_id: str, file_format: str = "pdb") -> str:
        """
        Download the structure file for a given PDB ID.
        Args:
            pdb_id (str): PDB ID to download.
            file_format (str): Format of the file to download. Default is "pdb".
        Returns:
            str: Path to the downloaded file.
        """
        url = f"{self.base_url}{pdb_id}.{file_format}"
        response = requests.get(url)
        
        if response.status_code == 200:
            file_path = os.path.join(PDB_CACHE_DIR, f"{pdb_id}.{file_format}")
            with open(file_path, "wb") as f:
                f.write(response.content)
            return file_path
        else:
            print(f"Failed to download {pdb_id}: {response.status_code}")
            return None
        
    def download_structures_parallel(self, pdb_ids: list, file_format: str = "pdb", max_workers: int = 5) -> list:
        """
        Download structure files in parallel
        Args:
            pdb_ids (list): List of PDB IDs to download.
            file_format (str): Format of the files to download. Default is "pdb".
            max_workers (int): Maximum number of parallel downloads.
        Returns:

            list: List of paths to the downloaded files.
        """
        if not os.path.exists(PDB_CACHE_DIR):
            os.makedirs(PDB_CACHE_DIR)

        downloaded_files = []
        lock = threading.Lock()

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(self.download_structure, pdb_id, file_format) for pdb_id in pdb_ids]
            for future in as_completed(futures):
                result = future.result()
                if result:
                    with lock:
                        downloaded_files.append(result)

        return downloaded_files
    
    def parse_search(self, results: dict) -> pd.DataFrame:
        export_df = pd.DataFrame()
        for entry in results["data"]["entries"]:
            df = pd.json_normalize(
                entry, 
                sep="_",
                record_path=None
            )

            # Expand nested fields manually
            df.insert(1,'method',df['exptl'].apply(lambda x: [y['method'] for y in x]))
            df.drop(columns=['exptl'], inplace=True)

            export_df = pd.concat([export_df, df], ignore_index=True)
        return export_df
            

In [None]:
# TODO PDB Y resolucion
instance = PdbInterface()
# result = instance.search_ids(
#     ["4HHB", "1A8I", "1A8J", "1A8K", "1A8L", "1A8M"], 
#     return_data_list=["exptl.method", "rcsb_entry_info", "rcsb_entry_info"]
# )
#instance.parse_search(result)
instance.download_structures_parallel(
    ["4HHB", "1A8I", "1A8J", "1A8K", "1A8L", "1A8M"], 
    file_format="pdb",
    max_workers=2
)


['src/.pdb_cache/4HHB.pdb',
 'src/.pdb_cache/1A8I.pdb',
 'src/.pdb_cache/1A8J.pdb',
 'src/.pdb_cache/1A8K.pdb',
 'src/.pdb_cache/1A8L.pdb',
 'src/.pdb_cache/1A8M.pdb']

In [32]:
import json
with open("results.json", "w") as f:
    json.dump(result, f)

## Refseq

In [56]:
import pandas as pd
import subprocess
import json
from Bio import Entrez, SeqIO

In [22]:
Entrez.email = "diego.fernandez@umag.cl"
handle = Entrez.einfo() # or esearch, efetch, ...
record = Entrez.read(handle)
handle.close()

In [44]:
handle = Entrez.efetch(db="protein", id="XP_010804480.1", retmode="xml")
record = Entrez.read(handle)
handle.close()

In [45]:
record



In [75]:
class RefSeqInterface():
    def __init__(self):
        pass

    def get_refseq_id(self, id: str) -> pd.DataFrame:
        handle = Entrez.efetch(db="protein", id=id, retmode="xml")
        records = Entrez.read(handle)
        handle.close()
        return pd.DataFrame(records)

    def get_refseq_ids(self, ids: list, max_workers: int = 5) -> pd.DataFrame:
        """
        Get data in parallel for a list of RefSeq IDs.
        Args:
            ids (list): List of RefSeq IDs to fetch predictions for.
            max_workers (int): Maximum number of parallel requests.
        Returns:
            pd.DataFrame: DataFrame containing parsed predictions.
        """
        export_df = pd.DataFrame()
        lock = threading.Lock()

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(self.get_refseq_id, uid) for uid in ids]
            for future in as_completed(futures):
                result = future.result()
                if type(result) == pd.DataFrame:
                    with lock:
                        export_df = pd.concat([export_df, result], ignore_index=True)
        return export_df


In [None]:
# TODO CSV o JSON para parseo
instance = RefSeqInterface()
#result = instance.get_refseq_id("XP_010804480.1")
df = instance.get_refseq_ids(["XP_010804480.1", "XP_010804481.1", "XP_010804482.1"], max_workers=2)

df

Unnamed: 0,GBSeq_locus,GBSeq_length,GBSeq_moltype,GBSeq_topology,GBSeq_division,GBSeq_update-date,GBSeq_create-date,GBSeq_definition,GBSeq_primary-accession,GBSeq_accession-version,...,GBSeq_project,GBSeq_keywords,GBSeq_source,GBSeq_organism,GBSeq_taxonomy,GBSeq_comment,GBSeq_source-db,GBSeq_feature-table,GBSeq_sequence,GBSeq_xrefs
0,XP_010804480,259,AA,linear,MAM,26-JAN-2016,30-DEC-2014,PREDICTED: beta-casein isoform X1 [Bos taurus],XP_010804480,XP_010804480.1,...,PRJNA33843,[RefSeq],Bos taurus (cattle),Bos taurus,Eukaryota; Metazoa; Chordata; Craniata; Verteb...,MODEL REFSEQ: This record is predicted by auto...,REFSEQ: accession XM_010806178.1,"[{'GBFeature_key': 'source', 'GBFeature_locati...",mplntiykqpqnqiiihsappsllvlyfgkkelramkvlilaclva...,"[{'GBXref_dbname': 'BioProject', 'GBXref_id': ..."
1,XP_010804481,233,AA,linear,MAM,30-DEC-2014,30-DEC-2014,PREDICTED: proline-rich protein 27 isoform X1 ...,XP_010804481,XP_010804481.1,...,PRJNA33843,[RefSeq],Bos taurus (cattle),Bos taurus,Eukaryota; Metazoa; Chordata; Craniata; Verteb...,MODEL REFSEQ: This record is predicted by auto...,REFSEQ: accession XM_010806179.1,"[{'GBFeature_key': 'source', 'GBFeature_locati...",mkfllwaclmyvsfardysdnigspypvnpsasisypvipsasipy...,"[{'GBXref_dbname': 'BioProject', 'GBXref_id': ..."
2,XP_010804482,218,AA,linear,MAM,30-DEC-2014,30-DEC-2014,PREDICTED: proline-rich protein 27 isoform X2 ...,XP_010804482,XP_010804482.1,...,PRJNA33843,[RefSeq],Bos taurus (cattle),Bos taurus,Eukaryota; Metazoa; Chordata; Craniata; Verteb...,MODEL REFSEQ: This record is predicted by auto...,REFSEQ: accession XM_010806180.1,"[{'GBFeature_key': 'source', 'GBFeature_locati...",mfvgstnmkfllwaclmyvsfardysdnigspypvnpsasisypvi...,"[{'GBXref_dbname': 'BioProject', 'GBXref_id': ..."


### Brenda

In [112]:
import string, os
import hashlib
from zeep import Client
from zeep.helpers import serialize_object
import pandas as pd
from typing import List, Tuple
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
# TODO Sleep random, ver lo de abajo
# For aditional implementations see: https://www.brenda-enzymes.org/soap.php
func_map = {
    #'getReference': ["ecNumber", "organism"],
    # 'getEcNumbersFromSequence': ,
    # 'getOrganismsFromSequence': ,
    'getKmValue': ["ecNumber", "kmValue", "kmValueMaximum", "substrate", "commentary", "organism", "ligandStructureId", "literature"],
    # 'getGeneralInformation': ,
    # 'getExpression': ,
    'getIc50Value': ["ecNumber", "ic50Value", "ic50ValueMaximum", "inhibitor", "commentary", "organism", "ligandStructureId", "literature"],
    'getKcatKmValue': ["ecNumber", "kcatKmValue", "kcatKmValueMaximum", "substrate", "commentary", "organism", "ligandStructureId", "literature"],
    'getPhStability': ["ecNumber", "phStability", "phStabilityMaximum", "commentary", "organism", "literature"]
    # 'getOxidationStability': ,
    # 'getNaturalSubstratesProducts': ,
    # 'getEngineering': ,
    # 'getNaturalProduct': ,
    # 'getMetalsIons': ,
    # 'getActivatingCompound': ,
    # 'getInhibitors': ,
    # 'getCofactor': ,
    # 'getGeneralStability': ,
    # 'getNaturalSubstrate': ,
    # 'getMolecularWeight': ,
    # 'getCrystallization': ,
    # 'getSubstratesProducts': ,
    # 'getReactionType': ,
    # 'getOrganismSynonyms': ,
    # 'getEnzymeNames': ,
    # 'getOrganicSolventStability': ,
    # 'getApplication': ,
    # 'getSynonyms': ,
    # 'getTemperatureOptimum': ,
    # 'getTemperatureStability': ,
    # 'getPiValue': ,
    # 'getTemperatureRange': ,
    # 'getRecommendedName': ,
    # 'getProduct': ,
    # 'getCasRegistryNumber': ,
    # 'getLocalization': ,
    # 'getPosttranslationalModification': ,
    # 'getSystematicName': ,
    # 'getCloned': ["ecNumber", "organism"],
    # 'getSpecificActivity': ,
    # 'getSubunits': ,
    # 'getLigands': ,
    # 'getTurnoverNumber': ,
    # 'getReaction': ,
    # 'getSourceTissue': ,
    # 'getSubstrate': ,
    # 'getPhRange': ,
    # 'getStorageStability': ,
    # 'getPhOptimum': ,
    # 'getDisease': ,
    # 'getPurification': ,
    # 'getRenatured': ,
    # 'getKiValue': ,
    # 'getPathway': ,
    # 'getPdb': ,
}

In [131]:
class BrendaInstance():
    def __init__(self, email: str, password: str):
        wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
        self.email = email
        self.password = hashlib.sha256(password.encode("utf-8")).hexdigest()
        self.client = Client(wsdl)
        self.result = {}

    def show_all_operations(self):
        print("Available operations:")
        for service in self.client.wsdl.services.values():
            for port in service.ports.values():
                for operation_name in port.binding._operations.keys():
                    print(f"- {operation_name}")
    
    def fetch_single_record(self, ec_number: str, organism: str) -> pd.DataFrame:
        input_parameters = {
            "ecNumber": ec_number,
            "organism": organism
        }
        export_data = {}
        
        for operation_name in func_map.keys():
            print(f"Executing {operation_name}")
            try:
                # Get the fields required for this function
                field_names = func_map[operation_name]

                # Build parameters in order
                param_list = [f"{key}*{input_parameters.get(key, '')}" for key in field_names]

                # Add credentials
                parameters = [self.email, self.password] + param_list

                func = getattr(self.client.service, operation_name)
                result = serialize_object(func(*parameters))

                # Convert to DataFrame
                if isinstance(result, list):
                    df = pd.DataFrame(result)
                else:
                    df = pd.DataFrame([result])
                

            
            except Exception as e:
                print(f"Error executing {operation_name}: {e}")
                continue

            # Add the DataFrame to the export data
            df_name = operation_name.replace("get", "")
            export_data[df_name] = df
        
        return export_data
    
    def fetch_brenda_data_parallel(self, parameters: List[Tuple[str, str]], max_workers: int = 5) -> pd.DataFrame:
        """
        Get data in parallel for a list of EC numbers and organisms.
        Args:
            parameters (List[str, str]): List of tuples containing EC numbers and organisms.
        Returns:
            pd.DataFrame: DataFrame containing parsed predictions.
        """
        lock = threading.Lock()

        with ThreadPoolExecutor(max_workers) as executor:
            futures = [executor.submit(self.fetch_single_record, ec, org) for ec, org in parameters]
            for future in as_completed(futures):
                result = future.result()
                if isinstance(result, dict):
                    with lock:
                        # Append to every correpsonding DataFrame in self.result
                        for key, df in result.items():
                            if key not in self.result:
                                self.result[key] = df
                            else:
                                self.result[key] = pd.concat([self.result[key], df], ignore_index=True)
    
    def show_results(self):
        """
        Show the results of the queries.
        """
        if not self.result:
            print("No results found.")
            return
        for key, df in self.result.items():
            print(f"Results for {key}:")
            print(df.head())
        

    def save_results(self, directory: str):
        """
        Save the results to the specified directory.
        Args:
            directory (str): Directory to save the results.
        """
        if not os.path.exists(directory):
            os.makedirs(directory)

        for key, df in self.result.items():
            if df.empty:
                print(f"No results for {key}.")
                continue
            df.to_csv(os.path.join(directory, f"{key}.csv"), index=False)
            print(f"Saved {key} to {directory}/{key}.csv")


In [132]:
instance.fetch_single_record("1.1.1.1", "Homo sapiens")

Executing getIc50Value
Executing getKcatKmValue
Executing getPhStability


{'Ic50Value': Empty DataFrame
 Columns: []
 Index: [],
 'KcatKmValue': Empty DataFrame
 Columns: []
 Index: [],
 'PhStability':   phStabilityMaximum phStability literature commentary      organism ecNumber
 0               10.6           7   [285567]     stable  Homo sapiens  1.1.1.1}

In [None]:
# instance = BrendaInstance("diego.fernandez@umag.cl", "")
# instance.fetch_brenda_data_parallel([("1.1.1.1", "Homo sapiens")])
# instance.save_results("results/brenda")

Executing getIc50Value
Executing getKcatKmValue
Executing getPhStability
No results for Ic50Value.
No results for KcatKmValue.
Saved PhStability to results/brenda/PhStability.csv


## Reactome

In [2]:
import os
import pandas as pd
import requests
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

In [3]:
REACTOME_BASE_URL = "https://reactome.org/ContentService/"
class ReactomeInstance():
    def __init__(self):
        pass

    def get_pathway(self, id: str) -> dict:
        """
        Download pathways from a given UniProt ID.
        Args:
            id (str): Reactome pathway ID.
        Returns:
            dict: Pathway data.
        """
        url = f"{REACTOME_BASE_URL}data/pathway/{id}/containedEvents"
        response = requests.get(url)
        
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Failed to fetch pathways for {id}: {response.status_code}")
            return None
    
    def parse_pathway(self, raw_result: dict) -> pd.DataFrame:
        """
        Parse the pathway data.
        Args:
            raw_result (dict): Raw pathway data.
        Returns:
            pd.DataFrame: Parsed pathway data.
        """
        if not raw_result:
            return pd.DataFrame()
        
        # Convert to DataFrame
        df = pd.json_normalize(raw_result)
        
        # Rename columns
        df.rename(columns={"stId": "pathway_id", "displayName": "pathway_name"}, inplace=True)
        
        return df

In [4]:
instance = ReactomeInstance()
respponse = instance.get_pathway("R-DME-1834941")
df = instance.parse_pathway(respponse)
df

Unnamed: 0,dbId,pathway_name,pathway_id,stIdVersion,isInDisease,isInferred,name,releaseDate,speciesName,category,className,schemaClass,hasDiagram,hasEHLD
0,10790592,STING binds cyclic GMP-AMP,R-DME-3244643,R-DME-3244643.1,False,True,[STING binds cyclic GMP-AMP],2025-03-26,Drosophila melanogaster,omitted,Reaction,BlackBoxEvent,,
1,10819344,"DDX41 binds bacterial c-di-AMP, c-di-GMP",R-DME-9013869,R-DME-9013869.1,False,True,"[DDX41 binds bacterial c-di-AMP, c-di-GMP ]",2025-03-26,Drosophila melanogaster,binding,Reaction,Reaction,,
2,10790177,STING dimerization,R-DME-3134800,R-DME-3134800.1,False,True,[STING dimerization],2025-03-26,Drosophila melanogaster,binding,Reaction,Reaction,,
3,10788129,STING binds c-di-GMP,R-DME-2396009,R-DME-2396009.1,False,True,[STING binds c-di-GMP],2025-03-26,Drosophila melanogaster,omitted,Reaction,BlackBoxEvent,,
4,10829117,STAT6-mediated induction of chemokines,R-DME-3249367,R-DME-3249367.1,False,True,[STAT6-mediated induction of chemokines],2025-03-26,Drosophila melanogaster,,Pathway,Pathway,False,False
5,10790821,STING recruits TBK1 and STAT6,R-DME-3249378,R-DME-3249378.1,False,True,[STING recruits TBK1 and STAT6],2025-03-26,Drosophila melanogaster,binding,Reaction,Reaction,,
6,10790823,p-S407-STAT6 is phosphorylated at Tyr641,R-DME-3249379,R-DME-3249379.1,False,True,[p-S407-STAT6 is phosphorylated at Tyr641],2025-03-26,Drosophila melanogaster,transition,Reaction,Reaction,,
7,10790813,TBK1 phosphorylates STAT6 at Ser407,R-DME-3249371,R-DME-3249371.1,False,True,[TBK1 phosphorylates STAT6 at Ser407],2025-03-26,Drosophila melanogaster,transition,Reaction,Reaction,,
8,10790798,"p-S407,Y641-STAT6 dimer migrates to the nucleus",R-DME-3249370,R-DME-3249370.1,False,True,"[p-S407,Y641-STAT6 dimer migrates to the nucleus]",2025-03-26,Drosophila melanogaster,transition,Reaction,Reaction,,
9,10790815,"p-S407,Y641-STAT6 is dimerized",R-DME-3249372,R-DME-3249372.1,False,True,"[p-S407,Y641-STAT6 is dimerized]",2025-03-26,Drosophila melanogaster,binding,Reaction,Reaction,,
