In [9]:
from src.uniprot import UniprotInterface
import pandas as pd

In [10]:
def replace_char_at_index(s, i, new_char):
    if i < 0 or i >= len(s):
        raise IndexError("Index out of range.")
    return s[:i] + new_char + s[i+1:]

In [11]:
ids = ["Q75UA4"]
from_db = 'UniProtKB_AC-ID'
to_db = 'UniProtKB'
disease = "CRC"

In [12]:
downloader = UniprotInterface()

job_id = downloader.submit_id_mapping(from_db=from_db, to_db=to_db, ids=ids)

In [None]:
if downloader.check_id_mapping_results_ready(job_id):
    link = downloader.get_id_mapping_results_link(job_id)
    results = downloader.get_id_mapping_results_search(link)

Fetched: 1 / 1


In [None]:
with open("results.json", "w") as f:
    json.dump(results, f)

In [None]:
results['results'][0]['to']['sequence']['value']

'MKFGKFVLLAASTALAVVGLGGPAAADSTPQAQPSIIGGSNATSGPWAARLFVNGRQNCTATIIAPQYILTAKHCVSSSGTYTFRIGSLDQTSGGTMATGSTITRYPGSADLAIVRLTTSVNATYSPLGSVGDVSVGQNVSVYGWGATSQCGSEINCQSRYLKVATVRVNSISCSDYTGGVAVCANRVNGITAGGDSGGPMFASGRQVGVASTSDRVNNTAYTNITRYRSWISQVAGV'

In [None]:
for result in results['results']:
    print(result['from'])

Q75UA4


In [None]:
export_data = []
sequence = results['results'][0]['to']['sequence']['value']
for feature in results['results'][0]['to']['features']:
    row = []
    if feature['type'] == 'Natural variant' and disease in feature['description']:     
        row.append(feature['featureId'])
        location_start = feature['location']['start']['value']
        location_end = feature['location']['end']['value']
        if location_start == location_end:
            row.append(location_start)
            original_sequence = feature['alternativeSequence']['originalSequence']
            new_sequence = feature['alternativeSequence']['alternativeSequences'][0]
            row.append(f"{original_sequence}->{new_sequence}")
            row.append(replace_char_at_index(sequence, int(location_start)-1, new_sequence))
        else:
            row.append(f"{location_start}-{location_end}")
            row.append("missing")
            row.append(sequence[:int(location_start)-1] + sequence[int(location_end)-1:])
        export_data.append(row)
export_data

In [None]:
df = pd.DataFrame(export_data, columns=["variant id", "position", "change", "sequence"])
df

In [None]:
df.to_csv("results.csv", index=False)

In [None]:
result = results['results'][0]
for reference in result['to']['references']:
    print(reference['citation']['citationCrossReferences'])

[{'database': 'PubMed', 'id': '11133465'}, {'database': 'DOI', 'id': '10.1128/AEM.67.1.345-353.2001'}]
[{'database': 'PubMed', 'id': '16237016'}, {'database': 'DOI', 'id': '10.1128/JB.187.21.7333-7340.2005'}]


In [None]:
references_list = []
result = results['results'][0]                 

try:
    for r in result['to']['references']:
        tmp = {}
        tmp["citacionCrossReferences"] = r['citation']['citationCrossReferences']
        tmp.update({"title": r['citation']['title']})
        references_list.append(tmp)
except KeyError:
    pass

In [None]:
references_list

[{'citacionCrossReferences': [{'database': 'PubMed', 'id': '11133465'},
   {'database': 'DOI', 'id': '10.1128/AEM.67.1.345-353.2001'}],
  'title': 'Purification and characterization of an extracellular poly(L-lactic acid) depolymerase from a soil isolate, Amycolatopsis sp. strain K104-1.'},
 {'citacionCrossReferences': [{'database': 'PubMed', 'id': '16237016'},
   {'database': 'DOI', 'id': '10.1128/JB.187.21.7333-7340.2005'}],
  'title': 'Gene cloning and molecular characterization of an extracellular poly(L-lactic acid) depolymerase from Amycolatopsis sp. strain K104-1.'}]

## Blast

In [None]:
import os, argparse
import shutil
import subprocess
import tarfile
from pathlib import Path
from urllib.request import urlopen
import re
from typing import List

import pandas as pd

DB_DIR = os.path.join("scripts", "db")
BLAST_BASE_URL = "https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/"
UNIPROT_BASE_URL = "https://ftp.uniprot.org/pub/databases/uniprot/current_release"
BLAST_DIR = Path("blast_bin")
#https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref100/uniref100.fasta.gz
#https://ftp.uniprot.org/pub/databases/uniprot/current_release/uniref/uniref100/uniref.xsd
#https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref100/uniref100.xml.gz
#https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.xml.gz
#https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz
#https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz

databases = {
    "uniprotkb_reviewed": "knowledgebase/complete/uniprot_sprot",
    "uniprotkb_unreviewed": "knowledgebase/complete/uniprot_trembl",
    "uniref100": "uniref/niref100/uniref100",
    "uniref90": "uniref/uniref90/uniref90",
    "uniref50": "uniref/uniref50/uniref50",
}

def download_uniprot_database(db_name: str, extension: str = "xml"):
    """ Download a Uniprot database from the Uniprot FTP server.
    Args:
        db_name (str): Name of the database to download.
        extension (str): File extension of the database. Default is "xml".
    """

    if db_name not in databases:
        raise ValueError(f"Database {db_name} is not supported. Supported databases are: {', '.join(databases.keys())}.")
    
    db_path = os.path.join(DB_DIR, f"{db_name}.{extension}")
    
    if not os.path.exists(db_path):
        os.makedirs(DB_DIR, exist_ok=True)
        url = f"{UNIPROT_BASE_URL}/{databases[db_name]}.{extension}.gz"
        os.system(f"wget {url} -O {db_path}.gz")
        print(f"Unzipping {db_path}...")
        subprocess.run(["gunzip", db_path], check=True)
    else:
        print(f"Database {db_name} already exists at {db_path}.")

def get_latest_version_url():
    """Retrieve the latest BLAST+ tarball URL from the NCBI FTP site."""
    with urlopen(BLAST_BASE_URL) as response:
        html = response.read().decode("utf-8")
    # Look for something like: ncbi-blast-2.16.0+-x64-linux.tar.gz
    match = re.search(r'ncbi-blast-(\d+\.\d+\.\d+\+)-x64-linux\.tar\.gz', html)
    if match:
        version = match.group(1)
        tar_name = f"ncbi-blast-{version}-x64-linux.tar.gz"
        return version, BLAST_BASE_URL + tar_name
    else:
        raise RuntimeError("Could not find the latest BLAST version from NCBI.")

def is_blast_installed():
    """Check if 'blastp' is available in the system PATH."""
    try:
        subprocess.run(["blastp", "-version"], check=True, stdout=subprocess.DEVNULL)
        return True
    except (subprocess.CalledProcessError, FileNotFoundError):
        return False


def download_and_extract_blast(version: str, url: str):
    """Download and extract the BLAST+ tarball."""
    tarball_name = url.split("/")[-1]
    if not Path(tarball_name).exists():
        print(f"Downloading BLAST+ {version}...")
        subprocess.run(["wget", url], check=True)

    print("Extracting BLAST+...")
    with tarfile.open(tarball_name, "r:gz") as tar:
        tar.extractall(BLAST_DIR)
    print(f"BLAST extracted to: {BLAST_DIR.resolve()}")


def get_local_blastp_path(version: str):
    """Return the path to local blastp binary."""
    return BLAST_DIR / f"ncbi-blast-{version}" / "bin" / "blastp"


def check_blast():
    """Ensure BLAST is installed. Return path to `blastp` binary."""
    if is_blast_installed():
        print("System-wide BLAST is installed.")
        return shutil.which("blastp")
    else:
        version, url = get_latest_version_url()
        local_blastp = get_local_blastp_path(version)
        if not local_blastp.exists():
            print(f"BLAST {version} not found locally. Installing...")
            BLAST_DIR.mkdir(exist_ok=True)
            download_and_extract_blast(version, url)
        else:
            print(f"Using already downloaded BLAST {version}.")
        return str(local_blastp)

def make_blast_database(db_name: str, db_type: str = "prot", extension: str = "xml"):
    """Create a BLAST database from the Uniprot database."""
    db_path = os.path.join(DB_DIR, f"{db_name}.{extension}")
    if not os.path.exists(db_path):
        raise FileNotFoundError(f"Database {db_name} not found at {db_path}. Please download it first.")
    
    # Check if the database is already created
    blast_db_path = os.path.join(DB_DIR, db_name)
    extensions = [".pdb", ".phr", ".pin", ".psq", ".pot", ".psq", ".ptf", ".pto"]
    makedb = False
    # For all extensions check if exists if there is one failing makedb again
    for ext in extensions:
        if not os.path.exists(blast_db_path + "/db" + ext):
            makedb = True
            break
    if makedb:
        print(f"Creating BLAST database for {db_name}...")
        blast_db_cmd = [
            "makeblastdb",
            "-in", db_path,
            "-dbtype", db_type,
            "-out", os.path.join(DB_DIR, db_name) + "/db",
        ]
    
        subprocess.run(blast_db_cmd, check=True)
        print(f"BLAST database created at: {os.path.join(DB_DIR, databases[db_name])}")
    else:
        print(f"BLAST database already exists at {blast_db_path}. No need to create it again.")

def run_blast(sequences: List[str], db_name: str, blast_type: str = "blastp", evalue: float = 0.001):
    """Run BLAST search."""
    blast_db_path = os.path.join(DB_DIR, db_name)
    if not os.path.exists(blast_db_path):
        raise FileNotFoundError(f"Database {db_name} not found at {blast_db_path}. Please download it first.")

    # Make tmp directory if it does not exist
    os.makedirs("tmp", exist_ok=True)

    # Write sequences to a temporary file
    with open("tmp/sequences.fasta", "w") as f:
        for i, seq in enumerate(sequences):
            f.write(f">{i}\n{seq}\n")
    
    blast_cmd = [
        blast_type,
        "-query", "tmp/sequences.fasta",
        "-db", blast_db_path + "/db",
        "-outfmt", "6",
        "-evalue", str(evalue),
    ]
    
    print(f"Running BLAST search...")
    with open("tmp/blast_results.txt", "w") as f:
        subprocess.run(blast_cmd, stdout=f, check=True)
    print(f"BLAST results saved to tmp/blast_results.txt")
    # Clean up temporary file
    os.remove("tmp/sequences.fasta")

def parse_blast_results(file_path: str, identity_threshold: float = 90.0):
    """Parse BLAST results from a file."""
    with open(file_path, "r") as f:
        results = f.readlines()
    
    parsed_results = []
    for line in results:
        fields = line.strip().split("\t")
        identity = float(fields[2])
        if identity >= identity_threshold:
            parsed_results.append({
                "query": fields[0],
                "subject": fields[1],
                "identity": fields[2],
                "alignment_length": fields[3],
                "evalue": fields[4],
                "bit_score": fields[5],
            })
    
    return parsed_results

In [None]:
df = pd.read_csv("data/test.csv")
sequences = df["sequences"].dropna().unique().tolist()
    
download_uniprot_database("uniprotkb_reviewed", "fasta")
    
blastp_path = check_blast()
print(f"Using blastp at: {blastp_path}")

make_blast_database("uniprotkb_reviewed", extension="fasta")

run_blast(sequences, "uniprotkb_reviewed", blast_type="blastp", evalue=0.0001)

results = parse_blast_results("tmp/blast_results.txt")

# Convert to DataFrame
sequences_df = pd.DataFrame(sequences, columns=["sequences"])
sequences_df["id"] = sequences_df.index

sequences_df

Database uniprotkb_reviewed already exists at scripts/db/uniprotkb_reviewed.fasta.
System-wide BLAST is installed.
Using blastp at: /home/diego/micromamba/envs/bioseqdownloader/bin/blastp
BLAST database already exists at scripts/db/uniprotkb_reviewed. No need to create it again.
Running BLAST search...
BLAST results saved to tmp/blast_results.txt


Unnamed: 0,sequences,id
0,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,0
1,MSFKVYDPIAELIATQFPTSNPDLQIINNDVLVVSPHKITLPMGPQ...,1
2,NYTETAQAIARSWRAGSHDRLKARGEAVAVTVHRLVAVPRGRDTPR...,2


In [None]:
df_blast = pd.DataFrame(results)

df_blast = df_blast.rename(columns={"query": "id", "subject": "subject_id"})
df_blast["id"] = df_blast["id"].astype(int)
df_blast = df_blast.merge(sequences_df, on="id", how="left")
df_blast = df_blast.drop(columns=["id"])
df_blast = df_blast.rename(columns={"sequences": "sequence"})

# Separate subject into source, accession, entry_name
df_blast["source"] = df_blast["subject_id"].apply(lambda x: x.split("|")[0])
df_blast["accession"] = df_blast["subject_id"].apply(lambda x: x.split("|")[1])
df_blast["entry_name"] = df_blast["subject_id"].apply(lambda x: x.split("|")[2])
df_blast = df_blast.drop(columns=["subject_id"])


In [None]:
df_blast

Unnamed: 0,identity,alignment_length,evalue,bit_score,sequence,source,accession,entry_name
0,100.0,438,0,0,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,sp,Q6GZX2,003R_FRG3G
1,100.0,180,0,0,MSFKVYDPIAELIATQFPTSNPDLQIINNDVLVVSPHKITLPMGPQ...,sp,Q197F2,008L_IIV3
2,100.0,50,0,0,MSFKVYDPIAELIATQFPTSNPDLQIINNDVLVVSPHKITLPMGPQ...,sp,Q6GZW6,009L_FRG3G
3,100.0,345,0,0,NYTETAQAIARSWRAGSHDRLKARGEAVAVTVHRLVAVPRGRDTPR...,sp,Q6GZW6,009L_FRG3G


## GO

In [None]:
from src.description_go import *
import os, ast
import pandas as pd
from tqdm import tqdm

In [None]:
DOCKER_IMAGE_NAME = "metastudent"
DOCKER_CONTAINER_NAME = "metastudent_container"
HOST_INPUT_FILE = os.path.abspath("tmp/sequences.fasta")
HOST_OUTPUT_DIR = os.path.abspath("tmp/")
CONTAINER_INPUT_FILE = "/app/input.fasta"
CONTAINER_OUTPUT_FILE = "/app/output.result"


print("[DESCRIPTION_GO] Getting Gen Ontology")
tqdm.pandas()

if not check_dependencies(DOCKER_IMAGE_NAME):
    print("[DESCRIPTION_GO] Metastudent not found. Installing...")
    install_dependencies(DOCKER_IMAGE_NAME)
else:
    print("[DESCRIPTION_GO] Metastudent found.")

input_df = pd.read_csv("results/umami_uniprot.csv")
obsolete_df = pd.read_csv("scripts/resources/amiGO_data.csv", sep="\t", names=["id_go", "description", "is_obsolete"])
input_df['go_terms'] = input_df['go_terms'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

parsed_df = pd.DataFrame()
if os.path.isfile(f"{HOST_OUTPUT_DIR}/output.BPO.txt") and \
        os.path.isfile(f"{HOST_OUTPUT_DIR}/output.CCO.txt") and \
        os.path.isfile(f"{HOST_OUTPUT_DIR}/output.MFO.txt"):
    print("[DESCRIPTION_GO] Metastudent results found.")
    parsed_df = parse_outputs("uniprot_id")

# Filter input_df with go_terms ~= null
input_df_with_go_terms = input_df[input_df["go_terms"].apply(lambda x: isinstance(x, list) and len(x) > 0)]
input_df = input_df[input_df["go_terms"].apply(lambda x: isinstance(x, list) and len(x) == 0)]

if not input_df_with_go_terms.empty:
    print("[DESCRIPTION_GO] Go terms found in input data.")
    input_df_with_go_terms = input_df_with_go_terms[["uniprot_id", "go_terms"]]
    input_df_with_go_terms = input_df_with_go_terms.explode("go_terms")
    parsed_df = pd.concat(
        [
            parsed_df,
            pd.merge(
                input_df_with_go_terms, 
                obsolete_df, 
                left_on="go_terms", 
                right_on="id_go", 
                how="left"
            )
            .drop(columns=["go_terms"])
            .rename(columns={"id_go": "go"})  
        ]
    )

input_df

[DESCRIPTION_GO] Getting Gen Ontology
Docker version 28.0.0, build f9ced58158
[DESCRIPTION_GO] Metastudent found.
[DESCRIPTION_GO] Go terms found in input data.


Unnamed: 0,uniprot_id,entry_type,protein_name,ec_numbers,organism,taxon_id,sequence,length,go_terms,pfam_ids,references,features,keywords,source_db


In [None]:
if not parsed_df.empty:
    # Check if all sequences have been processed
    parsed_ids = parsed_df["uniprot_id"].unique()
    input_ids = input_df["uniprot_id"].unique()
    if len(parsed_ids) == len(input_ids):
        print("[DESCRIPTION_GO] All sequences have been processed.")
        input_df = pd.DataFrame()
    else:
        input_df = input_df[~input_df["uniprot_id"].isin(parsed_ids)]
        print(f"[DESCRIPTION_GO] {len(input_df)} sequences have not been processed.")

[DESCRIPTION_GO] 0 sequences have not been processed.


In [None]:
os.makedirs(HOST_OUTPUT_DIR, exist_ok=True)
if not input_df.empty:
    print("[DESCRIPTION_GO] Running in batches of 50...")
    for i in tqdm(range(0, len(input_df), 50)):
        run_in_batches(input_df[i:i+50], HOST_OUTPUT_DIR)

In [None]:
test = pd.concat(
    [
        parsed_df,
        parse_outputs("uniprot_id")
    ]
)
    
test = test.sort_values(by="uniprot_id")
test = test.merge(obsolete_df, left_on="go", right_on="id_go", how="left")
test = test.drop(columns=["id_go"])

test

File '/home/diego/Documents/PythonProjects/BioSeqDownloader/tmp/output.BPO.txt' not found.


NameError: name 'exit' is not defined

## Uniprot query

In [None]:
import os
os.chdir("src")

In [None]:
query="organism_name:homo sapiens (human) AND length:[15 TO 30] AND reviewed:true"
fields="accession,protein_name,sequence,ec,lineage,organism_name,xref_pfam,xref_alphafolddb,xref_pdb,go_id"
sort="accession asc"
download=True
format="json"

In [None]:
query="P05067"
fields="accession,xref_interpro"
sort="accession asc"
download=True
format="json"

In [None]:
from src.uniprot import UniprotInterface

uniprot = UniprotInterface()
response = uniprot.submit_stream(
    query=query,
    fields=fields,
    sort=sort,
    include_isoform=True,
    download=download,
    format=format
)

In [None]:
response.json()

{'results': [{'entryType': 'UniProtKB reviewed (Swiss-Prot)',
   'primaryAccession': 'P05067',
   'uniProtKBCrossReferences': [{'database': 'InterPro',
     'id': 'IPR036669',
     'properties': [{'key': 'EntryName', 'value': 'Amyloid_Cu-bd_sf'}]},
    {'database': 'InterPro',
     'id': 'IPR008155',
     'properties': [{'key': 'EntryName', 'value': 'Amyloid_glyco'}]},
    {'database': 'InterPro',
     'id': 'IPR013803',
     'properties': [{'key': 'EntryName', 'value': 'Amyloid_glyco_Abeta'}]},
    {'database': 'InterPro',
     'id': 'IPR037071',
     'properties': [{'key': 'EntryName', 'value': 'Amyloid_glyco_Abeta_sf'}]},
    {'database': 'InterPro',
     'id': 'IPR011178',
     'properties': [{'key': 'EntryName', 'value': 'Amyloid_glyco_Cu-bd'}]},
    {'database': 'InterPro',
     'id': 'IPR024329',
     'properties': [{'key': 'EntryName', 'value': 'Amyloid_glyco_E2_domain'}]},
    {'database': 'InterPro',
     'id': 'IPR008154',
     'properties': [{'key': 'EntryName', 'value': 'A

In [None]:
parsed = uniprot.parse_stream_response(
    query=query,
    response=response
)

In [None]:
parsed

Unnamed: 0,query,accession,alphafold_ids,biogrid_ids,brenda_ids,go_terms,interpro_ids,kegg_ids,pdb_ids,pfam_ids,reactome_ids,refseq_ids,string_ids
0,P05067,P05067,[],[],[],[],"[IPR036669, IPR008155, IPR013803, IPR037071, I...",[],[],[],[],[],[]
1,P05067,P05067-10,,,,,,,,,,,
2,P05067,P05067-11,,,,,,,,,,,
3,P05067,P05067-2,,,,,,,,,,,
4,P05067,P05067-3,,,,,,,,,,,
5,P05067,P05067-4,,,,,,,,,,,
6,P05067,P05067-5,,,,,,,,,,,
7,P05067,P05067-6,,,,,,,,,,,
8,P05067,P05067-7,,,,,,,,,,,
9,P05067,P05067-8,,,,,,,,,,,


## Activity search

In [None]:
import os
import pandas as pd

In [None]:
uniprot_search_files = os.listdir("uniprot_search")
uniprot_search_files

['uniprot_celiac-toxic.csv',
 'uniprot_embryotoxic.csv',
 'uniprot_ace-inhibitor.csv',
 'uniprot_anuran-defense.csv',
 'uniprot_campde-inhibitor.csv',
 'uniprot_anti-neurotensive.csv',
 'uniprot_antitrypanosomic.csv',
 'uniprot_anticancer.csv',
 'uniprot_anorectic.csv',
 'uniprot_chemotactic.csv',
 'uniprot_targeting-GP.csv',
 'uniprot_Blood-Brain-Barrier.csv',
 'uniprot_antitumor.csv',
 'uniprot_cytotoxic.csv',
 'uniprot_activating-ubiquitin-mediated-proteolysis.csv',
 'uniprot_protein-kinase-c-inhibitor.csv',
 'uniprot_antihiv.csv',
 'uniprot_antidiabetic.csv',
 'uniprot_calpain-2-inhibitor.csv',
 'uniprot_antileishmania.csv',
 'uniprot_inhibitor.csv',
 'uniprot_antimicrobial.csv',
 'uniprot_antituberculosis.csv',
 'uniprot_antiviral.csv',
 'uniprot_wound-healing.csv',
 'uniprot_targeting-GN.csv',
 'uniprot_hmg-coa-reductase-inhibitor.csv',
 'uniprot_opioid-agonist.csv',
 'uniprot_toxicology.csv',
 'uniprot_antibacterial.csv',
 'uniprot_antibiofilm.csv',
 'uniprot_hypocholesterolemic

In [None]:
# Create an empty list to store the results
activity_data = []

# Iterate over each file in uniprot_search_files
for file in uniprot_search_files:
    # Load the CSV file into a DataFrame
    df = pd.read_csv(f"uniprot_search/{file}")
    
    # Extract the activity name from the file name
    activity = file.replace("uniprot_", "").replace(".csv", "").replace("-", " ").capitalize()
    
    # Count the number of sequences in the DataFrame
    sequence_count = len(df)
    
    # Append the activity and sequence count to the list
    activity_data.append({"activity": activity, "sequence_count": sequence_count})

# Create a DataFrame from the activity data
activity_df = pd.DataFrame(activity_data)

# Sort the DataFrame by sequence count in descending order
activity_df = activity_df.sort_values(by="sequence_count", ascending=False)

# Calculate the total number of sequences
total_sequences = activity_df["sequence_count"].sum()

# Add a row for the total sequences
activity_df = pd.concat([activity_df, pd.DataFrame([{"activity": "Total", "sequence_count": total_sequences}])], ignore_index=True)

# Display the resulting DataFrame
activity_df

Unnamed: 0,activity,sequence_count
0,Binding,411281
1,Inhibitor,18950
2,Surface binding,15663
3,Regulating,11361
4,Antimicrobial,5945
...,...,...
85,Dipeptidyl peptidaseiv,0
86,Antiamnestic,0
87,Edema inducer,0
88,Antiendotoxin,0


In [None]:
anti_activities_df = activity_df[activity_df['activity'].str.startswith('Anti', na=False)]
anti_activities_df

Unnamed: 0,activity,sequence_count
4,Antimicrobial,5945
6,Antiviral,3835
10,Antibacterial,2366
17,Antifungal,1209
19,Antitoxin,832
24,Antitumor,479
25,Anticancer,393
32,Antiparasitic,140
37,Antiangiogenic,86
41,Antioxidative,55


## Alphafold

In [1]:
from src.alphafold import AlphafoldInterface

In [2]:
instance = AlphafoldInterface(
    structures=['pdb'],
    output_dir="results",
)

In [3]:
print(instance.query_usage())

Usage: To fetch predictions, use the UniProt ID as the query.
        Example: 
            - fetch_single("P02666", parse=True)
            - fetch_batch(["P02666", "P12345"], parse=True)

        Also you can download structures by setting the `structures` parameter in the constructor.
        Example:
            - alphafold = AlphafoldInterface(structures=["pdb", "cif"])
            - prediction = alphafold.fetch_single("P02666")

        Available structures to download:
            - pdb: Protein Data Bank format
            - cif: Crystallographic Information File format
            - bcif: Binary Crystallographic Information File format
        

Example fields in the response:
	- entryId: str
	- gene: str
	- sequenceChecksum: str
	- sequenceVersionDate: str
	- uniprotAccession: str
	- uniprotId: str
	- uniprotDescription: str
	- taxId: int
	- organismScientificName: str
	- uniprotStart: int
	- uniprotEnd: int
	- uniprotSequence: str
	- modelCreatedDate: str
	- latestVersion: i

In [4]:
instance.fetch_single(
    query="P02666",
    parse=True
)

[{'entryId': 'AF-P02666-F1',
  'gene': 'CSN2',
  'sequenceChecksum': 'F0BBDD8148A238AE',
  'sequenceVersionDate': '1989-07-01',
  'uniprotAccession': 'P02666',
  'uniprotId': 'CASB_BOVIN',
  'uniprotDescription': 'Beta-casein',
  'taxId': 9913,
  'organismScientificName': 'Bos taurus',
  'uniprotStart': 1,
  'uniprotEnd': 224,
  'uniprotSequence': 'MKVLILACLVALALARELEELNVPGEIVESLSSSEESITRINKKIEKFQSEEQQQTEDELQDKIHPFAQTQSLVYPFPGPIPNSLPQNIPPLTQTPVVVPPFLQPEVMGVSKVKEAMAPKHKEMPFPKYPVEPFTESQSLTLTDVENLHLPLPLLQSWMHQPHQPLPPTVMFPPQSVLSLSQSKVLPVPQKAVPYPQRDMPIQAFLLYQEPVLGPVRGPFPIIV',
  'modelCreatedDate': '2022-06-01',
  'latestVersion': 4,
  'allVersions': [2, 3, 4],
  'bcifUrl': 'https://alphafold.ebi.ac.uk/files/AF-P02666-F1-model_v4.bcif',
  'cifUrl': 'https://alphafold.ebi.ac.uk/files/AF-P02666-F1-model_v4.cif',
  'paeImageUrl': 'https://alphafold.ebi.ac.uk/files/AF-P02666-F1-predicted_aligned_error_v4.png',
  'paeDocUrl': 'https://alphafold.ebi.ac.uk/files/AF-P02666-F1-predicted_aligned_error

In [6]:
instance.fetch_batch(
    queries=["P02666", "Q9TSI0", "P33048", "P11839", "O15552", "P76011"],
    parse=False,
)

[{'entryId': 'AF-P02666-F1',
  'gene': 'CSN2',
  'sequenceChecksum': 'F0BBDD8148A238AE',
  'sequenceVersionDate': '1989-07-01',
  'uniprotAccession': 'P02666',
  'uniprotId': 'CASB_BOVIN',
  'uniprotDescription': 'Beta-casein',
  'taxId': 9913,
  'organismScientificName': 'Bos taurus',
  'uniprotStart': 1,
  'uniprotEnd': 224,
  'uniprotSequence': 'MKVLILACLVALALARELEELNVPGEIVESLSSSEESITRINKKIEKFQSEEQQQTEDELQDKIHPFAQTQSLVYPFPGPIPNSLPQNIPPLTQTPVVVPPFLQPEVMGVSKVKEAMAPKHKEMPFPKYPVEPFTESQSLTLTDVENLHLPLPLLQSWMHQPHQPLPPTVMFPPQSVLSLSQSKVLPVPQKAVPYPQRDMPIQAFLLYQEPVLGPVRGPFPIIV',
  'modelCreatedDate': '2022-06-01',
  'latestVersion': 4,
  'allVersions': [2, 3, 4],
  'bcifUrl': 'https://alphafold.ebi.ac.uk/files/AF-P02666-F1-model_v4.bcif',
  'cifUrl': 'https://alphafold.ebi.ac.uk/files/AF-P02666-F1-model_v4.cif',
  'paeImageUrl': 'https://alphafold.ebi.ac.uk/files/AF-P02666-F1-predicted_aligned_error_v4.png',
  'paeDocUrl': 'https://alphafold.ebi.ac.uk/files/AF-P02666-F1-predicted_aligned_error

In [None]:
instance.save(
    data = result,
    filename = "alphafold2_results",
    extension= "csv"
)

'results/alphafold2_results.csv'

## BioGRID

In [1]:
from src.biogrid import BioGRIDInterface
import os

In [2]:
from dotenv import load_dotenv
load_dotenv()
biogrid_api_key = os.getenv("biogrid_api_key")

In [3]:
instance = BioGRIDInterface(
)

In [4]:
print(instance.query_usage())

Usage: To fetch interactions, use the BioGRID API with the following parameters.
        Example:
            - fetch_single(method="interactions", query={})
        Available methods: interactions

Query Parameters:
	start: 0 (integer) - Start index for pagination
	max: 10000 (integer) - Maximum number of results to return
	interSpeciesExclude: False (boolean) - Include interactions between different species
	selfInteractionsExclude: False (boolean) - If ‘true’, interactions with one interactor will be excluded
	includeEvidence: False (boolean) - If ‘true’, evidence codes will be included in the results
	searchBiogridIds: False (boolean) - If ‘true’, the interactor BIOGRID_ID will be examined for a match with the geneList.
	searchIds: False (boolean) - If ‘true’, the interactor ENTREZ_GENE, ORDERED LOCUS and SYSTEMATIC_NAME (orf) will be examined for a match with the geneList.
	format: json (string) - Format of the response. Options are 'tab1','tab2', 'extendedTab2', 'count', 'json', 

In [9]:
instance.get_dummy(biogrid_api_key, parse=True)

{'interactions': {'interaction_b': 'str',
  'synonyms_a': 'str',
  'synonyms_b': 'str'}}

In [6]:
instance.fetch_single(
    query={
        "accessKey": biogrid_api_key,
        "geneList": ["cdc27", "apc1", "apc2"],
        "taxId": "559292",
    },
    method="interactions",
    parse=True
)

[{'interaction_b': 'APC1',
  'synonyms_a': 'RSI1|TID2|anaphase promoting complex subunit 2|L000003970|L000004348',
  'synonyms_b': 'anaphase promoting complex subunit 1|L000004053'},
 {'interaction_b': 'CDC16',
  'synonyms_a': 'RSI1|TID2|anaphase promoting complex subunit 2|L000003970|L000004348',
  'synonyms_b': 'anaphase promoting complex subunit CDC16|L000000256'},
 {'interaction_b': 'CDC23',
  'synonyms_a': 'RSI1|TID2|anaphase promoting complex subunit 2|L000003970|L000004348',
  'synonyms_b': 'anaphase promoting complex subunit CDC23|L000000261'},
 {'interaction_b': 'CDC27',
  'synonyms_a': 'RSI1|TID2|anaphase promoting complex subunit 2|L000003970|L000004348',
  'synonyms_b': 'APC3|SNB1|anaphase promoting complex subunit CDC27|L000000266'},
 {'interaction_b': 'APC2',
  'synonyms_a': 'APC10|anaphase promoting complex subunit DOC1|L000004350',
  'synonyms_b': 'RSI1|TID2|anaphase promoting complex subunit 2|L000003970|L000004348'},
 {'interaction_b': 'APC1',
  'synonyms_a': 'CKI2|se

In [10]:
instance.fetch_single(
    query={
        "id" : "103",
        "accessKey": biogrid_api_key,
    },
    method="interactions",
    parse=True
)

[{'interaction_b': 'FLNC',
  'synonyms_a': 'JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAPKK1|SEK1|SERK1|SKK1',
  'synonyms_b': 'ABP-280|ABP280A|ABPA|ABPL|FLN2|MFM5|MPD4'}]

In [11]:
instance.fetch_batch(
    queries=[
        {
            "accessKey": biogrid_api_key,
            "geneList": ["P53", "CDK2", "BRCA1"],
        },
        {
            "accessKey": biogrid_api_key,
            "geneList": ["BRCA2", "ATM", "CHEK2"],
        }
    ],
    method="interactions"
)

[{'2368': {'BIOGRID_INTERACTION_ID': 2368,
   'ENTREZ_GENE_A': '672',
   'ENTREZ_GENE_B': '466',
   'BIOGRID_ID_A': 107140,
   'BIOGRID_ID_B': 106956,
   'SYSTEMATIC_NAME_A': '-',
   'SYSTEMATIC_NAME_B': '-',
   'OFFICIAL_SYMBOL_A': 'BRCA1',
   'OFFICIAL_SYMBOL_B': 'ATF1',
   'SYNONYMS_A': 'BRCAI|BRCC1|BROVCA1|FANCS|IRIS|PNCA4|PPP1R53|PSCP|RNF53',
   'SYNONYMS_B': 'EWS-ATF1|FUS/ATF-1|TREB36',
   'EXPERIMENTAL_SYSTEM': 'Two-hybrid',
   'EXPERIMENTAL_SYSTEM_TYPE': 'physical',
   'PUBMED_AUTHOR': 'Houvras Y (2000)',
   'PUBMED_ID': 10945975,
   'ORGANISM_A': 9606,
   'ORGANISM_B': 9606,
   'THROUGHPUT': 'Low Throughput',
   'QUANTITATION': '-',
   'MODIFICATION': '-',
   'ONTOLOGY_TERMS': {},
   'QUALIFICATIONS': '-',
   'TAGS': '-',
   'SOURCEDB': 'BIOGRID'},
  '2398': {'BIOGRID_INTERACTION_ID': 2398,
   'ENTREZ_GENE_A': '672',
   'ENTREZ_GENE_B': '4436',
   'BIOGRID_ID_A': 107140,
   'BIOGRID_ID_B': 110573,
   'SYSTEMATIC_NAME_A': '-',
   'SYSTEMATIC_NAME_B': '-',
   'OFFICIAL_SYMBOL_A'

## Brenda

In [1]:
from src.brenda import BrendaInstance
import os

In [2]:
from dotenv import load_dotenv
load_dotenv()
brenda_email = os.getenv("brenda_email")
brenda_password = os.getenv("brenda_password")


In [3]:
instance = BrendaInstance(
    email=brenda_email,
    password=brenda_password,
)

In [23]:
instance.fetch_single(
    query=
        {
            "ecNumber": "1.1.1.10",
            "organism": "Homo sapiens",
            #"kcatKmValue": "0.5"
        }, 
    operation="getKmValue",
)

[{'literature': [656049],
  'substrate': 'NADP+',
  'kmValue': '0.001',
  'kmValueMaximum': None,
  'commentary': 'pH 7.0, 25?C',
  'organism': 'Homo sapiens',
  'ecNumber': '1.1.1.10',
  'ligandStructureId': 6},
 {'literature': [656049],
  'substrate': 'NADPH',
  'kmValue': '0.002',
  'kmValueMaximum': None,
  'commentary': 'pH 7.0, 25?C',
  'organism': 'Homo sapiens',
  'ecNumber': '1.1.1.10',
  'ligandStructureId': 5},
 {'literature': [657322],
  'substrate': 'NADPH',
  'kmValue': '0.002',
  'kmValueMaximum': None,
  'commentary': 'pH 7.0, 25?C, wild-type enzyme',
  'organism': 'Homo sapiens',
  'ecNumber': '1.1.1.10',
  'ligandStructureId': 5},
 {'literature': [656049],
  'substrate': 'diacetyl',
  'kmValue': '0.077',
  'kmValueMaximum': None,
  'commentary': 'pH 7.0, 25?C',
  'organism': 'Homo sapiens',
  'ecNumber': '1.1.1.10',
  'ligandStructureId': 603},
 {'literature': [657322],
  'substrate': 'diacetyl',
  'kmValue': '0.077',
  'kmValueMaximum': None,
  'commentary': 'pH 7.0,

In [7]:
instance.fetch_single(
    query=
        {
            #"ecNumber": "1.1.1.10",
            "organism": "Homo sapiens",
            "temperatureRange": "25-37"
        }, 
    operation="getTemperatureRange",
)

[{'temperatureRangeMaximum': '37',
  'literature': [699917],
  'commentary': 'the activity increases gradually as the temperature increases from 25 to 37?C, and then decreases significantly at 45?C compared with that at 37?C',
  'organism': 'Homo sapiens',
  'temperatureRange': '25',
  'ecNumber': '1.1.1.188'},
 {'temperatureRangeMaximum': '65',
  'literature': [485618],
  'commentary': None,
  'organism': 'Homo sapiens',
  'temperatureRange': '25',
  'ecNumber': '2.1.1.77'},
 {'temperatureRangeMaximum': '45',
  'literature': [489533],
  'commentary': 'less than 50% of maximal activity above and below',
  'organism': 'Homo sapiens',
  'temperatureRange': '25',
  'ecNumber': '2.4.1.38'},
 {'temperatureRangeMaximum': '45',
  'literature': [489533],
  'commentary': 'less than 50% of maximal activity above and below',
  'organism': 'Homo sapiens',
  'temperatureRange': '25',
  'ecNumber': '2.4.1.90'},
 {'temperatureRangeMaximum': '37',
  'literature': [661066],
  'commentary': 'highest act

In [None]:
print(instance.query_usage())

Usage: To fetch data from BRENDA, use the following parameters.
        Example:
            - fetch(query={}, operations=["getKmValue", "getIc50Value"])
        Available operations: getKmValue, getIc50Value, getKcatKmValue, getKiValue, getPhRange, getPhOptimum, getPhStability, getCofactor, getTemperatureOptimum, getTemperatureStability, getTemperatureRange

For more information about each operation, please refer to the BRENDA documentation.
Or use `show_operation({operation_name})` to see the parameters required for each operation.


In [None]:
print(instance.show_operation("getTemperatureOptimum"))

Parameters for getTemperatureOptimum: ecNumber, temperatureOptimum, temperatureOptimumMaximum, commentary, organism, literature


In [None]:
instance.get_dummy()

{'getKmValue': {'literature': 'list(int)',
  'substrate': 'str',
  'kmValue': 'str',
  'kmValueMaximum': 'NoneType',
  'commentary': 'NoneType',
  'organism': 'str',
  'ecNumber': 'str',
  'ligandStructureId': 'int'},
 'getIc50Value': {},
 'getKcatKmValue': {},
 'getKiValue': {'kiValueMaximum': 'NoneType',
  'literature': 'list(int)',
  'kiValue': 'str',
  'commentary': 'NoneType',
  'inhibitor': 'str',
  'organism': 'str',
  'ecNumber': 'str',
  'ligandStructureId': 'int'},
 'getPhRange': {},
 'getPhOptimum': {},
 'getPhStability': {},
 'getCofactor': {'literature': 'list(int)',
  'commentary': 'str',
  'ecNumber': 'str',
  'organism': 'str',
  'cofactor': 'str',
  'ligandStructureId': 'int'},
 'getTemperatureOptimum': {},
 'getTemperatureStability': {},
 'getTemperatureRange': {}}

## Gen Ontology

In [3]:
from src.genontology import GenOntologyInterface

In [4]:
instance = GenOntologyInterface(
    #fields_to_extract = ["goid", "label"]
)

In [3]:
instance.get_dummy()

{'ontology-term': {'goid': 'str',
  'label': 'str',
  'definition': 'str',
  'comment': 'str',
  'creation_date': 'str',
  'synonyms': 'list(str)',
  'relatedSynonyms': 'list(str)',
  'alternativeIds': 'list(str)',
  'xrefs': 'list(str)',
  'subsets': 'list(str)'},
 'go': {'goid': 'str',
  'label': 'str',
  'definition': 'str',
  'comment': 'str',
  'creation_date': 'str',
  'synonyms': 'list(str)',
  'relatedSynonyms': 'list(str)',
  'alternativeIds': 'list(str)',
  'xrefs': 'list(str)',
  'subsets': 'list(str)'}}

In [9]:
instance.fetch_batch(
    method="bioentity-function",
    queries=["GO:0008150", "GO:0006915"],
    option="",
    parse=False
)

[[{'bioentity_label': 'ccdc51',
   'bioentity_name': 'coiled-coil domain containing 51',
   'date': '20060825',
   'assigned_by': 'ZFIN',
   'taxon': 'NCBITaxon:7955',
   'taxon_label': 'Danio rerio',
   'panther_family': 'PANTHER:PTHR28624',
   'panther_family_label': 'coiled-coil domain-containing protein 51 pthr28624',
   'evidence_type': 'ND',
   'evidence': 'ECO:0000307',
   'reference': ['ZFIN:ZDB-PUB-031118-1'],
   'synonym': ['im:7138455', 'zgc:153290'],
   'annotation_extension_class': ['GO:0008150'],
   'annotation_extension_class_label': ['biological_process']},
  {'bioentity_label': 'im:705546',
   'bioentity_name': 'im:705546',
   'date': '20050302',
   'assigned_by': 'ZFIN',
   'taxon': 'NCBITaxon:7955',
   'taxon_label': 'Danio rerio',
   'evidence_type': 'ND',
   'evidence': 'ECO:0000307',
   'reference': ['ZFIN:ZDB-PUB-031118-1'],
   'annotation_extension_class': ['GO:0008150'],
   'annotation_extension_class_label': ['biological_process']},
  {'bioentity_label': 'im:7

In [None]:
instance.fetch_batch(
    method="ontology-term",
    queries=["GO:0008150", "GO:0006915"],
    option=None,
    parse=True
)

In [None]:
instance.fetch_to_dataframe(
    method= "ontology-term",
    query= "GO:0008150", 
    option=None,
    look_for_relationships=True
)

Fetching data from https://api.geneontology.org/api/ontology/term/GO%3A0008150
Fetching data from https://api.geneontology.org/api/ontology/term/GO%3A0008150/graph


Unnamed: 0,goid,label,relationships
0,GO:0008150,biological_process,"[GO:0044848, GO:0044419, GO:0008150, GO:006500..."


## Interpro

In [1]:
from src.interpro import InterproInstance
from typing import List, Dict, Any, Union

In [2]:
instance = InterproInstance()

### Search Interpro Id IPR036669

In [1]:
query = {
    "id": "IPR002223",
    "db" : "InterPro",
}

### Search uniProt id Q29537

In [3]:
query = {
    "db" : "InterPro",
    "modifiers" : {},
    "filters" : [
        {
            "type": "protein",
            "db": "reviewed",
            "value": "Q29537"
        }
    ]
}

In [3]:
query = {
    "db" : "InterPro",
    "modifiers" : {},
    "filters" : [
        {
            "type": "protein",
            "db": "reviewed",
            "value": "P05067"
        }
    ]
}

In [4]:
instance.fetch_single(query=query, method="entry", pages_to_fetch=1, parse=True, config_key="entry")

Parsing data with fields: {'accession': 'proteins.accession', 'length': 'proteins.protein_length', 'organism_id': 'proteins.organism.id', 'protein_source_db': 'proteins.source_database', 'in_alphafold': 'proteins.in_alphafold', 'interpro_accession': 'metadata.accession', 'name': 'metadata.name', 'type': 'metadata.type', 'source_database': 'metadata.source_database', 'integrated': 'metadata.integrated', 'pfam': 'metadata.member_databases.pfam', 'prosite': 'metadata.member_databases.prosite', 'smart': 'metadata.member_databases.smart', 'prints': 'metadata.member_databases.prints', 'panther': 'metadata.member_databases.panther', 'cathgene3d': 'metadata.member_databases.cathgene3d', 'ssf': 'metadata.member_databases.ssf', 'go_identifier': 'metadata.go_terms.identifier', 'go_name': 'metadata.go_terms.name', 'go_category': 'metadata.go_terms.category.name', 'locations': 'proteins.entry_protein_locations'}


[{'accession': 'p05067',
  'length': 770,
  'organism_id': '9606',
  'protein_source_db': 'reviewed',
  'in_alphafold': True,
  'interpro_accession': 'IPR002223',
  'name': 'Pancreatic trypsin inhibitor Kunitz domain',
  'type': 'domain',
  'source_database': 'interpro',
  'integrated': None,
  'pfam': {'PF00014': 'Kunitz/Bovine pancreatic trypsin inhibitor domain'},
  'prosite': None,
  'smart': {'SM00131': 'BPTI/Kunitz family of serine protease inhibitors.'},
  'prints': {'PR00759': 'BASICPTASE'},
  'panther': None,
  'cathgene3d': None,
  'ssf': None,
  'go_identifier': 'GO:0004867',
  'go_name': 'serine-type endopeptidase inhibitor activity',
  'go_category': 'molecular_function',
  'locations': {'fragments': [{'start': 288,
     'end': 342,
     'dc-status': 'CONTINUOUS'}],
   'representative': False,
   'model': None,
   'score': None}},
 {'accession': 'p05067',
  'length': 770,
  'organism_id': '9606',
  'protein_source_db': 'reviewed',
  'in_alphafold': True,
  'interpro_access

### Search in Batches

In [6]:
queries: List[Union[str, Dict[str, Any]]] = [
    {
        "db" : "InterPro",
        "modifiers" : {},
        "filters" : [
            {
                "type": "protein",
                "db": "reviewed",
                "value": "Q29537"
            }
        ]
    },
    {
        "db": "InterPro",
        "modifiers": {},
        "filters": [
            {
                "type": "protein",
                "db": "reviewed",
                "value": "P05067"
            }
        ]
    }
]
instance.fetch_batch(queries=queries, method="entry", pages_to_fetch=1, parse=True, config_key="entry")

Parsing data with fields: {'accession': 'proteins.accession', 'length': 'proteins.protein_length', 'organism_id': 'proteins.organism.id', 'protein_source_db': 'proteins.source_database', 'in_alphafold': 'proteins.in_alphafold', 'interpro_accession': 'metadata.accession', 'name': 'metadata.name', 'type': 'metadata.type', 'source_database': 'metadata.source_database', 'integrated': 'metadata.integrated', 'pfam': 'metadata.member_databases.pfam', 'prosite': 'metadata.member_databases.prosite', 'smart': 'metadata.member_databases.smart', 'prints': 'metadata.member_databases.prints', 'panther': 'metadata.member_databases.panther', 'cathgene3d': 'metadata.member_databases.cathgene3d', 'ssf': 'metadata.member_databases.ssf', 'go_identifier': 'metadata.go_terms.identifier', 'go_name': 'metadata.go_terms.name', 'go_category': 'metadata.go_terms.category.name', 'locations': 'proteins.entry_protein_locations'}
Fetching data from InterPro API with URL: https://www.ebi.ac.uk:443/interpro/api/entry/

[[{'accession': 'q29537',
   'length': 381,
   'organism_id': '9615',
   'protein_source_db': 'reviewed',
   'in_alphafold': True,
   'interpro_accession': 'IPR002117',
   'name': 'p53 tumour suppressor family',
   'type': 'family',
   'source_database': 'interpro',
   'integrated': None,
   'pfam': None,
   'prosite': None,
   'smart': None,
   'prints': {'PR00386': 'P53SUPPRESSR'},
   'panther': {'PTHR11447': 'CELLULAR TUMOR ANTIGEN P53'},
   'cathgene3d': None,
   'ssf': None,
   'go_identifier': ['GO:0003677',
    'GO:0003700',
    'GO:0006355',
    'GO:0006915',
    'GO:0005634'],
   'go_name': ['DNA binding',
    'DNA-binding transcription factor activity',
    'regulation of DNA-templated transcription',
    'apoptotic process',
    'nucleus'],
   'go_category': ['molecular_function',
    'molecular_function',
    'biological_process',
    'biological_process',
    'cellular_component'],
   'locations': {'fragments': [{'start': 3,
      'end': 354,
      'dc-status': 'CONTINUOUS

In [None]:
instance.fetch_to_dataframe(
    [
        {
            "type": "entry",
            "db": "InterPro",
            "entry_integration": "",
            "modifiers": {
                "go_term": "GO:0004867"
            },
            "filter_type": "protein",
            "filter_db": "UniProt",
            "filter_value": "P05067"
        }
    ]
)

17 records found


Unnamed: 0,metadata,proteins
0,"{'accession': 'IPR002223', 'name': 'Pancreatic...","[{'accession': 'p05067', 'protein_length': 770..."
1,"{'accession': 'IPR008154', 'name': 'Amyloidoge...","[{'accession': 'p05067', 'protein_length': 770..."
2,"{'accession': 'IPR008155', 'name': 'Amyloidoge...","[{'accession': 'p05067', 'protein_length': 770..."
3,"{'accession': 'IPR011178', 'name': 'Amyloidoge...","[{'accession': 'p05067', 'protein_length': 770..."
4,"{'accession': 'IPR011993', 'name': 'PH-like do...","[{'accession': 'p05067', 'protein_length': 770..."
5,"{'accession': 'IPR013803', 'name': 'Amyloidoge...","[{'accession': 'p05067', 'protein_length': 770..."
6,"{'accession': 'IPR015849', 'name': 'Amyloidoge...","[{'accession': 'p05067', 'protein_length': 770..."
7,"{'accession': 'IPR019543', 'name': 'Beta-amylo...","[{'accession': 'p05067', 'protein_length': 770..."
8,"{'accession': 'IPR019744', 'name': 'Amyloidoge...","[{'accession': 'p05067', 'protein_length': 770..."
9,"{'accession': 'IPR019745', 'name': 'Amyloidoge...","[{'accession': 'p05067', 'protein_length': 770..."


## KEGG

In [1]:
from src.kegg import KEGGInterface

In [2]:
instance = KEGGInterface()

In [3]:
instance.fetch_single(
    method="get",
    query={
        "entries": ["hsa:10458", "ece:Z5100"]
    },
    parse=True
    #fields_to_extract=["ENTRY", "NAME", "PATHWAY"]
)

[{'entry': '10458             CDS       T01001',
  'name': '(RefSeq) BAR/IMD domain containing adaptor protein 2'},
 {'entry': 'Z5100             CDS       T00044', 'name': '(GenBank) espF'}]

In [None]:
instance.fetch_to_dataframe(
    method="get",
    query=["hsa:10458", "ece:Z5100"]
)

Unnamed: 0,ENTRY,NAME,PATHWAY
0,10458 CDS T01001,(RefSeq) BAR/IMD domain containing adaptor pro...,hsa04520 Adherens junction hsa04810 Regulati...
1,Z5100 CDS T00044,(GenBank) espF,ece05130 Pathogenic Escherichia coli infection


## PDB

In [1]:
from src.proteindatabank import PDBInterface

In [2]:
instance = PDBInterface(
    download_structures=True,
    return_data_list=["rcsb_id", "rcsb_comp_model_provenance", "rcsb_entry_info"],
    output_dir="results"
)

In [3]:
instance.fetch_batch(
    queries=["4HHB", "1A8I", "1A8J", "1A8K", "1A8L", "1A8M"],
    parse=True
)

Info: Structure for 4HHB already exists in pdb format.
Info: Structure for 1A8I already exists in pdb format.
Info: Structure for 1A8J already exists in pdb format.
Info: Structure for 1A8K already exists in pdb format.
Info: Structure for 1A8L already exists in pdb format.
Info: Structure for 1A8M already exists in pdb format.


[{'rcsb_id': '4HHB',
  'model_provenance': None,
  'branched_molecular_weight_minimum': None,
  'resolution_combined': 1.74,
  'experimental_method': 'X-ray',
  'diffrn_resolution_high': 1.74},
 {'rcsb_id': '1A8I',
  'model_provenance': None,
  'branched_molecular_weight_minimum': None,
  'resolution_combined': 1.78,
  'experimental_method': 'X-ray',
  'diffrn_resolution_high': 1.78},
 {'rcsb_id': '1A8J',
  'model_provenance': None,
  'branched_molecular_weight_minimum': None,
  'resolution_combined': 2.7,
  'experimental_method': 'X-ray',
  'diffrn_resolution_high': 2.7},
 {'rcsb_id': '1A8K',
  'model_provenance': None,
  'branched_molecular_weight_minimum': None,
  'resolution_combined': 2.0,
  'experimental_method': 'X-ray',
  'diffrn_resolution_high': 2.0},
 {'rcsb_id': '1A8L',
  'model_provenance': None,
  'branched_molecular_weight_minimum': None,
  'resolution_combined': 1.9,
  'experimental_method': 'X-ray',
  'diffrn_resolution_high': 1.9},
 {'rcsb_id': '1A8M',
  'model_proven

In [4]:
instance.get_dummy()

Info: Structure for 4HHB already exists in pdb format.


{'audit_author.name': 'str',
 'audit_author.pdbx_ordinal': 'int',
 'cell.angle_alpha': 'dict(float)',
 'cell.angle_beta': 'dict(float)',
 'cell.angle_gamma': 'dict(float)',
 'cell.length_a': 'dict(float)',
 'cell.length_b': 'dict(float)',
 'cell.length_c': 'dict(float)',
 'cell.zpdb': 'dict(int)',
 'cell': 'dict',
 'citation.country': 'str',
 'citation.id': 'str',
 'citation.journal_abbrev': 'str',
 'citation.journal_id_astm': 'str',
 'citation.journal_id_csd': 'str',
 'citation.journal_id_issn': 'str',
 'citation.journal_volume': 'str',
 'citation.page_first': 'str',
 'citation.page_last': 'str',
 'citation.pdbx_database_id_doi': 'str',
 'citation.pdbx_database_id_pub_med': 'int',
 'citation.rcsb_authors': 'list(str)',
 'citation.rcsb_is_primary': 'str',
 'citation.rcsb_journal_abbrev': 'str',
 'citation.title': 'str',
 'citation.year': 'int',
 'database2.database_code': 'str',
 'database2.database_id': 'str',
 'database2.pdbx_doi': 'str',
 'database2.pdbx_database_accession': 'str',


In [None]:
instance.fetch_single(
    "4HHB",
    parse=True
)

Info: Downloading 4HHB in pdb format...


{'rcsb_id': '4HHB', 'rcsb_entry_info.experimental_method': 'X-ray'}

## Reactome

In [1]:
from src.reactome import ReactomeInstance

In [2]:
instance = ReactomeInstance()

In [3]:
instance.fetch_single(query="R-HSA-5673001", parse=True, method="data/pathways/low/diagram/entity", option="")

[{'id': 'R-HSA-9648002',
  'display_name': 'RAS processing',
  'class_name': 'Pathway'}]

## Refseq

In [1]:
from src.refseq import RefSeqInterface

In [2]:
features = [
    "GBSeq_locus",
    "GBSeq_length",
    "GBSeq_keywords",
    "GBSeq_feature-table.GBFeature_intervals"
]

In [3]:
instance = RefSeqInterface()

In [None]:
instance.fetch_single(
    ["XP_010804480.1", "XP_010804481.1", "XP_010804482.1"],
    method="popset",
    parse=True,
    #fields_to_extract=features,
)

[{'GBSeq_locus': 'XP_010804480',
  'GBSeq_length': '259',
  'GBSeq_moltype': 'AA',
  'GBSeq_topology': 'linear',
  'GBSeq_division': 'MAM',
  'GBSeq_update-date': '26-JAN-2016',
  'GBSeq_create-date': '30-DEC-2014',
  'GBSeq_definition': 'PREDICTED: beta-casein isoform X1 [Bos taurus]',
  'GBSeq_primary-accession': 'XP_010804480',
  'GBSeq_accession-version': 'XP_010804480.1',
  'GBSeq_other-seqids': ['ref|XP_010804480.1|', 'gi|741930202'],
  'GBSeq_project': 'PRJNA33843',
  'GBSeq_keywords': ['RefSeq'],
  'GBSeq_source': 'Bos taurus (cattle)',
  'GBSeq_organism': 'Bos taurus',
  'GBSeq_taxonomy': 'Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Laurasiatheria; Cetartiodactyla; Ruminantia; Pecora; Bovidae; Bovinae; Bos',
  'GBSeq_source-db': 'REFSEQ: accession XM_010806178.1',
  'GBSeq_feature-table': [{'GBFeature_key': 'source',
    'GBFeature_location': '1..259',
    'GBFeature_intervals': [{'GBInterval_from': '1',
      'GBInterval_to': '259',
 

## STRING

In [1]:
from src.stringdb import StringInterface

In [2]:
instance = StringInterface()

In [3]:
instance.fetch_single(
    query={
        "identifiers": ["p53"],
    },
    outfmt="json",
    method="interaction_partners",
    parse=True
)

[{'id_a': '7227.FBpp0083753',
  'id_b': '7227.FBpp0110174',
  'name_a': 'p53',
  'name_b': 'tefu',
  'score': 0.985,
  'nscore': 0,
  'fscore': 0,
  'pscore': 0,
  'ascore': 0,
  'escore': 0.6,
  'dscore': 0.9,
  'tscore': 0.659},
 {'id_a': '7227.FBpp0083753',
  'id_b': '7227.FBpp0074047',
  'name_a': 'p53',
  'name_b': 'mei-41',
  'score': 0.962,
  'nscore': 0,
  'fscore': 0,
  'pscore': 0,
  'ascore': 0.071,
  'escore': 0,
  'dscore': 0.9,
  'tscore': 0.627},
 {'id_a': '7227.FBpp0083753',
  'id_b': '7227.FBpp0304253',
  'name_a': 'p53',
  'name_b': 'hpo',
  'score': 0.94,
  'nscore': 0,
  'fscore': 0,
  'pscore': 0,
  'ascore': 0.062,
  'escore': 0.225,
  'dscore': 0.9,
  'tscore': 0.28},
 {'id_a': '7227.FBpp0083753',
  'id_b': '7227.FBpp0080860',
  'name_a': 'p53',
  'name_b': 'lok',
  'score': 0.932,
  'nscore': 0,
  'fscore': 0,
  'pscore': 0,
  'ascore': 0.076,
  'escore': 0.6,
  'dscore': 0.75,
  'tscore': 0.36},
 {'id_a': '7227.FBpp0083753',
  'id_b': '7227.FBpp0076806',
  'nam

In [None]:
instance.fetch_to_dataframe(
    outfmt="json",
    method="interaction_partners",
    params={
        "identifiers": ["p53", "cdk2"],
        "species": 9606,
    }
)

Unnamed: 0,stringId_A,stringId_B,preferredName_A,preferredName_B,ncbiTaxonId,score,nscore,fscore,pscore,ascore,escore,dscore,tscore
0,9606.ENSP00000266970,9606.ENSP00000481380,CDK2,CCNA2,9606,0.999,0,0.003,0.0,0.453,0.999,0.9,0.999
1,9606.ENSP00000266970,9606.ENSP00000413720,CDK2,CDKN1C,9606,0.999,0,0.000,0.0,0.085,0.859,0.9,0.970
2,9606.ENSP00000266970,9606.ENSP00000228872,CDK2,CDKN1B,9606,0.999,0,0.000,0.0,0.085,0.999,0.9,0.999
3,9606.ENSP00000266970,9606.ENSP00000429089,CDK2,CCNE2,9606,0.999,0,0.000,0.0,0.200,0.995,0.9,0.996
4,9606.ENSP00000266970,9606.ENSP00000255465,CDK2,CCNA1,9606,0.999,0,0.000,0.0,0.292,0.942,0.9,0.999
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2956,9606.ENSP00000269305,9606.ENSP00000310928,TP53,PPARD,9606,0.400,0,0.000,0.0,0.000,0.000,0.0,0.400
2957,9606.ENSP00000269305,9606.ENSP00000419945,TP53,ERVW-1,9606,0.400,0,0.000,0.0,0.000,0.000,0.0,0.400
2958,9606.ENSP00000269305,9606.ENSP00000462980,TP53,TAF4B,9606,0.400,0,0.000,0.0,0.000,0.000,0.4,0.000
2959,9606.ENSP00000269305,9606.ENSP00000431885,TP53,TYK2,9606,0.400,0,0.000,0.0,0.083,0.000,0.0,0.372


### Show graph test

In [None]:
import json

# Convierte a formato Cytoscape.js
def convert_to_cytoscape_format(graph_json):
    elements = []

    for node in graph_json['nodes']:
        elements.append({
            "data": {
                "id": node["id"],
                "label": node["lbl"]
            }
        })

    for edge in graph_json['edges']:
        elements.append({
            "data": {
                "source": edge["sub"],
                "target": edge["obj"],
                "label": edge["pred"]
            }
        })

    return elements

# Genera el HTML
def create_cytoscape_html(graph_json):
    elements = convert_to_cytoscape_format(graph_json)

    html = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <meta charset="utf-8">
        <title>Grafo GO Interactivo</title>
        <script src="https://unpkg.com/cytoscape@3.19.0/dist/cytoscape.min.js"></script>
        <script src="https://unpkg.com/dagre@0.8.5/dist/dagre.min.js"></script>
        <script src="https://unpkg.com/cytoscape-dagre@2.3.2/cytoscape-dagre.js"></script>
        <style>
            html, body {{
                margin: 0;
                padding: 0;
                height: 100%;
                width: 100%;
                font-family: Arial, sans-serif;
            }}
            #cy {{
                height: 100%;
                width: 100%;
                display: block;
            }}
        </style>
    </head>
    <body>
        <div id="cy"></div>
        <script>
            cytoscape.use(cytoscapeDagre);

            var cy = cytoscape({{
                container: document.getElementById('cy'),
                elements: {json.dumps(elements)},
                style: [
                    {{
                        selector: 'node',
                        style: {{
                            'shape': 'roundrectangle',
                            'label': 'data(label)',
                            'text-valign': 'center',
                            'text-halign': 'center',
                            'background-color': '#AED6F1',
                            'color': '#1B2631',
                            'font-size': '8px',
                            'width': 'label',
                            'height': 'label',
                            'padding': '6px',
                            'border-width': 1,
                            'border-color': '#2980B9'
                        }}
                    }},
                    {{
                        selector: 'edge',
                        style: {{
                            'width': 2,
                            'label': 'data(label)',
                            'line-color': '#B2BABB',
                            'target-arrow-color': '#B2BABB',
                            'target-arrow-shape': 'triangle',
                            'curve-style': 'bezier',
                            'font-size': '7px',
                            'color': '#5D6D7E',
                            'text-background-opacity': 1,
                            'text-background-color': '#fff',
                            'text-background-shape': 'roundrectangle',
                            'text-background-padding': 2
                        }}
                    }}
                ],
                layout: {{
                    name: 'dagre',
                    rankDir: 'TB',
                    nodeSep: 30,
                    edgeSep: 10,
                    rankSep: 50
                }},
                zoomingEnabled: true,
                userZoomingEnabled: true,
                boxSelectionEnabled: false
            }});
        </script>
    </body>
    </html>
    """
    return html

# Guardar archivo HTML
def save_graph_to_html(graph_json, output_file='cytoscape_graph.html'):
    html = create_cytoscape_html(graph_json)
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(html)
    print(f"✅ Grafo guardado en: {output_file}")

# Ejecutar
save_graph_to_html(instance.parse(response)["topology_graph_json"], 'mi_grafo.html')

✅ Grafo guardado en: mi_grafo.html


# Uniprot + Databases union

In [37]:
import ast
import os

from dotenv import load_dotenv
import pandas as pd 
from src.uniprot import UniprotInterface
from src.alphafold import AlphafoldInterface
from src.biogrid import BioGRIDInterface
from src.interpro import InterproInstance

In [38]:
load_dotenv()

# Retrieve the BioGRID API key
biogrid_api_key = os.getenv("biogrid_api_key")

In [39]:
# Argparse arguments
output = "results/output.csv"
#query = "antimicrobial peptide"
query = "organism_name:homo sapiens (human) AND length:[15 TO 30] AND reviewed:true"
fields = "accession,protein_name,sequence,ec,lineage,organism_name,gene_primary"
out_db = "alphafold"
sort = "accession asc"
fmt = "json"

include_isoform = False
download = False

# Outside db argument
outside_db = {
    "alphafold": "xref_alphafolddb",
    "biogrid": "xref_biogrid",
    "brenda": "xref_brenda",
    "go": "go_id",
    "interpro": "xref_interpro",
    "kegg": "xref_kegg",
    "pfam": "xref_pfam",
    "pdb": "xref_pdb",
    "reactome": "xref_reactome",
    "refseq": "xref_refseq",   
    "string": "xref_string",
}

interfaces = {
    "alphafold": AlphafoldInterface(
        structures=['pdb'],
        output_dir="results",
        fields_to_extract={"entry": "entryId", "gene": "gene", "tax_id": "taxId"},
    ),
    "biogrid": BioGRIDInterface(),
    "interpro": InterproInstance(
        fields_to_extract={
            "accession": "protein.accession",
            "length": "proteins.protein_length",
            "organism_id": "proteins.organism.id",
            "protein_source_db": "proteins.source_database",
            "in_alphafold": "proteins.in_alphafold",
            "ipr_id": "metadata.ipr_id",
            "name": "metadata.name",
            "type": "metadata.type",
            "interaction_source_db": "metadata.source_database",
            "integrated": "metadata.integrated",
            "pfam": "metadata.member_databases.pfam",
            "prosite": "metadata.member_databases.prosite",
            "smart": "metadata.member_databases.smart",
            "prints": "metadata.member_databases.prints",
            "panther": "metadata.member_databases.panther",
            "cathgene3d": "metadata.member_databases.cathgene3d",
            "ssf": "metadata.member_databases.ssf",
            "go_identifier": "metadata.go_terms.identifier",
            "go_name": "metadata.go_terms.name",
            "go_category": "metadata.go_terms.category.name",
            "fragments_start": "entry_protein_locations.fragments.start",
            "fragments_end": "entry_protein_locations.fragments.end",
            "fragments_dc_status": "entry_protein_locations.fragments.dc-status",
            "representative": "entry_protein_locations.representative",
            "model": "entry_protein_locations.model",
            "score": "entry_protein_locations.score"
        }
    )
}

### DOwnload from UNiprot

In [None]:
instance = UniprotInterface()
print(f"Downloading data using query {query} and fields {fields}")

out_db_fields = [outside_db[db] for db in out_db.split(",") if db in outside_db]

response = instance.submit_stream(
    query=query,
    fields=fields + "," + ",".join(out_db_fields),
    sort=sort,
    include_isoform=include_isoform,
    download=download,
    format=fmt
)

if not response:
    raise ValueError("No response received from the Uniprot API. Aborting the process.")

# Parsear y guardar como CSV
print("Parsing results...")
export_df = instance.parse_stream_response(
    query=query,
    response=response
)

export_df.to_csv(output, index=False)

Downloading data using query organism_name:homo sapiens (human) AND length:[15 TO 30] AND reviewed:true and fields accession,protein_name,sequence,ec,lineage,organism_name,gene_primary
Parsing results...


### Download external db

In [10]:
export_df = pd.read_csv("results/output.csv")
export_df.head()

Unnamed: 0,query,accession,protein_name,organism_name,gene_primary,taxon_id,ineage,sequence,length,alphafold_ids,biogrid_ids,brenda_ids,go_terms,interpro_ids,kegg_ids,pdb_ids,pfam_ids,reactome_ids,refseq_ids,string_ids
0,organism_name:homo sapiens (human) AND length:...,A0A075B6S0,T cell receptor gamma joining 1,Homo sapiens,['TRGJ1'],9606,"['Eukaryota', 'Metazoa', 'Chordata', 'Craniata...",NYYKKLFGSGTTLVVT,16,['A0A075B6S0'],[],[],[],[],[],[],[],[],[],[]
1,organism_name:homo sapiens (human) AND length:...,A0A075B6Y3,T cell receptor alpha joining 3,Homo sapiens,['TRAJ3'],9606,"['Eukaryota', 'Metazoa', 'Chordata', 'Craniata...",GYSSASKIIFGSGTRLSIRP,20,['A0A075B6Y3'],[],[],[],[],[],[],[],[],[],[]
2,organism_name:homo sapiens (human) AND length:...,A0A075B6Y9,T cell receptor alpha joining 42,Homo sapiens,['TRAJ42'],9606,"['Eukaryota', 'Metazoa', 'Chordata', 'Craniata...",YGGSQGNLIFGKGTKLSVKP,20,['A0A075B6Y9'],[],[],[],[],[],[],[],[],[],[]
3,organism_name:homo sapiens (human) AND length:...,A0A075B700,T cell receptor alpha joining 31,Homo sapiens,['TRAJ31'],9606,"['Eukaryota', 'Metazoa', 'Chordata', 'Craniata...",NNNARLMFGDGTQLVVKP,18,['A0A075B700'],[],[],[],[],[],[],[],[],[],[]
4,organism_name:homo sapiens (human) AND length:...,A0A075B706,T cell receptor delta joining 1,Homo sapiens,['TRDJ1'],9606,"['Eukaryota', 'Metazoa', 'Chordata', 'Craniata...",TDKLIFGKGTRVTVEP,16,['A0A075B706'],[],[],[],[],[],[],[],[],[],[]


In [None]:
def parse_ids(df, db_name):
    raw_ids = df.get(f"{db_name}_ids", pd.Series(dtype=str)).dropna().unique().tolist()
    ids = []
    for entry in raw_ids:
        parsed = ast.literal_eval(entry) if isinstance(entry, str) and entry.startswith("[") else [entry]
        ids.extend(parsed)

    # Cleaning: remove "[]"
    return [id for id in ids if id != "[]"]

# Fetch data from alphafold given a ID from xref_alphafolddb
def fetch_alphafold(ids, interface):
    if not ids:
        return []
    # TODO: Remove Limit
    return interface.fetch_batch(queries=ids, parse=True)

# Fetch data from biogrid given a list of IDs from xref_biogrid or gene_primary and taxon_id fields
def fetch_biogrid(df, ids, interface):
    queries = []

    # Part 1: by ID if they exist
    if ids:
        queries.extend([{
            "id": id_,
            "accessKey": biogrid_api_key
        } for id_ in ids])

    # Part 2: build queries by taxId and geneList
    tmp_df = df[df["biogrid_ids"].astype(str).isin(["[]", "nan", "NaN", ""])]
    tmp_df = tmp_df.dropna(subset=["gene_primary", "taxon_id"])
    tmp_df = tmp_df[["gene_primary", "taxon_id"]].drop_duplicates()

    tmp_df["gene_primary"] = tmp_df["gene_primary"].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else [x]
    )

    tmp_df = tmp_df.explode("gene_primary")
    grouped = tmp_df.groupby("taxon_id")["gene_primary"].agg(list).reset_index()

    queries.extend([
        {
            "accessKey": biogrid_api_key,
            "geneList": row["gene_primary"],
            "taxId": row["taxon_id"],
            "format": "json"
        }
        for _, row in grouped.iterrows()
    ])

    return interface.fetch_batch(
        queries=queries,
        method="interactions",
        parse=True
    )

def fetch_brenda(df, ids, interface):
    return []

# TODO: Interpro retorna muchos none, fijarse en que campos tiene una busqueda
def fetch_interpro(df, interface):
    queries = []

    # Get IDs from xref_interpro with accession
    ids = df[~df["interpro_ids"].astype(str).isin(["[]", "nan", "NaN", ""])]
    ids.loc[:, 'interpro_ids'] = ids["interpro_ids"].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else [x]
    )
    ids = ids.explode("interpro_ids").drop_duplicates(subset=["interpro_ids"])

    if not ids.empty:
        queries.extend([{
            "id": id["interpro_ids"],
            "db": "InterPro",
            "filters": {
                "type": "protein",
                "db": "reviewed",
                "value": id["accession"]
            }
        } for _, id in ids.iterrows() if isinstance(id["interpro_ids"], str) and id["interpro_ids"] != "[]"])


    tmp_df = df[df["interpro_ids"].astype(str).isin(["[]", "nan", "NaN", ""])]
    tmp_df = df[["accession", "taxon_id"]].drop_duplicates()
    tmp_df = tmp_df.dropna(subset=["accession", "taxon_id"])

    if not tmp_df.empty:
        queries.extend([
            {
                "db": "InterPro",
                "modifiers": {},
                "filters": [
                    {
                        "type": "protein",
                        "db": "reviewed",
                        "value": row["accession"]
                    },
                    {
                        "type": "taxonomy",
                        "db": "uniprot",
                        "value": row["taxon_id"]
                    }
                ]

            }
            for _, row in tmp_df.iterrows()
        ])
    
    return interface.fetch_batch(
        queries=queries,
        method="entry",
        pages_to_fetch=1,
        parse=True
    )


# TODO Add a parameter to convert to dataframe.
# fetch_batch(..., parse=True, to_dataframe=True)

def fetch_from_external(db_name, df, interface):
    ids = parse_ids(df, db_name)
    print(f"Fetching from {db_name} using {len(ids)} IDs")

    match db_name:
        case "alphafold":
            return fetch_alphafold(ids, interface)
        case "biogrid":
            return fetch_biogrid(df, ids, interface)
        case "brenda":
            return fetch_brenda(df, ids, interface)
        # Add more cases as needed
        case "interpro":
            return fetch_interpro(df, interface)
        case _:
            raise ValueError(f"Unsupported database: {db_name}")

In [12]:
export_dfs = {}
for db in out_db.split(","):
    if db in interfaces:
        test = fetch_from_external(db, export_df, interfaces[db])
        export_dfs[db] = test

export_dfs["alphafold"]

Fetching from alphafold using 35 IDs


[{'entry': 'AF-A0A075B6S0-F1', 'gene': 'TRGJ1', 'tax_id': 9606},
 {'entry': 'AF-A0A075B6Y3-F1', 'gene': 'TRAJ3', 'tax_id': 9606},
 {'entry': 'AF-A0A075B6Y9-F1', 'gene': 'TRAJ42', 'tax_id': 9606},
 {'entry': 'AF-A0A075B700-F1', 'gene': 'TRAJ31', 'tax_id': 9606},
 {'entry': 'AF-A0A075B706-F1', 'gene': 'TRDJ1', 'tax_id': 9606},
 {'entry': 'AF-A0A0A0MT70-F1', 'gene': 'TRBJ2-6', 'tax_id': 9606},
 {'entry': 'AF-A0A0A0MT87-F1', 'gene': 'TRBJ2-4', 'tax_id': 9606},
 {'entry': 'AF-A0A0A0MT94-F1', 'gene': 'TRBJ2-2', 'tax_id': 9606},
 {'entry': 'AF-A0A0A0MTA7-F1', 'gene': 'TRBJ2-1', 'tax_id': 9606},
 {'entry': 'AF-A0A0B4J200-F1', 'gene': 'TRBJ2-3', 'tax_id': 9606},
 {'entry': 'AF-A0A0C4DH62-F1', 'gene': 'IGHJ1', 'tax_id': 9606},
 {'entry': 'AF-A0A0C5B5G6-F1', 'gene': 'MT-RNR1', 'tax_id': 9606},
 {'entry': 'AF-A0A0J9YWP8-F1', 'gene': 'TRBJ1-3', 'tax_id': 9606},
 {'entry': 'AF-A0A0J9YWX3-F1', 'gene': 'TRBJ1-6', 'tax_id': 9606},
 {'entry': 'AF-A0A0J9YXG5-F1', 'gene': 'TRBJ1-4', 'tax_id': 9606},
 {'en

# Interface

In [3]:
import ipywidgets as widgets
from IPython.display import display, JSON, clear_output
from src.interpro import InterproInstance, data_types, entry_integration_types, filter_types, db_types

In [6]:
filters = {
    "protein": [("UniProtKB/Swiss-Prot", "reviewed"),
                ("UniProtKB", "UniProt"),
                ("UniProtKB/TrEMBL", "unreviewed")],
    "structure": [("PDB", "PDB")],
    "taxonomy": [("UniProtKB", "uniprot")],
    "proteome": [("UniProtKB", "uniprot")],
    "set": [("CDD", "cdd"),
            ("Pfam", "pfam"),
            ("PIRSF", "pirsf")],
}

def interface_interpro():
    output = widgets.Output()
    dynamic_output = widgets.Output()
    active_sections = {}

    # ------------------------
    # Sección de configuración base
    # ------------------------

    method_dropdown = widgets.Dropdown(
        options=data_types,
        value="entry",
        description="Choose a main data type:"
    )

    db_dropdown = widgets.Dropdown(
        options=db_types["entry"],
        value="InterPro",
        description="DB:"
    )

    # Actualiza db_dropdown en base a method_dropdown
    def update_db_options(change):
        new_type = change['new']
        db_dropdown.options = db_types.get(new_type, [])

    method_dropdown.observe(update_db_options, names='value')

    # ------------------------
    # Sección de filtros dinámicos
    # ------------------------

    container = widgets.VBox()
    button_box = widgets.HBox()
    add_buttons = {}

    # Crear sección individual de filtro
    def create_filter_section(filter_type):
        options = filters[filter_type]
        labels = [label for label, _ in options]
        label_to_value = {label: val for label, val in options}

        db_dropdown_f = widgets.Dropdown(
            options=labels,
            description=f"{filter_type} DB:"
        )

        value_input = widgets.Text(
            placeholder=f"{filter_type} accession...",
            description=f"{filter_type} accession:"
        )

        def clear_value(b):
            value_input.value = ''

        clear_btn = widgets.Button(description="Clear", button_style='info')
        clear_btn.on_click(clear_value)

        def remove_section(b):
            container.children = [c for c in container.children if c != section]
            add_buttons[filter_type].disabled = False
            del active_sections[filter_type]

        remove_btn = widgets.Button(description="Remove", button_style='danger')
        remove_btn.on_click(remove_section)

        section = widgets.VBox([
            widgets.HTML(f"<b>{filter_type}</b>"),
            widgets.HBox([widgets.Label("Filter type:"), widgets.Label(filter_type)]),
            widgets.HBox([db_dropdown_f, clear_btn]),
            value_input,
            remove_btn
        ])
        active_sections[filter_type] = (db_dropdown_f, value_input, label_to_value)
        return section

    # Botones de filtros
    for key in filters:
        btn = widgets.Button(description=key.capitalize(), button_style='info', layout=widgets.Layout(width='auto'))
        add_buttons[key] = btn

        def make_handler(k):
            def handler(b):
                section = create_filter_section(k)
                container.children += (section,)
                add_buttons[k].disabled = True
            return handler

        btn.on_click(make_handler(key))

    button_box.children = list(add_buttons.values())

    # ------------------------
    # Botón para generar query
    # ------------------------

    generate_button = widgets.Button(
        description="Generar Query",
        button_style="success",
        icon="check"
    )

    def on_generate_click(b):
        with output:
            clear_output()
            query = {
                "db": db_dropdown.value,
                "filters": []
            }
            for ftype, (db_dd, val_input, label_map) in active_sections.items():
                db_label = db_dd.value
                db_value = label_map[db_label]
                value = val_input.value.strip()
                if value:
                    query["filters"].append({
                        "type": ftype,
                        "db": db_value,
                        "value": value
                    })
            display(JSON(query, expanded=True))
            # Aquí puedes llamar a la función de InterPro para enviar la consulta
            interpro_instance = InterproInstance()
            response = interpro_instance.fetch_single(
                query=query,
                method=method_dropdown.value,
                pages_to_fetch=1,
                parse=True
            )
            if response:
                display(JSON(response, expanded=True))
            else:
                print("No response received from InterPro API.")

    generate_button.on_click(on_generate_click)

    # ------------------------
    # Render
    # ------------------------

    display(widgets.VBox([
        widgets.HTML("<h3>InterPro Query Interface</h3>"),
        widgets.HTML("<b>Configuración principal:</b>"),
        method_dropdown,
        db_dropdown,
        widgets.HTML("<hr><b>Filtros dinámicos:</b>"),
        widgets.Label("Agregar un filtro:"),
        button_box,
        container,
        generate_button,
        output
    ]))

In [None]:
interface_interpro()

VBox(children=(HTML(value='<h3>InterPro Query Interface</h3>'), HTML(value='<b>Configuración principal:</b>'),…