In [None]:
from src.uniprot import UniprotInterface
import pandas as pd

In [None]:
def replace_char_at_index(s, i, new_char):
    if i < 0 or i >= len(s):
        raise IndexError("Index out of range.")
    return s[:i] + new_char + s[i+1:]

In [None]:
ids = ["Q75UA4"]
from_db = 'UniProtKB_AC-ID'
to_db = 'UniProtKB'
disease = "CRC"

In [None]:
downloader = UniprotInterface()

job_id = downloader.submit_id_mapping(from_db=from_db, to_db=to_db, ids=ids)

In [15]:
if downloader.check_id_mapping_results_ready(job_id):
    link = downloader.get_id_mapping_results_link(job_id)
    results = downloader.get_id_mapping_results_search(link)

Fetched: 1 / 1


In [16]:
with open("results.json", "w") as f:
    json.dump(results, f)

NameError: name 'json' is not defined

In [None]:
results['results'][0]['to']['sequence']['value']

'MKFGKFVLLAASTALAVVGLGGPAAADSTPQAQPSIIGGSNATSGPWAARLFVNGRQNCTATIIAPQYILTAKHCVSSSGTYTFRIGSLDQTSGGTMATGSTITRYPGSADLAIVRLTTSVNATYSPLGSVGDVSVGQNVSVYGWGATSQCGSEINCQSRYLKVATVRVNSISCSDYTGGVAVCANRVNGITAGGDSGGPMFASGRQVGVASTSDRVNNTAYTNITRYRSWISQVAGV'

In [None]:
for result in results['results']:
    print(result['from'])

Q75UA4


In [None]:
export_data = []
sequence = results['results'][0]['to']['sequence']['value']
for feature in results['results'][0]['to']['features']:
    row = []
    if feature['type'] == 'Natural variant' and disease in feature['description']:     
        row.append(feature['featureId'])
        location_start = feature['location']['start']['value']
        location_end = feature['location']['end']['value']
        if location_start == location_end:
            row.append(location_start)
            original_sequence = feature['alternativeSequence']['originalSequence']
            new_sequence = feature['alternativeSequence']['alternativeSequences'][0]
            row.append(f"{original_sequence}->{new_sequence}")
            row.append(replace_char_at_index(sequence, int(location_start)-1, new_sequence))
        else:
            row.append(f"{location_start}-{location_end}")
            row.append("missing")
            row.append(sequence[:int(location_start)-1] + sequence[int(location_end)-1:])
        export_data.append(row)
export_data

In [None]:
df = pd.DataFrame(export_data, columns=["variant id", "position", "change", "sequence"])
df

In [None]:
df.to_csv("results.csv", index=False)

In [None]:
result = results['results'][0]
for reference in result['to']['references']:
    print(reference['citation']['citationCrossReferences'])

[{'database': 'PubMed', 'id': '11133465'}, {'database': 'DOI', 'id': '10.1128/AEM.67.1.345-353.2001'}]
[{'database': 'PubMed', 'id': '16237016'}, {'database': 'DOI', 'id': '10.1128/JB.187.21.7333-7340.2005'}]


In [None]:
references_list = []
result = results['results'][0]                 

try:
    for r in result['to']['references']:
        tmp = {}
        tmp["citacionCrossReferences"] = r['citation']['citationCrossReferences']
        tmp.update({"title": r['citation']['title']})
        references_list.append(tmp)
except KeyError:
    pass

In [None]:
references_list

[{'citacionCrossReferences': [{'database': 'PubMed', 'id': '11133465'},
   {'database': 'DOI', 'id': '10.1128/AEM.67.1.345-353.2001'}],
  'title': 'Purification and characterization of an extracellular poly(L-lactic acid) depolymerase from a soil isolate, Amycolatopsis sp. strain K104-1.'},
 {'citacionCrossReferences': [{'database': 'PubMed', 'id': '16237016'},
   {'database': 'DOI', 'id': '10.1128/JB.187.21.7333-7340.2005'}],
  'title': 'Gene cloning and molecular characterization of an extracellular poly(L-lactic acid) depolymerase from Amycolatopsis sp. strain K104-1.'}]

## Blast

In [None]:
import os, argparse
import shutil
import subprocess
import tarfile
from pathlib import Path
from urllib.request import urlopen
import re
from typing import List

import pandas as pd

DB_DIR = os.path.join("scripts", "db")
BLAST_BASE_URL = "https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/"
UNIPROT_BASE_URL = "https://ftp.uniprot.org/pub/databases/uniprot/current_release"
BLAST_DIR = Path("blast_bin")
#https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref100/uniref100.fasta.gz
#https://ftp.uniprot.org/pub/databases/uniprot/current_release/uniref/uniref100/uniref.xsd
#https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref100/uniref100.xml.gz
#https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.xml.gz
#https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz
#https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz

databases = {
    "uniprotkb_reviewed": "knowledgebase/complete/uniprot_sprot",
    "uniprotkb_unreviewed": "knowledgebase/complete/uniprot_trembl",
    "uniref100": "uniref/niref100/uniref100",
    "uniref90": "uniref/uniref90/uniref90",
    "uniref50": "uniref/uniref50/uniref50",
}

def download_uniprot_database(db_name: str, extension: str = "xml"):
    """ Download a Uniprot database from the Uniprot FTP server.
    Args:
        db_name (str): Name of the database to download.
        extension (str): File extension of the database. Default is "xml".
    """

    if db_name not in databases:
        raise ValueError(f"Database {db_name} is not supported. Supported databases are: {', '.join(databases.keys())}.")
    
    db_path = os.path.join(DB_DIR, f"{db_name}.{extension}")
    
    if not os.path.exists(db_path):
        os.makedirs(DB_DIR, exist_ok=True)
        url = f"{UNIPROT_BASE_URL}/{databases[db_name]}.{extension}.gz"
        os.system(f"wget {url} -O {db_path}.gz")
        print(f"Unzipping {db_path}...")
        subprocess.run(["gunzip", db_path], check=True)
    else:
        print(f"Database {db_name} already exists at {db_path}.")

def get_latest_version_url():
    """Retrieve the latest BLAST+ tarball URL from the NCBI FTP site."""
    with urlopen(BLAST_BASE_URL) as response:
        html = response.read().decode("utf-8")
    # Look for something like: ncbi-blast-2.16.0+-x64-linux.tar.gz
    match = re.search(r'ncbi-blast-(\d+\.\d+\.\d+\+)-x64-linux\.tar\.gz', html)
    if match:
        version = match.group(1)
        tar_name = f"ncbi-blast-{version}-x64-linux.tar.gz"
        return version, BLAST_BASE_URL + tar_name
    else:
        raise RuntimeError("Could not find the latest BLAST version from NCBI.")

def is_blast_installed():
    """Check if 'blastp' is available in the system PATH."""
    try:
        subprocess.run(["blastp", "-version"], check=True, stdout=subprocess.DEVNULL)
        return True
    except (subprocess.CalledProcessError, FileNotFoundError):
        return False


def download_and_extract_blast(version: str, url: str):
    """Download and extract the BLAST+ tarball."""
    tarball_name = url.split("/")[-1]
    if not Path(tarball_name).exists():
        print(f"Downloading BLAST+ {version}...")
        subprocess.run(["wget", url], check=True)

    print("Extracting BLAST+...")
    with tarfile.open(tarball_name, "r:gz") as tar:
        tar.extractall(BLAST_DIR)
    print(f"BLAST extracted to: {BLAST_DIR.resolve()}")


def get_local_blastp_path(version: str):
    """Return the path to local blastp binary."""
    return BLAST_DIR / f"ncbi-blast-{version}" / "bin" / "blastp"


def check_blast():
    """Ensure BLAST is installed. Return path to `blastp` binary."""
    if is_blast_installed():
        print("System-wide BLAST is installed.")
        return shutil.which("blastp")
    else:
        version, url = get_latest_version_url()
        local_blastp = get_local_blastp_path(version)
        if not local_blastp.exists():
            print(f"BLAST {version} not found locally. Installing...")
            BLAST_DIR.mkdir(exist_ok=True)
            download_and_extract_blast(version, url)
        else:
            print(f"Using already downloaded BLAST {version}.")
        return str(local_blastp)

def make_blast_database(db_name: str, db_type: str = "prot", extension: str = "xml"):
    """Create a BLAST database from the Uniprot database."""
    db_path = os.path.join(DB_DIR, f"{db_name}.{extension}")
    if not os.path.exists(db_path):
        raise FileNotFoundError(f"Database {db_name} not found at {db_path}. Please download it first.")
    
    # Check if the database is already created
    blast_db_path = os.path.join(DB_DIR, db_name)
    extensions = [".pdb", ".phr", ".pin", ".psq", ".pot", ".psq", ".ptf", ".pto"]
    makedb = False
    # For all extensions check if exists if there is one failing makedb again
    for ext in extensions:
        if not os.path.exists(blast_db_path + "/db" + ext):
            makedb = True
            break
    if makedb:
        print(f"Creating BLAST database for {db_name}...")
        blast_db_cmd = [
            "makeblastdb",
            "-in", db_path,
            "-dbtype", db_type,
            "-out", os.path.join(DB_DIR, db_name) + "/db",
        ]
    
        subprocess.run(blast_db_cmd, check=True)
        print(f"BLAST database created at: {os.path.join(DB_DIR, databases[db_name])}")
    else:
        print(f"BLAST database already exists at {blast_db_path}. No need to create it again.")

def run_blast(sequences: List[str], db_name: str, blast_type: str = "blastp", evalue: float = 0.001):
    """Run BLAST search."""
    blast_db_path = os.path.join(DB_DIR, db_name)
    if not os.path.exists(blast_db_path):
        raise FileNotFoundError(f"Database {db_name} not found at {blast_db_path}. Please download it first.")

    # Make tmp directory if it does not exist
    os.makedirs("tmp", exist_ok=True)

    # Write sequences to a temporary file
    with open("tmp/sequences.fasta", "w") as f:
        for i, seq in enumerate(sequences):
            f.write(f">{i}\n{seq}\n")
    
    blast_cmd = [
        blast_type,
        "-query", "tmp/sequences.fasta",
        "-db", blast_db_path + "/db",
        "-outfmt", "6",
        "-evalue", str(evalue),
    ]
    
    print(f"Running BLAST search...")
    with open("tmp/blast_results.txt", "w") as f:
        subprocess.run(blast_cmd, stdout=f, check=True)
    print(f"BLAST results saved to tmp/blast_results.txt")
    # Clean up temporary file
    os.remove("tmp/sequences.fasta")

def parse_blast_results(file_path: str, identity_threshold: float = 90.0):
    """Parse BLAST results from a file."""
    with open(file_path, "r") as f:
        results = f.readlines()
    
    parsed_results = []
    for line in results:
        fields = line.strip().split("\t")
        identity = float(fields[2])
        if identity >= identity_threshold:
            parsed_results.append({
                "query": fields[0],
                "subject": fields[1],
                "identity": fields[2],
                "alignment_length": fields[3],
                "evalue": fields[4],
                "bit_score": fields[5],
            })
    
    return parsed_results

In [None]:
df = pd.read_csv("data/test.csv")
sequences = df["sequences"].dropna().unique().tolist()
    
download_uniprot_database("uniprotkb_reviewed", "fasta")
    
blastp_path = check_blast()
print(f"Using blastp at: {blastp_path}")

make_blast_database("uniprotkb_reviewed", extension="fasta")

run_blast(sequences, "uniprotkb_reviewed", blast_type="blastp", evalue=0.0001)

results = parse_blast_results("tmp/blast_results.txt")

# Convert to DataFrame
sequences_df = pd.DataFrame(sequences, columns=["sequences"])
sequences_df["id"] = sequences_df.index

sequences_df

Database uniprotkb_reviewed already exists at scripts/db/uniprotkb_reviewed.fasta.
System-wide BLAST is installed.
Using blastp at: /home/diego/micromamba/envs/bioseqdownloader/bin/blastp
BLAST database already exists at scripts/db/uniprotkb_reviewed. No need to create it again.
Running BLAST search...
BLAST results saved to tmp/blast_results.txt


Unnamed: 0,sequences,id
0,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,0
1,MSFKVYDPIAELIATQFPTSNPDLQIINNDVLVVSPHKITLPMGPQ...,1
2,NYTETAQAIARSWRAGSHDRLKARGEAVAVTVHRLVAVPRGRDTPR...,2


In [None]:
df_blast = pd.DataFrame(results)

df_blast = df_blast.rename(columns={"query": "id", "subject": "subject_id"})
df_blast["id"] = df_blast["id"].astype(int)
df_blast = df_blast.merge(sequences_df, on="id", how="left")
df_blast = df_blast.drop(columns=["id"])
df_blast = df_blast.rename(columns={"sequences": "sequence"})

# Separate subject into source, accession, entry_name
df_blast["source"] = df_blast["subject_id"].apply(lambda x: x.split("|")[0])
df_blast["accession"] = df_blast["subject_id"].apply(lambda x: x.split("|")[1])
df_blast["entry_name"] = df_blast["subject_id"].apply(lambda x: x.split("|")[2])
df_blast = df_blast.drop(columns=["subject_id"])


In [None]:
df_blast

Unnamed: 0,identity,alignment_length,evalue,bit_score,sequence,source,accession,entry_name
0,100.0,438,0,0,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,sp,Q6GZX2,003R_FRG3G
1,100.0,180,0,0,MSFKVYDPIAELIATQFPTSNPDLQIINNDVLVVSPHKITLPMGPQ...,sp,Q197F2,008L_IIV3
2,100.0,50,0,0,MSFKVYDPIAELIATQFPTSNPDLQIINNDVLVVSPHKITLPMGPQ...,sp,Q6GZW6,009L_FRG3G
3,100.0,345,0,0,NYTETAQAIARSWRAGSHDRLKARGEAVAVTVHRLVAVPRGRDTPR...,sp,Q6GZW6,009L_FRG3G


## GO

In [None]:
from src.description_go import *
import os, ast
import pandas as pd
from tqdm import tqdm

In [None]:
DOCKER_IMAGE_NAME = "metastudent"
DOCKER_CONTAINER_NAME = "metastudent_container"
HOST_INPUT_FILE = os.path.abspath("tmp/sequences.fasta")
HOST_OUTPUT_DIR = os.path.abspath("tmp/")
CONTAINER_INPUT_FILE = "/app/input.fasta"
CONTAINER_OUTPUT_FILE = "/app/output.result"


print("[DESCRIPTION_GO] Getting Gen Ontology")
tqdm.pandas()

if not check_dependencies(DOCKER_IMAGE_NAME):
    print("[DESCRIPTION_GO] Metastudent not found. Installing...")
    install_dependencies(DOCKER_IMAGE_NAME)
else:
    print("[DESCRIPTION_GO] Metastudent found.")

input_df = pd.read_csv("results/umami_uniprot.csv")
obsolete_df = pd.read_csv("scripts/resources/amiGO_data.csv", sep="\t", names=["id_go", "description", "is_obsolete"])
input_df['go_terms'] = input_df['go_terms'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

parsed_df = pd.DataFrame()
if os.path.isfile(f"{HOST_OUTPUT_DIR}/output.BPO.txt") and \
        os.path.isfile(f"{HOST_OUTPUT_DIR}/output.CCO.txt") and \
        os.path.isfile(f"{HOST_OUTPUT_DIR}/output.MFO.txt"):
    print("[DESCRIPTION_GO] Metastudent results found.")
    parsed_df = parse_outputs("uniprot_id")

# Filter input_df with go_terms ~= null
input_df_with_go_terms = input_df[input_df["go_terms"].apply(lambda x: isinstance(x, list) and len(x) > 0)]
input_df = input_df[input_df["go_terms"].apply(lambda x: isinstance(x, list) and len(x) == 0)]

if not input_df_with_go_terms.empty:
    print("[DESCRIPTION_GO] Go terms found in input data.")
    input_df_with_go_terms = input_df_with_go_terms[["uniprot_id", "go_terms"]]
    input_df_with_go_terms = input_df_with_go_terms.explode("go_terms")
    parsed_df = pd.concat(
        [
            parsed_df,
            pd.merge(
                input_df_with_go_terms, 
                obsolete_df, 
                left_on="go_terms", 
                right_on="id_go", 
                how="left"
            )
            .drop(columns=["go_terms"])
            .rename(columns={"id_go": "go"})  
        ]
    )

input_df

[DESCRIPTION_GO] Getting Gen Ontology
Docker version 28.0.0, build f9ced58158
[DESCRIPTION_GO] Metastudent found.
[DESCRIPTION_GO] Go terms found in input data.


Unnamed: 0,uniprot_id,entry_type,protein_name,ec_numbers,organism,taxon_id,sequence,length,go_terms,pfam_ids,references,features,keywords,source_db


In [None]:
if not parsed_df.empty:
    # Check if all sequences have been processed
    parsed_ids = parsed_df["uniprot_id"].unique()
    input_ids = input_df["uniprot_id"].unique()
    if len(parsed_ids) == len(input_ids):
        print("[DESCRIPTION_GO] All sequences have been processed.")
        input_df = pd.DataFrame()
    else:
        input_df = input_df[~input_df["uniprot_id"].isin(parsed_ids)]
        print(f"[DESCRIPTION_GO] {len(input_df)} sequences have not been processed.")

[DESCRIPTION_GO] 0 sequences have not been processed.


In [None]:
os.makedirs(HOST_OUTPUT_DIR, exist_ok=True)
if not input_df.empty:
    print("[DESCRIPTION_GO] Running in batches of 50...")
    for i in tqdm(range(0, len(input_df), 50)):
        run_in_batches(input_df[i:i+50], HOST_OUTPUT_DIR)

In [None]:
test = pd.concat(
    [
        parsed_df,
        parse_outputs("uniprot_id")
    ]
)
    
test = test.sort_values(by="uniprot_id")
test = test.merge(obsolete_df, left_on="go", right_on="id_go", how="left")
test = test.drop(columns=["id_go"])

test

File '/home/diego/Documents/PythonProjects/BioSeqDownloader/tmp/output.BPO.txt' not found.


NameError: name 'exit' is not defined

## Uniprot query

In [None]:
import os
os.chdir("src")

In [None]:
query="organism_name:homo sapiens (human) AND length:[15 TO 30] AND reviewed:true"
fields="accession,protein_name,sequence,ec,lineage,organism_name,xref_pfam,xref_alphafolddb,xref_pdb,go_id"
sort="accession asc"
download=True
format="json"

In [None]:
query="P05067"
fields="accession,xref_interpro"
sort="accession asc"
download=True
format="json"

In [None]:
from src.uniprot import UniprotInterface

uniprot = UniprotInterface()
response = uniprot.submit_stream(
    query=query,
    fields=fields,
    sort=sort,
    include_isoform=True,
    download=download,
    format=format
)

In [None]:
response.json()

{'results': [{'entryType': 'UniProtKB reviewed (Swiss-Prot)',
   'primaryAccession': 'P05067',
   'uniProtKBCrossReferences': [{'database': 'InterPro',
     'id': 'IPR036669',
     'properties': [{'key': 'EntryName', 'value': 'Amyloid_Cu-bd_sf'}]},
    {'database': 'InterPro',
     'id': 'IPR008155',
     'properties': [{'key': 'EntryName', 'value': 'Amyloid_glyco'}]},
    {'database': 'InterPro',
     'id': 'IPR013803',
     'properties': [{'key': 'EntryName', 'value': 'Amyloid_glyco_Abeta'}]},
    {'database': 'InterPro',
     'id': 'IPR037071',
     'properties': [{'key': 'EntryName', 'value': 'Amyloid_glyco_Abeta_sf'}]},
    {'database': 'InterPro',
     'id': 'IPR011178',
     'properties': [{'key': 'EntryName', 'value': 'Amyloid_glyco_Cu-bd'}]},
    {'database': 'InterPro',
     'id': 'IPR024329',
     'properties': [{'key': 'EntryName', 'value': 'Amyloid_glyco_E2_domain'}]},
    {'database': 'InterPro',
     'id': 'IPR008154',
     'properties': [{'key': 'EntryName', 'value': 'A

In [None]:
parsed = uniprot.parse_stream_response(
    query=query,
    response=response
)

In [None]:
parsed

Unnamed: 0,query,accession,alphafold_ids,biogrid_ids,brenda_ids,go_terms,interpro_ids,kegg_ids,pdb_ids,pfam_ids,reactome_ids,refseq_ids,string_ids
0,P05067,P05067,[],[],[],[],"[IPR036669, IPR008155, IPR013803, IPR037071, I...",[],[],[],[],[],[]
1,P05067,P05067-10,,,,,,,,,,,
2,P05067,P05067-11,,,,,,,,,,,
3,P05067,P05067-2,,,,,,,,,,,
4,P05067,P05067-3,,,,,,,,,,,
5,P05067,P05067-4,,,,,,,,,,,
6,P05067,P05067-5,,,,,,,,,,,
7,P05067,P05067-6,,,,,,,,,,,
8,P05067,P05067-7,,,,,,,,,,,
9,P05067,P05067-8,,,,,,,,,,,


## Activity search

In [None]:
import os
import pandas as pd

In [None]:
uniprot_search_files = os.listdir("uniprot_search")
uniprot_search_files

['uniprot_celiac-toxic.csv',
 'uniprot_embryotoxic.csv',
 'uniprot_ace-inhibitor.csv',
 'uniprot_anuran-defense.csv',
 'uniprot_campde-inhibitor.csv',
 'uniprot_anti-neurotensive.csv',
 'uniprot_antitrypanosomic.csv',
 'uniprot_anticancer.csv',
 'uniprot_anorectic.csv',
 'uniprot_chemotactic.csv',
 'uniprot_targeting-GP.csv',
 'uniprot_Blood-Brain-Barrier.csv',
 'uniprot_antitumor.csv',
 'uniprot_cytotoxic.csv',
 'uniprot_activating-ubiquitin-mediated-proteolysis.csv',
 'uniprot_protein-kinase-c-inhibitor.csv',
 'uniprot_antihiv.csv',
 'uniprot_antidiabetic.csv',
 'uniprot_calpain-2-inhibitor.csv',
 'uniprot_antileishmania.csv',
 'uniprot_inhibitor.csv',
 'uniprot_antimicrobial.csv',
 'uniprot_antituberculosis.csv',
 'uniprot_antiviral.csv',
 'uniprot_wound-healing.csv',
 'uniprot_targeting-GN.csv',
 'uniprot_hmg-coa-reductase-inhibitor.csv',
 'uniprot_opioid-agonist.csv',
 'uniprot_toxicology.csv',
 'uniprot_antibacterial.csv',
 'uniprot_antibiofilm.csv',
 'uniprot_hypocholesterolemic

In [None]:
# Create an empty list to store the results
activity_data = []

# Iterate over each file in uniprot_search_files
for file in uniprot_search_files:
    # Load the CSV file into a DataFrame
    df = pd.read_csv(f"uniprot_search/{file}")
    
    # Extract the activity name from the file name
    activity = file.replace("uniprot_", "").replace(".csv", "").replace("-", " ").capitalize()
    
    # Count the number of sequences in the DataFrame
    sequence_count = len(df)
    
    # Append the activity and sequence count to the list
    activity_data.append({"activity": activity, "sequence_count": sequence_count})

# Create a DataFrame from the activity data
activity_df = pd.DataFrame(activity_data)

# Sort the DataFrame by sequence count in descending order
activity_df = activity_df.sort_values(by="sequence_count", ascending=False)

# Calculate the total number of sequences
total_sequences = activity_df["sequence_count"].sum()

# Add a row for the total sequences
activity_df = pd.concat([activity_df, pd.DataFrame([{"activity": "Total", "sequence_count": total_sequences}])], ignore_index=True)

# Display the resulting DataFrame
activity_df

Unnamed: 0,activity,sequence_count
0,Binding,411281
1,Inhibitor,18950
2,Surface binding,15663
3,Regulating,11361
4,Antimicrobial,5945
...,...,...
85,Dipeptidyl peptidaseiv,0
86,Antiamnestic,0
87,Edema inducer,0
88,Antiendotoxin,0


In [None]:
anti_activities_df = activity_df[activity_df['activity'].str.startswith('Anti', na=False)]
anti_activities_df

Unnamed: 0,activity,sequence_count
4,Antimicrobial,5945
6,Antiviral,3835
10,Antibacterial,2366
17,Antifungal,1209
19,Antitoxin,832
24,Antitumor,479
25,Anticancer,393
32,Antiparasitic,140
37,Antiangiogenic,86
41,Antioxidative,55


## Alphafold

In [6]:
from src.alphafold import AlphafoldInterface

In [7]:
instance = AlphafoldInterface(
    structures=['pdb'],
    output_dir="results",
)

In [8]:
print(instance.query_usage())

ValueError: Unknown method 'None'

In [9]:
instance.fetch_single(
    query="P02666",
    method="prediction",
    parse=True,
    to_dataframe=True,
)

Cache_key: P02666_prediction
Prepared request: https://alphafold.ebi.ac.uk/api/prediction/P02666
Error fetching prediction for {'qualifier': 'P02666'}: HTTPSConnectionPool(host='alphafold.ebi.ac.uk', port=443): Max retries exceeded with url: /api/prediction/P02666 (Caused by ResponseError('too many 500 error responses'))


Unnamed: 0,entry,gene,tax_id,organism,is_reviewed,is_reference,pdbUrl
0,,,,,,,


In [None]:
instance.fetch_batch(
    queries=["P02666", "Q9TSI0", "P33048", "P11839", "O15552", "P76011"],
    parse=False,
    to_dataframe=True,
)

Cache_key: P02666
Cache_key: Q9TSI0
Cache_key: P33048
Cache_key: P11839
Cache_key: O15552
Cache_key: P76011


Unnamed: 0,entryId,gene,sequenceChecksum,sequenceVersionDate,uniprotAccession,uniprotId,uniprotDescription,taxId,organismScientificName,uniprotStart,...,bcifUrl,cifUrl,pdbUrl,paeImageUrl,paeDocUrl,amAnnotationsUrl,amAnnotationsHg19Url,amAnnotationsHg38Url,isReviewed,isReferenceProteome
0,AF-P02666-F1,CSN2,F0BBDD8148A238AE,1989-07-01,P02666,CASB_BOVIN,Beta-casein,9913,Bos taurus,1,...,https://alphafold.ebi.ac.uk/files/AF-P02666-F1...,https://alphafold.ebi.ac.uk/files/AF-P02666-F1...,https://alphafold.ebi.ac.uk/files/AF-P02666-F1...,https://alphafold.ebi.ac.uk/files/AF-P02666-F1...,https://alphafold.ebi.ac.uk/files/AF-P02666-F1...,,,,True,True
1,AF-Q9TSI0-F1,CSN2,14FD3687DD17C5A9,2000-05-01,Q9TSI0,CASB_BUBBU,Beta-casein,89462,Bubalus bubalis,1,...,https://alphafold.ebi.ac.uk/files/AF-Q9TSI0-F1...,https://alphafold.ebi.ac.uk/files/AF-Q9TSI0-F1...,https://alphafold.ebi.ac.uk/files/AF-Q9TSI0-F1...,https://alphafold.ebi.ac.uk/files/AF-Q9TSI0-F1...,https://alphafold.ebi.ac.uk/files/AF-Q9TSI0-F1...,,,,True,False
2,AF-P33048-F1,CSN2,96AE17746A01CD05,1993-10-01,P33048,CASB_CAPHI,Beta-casein,9925,Capra hircus,1,...,https://alphafold.ebi.ac.uk/files/AF-P33048-F1...,https://alphafold.ebi.ac.uk/files/AF-P33048-F1...,https://alphafold.ebi.ac.uk/files/AF-P33048-F1...,https://alphafold.ebi.ac.uk/files/AF-P33048-F1...,https://alphafold.ebi.ac.uk/files/AF-P33048-F1...,,,,True,True
3,AF-P11839-F1,CSN2,061B4424DCB49EB1,1995-11-01,P11839,CASB_SHEEP,Beta-casein,9940,Ovis aries,1,...,https://alphafold.ebi.ac.uk/files/AF-P11839-F1...,https://alphafold.ebi.ac.uk/files/AF-P11839-F1...,https://alphafold.ebi.ac.uk/files/AF-P11839-F1...,https://alphafold.ebi.ac.uk/files/AF-P11839-F1...,https://alphafold.ebi.ac.uk/files/AF-P11839-F1...,,,,True,True
4,AF-O15552-F1,FFAR2,F4A8AC6AFBDF1E90,1998-01-01,O15552,FFAR2_HUMAN,Free fatty acid receptor 2,9606,Homo sapiens,1,...,https://alphafold.ebi.ac.uk/files/AF-O15552-F1...,https://alphafold.ebi.ac.uk/files/AF-O15552-F1...,https://alphafold.ebi.ac.uk/files/AF-O15552-F1...,https://alphafold.ebi.ac.uk/files/AF-O15552-F1...,https://alphafold.ebi.ac.uk/files/AF-O15552-F1...,https://alphafold.ebi.ac.uk/files/AF-O15552-F1...,https://alphafold.ebi.ac.uk/files/AF-O15552-F1...,https://alphafold.ebi.ac.uk/files/AF-O15552-F1...,True,True
5,AF-P76011-F1,ymgE,78ED929220264E62,2001-07-11,P76011,YMGE_ECOLI,UPF0410 protein YmgE,83333,Escherichia coli (strain K12),1,...,https://alphafold.ebi.ac.uk/files/AF-P76011-F1...,https://alphafold.ebi.ac.uk/files/AF-P76011-F1...,https://alphafold.ebi.ac.uk/files/AF-P76011-F1...,https://alphafold.ebi.ac.uk/files/AF-P76011-F1...,https://alphafold.ebi.ac.uk/files/AF-P76011-F1...,,,,True,True


In [None]:
instance.save(
    data = result,
    filename = "alphafold2_results",
    extension= "csv"
)

NameError: name 'result' is not defined

## BioDBNet

In [13]:
from src.biodbnet import BioDBNetInterface

In [14]:
instance = BioDBNetInterface()

In [15]:
instance.fetch_single(
    query={
        "input": "genesymbol",
        "inputValues": ["APP", "SLC9A9"],
        "outputs": ["genesymbol", "affyid", "go-biologicalprocess"],
        "taxonId": "9606"
    },
    method="db2db",
    parse=True,
    to_dataframe=True
).dropna(axis=1, how='all')

Cache_key: APP_genesymbol_genesymbol_9606_db2db
Cache_key: APP_affyid_genesymbol_9606_db2db
Cache_key: APP_go-biologicalprocess_genesymbol_9606_db2db
Cache_key: SLC9A9_genesymbol_genesymbol_9606_db2db
Cache_key: SLC9A9_affyid_genesymbol_9606_db2db
Cache_key: SLC9A9_go-biologicalprocess_genesymbol_9606_db2db
Prepared request: https://biodbnet.abcc.ncifcrf.gov/webServices/rest.php/biodbnetRestApi.json?inputValues=APP%2CSLC9A9&outputs=genesymbol%2Caffyid%2Cgo-biologicalprocess&input=genesymbol&taxonId=9606&method=db2db
Cache_key: APP_genesymbol_genesymbol_9606_db2db
Cache_key: APP_affyid_genesymbol_9606_db2db
Cache_key: APP_go-biologicalprocess_genesymbol_9606_db2db
Cache_key: SLC9A9_genesymbol_genesymbol_9606_db2db
Cache_key: SLC9A9_affyid_genesymbol_9606_db2db
Cache_key: SLC9A9_go-biologicalprocess_genesymbol_9606_db2db


Unnamed: 0,affy_id,gene_symbol,go_biological_process
0,"[11738008_s_at [Chip: PrimeView], 11746866_a_a...",APP,[GO:0006357 [Name: regulation of transcription...
1,"[11738008_s_at [Chip: PrimeView], 11746866_a_a...",APP,[GO:0006357 [Name: regulation of transcription...
2,"[11738008_s_at [Chip: PrimeView], 11746866_a_a...",APP,[GO:0006357 [Name: regulation of transcription...
3,"[227791_at [Chip: U133_Plus_2], 43140_at [Chip...",SLC9A9,[GO:0042742 [Name: defense response to bacteri...
4,"[227791_at [Chip: U133_Plus_2], 43140_at [Chip...",SLC9A9,[GO:0042742 [Name: defense response to bacteri...
5,"[227791_at [Chip: U133_Plus_2], 43140_at [Chip...",SLC9A9,[GO:0042742 [Name: defense response to bacteri...
6,"[11738008_s_at [Chip: PrimeView], 11746866_a_a...",APP,[GO:0006357 [Name: regulation of transcription...
7,"[227791_at [Chip: U133_Plus_2], 43140_at [Chip...",SLC9A9,[GO:0042742 [Name: defense response to bacteri...


In [16]:
instance.fetch_single(
    query={
        "pathways": "1",
        "taxonId": "9606"
    },
    method="getpathways",
    parse=True,
    to_dataframe=True
)

Cache_key: 1_9606_getpathways
Prepared request: https://biodbnet.abcc.ncifcrf.gov/webServices/rest.php/biodbnetRestApi.json?pathways=1&taxonId=9606&method=getpathways


Unnamed: 0,name,title,description,source_database
0,NS1 Mediated Effects on Host Pathways,NS1 Mediated Effects on Host Pathways,NS1 Mediated Effects on Host Pathways,REACTOME
1,2-LTR circle formation,2-LTR circle formation,2-LTR circle formation,REACTOME
2,3' -UTR-mediated translational regulation,3' -UTR-mediated translational regulation,3' -UTR-mediated translational regulation,REACTOME
3,5-Phosphoribose 1-diphosphate biosynthesis,PRPP biosynthesis,PRPP biosynthesis,REACTOME
4,ABH2 mediated Reversal of Alkylation Damage,ABH2 mediated Reversal of Alkylation Damage,ABH2 mediated Reversal of Alkylation Damage,REACTOME
...,...,...,...,...
1751,hsa05020,Prion diseases,Prion diseases,KEGG
1752,hsa03320,PPAR signaling pathway,PPAR signaling pathway,KEGG
1753,hsa00620,Pyruvate metabolism,Pyruvate metabolism,KEGG
1754,hsa04070,Phosphatidylinositol signaling system,Phosphatidylinositol signaling system,KEGG


## BioGRID

In [17]:
from src.biogrid import BioGRIDInterface
import os

In [18]:
from dotenv import load_dotenv
load_dotenv()
biogrid_api_key = os.getenv("biogrid_api_key")

In [19]:
instance = BioGRIDInterface(
)

In [20]:
instance.fetch_single(
    query={
        "accessKey": biogrid_api_key,
        "geneList": ["P53", "CDK2", "BRCA1"],
    },
    method="interactions",
    parse=True,
    to_dataframe=True
)

Cache_key: P53_interactions
Cache_key: CDK2_interactions
Cache_key: BRCA1_interactions
Prepared request: https://webservice.thebiogrid.org/interactions?accessKey=87aee10241ab2fcc6e497352e5024380&start=0&max=10000&interSpeciesExclude=False&selfInteractionsExclude=False&includeEvidence=False&geneList=P53%7CCDK2%7CBRCA1&searchBiogridIds=False&taxId=All&searchIds=False&format=json
Cache_key: P53_interactions
Cache_key: CDK2_interactions
Cache_key: BRCA1_interactions


Unnamed: 0,interaction_b,synonyms_a,synonyms_b,organism_a,organism_b
0,WWOX,BCC7|LFS1|P53|TRP53,D16S432E|FOR|FRA16D|HHCMA56|PRO0128|SCAR12|SDR...,9606,9606
1,CDC14A,BCC7|LFS1|P53|TRP53,cdc14|hCDC14,9606,9606
2,betaTub60D,Dmel\CG15450,143391_i_at|3t|B3t|BETA 60D|CG3401|D.m.BETA-60...,7227,7227
3,tamo,1323/07|1422/04|CG17117|Dm-HTH|Dmel\CG17117|Me...,60B|CG4057|Dmel\CG4057,7227,7227
4,exd,1323/07|1422/04|CG17117|Dm-HTH|Dmel\CG17117|Me...,CG8933|DExd|Dm-EXD|Dmel\CG8933|Dpbx|Pbx1|anon-...,7227,7227
...,...,...,...,...,...
10035,PRR5,BRCAI|BRCC1|BROVCA1|FANCS|IRIS|PNCA4|PPP1R53|P...,FLJ20185k|PP610|PROTOR-1|PROTOR1,9606,9606
10036,RICTOR,BRCAI|BRCC1|BROVCA1|FANCS|IRIS|PNCA4|PPP1R53|P...,AVO3|PIA|hAVO3,9606,9606
10037,BRCA1,JC310|MIP1|SIN1|SIN1b|SIN1g,BRCAI|BRCC1|BROVCA1|FANCS|IRIS|PNCA4|PPP1R53|P...,9606,9606
10038,BRCA1,FLJ20185k|PP610|PROTOR-1|PROTOR1,BRCAI|BRCC1|BROVCA1|FANCS|IRIS|PNCA4|PPP1R53|P...,9606,9606


In [21]:
print(instance.query_usage())

Usage: To fetch interactions, use the BioGRID API with the following parameters.
        Example:
            - fetch_single(method="interactions", query={})
        Available methods: interactions



Example Query:

        {
            "accessKey": "YOUR_ACCESS_KEY",
            "geneList": ["P53"],
            "max": 10,
            "format": "json"
        }
        

Response Format:

        {
            "BIOGRID_INTERACTION_ID": "int",
            "ENTREZ_GENE_A": "str",
            "ENTREZ_GENE_B": "str",
            "BIOGRID_ID_A": "int",
            "BIOGRID_ID_B": "int",
            "SYSTEMATIC_NAME_A": "str",
            "SYSTEMATIC_NAME_B": "str",
            "OFFICIAL_SYMBOL_A": "str",
            "OFFICIAL_SYMBOL_B": "str",
            "SYNONYMS_A": "str",
            "SYNONYMS_B": "str",
            "EXPERIMENTAL_SYSTEM": "str",
            "EXPERIMENTAL_SYSTEM_TYPE": "str",
            "PUBMED_AUTHOR": "str",
            "PUBMED_ID": "int",
            "ORGANISM_A": 

In [22]:
instance.fetch_batch(
    queries=[
        {
            "accessKey": biogrid_api_key,
            "geneList": ["P53", "CDK2", "BRCA1"],
        },
        {
            "accessKey": biogrid_api_key,
            "geneList": ["BRCA2", "ATM", "CHEK2"],
        }
    ],
    method="interactions",
    parse=True,
    to_dataframe=True
)

Cache_key: P53_interactions
Cache_key: CDK2_interactions
Cache_key: BRCA1_interactions
Cache_key: BRCA2_interactions
Cache_key: ATM_interactions
Cache_key: CHEK2_interactions
Cache_key: BRCA2_interactions
Cache_key: ATM_interactions
Cache_key: CHEK2_interactions
Prepared request: https://webservice.thebiogrid.org/interactions?accessKey=87aee10241ab2fcc6e497352e5024380&start=0&max=10000&interSpeciesExclude=False&selfInteractionsExclude=False&includeEvidence=False&geneList=BRCA2%7CATM%7CCHEK2&searchBiogridIds=False&taxId=All&searchIds=False&format=json
Cache_key: BRCA2_interactions
Cache_key: ATM_interactions
Cache_key: CHEK2_interactions


Unnamed: 0,interaction_b,synonyms_a,synonyms_b,organism_a,organism_b
0,WWOX,BCC7|LFS1|P53|TRP53,D16S432E|FOR|FRA16D|HHCMA56|PRO0128|SCAR12|SDR...,9606,9606
1,CDC14A,BCC7|LFS1|P53|TRP53,cdc14|hCDC14,9606,9606
2,betaTub60D,Dmel\CG15450,143391_i_at|3t|B3t|BETA 60D|CG3401|D.m.BETA-60...,7227,7227
3,tamo,1323/07|1422/04|CG17117|Dm-HTH|Dmel\CG17117|Me...,60B|CG4057|Dmel\CG4057,7227,7227
4,exd,1323/07|1422/04|CG17117|Dm-HTH|Dmel\CG17117|Me...,CG8933|DExd|Dm-EXD|Dmel\CG8933|Dpbx|Pbx1|anon-...,7227,7227
...,...,...,...,...,...
12049,CHEK2,MYL|PP8675|RNF71|TRIM19,CDS1|CHK2|HuCds1|LFS2|PP1425|RAD53|hCds1,9606,9606
12050,CHEK2,CDS1|CHK2|HuCds1|LFS2|PP1425|RAD53|hCds1,CDS1|CHK2|HuCds1|LFS2|PP1425|RAD53|hCds1,9606,9606
12051,CHEK2,CDS1|CHK2|HuCds1|LFS2|PP1425|RAD53|hCds1,CDS1|CHK2|HuCds1|LFS2|PP1425|RAD53|hCds1,9606,9606
12052,CDC25C,CDS1|CHK2|HuCds1|LFS2|PP1425|RAD53|hCds1,CDC25|PPP1R60,9606,9606


In [23]:
instance.get_dummy(biogrid_api_key, parse=False)

Cache_key: cdc27_1_559292_interactions
Cache_key: apc1_1_559292_interactions
Cache_key: apc2_1_559292_interactions
Prepared request: https://webservice.thebiogrid.org/interactions?accessKey=87aee10241ab2fcc6e497352e5024380&start=0&max=1&interSpeciesExclude=False&selfInteractionsExclude=False&includeEvidence=False&geneList=cdc27%7Capc1%7Capc2&searchBiogridIds=False&taxId=559292&searchIds=False&format=json
Cache_key: cdc27_1_559292_interactions
Cache_key: apc1_1_559292_interactions
Cache_key: apc2_1_559292_interactions


{'interactions': {'BIOGRID_INTERACTION_ID': 'int',
  'ENTREZ_GENE_A': 'str',
  'ENTREZ_GENE_B': 'str',
  'BIOGRID_ID_A': 'int',
  'BIOGRID_ID_B': 'int',
  'SYSTEMATIC_NAME_A': 'str',
  'SYSTEMATIC_NAME_B': 'str',
  'OFFICIAL_SYMBOL_A': 'str',
  'OFFICIAL_SYMBOL_B': 'str',
  'SYNONYMS_A': 'str',
  'SYNONYMS_B': 'str',
  'EXPERIMENTAL_SYSTEM': 'str',
  'EXPERIMENTAL_SYSTEM_TYPE': 'str',
  'PUBMED_AUTHOR': 'str',
  'PUBMED_ID': 'int',
  'ORGANISM_A': 'int',
  'ORGANISM_B': 'int',
  'THROUGHPUT': 'str',
  'QUANTITATION': 'str',
  'MODIFICATION': 'str',
  'ONTOLOGY_TERMS': 'dict',
  'QUALIFICATIONS': 'str',
  'TAGS': 'str',
  'SOURCEDB': 'str'}}

In [24]:
instance.fetch_single(
    query={
        "accessKey": biogrid_api_key,
        "geneList": ["cdc27", "apc1", "apc2"],
        "taxId": "559292",
    },
    method="interactions",
    parse=True,
    to_dataframe=True
)

Cache_key: cdc27_559292_interactions
Cache_key: apc1_559292_interactions
Cache_key: apc2_559292_interactions
Prepared request: https://webservice.thebiogrid.org/interactions?accessKey=87aee10241ab2fcc6e497352e5024380&start=0&max=10000&interSpeciesExclude=False&selfInteractionsExclude=False&includeEvidence=False&geneList=cdc27%7Capc1%7Capc2&searchBiogridIds=False&taxId=559292&searchIds=False&format=json
Cache_key: cdc27_559292_interactions
Cache_key: apc1_559292_interactions
Cache_key: apc2_559292_interactions


Unnamed: 0,interaction_b,synonyms_a,synonyms_b,organism_a,organism_b
0,APC1,RSI1|TID2|anaphase promoting complex subunit 2...,anaphase promoting complex subunit 1|L000004053,559292,559292
1,CDC16,RSI1|TID2|anaphase promoting complex subunit 2...,anaphase promoting complex subunit CDC16|L0000...,559292,559292
2,CDC23,RSI1|TID2|anaphase promoting complex subunit 2...,anaphase promoting complex subunit CDC23|L0000...,559292,559292
3,CDC27,RSI1|TID2|anaphase promoting complex subunit 2...,APC3|SNB1|anaphase promoting complex subunit C...,559292,559292
4,APC2,APC10|anaphase promoting complex subunit DOC1|...,RSI1|TID2|anaphase promoting complex subunit 2...,559292,559292
...,...,...,...,...,...
4423,APC1,CDK1|HSL5|SRM5|cyclin-dependent serine/threoni...,anaphase promoting complex subunit 1|L000004053,559292,559292
4424,APC2,CDK1|HSL5|SRM5|cyclin-dependent serine/threoni...,RSI1|TID2|anaphase promoting complex subunit 2...,559292,559292
4425,CDC27,CDK1|HSL5|SRM5|cyclin-dependent serine/threoni...,APC3|SNB1|anaphase promoting complex subunit C...,559292,559292
4426,APC1,MIND complex subunit DSN1|L000004645,anaphase promoting complex subunit 1|L000004053,559292,559292


In [25]:
instance.fetch_single(
    query={
        "accessKey": biogrid_api_key,
        "geneList": ["cdc27", "apc1", "apc2"],
        "taxId": "559292",
    },
    method="interactions",
    parse=True,
    to_dataframe=True
)

Cache_key: cdc27_559292_interactions
Cache_key: apc1_559292_interactions
Cache_key: apc2_559292_interactions


Unnamed: 0,interaction_b,synonyms_a,synonyms_b,organism_a,organism_b
0,APC1,RSI1|TID2|anaphase promoting complex subunit 2...,anaphase promoting complex subunit 1|L000004053,559292,559292
1,CDC16,RSI1|TID2|anaphase promoting complex subunit 2...,anaphase promoting complex subunit CDC16|L0000...,559292,559292
2,CDC23,RSI1|TID2|anaphase promoting complex subunit 2...,anaphase promoting complex subunit CDC23|L0000...,559292,559292
3,CDC27,RSI1|TID2|anaphase promoting complex subunit 2...,APC3|SNB1|anaphase promoting complex subunit C...,559292,559292
4,APC2,APC10|anaphase promoting complex subunit DOC1|...,RSI1|TID2|anaphase promoting complex subunit 2...,559292,559292
...,...,...,...,...,...
4423,APC1,CDK1|HSL5|SRM5|cyclin-dependent serine/threoni...,anaphase promoting complex subunit 1|L000004053,559292,559292
4424,APC2,CDK1|HSL5|SRM5|cyclin-dependent serine/threoni...,RSI1|TID2|anaphase promoting complex subunit 2...,559292,559292
4425,CDC27,CDK1|HSL5|SRM5|cyclin-dependent serine/threoni...,APC3|SNB1|anaphase promoting complex subunit C...,559292,559292
4426,APC1,MIND complex subunit DSN1|L000004645,anaphase promoting complex subunit 1|L000004053,559292,559292


In [26]:
instance.fetch_single(
    query={
        "accessKey": biogrid_api_key,
        "geneList": ['MFS1'],
        "taxId": "9606",
    },
    method="interactions",
    parse=True,
    to_dataframe=True
)

Cache_key: MFS1_9606_interactions
Prepared request: https://webservice.thebiogrid.org/interactions?accessKey=87aee10241ab2fcc6e497352e5024380&start=0&max=10000&interSpeciesExclude=False&selfInteractionsExclude=False&includeEvidence=False&geneList=MFS1&searchBiogridIds=False&taxId=9606&searchIds=False&format=json
Cache_key: MFS1_9606_interactions


Unnamed: 0,interaction_b,synonyms_a,synonyms_b,organism_a,organism_b
0,MFAP2,ACMICD|ECTOL1|FBN|GPHYSD2|MASS|MFS1|OCTD|SGS|S...,MAGP|MAGP-1|MAGP1,9606,9606
1,FBN1,-,ACMICD|ECTOL1|FBN|GPHYSD2|MASS|MFS1|OCTD|SGS|S...,9606,9606
2,FBN1,GLC1A|GPOA|JOAG|JOAG1|TIGR|myocilin,ACMICD|ECTOL1|FBN|GPHYSD2|MASS|MFS1|OCTD|SGS|S...,9606,9606
3,FBN1,CSPG2|ERVR|GHAP|PG-M|WGN|WGN1,ACMICD|ECTOL1|FBN|GPHYSD2|MASS|MFS1|OCTD|SGS|S...,9606,9606
4,FBN1,SVAS|WBS|WS,ACMICD|ECTOL1|FBN|GPHYSD2|MASS|MFS1|OCTD|SGS|S...,9606,9606
5,FBN1,ADCAII|OPCA3|SCA7,ACMICD|ECTOL1|FBN|GPHYSD2|MASS|MFS1|OCTD|SGS|S...,9606,9606
6,FBN1,hSPRY2,ACMICD|ECTOL1|FBN|GPHYSD2|MASS|MFS1|OCTD|SGS|S...,9606,9606
7,FBN1,ZNF753|zfp-41,ACMICD|ECTOL1|FBN|GPHYSD2|MASS|MFS1|OCTD|SGS|S...,9606,9606
8,FBN1,-,ACMICD|ECTOL1|FBN|GPHYSD2|MASS|MFS1|OCTD|SGS|S...,9606,9606
9,FBN1,ZNF753|zfp-41,ACMICD|ECTOL1|FBN|GPHYSD2|MASS|MFS1|OCTD|SGS|S...,9606,9606


In [27]:
instance.fetch_single(
    query={
        "id" : "103",
        "accessKey": biogrid_api_key,
    },
    method="interactions",
    parse=True,
    to_dataframe=True
)

Cache_key: 103_interactions
Prepared request: https://webservice.thebiogrid.org/interactions?accessKey=87aee10241ab2fcc6e497352e5024380&id=103&start=0&max=10000&interSpeciesExclude=False&selfInteractionsExclude=False&includeEvidence=False&searchBiogridIds=False&taxId=All&searchIds=False&format=json


Unnamed: 0,interaction_b,synonyms_a,synonyms_b,organism_a,organism_b
0,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,ABP-280|ABP280A|ABPA|ABPL|FLN2|MFM5|MPD4,9606,9606
1,ACTN2,CMD1DD|CMH22|MYOP|RCM4,CMD1AA,9606,9606
2,FNTA,ACTRI|ACVR1A|ACVRLK2|ALK2|FOP|SKR1|TSRI,FPTA|PGGT1A|PTAR2,9606,9606
3,PML,DCML|IMD21|MONOMAC|NFE1B,MYL|PP8675|RNF71|TRIM19,9606,9606
4,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,ADMIO|APRF|HIES,9606,9606
...,...,...,...,...,...
9995,RpS17,CG13167|Dmel\CG13167,BcDNA:RE44119|CG3922|D9|Dmel\CG3922|M|M(3)67|M...,7227,7227
9996,RpL15,CG13167|Dmel\CG13167,80Fi|CG17420|CG40199|DmRpL15|Dmel\CG17420|L15|...,7227,7227
9997,CG15423,Dmel\CG13168,Dmel\CG15423,7227,7227
9998,CG15549,Dmel\CG13168,BcDNA:AT07554|Dmel\CG15549,7227,7227


## Brenda

In [28]:
from src.brenda import BrendaInstance
import os

In [29]:
from dotenv import load_dotenv
load_dotenv()
brenda_email = os.getenv("brenda_email")
brenda_password = os.getenv("brenda_password")


In [30]:
instance = BrendaInstance(
    email=brenda_email,
    password=brenda_password,
    #use_config=False,  # Set to True if you want to use a config file
)

In [31]:
instance.fetch_single(
    query={
            "ecNumber": "1.1.1.10",
            "organism": "Homo sapiens",
            #"kcatKmValue": "0.5"
        }, 
    method="getKmValue",
    parse=True,
    to_dataframe=True
)

Cache_key: 1.1.1.10_Homo sapiens_getKmValue


Unnamed: 0,ec,organism,km,km_max,substrate
0,1.1.1.10,Homo sapiens,0.001,,NADP+
1,1.1.1.10,Homo sapiens,0.002,,NADPH
2,1.1.1.10,Homo sapiens,0.002,,NADPH
3,1.1.1.10,Homo sapiens,0.077,,diacetyl
4,1.1.1.10,Homo sapiens,0.077,,diacetyl
5,1.1.1.10,Homo sapiens,0.0945,,diacetyl
6,1.1.1.10,Homo sapiens,0.17,,NADH
7,1.1.1.10,Homo sapiens,0.21,,L-xylulose
8,1.1.1.10,Homo sapiens,0.22,,diacetyl
9,1.1.1.10,Homo sapiens,0.85,,NAD+


In [32]:
instance.fetch_single(
    query=
        {
            #"ecNumber": "1.1.1.10",
            "organism": "Homo sapiens",
            "temperatureRange": "25-37"
        }, 
    method="getTemperatureRange",
    parse=True,
    to_dataframe=True
)

Cache_key: 25-37_Homo sapiens_getTemperatureRange


Unnamed: 0,ec,organism,temperature_range,temperature_range_max
0,1.1.1.188,Homo sapiens,25,37
1,2.1.1.77,Homo sapiens,25,65
2,2.4.1.38,Homo sapiens,25,45
3,2.4.1.90,Homo sapiens,25,45
4,2.7.1.21,Homo sapiens,25,37
5,2.8.2.2,Homo sapiens,25,50
6,3.1.8.1,Homo sapiens,25,50
7,3.1.11.1,Homo sapiens,25,37
8,3.4.21.108,Homo sapiens,25,55
9,3.5.1.1,Homo sapiens,25,60


In [8]:
print(instance.query_usage())

Usage: To fetch data from BRENDA, use the following parameters.
        Example:
            - fetch(query={}, methods=["getKmValue", "getIc50Value"])
        Available methods: getKmValue, getIc50Value, getKcatKmValue, getKiValue, getPhRange, getPhOptimum, getPhStability, getCofactor, getTemperatureOptimum, getTemperatureStability, getTemperatureRange

For more information about each method, please refer to the BRENDA documentation.
Or use `show_method({method_name})` to see the parameters required for each method.


In [10]:
instance.get_dummy()

Cache_key: 1.1.1.1_Escherichia coli_getKmValue
type(response)=[{'literature': [660650], 'substrate': 'ethanol', 'kmValue': '10.6', 'kmValueMaximum': None, 'commentary': None, 'organism': 'Escherichia coli', 'ecNumber': '1.1.1.1', 'ligandStructureId': 65}]
Cache_key: 1.1.1.1_Escherichia coli_getIc50Value
type(response)=[]
Cache_key: 1.1.1.1_Escherichia coli_getKcatKmValue
type(response)=[]
Cache_key: 1.1.1.1_Escherichia coli_getKiValue
type(response)=[{'kiValueMaximum': None, 'literature': [660650], 'kiValue': '18.26', 'commentary': None, 'inhibitor': '4-Methylpyrazole', 'organism': 'Escherichia coli', 'ecNumber': '1.1.1.1', 'ligandStructureId': 1804}]
Cache_key: 1.1.1.1_Escherichia coli_getPhRange
type(response)=[]
Cache_key: 1.1.1.1_Escherichia coli_getPhOptimum
type(response)=[]
Cache_key: 1.1.1.1_Escherichia coli_getPhStability
type(response)=[]
Cache_key: 1.1.1.1_Escherichia coli_getCofactor
type(response)=[{'literature': [660650], 'commentary': 'strongly preferred as cofactor', 'e

{'getKmValue': {'literature': 'list(int)',
  'substrate': 'str',
  'kmValue': 'str',
  'kmValueMaximum': 'NoneType',
  'commentary': 'NoneType',
  'organism': 'str',
  'ecNumber': 'str',
  'ligandStructureId': 'int'},
 'getIc50Value': {},
 'getKcatKmValue': {},
 'getKiValue': {'kiValueMaximum': 'NoneType',
  'literature': 'list(int)',
  'kiValue': 'str',
  'commentary': 'NoneType',
  'inhibitor': 'str',
  'organism': 'str',
  'ecNumber': 'str',
  'ligandStructureId': 'int'},
 'getPhRange': {},
 'getPhOptimum': {},
 'getPhStability': {},
 'getCofactor': {'literature': 'list(int)',
  'commentary': 'str',
  'ecNumber': 'str',
  'organism': 'str',
  'cofactor': 'str',
  'ligandStructureId': 'int'},
 'getTemperatureOptimum': {},
 'getTemperatureStability': {},
 'getTemperatureRange': {}}

## Chembl

In [1]:
from src.chembl import ChEMBLInterface

In [2]:
instance = ChEMBLInterface()

In [3]:
instance.fetch_single(
    query={"target_chembl_id": "CHEMBL1824", "pchembl_value": 5.62}, 
    method="activity",
    pages_to_fetch=1, 
    parse=True,
    to_dataframe=True
)

Cache_key: CHEMBL1824_activity


Unnamed: 0,assay_id,type,units,value
0,CHEMBL650385,IC50,uM,2.4
1,CHEMBL880259,IC50,uM,2.4
2,CHEMBL1073381,IC50,uM,2.4
3,CHEMBL1932748,IC50,nM,2421.0
4,CHEMBL2320919,IC50,uM,2.38
5,CHEMBL3993804,IC50,nM,2400.0
6,CHEMBL5444475,IC50,nM,2400.0


In [3]:
instance.fetch_single(
    query={"target_chembl_id": "CHEMBL1824"}, 
    method="binding_site",
    pages_to_fetch=1, 
    parse=True,
    to_dataframe=True
)

Cache_key: CHEMBL1824_binding_site
Fetching page: https://www.ebi.ac.uk/chembl/api/data/binding_site?target_chembl_id=CHEMBL1824&format=json for method binding_site with pages_to_fetch=1


Unnamed: 0,site_components,site_id,site_name
0,"[{'component_id': 4909, 'domain': {'domain_des...",2,"UDP-glucuronosyltransferase 1-10, UDPGT domain"
1,"[{'component_id': 5575, 'domain': {'domain_des...",3,"Mitogen-activated protein kinase 8, Pkinase do..."
2,"[{'component_id': 118, 'domain': {'domain_desc...",4,"Inosine-5'-monophosphate dehydrogenase 1, IMPD..."
3,"[{'component_id': 397, 'domain': {'domain_desc...",5,"Dopamine D1 receptor, 7tm_1 domain"
4,"[{'component_id': 218, 'domain': {'domain_desc...",6,"GABA transporter 1, SNF domain"
5,"[{'component_id': 1835, 'domain': {'domain_des...",7,"LDL-associated phospholipase A2, PAF-AH_p_II d..."
6,"[{'component_id': 2632, 'domain': {'domain_des...",8,"Purinergic receptor P2Y1, 7tm_1 domain"
7,"[{'component_id': 873, 'domain': {'domain_desc...",9,"3-phosphoinositide dependent protein kinase-1,..."
8,"[{'component_id': 970, 'domain': {'domain_desc...",10,"Tyrosine-protein kinase CSK, Pkinase_Tyr domain"
9,"[{'component_id': 3106, 'domain': {'domain_des...",11,"Carbonic anhydrase VA, Carb_anhydrase domain"


In [36]:
instance.get_dummy()

Cache_key: CHEMBL1824_activity


{'action_type': 'NoneType',
 'activity_comment': 'NoneType',
 'activity_id': 'int',
 'activity_properties': 'list',
 'assay_chembl_id': 'str',
 'assay_description': 'str',
 'assay_type': 'str',
 'assay_variant_accession': 'NoneType',
 'assay_variant_mutation': 'NoneType',
 'bao_endpoint': 'str',
 'bao_format': 'str',
 'bao_label': 'str',
 'canonical_smiles': 'str',
 'data_validity_comment': 'NoneType',
 'data_validity_description': 'NoneType',
 'document_chembl_id': 'str',
 'document_journal': 'str',
 'document_year': 'int',
 'ligand_efficiency': 'NoneType',
 'molecule_chembl_id': 'str',
 'molecule_pref_name': 'NoneType',
 'parent_molecule_chembl_id': 'str',
 'pchembl_value': 'str',
 'potential_duplicate': 'int',
 'qudt_units': 'str',
 'record_id': 'int',
 'relation': 'str',
 'src_id': 'int',
 'standard_flag': 'int',
 'standard_relation': 'str',
 'standard_text_value': 'NoneType',
 'standard_type': 'str',
 'standard_units': 'str',
 'standard_upper_value': 'NoneType',
 'standard_value':

## Chebi

In [1]:
from src.chebi import ChEBIInterface

In [2]:
instance = ChEBIInterface()

In [3]:
instance.fetch_single(
    query=["CHEBI:18357","CHEBI:29033"], 
    method="compounds", 
    parse=True,
    to_dataframe=True
)

Cache_key: ['CHEBI:18357', 'CHEBI:29033']_compounds
Prepared url: https://www.ebi.ac.uk/chebi/beta/api/public/compounds/?chebi_ids=CHEBI%3A18357%2CCHEBI%3A29033


Unnamed: 0,id
0,CHEBI:18357
1,CHEBI:29033


In [4]:
instance.fetch_batch(
    queries=[
        {"chebi_ids": ["CHEBI:18357", "CHEBI:29033"]}
    ],
    method="compounds",
    parse=True,
    to_dataframe=True
)

Cache_key: {"chebi_ids": ["CHEBI:18357", "CHEBI:29033"]}_compounds
Cache_key: ['CHEBI:18357', 'CHEBI:29033']_compounds


Unnamed: 0,id
0,CHEBI:18357
1,CHEBI:29033


In [5]:
instance.fetch_batch(
    queries=[
        {"term": "paracetamol"}
    ],
    method="es_search",
    parse=True,
    to_dataframe=True
)

Cache_key: {"term": "paracetamol"}_es_search
Cache_key: paracetamol_es_search


Unnamed: 0,_index,_type,_id,_score,_source
0,chebi-prod-compounds-16042025,_doc,46195,66.14483,"{'chebi_accession': 'CHEBI:46195', 'name': 'pa..."
1,chebi-prod-compounds-16042025,_doc,74529,58.1175,"{'chebi_accession': 'CHEBI:74529', 'name': 'an..."
2,chebi-prod-compounds-16042025,_doc,32635,32.58668,"{'chebi_accession': 'CHEBI:32635', 'name': 'pa..."
3,chebi-prod-compounds-16042025,_doc,32636,31.397276,"{'chebi_accession': 'CHEBI:32636', 'name': 'ac..."
4,chebi-prod-compounds-16042025,_doc,139476,31.277231,"{'chebi_accession': 'CHEBI:139476', 'name': '3..."
5,chebi-prod-compounds-16042025,_doc,180460,29.612461,"{'chebi_accession': 'CHEBI:180460', 'name': 'P..."
6,chebi-prod-compounds-16042025,_doc,133066,26.95681,"{'chebi_accession': 'CHEBI:133066', 'name': '<..."
7,chebi-prod-compounds-16042025,_doc,133067,25.853018,"{'chebi_accession': 'CHEBI:133067', 'name': '<..."
8,chebi-prod-compounds-16042025,_doc,133435,20.886192,"{'chebi_accession': 'CHEBI:133435', 'name': '<..."
9,chebi-prod-compounds-16042025,_doc,133438,20.886192,"{'chebi_accession': 'CHEBI:133438', 'name': '<..."


In [7]:
instance.fetch_batch(
    queries=[
        {"chebi_id": "CHEBI:18357"}
    ],
    method="ontology-children",
    parse=True,
    to_dataframe=True
)

Cache_key: {"chebi_id": "CHEBI:18357"}_ontology-children
Cache_key: CHEBI:18357_ontology-children
Prepared url: https://www.ebi.ac.uk/chebi/beta/api/public/ontology/children/CHEBI%3A18357


Unnamed: 0,id,chebi_accession,ontology_relations
0,18357,CHEBI:18357,"{'incoming_relations': [{'init_id': 167178, 'i..."


In [5]:
instance.fetch_batch(
    queries=[
        {"chebi_id": "CHEBI:18357"}
    ],
    method="ontology-parents",
    parse=True,
    to_dataframe=True
)

Cache_key: {"chebi_id": "CHEBI:18357"}_ontology-parents
Cache_key: CHEBI:18357_ontology-parents
Prepared url: https://www.ebi.ac.uk/chebi/beta/api/public/ontology/parents/CHEBI%3A18357


Unnamed: 0,id,chebi_accession,ontology_relations
0,18357,CHEBI:18357,"{'outgoing_relations': [{'init_id': 18357, 'in..."


## Gen Ontology

In [43]:
from src.genontology import GenOntologyInterface

In [44]:
instance = GenOntologyInterface(
    #fields_to_extract = ["goid", "label"]
)

In [45]:
instance.get_dummy()

Cache_key: GO:0008150_ontology-term_default
Prepared request: https://api.geneontology.org/api/ontology/term/GO%3A0008150
Cache_key: GO:0008150_ontology-term_graph
Prepared request: https://api.geneontology.org/api/ontology/term/GO%3A0008150/graph
Cache_key: GO:0008150_go_default
Prepared request: https://api.geneontology.org/api/go/GO%3A0008150
Cache_key: GO:0008150_bioentity-function_default
Prepared request: https://api.geneontology.org/api/bioentity/function/GO%3A0008150


{'ontology-term_default': {'goid': 'str',
  'label': 'str',
  'definition': 'str',
  'comment': 'str',
  'creation_date': 'str',
  'synonyms': 'list(str)',
  'relatedSynonyms': 'list(str)',
  'alternativeIds': 'list(str)',
  'xrefs': 'list(str)',
  'subsets': 'list(str)'},
 'ontology-term_graph': {'topology_graph_json.nodes.id': 'dict(str)',
  'topology_graph_json.nodes.lbl': 'dict(str)',
  'topology_graph_json.edges.sub': 'dict(str)',
  'topology_graph_json.edges.obj': 'dict(str)',
  'topology_graph_json.edges.pred': 'dict(str)',
  'topology_graph_json.meta': 'dict(dict)',
  'topology_graph_json': 'dict'},
 'go_default': {'goid': 'str',
  'label': 'str',
  'definition': 'str',
  'comment': 'str',
  'creation_date': 'str',
  'synonyms': 'list(str)',
  'relatedSynonyms': 'list(str)',
  'alternativeIds': 'list(str)',
  'xrefs': 'list(str)',
  'subsets': 'list(str)'},
 'bioentity-function_default': {'bioentity_label': 'str',
  'bioentity_name': 'str',
  'date': 'str',
  'assigned_by': 'st

In [46]:
instance.fetch_single(
    method="bioentity-function",
    query="GO:0006915",
    parse=True,
    to_dataframe=True
)

Cache_key: GO:0006915_bioentity-function_default
Prepared request: https://api.geneontology.org/api/bioentity/function/GO%3A0006915


Unnamed: 0,label,name,tax_id,organism,evidence_type,evidence,synonym
0,Lhx4,LIM homeobox protein 4,NCBITaxon:10090,Mus musculus,IMP,ECO:0000315,"[A330062J17Rik, Gsh-4, Gsh4]"
1,Cflar,CASP8 and FADD-like apoptosis regulator,NCBITaxon:10090,Mus musculus,IMP,ECO:0000315,"[2310024N18Rik, A430105C05Rik, Cash, Casper, F..."
2,Aipl1,aryl hydrocarbon receptor-interacting protein-...,NCBITaxon:10090,Mus musculus,IMP,ECO:0000315,A930007I01Rik
3,Gli3,GLI-Kruppel family member GLI3,NCBITaxon:10090,Mus musculus,IMP,ECO:0000315,"[Bph, brachyphalangy]"
4,Aldh1a3,"aldehyde dehydrogenase family 1, subfamily A3",NCBITaxon:10090,Mus musculus,IGI,ECO:0000316,"[ALDH6, RALDH3, V1, retinaldehyde dehydrogenas..."
...,...,...,...,...,...,...,...
95,Nf1,neurofibromin 1,NCBITaxon:10090,Mus musculus,IMP,ECO:0000315,"[Dsk9, Mhdadsk9, Nf-1, neurofibromin]"
96,Pim2,proviral integration site 2,NCBITaxon:10090,Mus musculus,IMP,ECO:0000315,"[DXCch3, Pim-2]"
97,Palb2,partner and localizer of BRCA2,NCBITaxon:10090,Mus musculus,IMP,ECO:0000315,
98,Syce3,synaptonemal complex central element protein 3,NCBITaxon:10090,Mus musculus,IDA,ECO:0000314,1700007E06Rik


In [47]:
instance.fetch_batch(
    method="bioentity-function",
    queries=["GO:0008150", "GO:0006915"],
    parse=True,
    to_dataframe=True
)

Cache_key: GO:0008150_bioentity-function_default
Cache_key: GO:0006915_bioentity-function_default


Unnamed: 0,label,name,tax_id,organism,evidence_type,evidence,synonym
0,Rapgef4os3,Rap guanine nucleotide exchange factor (GEF) 4...,NCBITaxon:10090,Mus musculus,ND,ECO:0000307,4732447D17Rik
1,Nxpe3,"neurexophilin and PC-esterase domain family, m...",NCBITaxon:10090,Mus musculus,ND,ECO:0000307,"[Fam55c, LOC208684, LOC385658]"
2,Lsm14b,LSM family member 14B,NCBITaxon:10090,Mus musculus,ND,ECO:0000307,
3,BC039771,cDNA sequence BC039771,NCBITaxon:10090,Mus musculus,ND,ECO:0000307,
4,Gm14661,predicted gene 14661,NCBITaxon:10090,Mus musculus,ND,ECO:0000307,ENSMUSG00000067835
...,...,...,...,...,...,...,...
195,Nf1,neurofibromin 1,NCBITaxon:10090,Mus musculus,IMP,ECO:0000315,"[Dsk9, Mhdadsk9, Nf-1, neurofibromin]"
196,Pim2,proviral integration site 2,NCBITaxon:10090,Mus musculus,IMP,ECO:0000315,"[DXCch3, Pim-2]"
197,Palb2,partner and localizer of BRCA2,NCBITaxon:10090,Mus musculus,IMP,ECO:0000315,
198,Syce3,synaptonemal complex central element protein 3,NCBITaxon:10090,Mus musculus,IDA,ECO:0000314,1700007E06Rik


In [48]:
instance.fetch_batch(
    method="ontology-term",
    queries=["GO:0008150", "GO:0006915"],
    parse=True,
    to_dataframe=True
)

Cache_key: GO:0008150_ontology-term_default
Cache_key: GO:0006915_ontology-term_default
Cache_key: GO:0006915_ontology-term_default
Prepared request: https://api.geneontology.org/api/ontology/term/GO%3A0006915


Unnamed: 0,go_term,label,definition
0,GO:0008150,biological_process,A biological process is the execution of a gen...
1,GO:0006915,apoptotic process,A programmed cell death process which begins w...


## Interpro

In [49]:
from src.interpro import InterproInstance
from typing import List, Dict, Any, Union

In [50]:
instance = InterproInstance()

### Search Interpro Id IPR036669

In [51]:
query = {
    "id": "IPR002223",
    "db" : "InterPro",
}

### Search uniProt id Q29537

In [5]:
query = {
    "db" : "InterPro",
    "modifiers" : {},
    "filters" : [
        {
            "type": "protein",
            "db": "reviewed",
            "value": "Q29537"
        }
    ]
}

In [7]:
query = {
    "db" : "InterPro",
    "modifiers" : {},
    "filters" : [
        {
            "type": "protein",
            "db": "reviewed",
            "value": "P05067"
        }
    ]
}

In [52]:
instance.fetch_single(query=query, method="entry", pages_to_fetch=1, parse=True, to_dataframe=True)

Cache_key: IPR002223_InterPro_entry
Fetching data from InterPro API with URL: https://www.ebi.ac.uk:443/interpro/api/entry/InterPro/IPR002223/


Unnamed: 0,accession,length,organism_id,protein_source_db,in_alphafold,interpro_accession,name,type,source_database,integrated,...,prosite,smart,prints,panther,cathgene3d,ssf,go_identifier,go_name,go_category,locations
0,,,,,,IPR002223,{'name': 'Pancreatic trypsin inhibitor Kunitz ...,domain,interpro,,...,,{'SM00131': 'BPTI/Kunitz family of serine prot...,{'PR00759': 'BASICPTASE'},,,,GO:0004867,serine-type endopeptidase inhibitor activity,molecular_function,


### Search in Batches

In [53]:
queries: List[Union[str, Dict[str, Any]]] = [
    {
        "db" : "InterPro",
        "modifiers" : {},
        "filters" : [
            {
                "type": "protein",
                "db": "reviewed",
                "value": "Q29537"
            }
        ]
    },
    {
        "db": "InterPro",
        "modifiers": {},
        "filters": [
            {
                "type": "protein",
                "db": "reviewed",
                "value": "P05067"
            }
        ]
    }
]
instance.fetch_batch(queries=queries, method="entry", pages_to_fetch=1, parse=True, to_dataframe=True)

Cache_key: {"db": "InterPro", "filters": [{"db": "reviewed", "type": "protein", "value": "Q29537"}], "modifiers": {}}_entry
Cache_key: {"db": "InterPro", "filters": [{"db": "reviewed", "type": "protein", "value": "P05067"}], "modifiers": {}}_entry
Cache_key: InterPro_{}_[{'type': 'protein', 'db': 'reviewed', 'value': 'Q29537'}]_entry
Fetching data from InterPro API with URL: https://www.ebi.ac.uk:443/interpro/api/entry/InterPro/protein/reviewed/Q29537/
Cache_key: InterPro_{}_[{'type': 'protein', 'db': 'reviewed', 'value': 'P05067'}]_entry
Fetching data from InterPro API with URL: https://www.ebi.ac.uk:443/interpro/api/entry/InterPro/protein/reviewed/P05067/


Unnamed: 0,accession,length,organism_id,protein_source_db,in_alphafold,interpro_accession,name,type,source_database,integrated,...,prosite,smart,prints,panther,cathgene3d,ssf,go_identifier,go_name,go_category,locations
0,q29537,381,9615,reviewed,True,IPR002117,p53 tumour suppressor family,family,interpro,,...,,,{'PR00386': 'P53SUPPRESSR'},{'PTHR11447': 'CELLULAR TUMOR ANTIGEN P53'},,,"[GO:0003677, GO:0003700, GO:0006355, GO:000691...","[DNA binding, DNA-binding transcription factor...","[molecular_function, molecular_function, biolo...","{'fragments': [{'start': 3, 'end': 354, 'dc-st..."
1,q29537,381,9615,reviewed,True,IPR008967,"p53-like transcription factor, DNA-binding dom...",homologous_superfamily,interpro,,...,,,,,,{'SSF49417': 'p53-like transcription factors'},"[GO:0003700, GO:0006355]","[DNA-binding transcription factor activity, re...","[molecular_function, biological_process]","{'fragments': [{'start': 84, 'end': 275, 'dc-s..."
2,q29537,381,9615,reviewed,True,IPR010991,"p53, tetramerisation domain",domain,interpro,,...,,,,,,,GO:0051262,protein tetramerization,biological_process,"{'fragments': [{'start': 307, 'end': 345, 'dc-..."
3,q29537,381,9615,reviewed,True,IPR011615,"p53, DNA-binding domain",domain,interpro,,...,,,,,,,GO:0000976,transcription cis-regulatory region binding,molecular_function,"{'fragments': [{'start': 88, 'end': 277, 'dc-s..."
4,q29537,381,9615,reviewed,True,IPR012346,"p53/RUNT-type transcription factor, DNA-bindin...",homologous_superfamily,interpro,,...,,,,,{'G3DSA:2.60.40.720': 'G3DSA:2.60.40.720'},,"[GO:0003677, GO:0003700, GO:0006355, GO:0005634]","[DNA binding, DNA-binding transcription factor...","[molecular_function, molecular_function, biolo...","{'fragments': [{'start': 82, 'end': 282, 'dc-s..."
5,q29537,381,9615,reviewed,True,IPR013872,"p53, transactivation domain",domain,interpro,,...,,,,,,,GO:0005515,protein binding,molecular_function,"{'fragments': [{'start': 6, 'end': 30, 'dc-sta..."
6,q29537,381,9615,reviewed,True,IPR036674,p53-like tetramerisation domain superfamily,homologous_superfamily,interpro,,...,,,,,{'G3DSA:4.10.170.10': 'p53-like tetramerisatio...,{'SSF47719': 'p53 tetramerization domain'},GO:0051262,protein tetramerization,biological_process,"{'fragments': [{'start': 307, 'end': 348, 'dc-..."
7,q29537,381,9615,reviewed,True,IPR057064,"p53, central conserved site",conserved_site,interpro,,...,{'PS00348': 'p53 family signature'},,,,,,,,,"{'fragments': [{'start': 225, 'end': 237, 'dc-..."
8,p05067,770,9606,reviewed,True,IPR002223,Pancreatic trypsin inhibitor Kunitz domain,domain,interpro,,...,,{'SM00131': 'BPTI/Kunitz family of serine prot...,{'PR00759': 'BASICPTASE'},,,,GO:0004867,serine-type endopeptidase inhibitor activity,molecular_function,"{'fragments': [{'start': 288, 'end': 342, 'dc-..."
9,p05067,770,9606,reviewed,True,IPR008154,"Amyloidogenic glycoprotein, extracellular",domain,interpro,,...,,{'SM00006': 'amyloid A4'},,,,,,,,"{'fragments': [{'start': 24, 'end': 189, 'dc-s..."


In [54]:
instance.fetch_to_dataframe(
    [
        {
            "type": "entry",
            "db": "InterPro",
            "entry_integration": "",
            "modifiers": {
                "go_term": "GO:0004867"
            },
            "filter_type": "protein",
            "filter_db": "UniProt",
            "filter_value": "P05067"
        }
    ]
)

AttributeError: 'InterproInstance' object has no attribute 'fetch_to_dataframe'

## KEGG

In [55]:
from src.kegg import KEGGInterface

In [56]:
instance = KEGGInterface()

In [57]:
instance.fetch_single(
    query={
        "entries": ["hsa:10458", "ece:Z5100"]
    },
    method="get",
    to_dataframe=True,
    parse=True,
    #fields_to_extract=["ENTRY", "NAME", "PATHWAY"]
)

Cache_key: hsa:10458_get
Cache_key: ece:Z5100_get
Cache_key: hsa:10458_get
Cache_key: ece:Z5100_get


Unnamed: 0,entry,name
0,10458 CDS T01001,(RefSeq) BAR/IMD domain containing adaptor pro...
1,Z5100 CDS T00044,(GenBank) espF


## PathwayCommons

In [9]:
from src.pathwaycommons import PathwayCommonsInterface

instance = PathwayCommonsInterface()

In [10]:
instance.fetch_single(
    query = {
        "uri": ["R-ALL-444824"],
        "pattern": ["interacts-with", "used-to-produce"]
    },
    method="fetch",
    parse=True,
    to_dataframe=True
)

Cache_key: ['R-ALL-444824']_['interacts-with', 'used-to-produce']_fetch
Prepared request: https://www.pathwaycommons.org/pc2/v2/fetch
Fetching data with parameters: {'uri': ['R-ALL-444824'], 'format': 'jsonld', 'pattern': ['interacts-with', 'used-to-produce'], 'subpw': False}


Unnamed: 0,id,name,db,cellular_location,chemical_formula,comment,component,component_stoichiometry,data_source,display_name,...,sequence_interval_begin,sequence_interval_end,sequence_position,standard_name,stoichiometric_coefficient,structure,structure_data,structure_format,term,xref
0,CHEBI:15551,,chebi,,,,,,,,...,,,,,,,,,,
1,CHEBI:15552,,chebi,,,,,,,,...,,,,,,,,,,
2,CHEBI:15555,,chebi,,,,,,,,...,,,,,,,,,,
3,CHEBI:16335,,chebi,,,,,,,,...,,,,,,,,,,
4,CHEBI:18397,,chebi,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1822,Q9P1P5,,uniprot,,,,,,,,...,,,,,,,,,,
1823,Q9P2W3,,uniprot,,,,,,,,...,,,,,,,,,,
1824,Q9UBI6,,uniprot,,,,,,,,...,,,,,,,,,,
1825,Q9UK08,,uniprot,,,,,,,,...,,,,,,,,,,


In [None]:
instance.fetch_single(
    query = {
        "q": "APP",
        "organism": [
            "9606"
        ],
        "datasource": [
            "reactome",
            "uniprot"
        ]
    },
    method="top_pathways",
    parse=True,
    to_dataframe=True
)

Cache_key: APP_['9606']_['reactome', 'uniprot']_top_pathways
Prepared request: https://www.pathwaycommons.org/pc2/v2/top_pathways
Fetching data with parameters: {'q': 'APP', 'organism': ['9606'], 'datasource': ['reactome', 'uniprot']}


Unnamed: 0,uri,biopax_class,name,data_source,organism,pathway,excerpt,num_participants,num_processes
0,http://bioregistry.io/reactome:R-HSA-392499,Pathway,Metabolism of proteins,pc14:reactome,http://bioregistry.io/ncbitaxon:9606,[],,0,9
1,http://bioregistry.io/reactome:R-HSA-162582,Pathway,Signaling Pathways,pc14:reactome,http://bioregistry.io/ncbitaxon:9606,[],,0,17


In [11]:
instance.fetch_single(
    query = {
        "source": [
            "Q16602"
        ],
        "limit": 1,
        "format": "jsonld",
        "organism": [
            "9606"
        ],
        "datasource": [
            "reactome"
        ],
        "pattern": [
            "interacts-with",
            "used-to-produce"
        ],
        "subpw": True,
        "direction": "undirected"
    },
    method="neighborhood",
    parse=True,
    to_dataframe=True
)

Cache_key: ['Q16602']_1_jsonld_['9606']_['reactome']_['interacts-with', 'used-to-produce']_True_undirected_neighborhood
Prepared request: https://www.pathwaycommons.org/pc2/v2/neighborhood
Fetching data with parameters: {'source': ['Q16602'], 'limit': 1, 'format': 'jsonld', 'organism': ['9606'], 'datasource': ['reactome'], 'pattern': ['interacts-with', 'used-to-produce'], 'subpw': True, 'direction': 'undirected'}


Unnamed: 0,id,name,db,cellular_location,chemical_formula,comment,component,component_stoichiometry,control_type,controlled,...,standard_name,stoichiometric_coefficient,structure,structure_data,structure_format,term,title,url,xref,year
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,CHEBI:10100,,chebi,,,,,,,,...,,,,,,,,,,
4,CHEBI:15354,,chebi,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6454,Q9UNW8,,uniprot,,,,,,,,...,,,,,,,,,,
6455,Q9Y271,,uniprot,,,,,,,,...,,,,,,,,,,
6456,Q9Y287,,uniprot,,,,,,,,...,,,,,,,,,,
6457,Q9Y5X5,,uniprot,,,,,,,,...,,,,,,,,,,


## PANTHER

In [62]:
from src.panther import PantherInterface

In [63]:
instance = PantherInterface()

In [64]:
instance.fetch_single(
    query={
        "family": "PTHR10000"
    },
    method="familyortholog",
    parse=True,
    to_dataframe=True
)

Cache_key: PTHR10000_familyortholog
Inputs for method 'familyortholog': {'family': 'PTHR10000'}
Prepared request: https://pantherdb.org/services/oai/pantherdb/familyortholog?family=PTHR10000


Unnamed: 0,pid,sid
0,PTN000796677,PTN008492220
1,PTN000796677,PTN008492221
2,PTN000796677,PTN000000088
3,PTN000796677,PTN000796679
4,PTN000796677,PTN001087724
...,...,...
3652,PTN001597048,PTN007416238
3653,PTN001597048,PTN001596985
3654,PTN001597047,PTN007416238
3655,PTN001597047,PTN001596985


In [65]:
instance.fetch_single(
    query="PTHR10000",
    method="familymsa",
    parse=True,
    to_dataframe=True
)

Cache_key: PTHR10000_familymsa
Inputs for method 'familymsa': {'family': 'PTHR10000'}
Prepared request: https://pantherdb.org/services/oai/pantherdb/familymsa?family=PTHR10000


Unnamed: 0,sequence,persistent_id
0,ml...............................................,PTN007416230
1,.................................................,PTN001597030
2,.................................................,PTN001597042
3,.................................................,PTN000000095
4,ml.................xx..............xxnxs.........,PTN004118870
...,...,...
181,.................................................,PTN001597027
182,.................................................,PTN001087692
183,.................................................,PTN001087691
184,.................................................,PTN002558009


In [66]:
instance.fetch_single(
    query={
        "geneInputList": ["BRCA1", "CIROP"],
        "organism": "9606"
    },
    method="geneinfo",
    parse=True,
    to_dataframe=True,
)

Cache_key: BRCA1_9606_geneinfo
Cache_key: CIROP_9606_geneinfo
Inputs for method 'geneinfo': {'geneInputList': 'BRCA1,CIROP', 'organism': '9606'}
Prepared request: https://pantherdb.org/services/oai/pantherdb/geneinfo?geneInputList=BRCA1%2CCIROP&organism=9606
Cache_key: BRCA1_9606_geneinfo
Cache_key: CIROP_9606_geneinfo


Unnamed: 0,id
0,PTHR13763
1,PTHR10942


In [67]:
instance.fetch_batch(
    queries=[{
        "geneInputList": ["BRCA1", "CIROP"],
        "organism": "9606"
    }],
    method="geneinfo",
    parse=True,
    to_dataframe=True,
)

Cache_key: BRCA1_9606_geneinfo
Cache_key: CIROP_9606_geneinfo


Unnamed: 0,id
0,PTHR13763
1,PTHR10942


## PDB

In [68]:
from src.proteindatabank import PDBInterface

In [69]:
instance = PDBInterface(
    download_structures=True,
    return_data_list=["rcsb_id", "rcsb_comp_model_provenance", "rcsb_entry_info"],
    output_dir="results"
)

In [70]:
instance.fetch_single(query="4HHB", method="entry", parse=True, to_dataframe=True)

Info: Structure for 4HHB already exists in pdb format.
Cache_key: 4HHB_entry
Prepared request: https://data.rcsb.org/rest/v1/core/entry/4HHB


Unnamed: 0,rcsb_id,model_provenance,branched_molecular_weight_minimum,resolution_combined,experimental_method,diffrn_resolution_high
0,4HHB,,,1.74,X-ray,1.74


In [71]:
instance.fetch_batch(
    queries=["4HHB", "1A8I", "1A8J", "1A8K", "1A8L", "1A8M"],
    method="entry",
    parse=True,
    to_dataframe=True
)

Cache_key: 4HHB_entry
Cache_key: 1A8I_entry
Cache_key: 1A8J_entry
Cache_key: 1A8K_entry
Cache_key: 1A8L_entry
Cache_key: 1A8M_entry
Info: Structure for 1A8I already exists in pdb format.
Cache_key: 1A8I_entry
Prepared request: https://data.rcsb.org/rest/v1/core/entry/1A8I
Info: Structure for 1A8J already exists in pdb format.
Cache_key: 1A8J_entry
Prepared request: https://data.rcsb.org/rest/v1/core/entry/1A8J
Info: Structure for 1A8K already exists in pdb format.
Cache_key: 1A8K_entry
Prepared request: https://data.rcsb.org/rest/v1/core/entry/1A8K
Info: Structure for 1A8L already exists in pdb format.
Cache_key: 1A8L_entry
Info: Structure for 1A8M already exists in pdb format.
Cache_key: 1A8M_entry
Prepared request: https://data.rcsb.org/rest/v1/core/entry/1A8L
Prepared request: https://data.rcsb.org/rest/v1/core/entry/1A8M
Info: Structure for 4HHB already exists in pdb format.
Info: Structure for 1A8I already exists in pdb format.
Info: Structure for 1A8J already exists in pdb format

Unnamed: 0,rcsb_id,model_provenance,branched_molecular_weight_minimum,resolution_combined,experimental_method,diffrn_resolution_high
0,4HHB,,,1.74,X-ray,1.74
1,1A8I,,,1.78,X-ray,1.78
2,1A8J,,,2.7,X-ray,2.7
3,1A8K,,,2.0,X-ray,2.0
4,1A8L,,,1.9,X-ray,1.9
5,1A8M,,,2.3,X-ray,2.3


In [5]:
instance.get_dummy()

Info: Structure for 4HHB already exists in pdb format.
Cache_key: 4HHB_entry


{'audit_author.name': 'str',
 'audit_author.pdbx_ordinal': 'int',
 'cell.angle_alpha': 'dict(float)',
 'cell.angle_beta': 'dict(float)',
 'cell.angle_gamma': 'dict(float)',
 'cell.length_a': 'dict(float)',
 'cell.length_b': 'dict(float)',
 'cell.length_c': 'dict(float)',
 'cell.zpdb': 'dict(int)',
 'cell': 'dict',
 'citation.country': 'str',
 'citation.id': 'str',
 'citation.journal_abbrev': 'str',
 'citation.journal_id_astm': 'str',
 'citation.journal_id_csd': 'str',
 'citation.journal_id_issn': 'str',
 'citation.journal_volume': 'str',
 'citation.page_first': 'str',
 'citation.page_last': 'str',
 'citation.pdbx_database_id_doi': 'str',
 'citation.pdbx_database_id_pub_med': 'int',
 'citation.rcsb_authors': 'list(str)',
 'citation.rcsb_is_primary': 'str',
 'citation.rcsb_journal_abbrev': 'str',
 'citation.title': 'str',
 'citation.year': 'int',
 'database2.database_code': 'str',
 'database2.database_id': 'str',
 'database2.pdbx_doi': 'str',
 'database2.pdbx_database_accession': 'str',


In [72]:
instance.fetch_single(
    query="4HHB",
    method="entry",
    parse=True
)

Info: Structure for 4HHB already exists in pdb format.
Cache_key: 4HHB_entry


{'rcsb_id': '4HHB',
 'model_provenance': None,
 'branched_molecular_weight_minimum': None,
 'resolution_combined': 1.74,
 'experimental_method': 'X-ray',
 'diffrn_resolution_high': 1.74}

## Pride

In [73]:
from src.pride import PrideInterface

In [74]:
instance = PrideInterface()

In [75]:
instance.fetch_single(
    query={
        "keyword": "cancer",
    },
    method="search",
    option="projects",
    parse=True,
    to_dataframe=True
)

Cache_key: cancer_search_projects
Prepared request: https://www.ebi.ac.uk/pride/ws/archive/v3/search/projects?keyword=cancer&page=0&sortDirection=DESC&sortFields=submissionDate
Fetching data with parameters: {'keyword': 'cancer', 'page': 0, 'sortDirection': 'DESC', 'sortFields': 'submissionDate'}


Unnamed: 0,accession,title,additional_attributes,project_description,sample_processing_protocol,data_processing_protocol,project_tags,keywords,doi,submission_type,...,experiment_types,quantification_methods,countries,sample_attributes,organisms,organism_parts,diseases,references,identified_ptm_strings,total_file_downloads
0,PXD066683,New proteomic signature in circulating extrace...,,The identification of noninvasive prognostic b...,Sample preparation Total protein lysates were ...,MS data processing. Proteins were identified w...,Human proteome project,"[Tumor-draining vein, Proteomic, Extracellular...",,PARTIAL,...,Bottom-up proteomics,Relative quantification,,"[blood cell, lung cancer, blood plasma, Homo s...",Homo sapiens (human),,Lung cancer,[],,
1,PXD066623,Exon inclusion signatures enable accurate esti...,,Splicing factors control exon inclusion in mes...,Cell culture BJ fibroblasts from the Danielsso...,Analysis of mass spectra Acquired spectra were...,[],Cancer; alternative splicing; splicing factor;...,,PARTIAL,...,Data-independent acquisition,label-free quantification,,"[fibroblast, cell culture, Homo sapiens (Human...","[Homo sapiens (human), Characteristics[organism]]",,"[Carcinoma, Characteristics[disease]]","Anglada-Girotto M, Moakley DF, Zhang C, Mirave...",,
2,PXD066297,Genome-scale CRISPR screens identify PTGES3 as...,,The androgen receptor (AR) is a critical drive...,Approximately 10 million cells were lyesed usi...,The MS/MS raw data (.raw files) were processed...,[],"[Ar, Crispr screens, Ptges3, Prostate cancer]",,PARTIAL,...,Bottom-up proteomics,[],,"[cell culture, Homo sapiens (Human)]",Homo sapiens (human),,[],[],,
3,PXD066316,Design of combination therapeutics from protei...,,High-grade serous ovarian cancer (HGSOC) remai...,Sample processing To harvest cells for mass sp...,The protein expression matrix contained protei...,[],"[Cell culture, Drug pertubation, Hgsoc, Ovaria...",,PARTIAL,...,"[diaPASEF, Bottom-up proteomics]","[label-free quantification, Relative quantific...",,"[malignant neoplasm of ovary, cell culture, Ho...",Homo sapiens (human),,Malignant neoplasm of ovary,[],,
4,PXD066069,Radiation induced extracellular matrix changes...,,Muscle-invasive bladder cancer (MIBC) is a pre...,cells were grown for 7 days in normal tissue c...,MS analysis was performed following previously...,[],"[Radiation, Extracellular matrix, In vitro, Bl...",,PARTIAL,...,Gel-based experiment,Normalized Spectral Abundance Factor - NSAF,,"[cell culture, Homo sapiens (Human), permanent...",Homo sapiens (human),,Urinary bladder cancer,null--pubMed:0--doi: 10.3389/FONC.2025.1616943,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,PXD061698,Astral-DIA Proteomics Deciphers the Molecular ...,,Male infertility is fundamentally rooted in de...,"Leveraging the principles of DIA technology, w...",This study leverages state-of-the-art proteomi...,[],Astral-dia; proteomics deciphers; molecular la...,,PARTIAL,...,Data-independent acquisition,[],,"[Homo sapiens (Human), spermatozoon]",Homo sapiens (human),,[],"Perez-Riverol Y, Bandla C, Kundu DJ, Kamatchin...",,
96,PXD061725,Astral-DIA Proteomics Deciphers the Molecular ...,,Male infertility is fundamentally rooted in de...,To analyze the aberrant regulatory events in h...,"Leveraging the principles of DIA technology, w...",[],Astral-dia; proteomics; human spermatozoa,,PARTIAL,...,Bottom-up proteomics,[],,"[Homo sapiens (Human), spermatozoon]",Homo sapiens (human),,[],"Perez-Riverol Y, Bandla C, Kundu DJ, Kamatchin...",,
97,PXD061710,Overcoming preservation challenges to enable s...,,Recent advances in single-cell proteomics (SCP...,Single cell seeding and processing was perform...,Spectronaut 18 was used for data analysis sear...,[],"[Human, Scp, Mouse, Lc-ms/ms, Pancreas]",,PARTIAL,...,"[Data-independent acquisition, Bottom-up prote...",label-free quantification,,"[cell culture, Homo sapiens (Human), pancreas,...","[Homo sapiens (human), Mus musculus (mouse)]",,Cancer,[],,
98,PXD061696,Recombinant Protein Spectral Library (rPSL) DI...,,Data-independent acquisition mass spectrometry...,Frozen tissues were cryogenically homogenized ...,DIA data processing using DIA-NN: Raw DIA-MS d...,[],"[Snap frozen tissues, Rpsl, Cancer- associated...",,PARTIAL,...,"[Data-dependent acquisition, Data-independent ...",Relative quantification,,"[colorectal cancer cell, Homo sapiens (Human),...",Homo sapiens (human),,"[Colon cancer, Breast cancer]","Krishnamurthy S, Gunasegaran B, Paul-Heng M, M...",,


In [76]:
instance.fetch_single(
    query={
        "projectAccession": "PXD066763",
    },
    method="projects",
    parse=True,
    to_dataframe=True
)

Cache_key: PXD066763_projects_default
Prepared request: https://www.ebi.ac.uk/pride/ws/archive/v3/projects/PXD066763
Fetching data with parameters: {}


Unnamed: 0,accession,title,additional_attributes,project_description,sample_processing_protocol,data_processing_protocol,project_tags,keywords,doi,submission_type,...,experiment_types,quantification_methods,countries,sample_attributes,organisms,organism_parts,diseases,references,identified_ptm_strings,total_file_downloads
0,PXD066763,Stereo-random Oligonucleotides Enable Efficien...,[],We used our previously published isASO-ID prot...,Proximity biotinylation experiment. Cells were...,MS spectra were processed with MaxQuant softwa...,[],"[Antisense oligonucleotide, Basu, Drug protein...",,PARTIAL,...,"{'@type': 'CvParam', 'cvLabel': 'PRIDE', 'acce...","{'@type': 'CvParam', 'cvLabel': 'PRIDE', 'acce...",Germany,"[{'@type': 'Tuple', 'key': {'cvLabel': 'EFO', ...","{'@type': 'CvParam', 'cvLabel': 'NEWT', 'acces...","{'@type': 'CvParam', 'cvLabel': 'CL', 'accessi...",[],[],"[{'@type': 'CvParam', 'cvLabel': 'MOD', 'acces...",0


In [77]:
instance.fetch_single(
    query={
        "accession": "PXD066763",
    },
    method="projects",
    option="similarProjects",
    parse=True,
    to_dataframe=True
)

Cache_key: PXD066763_projects_similarProjects
Prepared request: https://www.ebi.ac.uk/pride/ws/archive/v3/projects/PXD066763/similarProjects?page=0&pageSize=10
Fetching data with parameters: {'page': 0, 'pageSize': 10}


Unnamed: 0,accession,affiliations,avg_wonloads_per_file,data_processing_protocol,diseases,doi,download_count,experiment_types,instruments,keywords,...,sample_attributes,sample_processing_protocol,sdrf,softwares,submission_date,submission_type,submitters,title,updated_date,yearly_downloads
0,PXD045992,"Interfaculty Institute of Biochemistry, Univer...",3.333333,MS spectra were processed with MaxQuant softwa...,[],,300,Affinity purification coupled with mass spectr...,Q Exactive HF,"[Live cell, Bioid, Proximity biotinylation, Be...",...,"[Homo sapiens (Human), cervical cancer cell li...",Proximity biotinylation experiment. Hek293T Fl...,,,2023-10-09,PARTIAL,Mirita Franz-Wachtel,Profiling the interactome of oligonucleotide d...,2023-10-09,"[{'year': '2024', 'count': 79}, {'year': '2025..."
1,PXD013215,Interfaculty Institute of Biochemistry Univers...,37.46154,Spectra were processed with MaxQuant software ...,[],,487,[Affinity purification coupled with mass spect...,LTQ Orbitrap Elite,"[Budding yeast yer156c interactome, Chromatin,...",...,Saccharomyces cerevisiae (Baker's yeast),For SILAC APEX2 experiments the mitochondrial ...,,MaxQuant,2019-03-25,PARTIAL,Nicolas Nalpas,APEX2-mediated proximity labeling of mitochond...,2019-03-25,"[{'year': '2021', 'count': 84}, {'year': '2022..."
2,PXD011594,Center for Plant Molecular Biology (ZMBP) Univ...,22.2,Spectra were processed with MaxQuant software ...,[],,111,Affinity purification coupled with mass spectr...,LTQ Orbitrap XL,"[Membrane traffic, Arf gtpases, Arabidopsis, A...",...,"[Arabidopsis thaliana (Mouse-ear cress), seedl...",Protein extracts from transgenic seedlings exp...,,MaxQuant,2018-11-07,PARTIAL,Nicolas Nalpas,Arabidopsis ARF-ARFGEF interaction analyzed by...,2018-11-07,"[{'year': '2021', 'count': 48}, {'year': '2022..."
3,PXD032366,Faculty of Biology & Biotechnology Ruhr-Univer...,1.473684,MS spectra were processed with MaxQuant softwa...,[],,28,Affinity purification coupled with mass spectr...,Q Exactive,"[Lc-msms, Bacterial effectors, Plants]",...,"[Nicotiana benthamiana, leaf]",Proteins were purified by SDS-PAGE (short run)...,,"[Andromeda, MaxQuant]",2022-03-17,PARTIAL,Mirita Franz-Wachtel,Characterization of Xanthomonas effector prote...,2022-03-17,"[{'year': '2023', 'count': 17}, {'year': '2024..."
4,PXD010694,University of Tuebingen - Interfaculty Institu...,24.1,Acquired MS spectra were processed with MaxQua...,[],,241,Affinity purification coupled with mass spectr...,LTQ Orbitrap Elite,"[Rna localization, Bioid, Mouse, Rbps]",...,"[embryonic fibroblast, Mus musculus (Mouse)]","For RNA-BioID, cells were incubated with 50 µM...",,"[Andromeda, MaxQuant]",2018-08-06,PARTIAL,Mirita Franz-Wachtel,β-actin RNA-BioID,2018-08-06,"[{'year': '2021', 'count': 40}, {'year': '2022..."
5,PXD031610,"University of Tuebingen, Interfaculty Institut...",14.25,MS spectra were processed with MaxQuant softwa...,[],,57,Affinity purification coupled with mass spectr...,Q Exactive,"[Mitochondria, Yeast, Signal-anchored proteins...",...,"[cell culture, Saccharomyces cerevisiae (Baker...",Pull-down assays were performed with extract o...,,"[Andromeda, MaxQuant]",2022-02-11,PARTIAL,Mirita Franz-Wachtel,Searching for the interaction partners in yeas...,2022-02-11,"[{'year': '2022', 'count': 12}, {'year': '2023..."
6,PXD045484,University of Halle-Wittenberg Institute for B...,10.615385,The MS data were processed with MaxQuant softw...,[],,138,Affinity purification coupled with mass spectr...,Q Exactive HF,[Comprehensive isolation by rna binding protei...,...,"[plant cell, Arabidopsis thaliana (Mouse-ear c...",LC-MS/MS analyses of eluted samples were perfo...,,,2023-09-19,PARTIAL,Irina Droste-Borel,Identification of U1 snRNA associated proteins,2023-09-19,"[{'year': '2024', 'count': 116}, {'year': '202..."
7,PXD033585,Faculty of Biology & Biotechnology Ruhr-Univer...,1.2,MS spectra were processed with MaxQuant softwa...,[],,24,Affinity purification coupled with mass spectr...,Q Exactive,"[Lc-msms, Plants]",...,"[Nicotiana benthamiana, leaf]",: Proteins were purified by SDS-PAGE (short ru...,,"[Andromeda, MaxQuant]",2022-05-04,PARTIAL,Mirita Franz-Wachtel,Functional characterization of SH3P2 from Arab...,2022-05-04,"[{'year': '2022', 'count': 5}, {'year': '2023'..."
8,PXD005597,Junior Research Group Infection Biology of Sal...,3.714286,The MS data were processed with MaxQuant softw...,[],,104,Affinity purification coupled with mass spectr...,LTQ Orbitrap,"[T3ss, Flip, Flio, Export gate complex, Assembly]",...,Salmonella enterica subsp. enterica serovar Ty...,LC-MS/MS analyses were performed on an Easy na...,,MaxQuant,2016-12-20,PARTIAL,Nicolas Nalpas,FliP subcomplex analysis - A flagellum-specif...,2016-12-20,"[{'year': '2021', 'count': 52}, {'year': '2022..."
9,PXD036942,"University Tübingen, Center for Plant Molecula...",14.523809,Acquired MS spectra were processed with MaxQua...,[],,305,Affinity purification coupled with mass spectr...,Orbitrap Exploris 480,"[Pti, Arabidopsis, Cell death, Bir3, Plant imm...",...,"[Arabidopsis thaliana (Mouse-ear cress), leaf]",35S-BIR3-YFP expressing plants were used to im...,,"[Andromeda, MaxQuant]",2022-09-22,PARTIAL,Mirita Franz-Wachtel,Interactome of Arabidopsis BIR3 protein,2022-09-22,"[{'year': '2022', 'count': 5}, {'year': '2023'..."


## Pubchem

In [6]:
from src.pubchem import PubChemInterface

In [7]:
instance = PubChemInterface()

In [80]:
instance.fetch_single(
    query={
        "genesymbol": ["APP", "ECD"],
    },
    method="gene",
    option="summary",
    parse=True,
    to_dataframe=True
)

Cache_key: APP_gene_summary
Cache_key: ECD_gene_summary
Prepared request: https://pubchem.ncbi.nlm.nih.gov/rest/pug/gene/genesymbol/APP,ECD/summary/json
Cache_key: APP_gene_summary
Cache_key: ECD_gene_summary


Unnamed: 0,gene_id,symbol,name,taxonomy_id,taxonomy,description,synonym
0,351,APP,amyloid beta precursor protein,9606,Homo sapiens (human),This gene encodes a cell surface receptor and ...,"[AAA, ABETA, ABPP, AD1, APPI, CTFgamma, CVAP, ..."
1,11319,ECD,ecdysoneless cell cycle regulator,9606,Homo sapiens (human),Enables histone acetyltransferase binding acti...,"[GCR2, HSGT1, SGT1, protein ecdysoneless homol..."


In [81]:
# For more properties check the documentation: https://pubchem.ncbi.nlm.nih.gov/docs/pug-rest#section=Compound-Property-Tables
instance.fetch_single(
    query={
        #"cid": ["2244", "2245"],
        #"name": "aspirin",
        "smiles": "CC(=O)OC1=CC=CC=C1C(=O)O",
        "property": ["molecularformula", "smiles", "hbonddonorcount", "hbondacceptorcount"],
    },
    method="compound",
    #option="sids",
    parse=True,
    to_dataframe=True
)

Cache_key: CC(=O)OC1=CC=CC=C1C(=O)O_['molecularformula', 'smiles', 'hbonddonorcount', 'hbondacceptorcount']_compound_default
Prepared request: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/CC(=O)OC1=CC=CC=C1C(=O)O/property/molecularformula,smiles,hbonddonorcount,hbondacceptorcount/json


Unnamed: 0,cid,molecular_formula,molecular_weight,smiles,connectivity_smiles,inchi,inchikey,iupac_name,title,xlogp,...,feature_acceptor_count_3d,feature_donor_count_3d,feature_anion_count_3d,feature_cation_count_3d,feature_ring_count_3d,feature_hydrophobe_count_3d,conformer_model_rmsd_3d,effective_rotor_count_3d,conformer_count_3d,fingerprint_2d
0,2244,C9H8O4,,CC(=O)OC1=CC=CC=C1C(=O)O,,,,,,,...,,,,,,,,,,


In [82]:
instance.fetch_single(
    query={
        "smiles": "CC(=O)OC1=CC=CC=C1C(=O)O",
    },
    method="compound",
    option="cids",
    parse=True,
    to_dataframe=True
)

Cache_key: CC(=O)OC1=CC=CC=C1C(=O)O_compound_cids
Prepared request: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/CC(=O)OC1=CC=CC=C1C(=O)O/cids/json


Unnamed: 0,cid
0,2244


In [83]:
instance.fetch_single(
    query={
        "smiles": "CC(=O)OC1=CC=CC=C1C(=O)O",
    },
    method="compound",
    option="description",
    parse=True,
    to_dataframe=True
)

Cache_key: CC(=O)OC1=CC=CC=C1C(=O)O_compound_description
Prepared request: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/CC(=O)OC1=CC=CC=C1C(=O)O/description/json


Unnamed: 0,cid,title,description,description_source_name,description_url
0,2244,Aspirin,,,
1,2244,,Aspirin can cause developmental toxicity and f...,California Office of Environmental Health Haza...,https://oehha.ca.gov/proposition-65/chemicals/...
2,2244,,Acetylsalicylic acid is a member of the class ...,ChEBI,https://www.ebi.ac.uk/chebi/searchId.do?chebiI...


In [84]:
instance.fetch_single(
    query={
        "accession": "P00533",
    },
    method="protein",
    option="summary",
    parse=True,
    to_dataframe=True
)

Cache_key: P00533_protein_summary
Prepared request: https://pubchem.ncbi.nlm.nih.gov/rest/pug/protein/accession/P00533/summary/json


Unnamed: 0,protein_accession,name,taxonomy_id,taxonomy,synonym
0,P00533,Epidermal growth factor receptor,9606,Homo sapiens (human),"[EC 2.7.10.1, Proto-oncogene c-ErbB-1, Recepto..."


In [85]:
instance.fetch_single(
    query={
        "accession": "P00533",
    },
    method="protein",
    option="aids",
    parse=True,
    to_dataframe=True
)

Cache_key: P00533_protein_aids
Prepared request: https://pubchem.ncbi.nlm.nih.gov/rest/pug/protein/accession/P00533/aids/json


Unnamed: 0,protein_accession,aid
0,P00533,"[1433, 1726, 1727, 1729, 1731, 1742, 1982, 336..."


In [86]:
instance.fetch_single(
    query={
        "accession": "P00533",
    },
    method="protein",
    option="concise",
    parse=True,
    to_dataframe=True
)

Cache_key: P00533_protein_concise
Prepared request: https://pubchem.ncbi.nlm.nih.gov/rest/pug/protein/accession/P00533/concise/json


Unnamed: 0,aid,sid,cid,activity_outcome,target_accession,target_geneid,activity_name,activity_qualifier,activity_value_um,assay_name,assay_type,pubmed_id,rna_i
0,3364,103262069,5329249,Active,,1956,IC50,=,4.32,Inhibition of ligand-induced proliferation in ...,Confirmatory,12127526,
1,3364,103262170,10300451,Active,,1956,IC50,=,0.62,Inhibition of ligand-induced proliferation in ...,Confirmatory,12127526,
2,3364,103262784,5329232,Active,,1956,IC50,=,7.82,Inhibition of ligand-induced proliferation in ...,Confirmatory,12127526,
3,3364,103262821,5329244,Active,,1956,IC50,=,1.29,Inhibition of ligand-induced proliferation in ...,Confirmatory,12127526,
4,3364,103262822,5329243,Active,,1956,IC50,=,7.5,Inhibition of ligand-induced proliferation in ...,Confirmatory,12127526,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
46762,2061069,513841252,154674422,Inactive,,1956,IC50,>,20,WT-EGFR Kinase Assay from US Patent US12227501...,Confirmatory,,
46763,2061069,513841253,154674465,Inactive,,1956,IC50,>,20,WT-EGFR Kinase Assay from US Patent US12227501...,Confirmatory,,
46764,2061069,513841254,154674465,Inactive,,1956,IC50,>,20,WT-EGFR Kinase Assay from US Patent US12227501...,Confirmatory,,
46765,2061069,513841255,154674465,Inactive,,1956,IC50,>,20,WT-EGFR Kinase Assay from US Patent US12227501...,Confirmatory,,


In [87]:
instance.fetch_single(
    query={
        "accession": "P00533",
    },
    method="protein",
    option="pwaccs",
    parse=True,
    to_dataframe=True
)

Cache_key: P00533_protein_pwaccs
Prepared request: https://pubchem.ncbi.nlm.nih.gov/rest/pug/protein/accession/P00533/pwaccs/json


Unnamed: 0,protein_accession,pathway_accession
0,P00533,"[PathBank:SMP0000472, PathBank:SMP0000473, Pat..."


In [88]:
instance.fetch_single(
    query={
        "genesymbol": ["APP"]
    },
    method="gene",
    option="summary",
    parse=True,
)

Cache_key: APP_gene_summary


[[{'gene_id': 351,
   'symbol': 'APP',
   'name': 'amyloid beta precursor protein',
   'taxonomy_id': 9606,
   'taxonomy': 'Homo sapiens (human)',
   'description': 'This gene encodes a cell surface receptor and transmembrane precursor protein that is cleaved by secretases to form a number of peptides. Some of these peptides are secreted and can bind to the acetyltransferase complex APBB1/TIP60 to promote transcriptional activation, while others form the protein basis of the amyloid plaques found in the brains of patients with Alzheimer disease. In addition, two of the peptides are antimicrobial peptides, having been shown to have bacteriocidal and antifungal activities. Mutations in this gene have been implicated in autosomal dominant Alzheimer disease and cerebroarterial amyloidosis (cerebral amyloid angiopathy). Multiple transcript variants encoding several different isoforms have been found for this gene. [provided by RefSeq, Aug 2014]',
   'synonym': ['AAA',
    'ABETA',
    'ABPP

In [89]:
instance.fetch_single(
    query={
        "genesymbol": ["APP", "ECD"],
    },
    method="gene",
    option="summary",
    parse=True,
    to_dataframe=True
)

Cache_key: APP_gene_summary
Cache_key: ECD_gene_summary


Unnamed: 0,gene_id,symbol,name,taxonomy_id,taxonomy,description,synonym
0,351,APP,amyloid beta precursor protein,9606,Homo sapiens (human),This gene encodes a cell surface receptor and ...,"[AAA, ABETA, ABPP, AD1, APPI, CTFgamma, CVAP, ..."
1,11319,ECD,ecdysoneless cell cycle regulator,9606,Homo sapiens (human),Enables histone acetyltransferase binding acti...,"[GCR2, HSGT1, SGT1, protein ecdysoneless homol..."


In [90]:
instance.fetch_single(
    query={
        "genesymbol": ["APP"],
    },
    method="gene",
    option="concise",
    parse=True,
    to_dataframe=True
)

Cache_key: APP_gene_concise
Prepared request: https://pubchem.ncbi.nlm.nih.gov/rest/pug/gene/genesymbol/APP/concise/json
Cache_key: APP_gene_concise


Unnamed: 0,aid,sid,cid,activity_outcome,target_accession,target_geneid,activity_name,activity_qualifier,activity_value_um,assay_name,assay_type,pubmed_id,rna_i
0,374278,103655342,42603587,Active,P05067,,,,,Binding affinity to beta-amyloid plaques in su...,Other,18992967,
1,374278,103655344,42603588,Active,P05067,,,,,Binding affinity to beta-amyloid plaques in su...,Other,18992967,
2,374282,103655343,42603729,Active,P05067,,,,,Binding affinity to vascular beta-amyloid plaq...,Other,18992967,
3,473568,103747372,11574638,Unspecified,P05067,,EC50,=,16,Inhibition of human recombinant APP in CHO cel...,Confirmatory,20223661,
4,473568,103747583,11596660,Active,P05067,,EC50,=,4.8,Inhibition of human recombinant APP in CHO cel...,Confirmatory,20223661,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,1831909,242634277,66665322,Active,P05067,,IC50,=,0.003,Inhibition of wild type human APP expressed in...,Confirmatory,33844524,
124,1515584,103218648,656629,Active,P05067,,IC50,=,0.19953,Inhibition of APP (unknown origin),Confirmatory,30655952,
125,1515584,440111474,45102601,Unspecified,P05067,,IC50,>,10,Inhibition of APP (unknown origin),Confirmatory,30655952,
126,1831886,242634277,66665322,Active,P05067,,IC50,=,0.004,Inhibition of wild type human APP expressed in...,Confirmatory,33844524,


In [91]:
instance.fetch_single(
    query={
        "genesymbol": ["APP"],
    },
    method="gene",
    option="pwaccs",
    parse=True,
    to_dataframe=True
)

Cache_key: APP_gene_pwaccs
Prepared request: https://pubchem.ncbi.nlm.nih.gov/rest/pug/gene/genesymbol/APP/pwaccs/json
Cache_key: APP_gene_pwaccs


Unnamed: 0,gene_id,pathway_accession
0,351,"[COVID-19 Disease Map:718, PANTHER:P00003, Pat..."


## Reactome

In [92]:
from src.reactome import ReactomeInstance

In [93]:
instance = ReactomeInstance()

In [94]:
instance.fetch_single(query="R-DME-1834941", parse=True, method="data-discover", to_dataframe=True)

Cache_key: R-DME-1834941_data-discover


Unnamed: 0,name,description,url,version,keywords,includedInDataCatalog,distribution,license,@context,@type
0,STING mediated induction of host immune responses,This event has been computationally inferred f...,https://reactome.org/PathwayBrowser/#/R-DME-18...,93,[Pathway],"{'name': 'Reactome', 'url': 'https://reactome....",[{'contentUrl': 'https://reactome.org/ContentS...,https://creativecommons.org/licenses/by/4.0/,http://schema.org/,DataSet


## RHEA

In [95]:
from src.rhea import RheaInterface

In [96]:
instance = RheaInterface()

In [97]:
instance.fetch_single(
    query={
        "query": "uniprot:*"
    },
    method="rhea",
    parse=True,
    to_dataframe=True
)

Cache_key: uniprot:*_rhea
Prepared request: https://www.rhea-db.org/rhea/?query=uniprot%3A%2A&columns=rhea-id%2Cequation%2Cchebi%2Cchebi-id%2Cec%2Cuniprot%2Cgo&format=json&limit=100


Unnamed: 0,id,equation,status,htmlequation,comment,balanced,transport
0,21252,(S)-2-hydroxyglutarate + A = 2-oxoglutarate + AH2,approved,"<span class=""participant""><span class=""stoichi...",,True,False
1,21256,3-phosphoshikimate + phosphoenolpyruvate = 5-O...,approved,"<span class=""participant""><span class=""stoichi...",,True,False
2,21260,[thioredoxin]-disulfide + L-methionine + H2O =...,approved,"<span class=""participant""><span class=""stoichi...",,True,False
3,21264,glycolate + A = glyoxylate + AH2,approved,"<span class=""participant""><span class=""stoichi...",,True,False
4,21272,spermidine + glutathione + ATP = glutathionyls...,approved,"<span class=""participant""><span class=""stoichi...",,True,False
...,...,...,...,...,...,...,...
95,21800,all-trans-antheraxanthin + L-ascorbate = all-t...,approved,"<span class=""participant""><span class=""stoichi...",RHEA:21800 part of RHEA:32371 \n\nPublished in...,True,False
96,21804,myo-inositol + ATP = 1D-myo-inositol 3-phospha...,approved,"<span class=""participant""><span class=""stoichi...",,True,False
97,21812,cholesterol + reduced [NADPH--hemoprotein redu...,approved,"<span class=""participant""><span class=""stoichi...",,True,False
98,21816,a D-alpha-amino acid + O2 + H2O = a 2-oxocarbo...,approved,"<span class=""participant""><span class=""stoichi...",Multi-step reaction: RHEA:78799 + RHEA:78803,True,False


## Refseq

In [98]:
from src.refseq import RefSeqInterface

In [99]:
instance = RefSeqInterface()

In [100]:
instance.fetch_single(
    query=["XP_010804480.1", "XP_010804481.1", "XP_010804482.1"],
    method="protein",
    parse=True,
    to_dataframe=True,
)

Cache_key: ['XP_010804480.1', 'XP_010804481.1', 'XP_010804482.1']_protein


Unnamed: 0,locus,project_id
0,XP_010804480,PRJNA33843
1,XP_010804481,PRJNA33843
2,XP_010804482,PRJNA33843


In [101]:
instance.fetch_batch(
    queries=["XP_010804480.1", "XP_010804481.1", "XP_010804482.1"],
    method="protein",
    parse=True,
    to_dataframe=True,
)

Cache_key: XP_010804480.1_protein
Cache_key: XP_010804481.1_protein
Cache_key: XP_010804482.1_protein
Cache_key: XP_010804480.1_protein
Cache_key: XP_010804481.1_protein
Cache_key: XP_010804482.1_protein


Unnamed: 0,locus,project_id
0,XP_010804480,PRJNA33843
1,XP_010804481,PRJNA33843
2,XP_010804482,PRJNA33843


## STRING

In [102]:
from src.stringdb import StringInterface

In [103]:
instance = StringInterface()

In [104]:
instance.fetch_single(
    query={
        "identifiers": ["p53"],
    },
    method="get_string_ids",
    parse=True,
    to_dataframe=True
)

Cache_key: p53_get_string_ids
Prepared request URL: https://string-db.org/api/json/get_string_ids?identifiers=p53&echo_query=0
Cache_key: p53_get_string_ids


Unnamed: 0,query_index,query_item,str_id
0,0,p53,9606.ENSP00000269305


In [105]:
instance.fetch_single(
    method="interaction_partners",
    query={
        "identifiers": ["p53", "cdk2"],
        "species": 9606,
    },
    parse=True,
    to_dataframe=True
)

Cache_key: p53_interaction_partners
Cache_key: cdk2_interaction_partners
Prepared request URL: https://string-db.org/api/json/interaction_partners?identifiers=p53%250dcdk2&species=9606&network_type=functional
Cache_key: p53_interaction_partners
Cache_key: cdk2_interaction_partners


Unnamed: 0,id_a,id_b,name_a,name_b,score,nscore,fscore,pscore,ascore,escore,dscore,tscore
0,9606.ENSP00000266970,9606.ENSP00000481380,CDK2,CCNA2,0.999,0,0.003,0.0,0.453,0.999,0.9,0.999
1,9606.ENSP00000266970,9606.ENSP00000413720,CDK2,CDKN1C,0.999,0,0.000,0.0,0.085,0.859,0.9,0.970
2,9606.ENSP00000266970,9606.ENSP00000228872,CDK2,CDKN1B,0.999,0,0.000,0.0,0.085,0.999,0.9,0.999
3,9606.ENSP00000266970,9606.ENSP00000429089,CDK2,CCNE2,0.999,0,0.000,0.0,0.200,0.995,0.9,0.996
4,9606.ENSP00000266970,9606.ENSP00000255465,CDK2,CCNA1,0.999,0,0.000,0.0,0.292,0.942,0.9,0.999
...,...,...,...,...,...,...,...,...,...,...,...,...
5917,9606.ENSP00000269305,9606.ENSP00000310928,TP53,PPARD,0.400,0,0.000,0.0,0.000,0.000,0.0,0.400
5918,9606.ENSP00000269305,9606.ENSP00000419945,TP53,ERVW-1,0.400,0,0.000,0.0,0.000,0.000,0.0,0.400
5919,9606.ENSP00000269305,9606.ENSP00000462980,TP53,TAF4B,0.400,0,0.000,0.0,0.000,0.000,0.4,0.000
5920,9606.ENSP00000269305,9606.ENSP00000431885,TP53,TYK2,0.400,0,0.000,0.0,0.083,0.000,0.0,0.372


In [106]:
instance.fetch_batch(
    outfmt="json",
    method="interaction_partners",
    queries=[{
        "identifiers": ["p53", "cdk2"],
        "species": 9606,
    }],
    parse=True,
    to_dataframe=True
)

Cache_key: p53_interaction_partners
Cache_key: cdk2_interaction_partners


Unnamed: 0,id_a,id_b,name_a,name_b,score,nscore,fscore,pscore,ascore,escore,dscore,tscore
0,9606.ENSP00000266970,9606.ENSP00000481380,CDK2,CCNA2,0.999,0,0.003,0.0,0.453,0.999,0.9,0.999
1,9606.ENSP00000266970,9606.ENSP00000413720,CDK2,CDKN1C,0.999,0,0.000,0.0,0.085,0.859,0.9,0.970
2,9606.ENSP00000266970,9606.ENSP00000228872,CDK2,CDKN1B,0.999,0,0.000,0.0,0.085,0.999,0.9,0.999
3,9606.ENSP00000266970,9606.ENSP00000429089,CDK2,CCNE2,0.999,0,0.000,0.0,0.200,0.995,0.9,0.996
4,9606.ENSP00000266970,9606.ENSP00000255465,CDK2,CCNA1,0.999,0,0.000,0.0,0.292,0.942,0.9,0.999
...,...,...,...,...,...,...,...,...,...,...,...,...
5917,9606.ENSP00000269305,9606.ENSP00000310928,TP53,PPARD,0.400,0,0.000,0.0,0.000,0.000,0.0,0.400
5918,9606.ENSP00000269305,9606.ENSP00000419945,TP53,ERVW-1,0.400,0,0.000,0.0,0.000,0.000,0.0,0.400
5919,9606.ENSP00000269305,9606.ENSP00000462980,TP53,TAF4B,0.400,0,0.000,0.0,0.000,0.000,0.4,0.000
5920,9606.ENSP00000269305,9606.ENSP00000431885,TP53,TYK2,0.400,0,0.000,0.0,0.083,0.000,0.0,0.372


### Show graph test

In [None]:
import json

# Convierte a formato Cytoscape.js
def convert_to_cytoscape_format(graph_json):
    elements = []

    for node in graph_json['nodes']:
        elements.append({
            "data": {
                "id": node["id"],
                "label": node["lbl"]
            }
        })

    for edge in graph_json['edges']:
        elements.append({
            "data": {
                "source": edge["sub"],
                "target": edge["obj"],
                "label": edge["pred"]
            }
        })

    return elements

# Genera el HTML
def create_cytoscape_html(graph_json):
    elements = convert_to_cytoscape_format(graph_json)

    html = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <meta charset="utf-8">
        <title>Grafo GO Interactivo</title>
        <script src="https://unpkg.com/cytoscape@3.19.0/dist/cytoscape.min.js"></script>
        <script src="https://unpkg.com/dagre@0.8.5/dist/dagre.min.js"></script>
        <script src="https://unpkg.com/cytoscape-dagre@2.3.2/cytoscape-dagre.js"></script>
        <style>
            html, body {{
                margin: 0;
                padding: 0;
                height: 100%;
                width: 100%;
                font-family: Arial, sans-serif;
            }}
            #cy {{
                height: 100%;
                width: 100%;
                display: block;
            }}
        </style>
    </head>
    <body>
        <div id="cy"></div>
        <script>
            cytoscape.use(cytoscapeDagre);

            var cy = cytoscape({{
                container: document.getElementById('cy'),
                elements: {json.dumps(elements)},
                style: [
                    {{
                        selector: 'node',
                        style: {{
                            'shape': 'roundrectangle',
                            'label': 'data(label)',
                            'text-valign': 'center',
                            'text-halign': 'center',
                            'background-color': '#AED6F1',
                            'color': '#1B2631',
                            'font-size': '8px',
                            'width': 'label',
                            'height': 'label',
                            'padding': '6px',
                            'border-width': 1,
                            'border-color': '#2980B9'
                        }}
                    }},
                    {{
                        selector: 'edge',
                        style: {{
                            'width': 2,
                            'label': 'data(label)',
                            'line-color': '#B2BABB',
                            'target-arrow-color': '#B2BABB',
                            'target-arrow-shape': 'triangle',
                            'curve-style': 'bezier',
                            'font-size': '7px',
                            'color': '#5D6D7E',
                            'text-background-opacity': 1,
                            'text-background-color': '#fff',
                            'text-background-shape': 'roundrectangle',
                            'text-background-padding': 2
                        }}
                    }}
                ],
                layout: {{
                    name: 'dagre',
                    rankDir: 'TB',
                    nodeSep: 30,
                    edgeSep: 10,
                    rankSep: 50
                }},
                zoomingEnabled: true,
                userZoomingEnabled: true,
                boxSelectionEnabled: false
            }});
        </script>
    </body>
    </html>
    """
    return html

# Guardar archivo HTML
def save_graph_to_html(graph_json, output_file='cytoscape_graph.html'):
    html = create_cytoscape_html(graph_json)
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(html)
    print(f"✅ Grafo guardado en: {output_file}")

# Ejecutar
save_graph_to_html(instance.parse(response)["topology_graph_json"], 'mi_grafo.html')

✅ Grafo guardado en: mi_grafo.html


# Uniprot + Databases union

In [None]:
import ast
import os

from dotenv import load_dotenv
import pandas as pd 
from src.uniprot import UniprotInterface
from src.alphafold import AlphafoldInterface
from src.brenda import BrendaInstance
from src.brenda import methods as brenda_methods
from src.biogrid import BioGRIDInterface
from src.interpro import InterproInstance
from src.genontology import GenOntologyInterface
from src.kegg import KEGGInterface
from src.proteindatabank import PDBInterface
from src.reactome import ReactomeInstance
from src.refseq import RefSeqInterface
from src.stringdb import StringInterface

In [None]:
load_dotenv()

# Retrieve the BioGRID API key
biogrid_api_key = os.getenv("biogrid_api_key")
# Retrieve the Brenda email and password
brenda_email = os.getenv("brenda_email")
brenda_password = os.getenv("brenda_password")

In [None]:
# Argparse arguments
output = "results/output.csv"
#query = "antimicrobial peptide"
query = "organism_name:homo sapiens (human) AND length:[15 TO 30] AND reviewed:true"
fields = "accession,protein_name,sequence,ec,lineage,organism_name,gene_primary"
out_db = "brenda"
sort = "accession asc"
fmt = "json"

include_isoform = False
download = False

# Outside db argument
outside_db = {
    "alphafold": "xref_alphafolddb",
    "biogrid": "xref_biogrid",
    "brenda": "xref_brenda",
    "go": "go_id",
    "interpro": "xref_interpro",
    "kegg": "xref_kegg",
    "pfam": "xref_pfam",
    "pdb": "xref_pdb",
    "reactome": "xref_reactome",
    "refseq": "xref_refseq",   
    "string": "xref_string",
}

interfaces = {
    "alphafold": AlphafoldInterface(
        structures=['pdb'],
        output_dir="results"
    ),
    "biogrid": BioGRIDInterface(),
    "brenda": BrendaInstance(
        email=brenda_email,
        password=brenda_password
    ),
    "genontology": GenOntologyInterface(),
    "interpro": InterproInstance(),
    "kegg": KEGGInterface(),
    "pdb": PDBInterface(
        download_structures=True,
        return_data_list=["rcsb_id", "rcsb_comp_model_provenance"],
        output_dir=output.split("/")[0]
    ),
    "reactome": ReactomeInstance(),
    "refseq": RefSeqInterface(),
    "string": StringInterface(),
}

### Download external db

In [None]:
export_df = pd.read_csv("results/output.csv")
export_df.head()

Unnamed: 0,query,accession,protein_name,organism_name,gene_primary,taxon_id,ineage,sequence,length,alphafold_ids,biogrid_ids,brenda_ids,go_terms,interpro_ids,kegg_ids,pdb_ids,pfam_ids,reactome_ids,refseq_ids,string_ids
0,organism_name:homo sapiens (human) AND length:...,A0A075B6S0,T cell receptor gamma joining 1,Homo sapiens,['TRGJ1'],9606,"['Eukaryota', 'Metazoa', 'Chordata', 'Craniata...",NYYKKLFGSGTTLVVT,16,['A0A075B6S0'],[],[],[],[],[],[],[],[],[],[]
1,organism_name:homo sapiens (human) AND length:...,A0A075B6Y3,T cell receptor alpha joining 3,Homo sapiens,['TRAJ3'],9606,"['Eukaryota', 'Metazoa', 'Chordata', 'Craniata...",GYSSASKIIFGSGTRLSIRP,20,['A0A075B6Y3'],[],[],[],[],[],[],[],[],[],[]
2,organism_name:homo sapiens (human) AND length:...,A0A075B6Y9,T cell receptor alpha joining 42,Homo sapiens,['TRAJ42'],9606,"['Eukaryota', 'Metazoa', 'Chordata', 'Craniata...",YGGSQGNLIFGKGTKLSVKP,20,['A0A075B6Y9'],[],[],[],[],[],[],[],[],[],[]
3,organism_name:homo sapiens (human) AND length:...,A0A075B700,T cell receptor alpha joining 31,Homo sapiens,['TRAJ31'],9606,"['Eukaryota', 'Metazoa', 'Chordata', 'Craniata...",NNNARLMFGDGTQLVVKP,18,['A0A075B700'],[],[],[],[],[],[],[],[],[],[]
4,organism_name:homo sapiens (human) AND length:...,A0A075B706,T cell receptor delta joining 1,Homo sapiens,['TRDJ1'],9606,"['Eukaryota', 'Metazoa', 'Chordata', 'Craniata...",TDKLIFGKGTRVTVEP,16,['A0A075B706'],[],[],[],[],[],[],[],[],[],[]


In [None]:
def parse_ids(df, db_name):
    raw_ids = df.get(f"{db_name}_ids", pd.Series(dtype=str)).dropna().unique().tolist()
    ids = []
    for entry in raw_ids:
        parsed = ast.literal_eval(entry) if isinstance(entry, str) and entry.startswith("[") else [entry]
        ids.extend(parsed)

    # Cleaning: remove "[]"
    return [id for id in ids if id != "[]"]

# Fetch data from alphafold given a ID from xref_alphafolddb
# Those without alphafold_ids are skipped
def fetch_alphafold(df):
    instance = AlphafoldInterface(
        structures=['pdb'],
        output_dir="results"
    )

    ids = parse_ids(df, "alphafold")
    if not ids:
        return []
    # TODO: Remove Limit
    print(f"{len(ids)} IDs to fetch from alphafold")
    return instance.fetch_batch(queries=ids[:4], parse=True)

# Fetch data from biogrid given a list of IDs from xref_biogrid or gene_primary and taxon_id fields
def fetch_biogrid(df):
    instance = BioGRIDInterface()
    queries = []

    # Part 1: by ID if they exist
    ids = parse_ids(df, "biogrid")
    if ids:
        queries.extend([{
            "id": id_,
            "accessKey": biogrid_api_key
        } for id_ in ids])

    # Part 2: build queries by taxId and geneList
    tmp_df = df[df["biogrid_ids"].astype(str).isin(["[]", "nan", "NaN", ""])]
    tmp_df = tmp_df.dropna(subset=["gene_primary", "taxon_id"])
    tmp_df = tmp_df[["gene_primary", "taxon_id"]].drop_duplicates()

    tmp_df["gene_primary"] = tmp_df["gene_primary"].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else [x]
    )

    tmp_df = tmp_df.explode("gene_primary")
    grouped = tmp_df.groupby("taxon_id")["gene_primary"].agg(list).reset_index()

    queries.extend([
        {
            "accessKey": biogrid_api_key,
            "geneList": row["gene_primary"],
            "taxId": row["taxon_id"],
            "format": "json"
        }
        for _, row in grouped.iterrows()
    ])

    print(f"{len(queries)} queries to fetch from biogrid")

    return instance.fetch_batch(
        queries=queries,
        method="interactions",
        parse=True
    )

# Those without brenda_ids are skipped
def fetch_brenda(df, method: str):
    
    instance = BrendaInstance(
        email=brenda_email,
        password=brenda_password
    )
    queries = []
    results = []

    ids = df[~df["brenda_ids"].astype(str).isin(["[]", "nan", "NaN", ""])]
    ids.loc[:, 'brenda_ids'] = ids["brenda_ids"].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else [x]
    )
    ids = ids.explode("brenda_ids").drop_duplicates(subset=["brenda_ids"])
    # TODO Remove Limit
    ids = ids[:4]  # Limit to 4 for testing purposes

    if not ids.empty:
        queries.extend([{
            "ecNumber": id["brenda_ids"],
            "organism": id["organism_name"]
        } for _, id in ids.iterrows() if isinstance(id["brenda_ids"], str) and id["brenda_ids"] != "[]"])

    results.append(
        {
            "method": method,
            "data": instance.fetch_batch(
                queries=queries,
                method=method,
                parse=True
                to_dataframe=True
            )
        })
    
    return results

# Those without go_terms are skipped
# TODO: Retorna una lista anidada y por eso no puedo hacerlo un dataframe ver que pasa
def fetch_genontology(df):
    queries = []
    ids = df["go_terms"].dropna().unique().tolist()
    ids = [ast.literal_eval(id) if isinstance(id, str) and id.startswith("[") else [id] for id in ids]
    ids = [item for sublist in ids for item in sublist]  # Flatten
    ids = list(set(ids))  # Remove duplicates

    # TODO Remove Limit
    ids = ids[:4]  # Limit to 4 for testing purposes
    
    # TODO It can be also added more methods to fetch
    return interface.fetch_batch(
        method="ontology-term",
        queries=ids,
        option=None,
        parse=True
    )

def fetch_interpro(df):
    queries = []

    ids = df[~df["interpro_ids"].astype(str).isin(["[]", "nan", "NaN", ""])]
    ids.loc[:, 'interpro_ids'] = ids["interpro_ids"].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else [x]
    )
    ids = ids.explode("interpro_ids").drop_duplicates(subset=["interpro_ids"])

    if not ids.empty:
        queries.extend([{
            "id": id["interpro_ids"],
            "db": "InterPro",
            "filters": {
                "type": "protein",
                "db": "reviewed",
                "value": id["accession"]
            }
        } for _, id in ids.iterrows() if isinstance(id["interpro_ids"], str) and id["interpro_ids"] != "[]"])


    tmp_df = df[df["interpro_ids"].astype(str).isin(["[]", "nan", "NaN", ""])]
    tmp_df = df[["accession", "taxon_id"]].drop_duplicates()
    tmp_df = tmp_df.dropna(subset=["accession", "taxon_id"])

    if not tmp_df.empty:
        queries.extend([
            {
                "db": "InterPro",
                "filters": [
                    {
                        "type": "protein",
                        "db": "reviewed",
                        "value": row["accession"]
                    },
                    {
                        "type": "taxonomy",
                        "db": "uniprot",
                        "value": row["taxon_id"]
                    }
                ]

            }
            for _, row in tmp_df.iterrows()
        ])
    
    print(len(queries), "queries to fetch from interpro")

    return interface.fetch_batch(
        queries=queries,
        method="entry",
        pages_to_fetch=1,
        parse=True
    )

# Those without kegg_ids are skipped
def fetch_kegg(df):
    queries = []
    ids = parse_ids(df, "kegg")

    if not ids:
        return []
    
    for id in ids:
        queries.append({
            "entries": [id]
        })

    return interface.fetch_batch(
        method="get",
        queries=queries[:4],  # TODO Remove Limit
        parse=True
    )

# TODO: PDB retorna una lista anidada y por eso no puedo hacerlo un dataframe ver que pasa
def fetch_pdb(df):
    queries = []
    ids = parse_ids(df, "pdb")
    
    if not ids:
        return []
    
    queries = ids[:4]  # TODO Remove Limit

    return interface.fetch_batch(
        queries=queries,
        parse=True
    )

def fetch_reactome(df):
    queries = []
    ids = parse_ids(df, "reactome")

    if not ids:
        return []

    return interface.fetch_batch(
        queries=ids[:4],  # TODO Remove Limit
        parse=True,
        method="data/discover",
        option=""
    )

def fetch_refseq(df):
    queries = []
    ids = parse_ids(df, "refseq")

    if not ids:
        return []

    # TODO Remove Limit
    queries = ids[:4]

    return interface.fetch_batch(
        queries=queries,
        method="protein",
        parse=True
    )

# No string_ids this time, only gene_primary and taxon_id
# TODO make sure string_ids are working
def fetch_string(df):
    queries = []
    
    # 1. Fetch by string_ids if they exist
    tmp_df = df[~df["string_ids"].astype(str).isin(["[]", "nan", "NaN", ""])]
    if not tmp_df.empty:
        tmp_df = tmp_df.dropna(subset=["string_ids"])
        tmp_df = tmp_df[["string_ids", "taxon_id"]].drop_duplicates
        tmp_df["string_ids"] = tmp_df["string_ids"].apply(
            lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else [x]
        )
        tmp_df = tmp_df.explode("string_ids")
        grouped = tmp_df.groupby("taxon_id")["string_ids"].agg(list).reset_index()
        queries.extend([
            {
                "identifiers": row["string_ids"],
                "species": row["taxon_id"]
            }
            for _, row in grouped.iterrows()
        ])
    
    # 2. Build queries by taxon_id and gene_primary
    tmp_df = df[df["string_ids"].astype(str).isin(["[]", "nan", "NaN", ""])]
    tmp_df = tmp_df.dropna(subset=["gene_primary", "taxon_id"])
    tmp_df = tmp_df[["gene_primary", "taxon_id"]].drop_duplicates()

    tmp_df["gene_primary"] = tmp_df["gene_primary"].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else [x]
    )

    tmp_df = tmp_df.explode("gene_primary")
    grouped = tmp_df.groupby("taxon_id")["gene_primary"].agg(list).reset_index()

    queries.extend([
        {
            "identifiers": row["gene_primary"],
            "species": row["taxon_id"]
        }
        for _, row in grouped.iterrows()
    ])

    return interface.fetch_batch(
        outfmt="json",
        method="interaction_partners",
        queries=queries[:4],  # TODO Remove Limit
        parse=True
    )

# TODO Add a parameter to convert to dataframe.
# fetch_batch(..., parse=True, to_dataframe=True)

def fetch_from_external(db_name, df):
    match db_name:
        case "alphafold":
            return fetch_alphafold(df)
        case "biogrid":
            return fetch_biogrid(df)
        case brenda if brenda.startswith("brenda_"):
            print(f"aa")
            method = brenda.split("_", 1)[1]
            return fetch_brenda(df, method)
        case "genontology":
            return fetch_genontology(df)
        case "interpro":
            return fetch_interpro(df)
        case "kegg":
            return fetch_kegg(df)
        case "pdb":
            return fetch_pdb(df)
        case "reactome":
            return fetch_reactome(df)
        case "refseq":
            return fetch_refseq(df)
        case "string":
            return fetch_string(df)
        case _:
            raise ValueError(f"Unsupported database: {db_name}")

In [None]:
export_dfs = {}
databases = out_db.split(",")

if "biogrid" in databases:
    biogrid_api_key = os.getenv("biogrid_api_key")
    if not biogrid_api_key:
        raise ValueError("Please set the 'biogrid_api_key' environment variable.")
    

if "brenda" in databases:
    databases.remove("brenda")
    # Add all brenda methods to the databases list
    databases.extend([f"brenda_{method}" for method in brenda_methods.keys()])
    if not brenda_email or not brenda_password:
        raise ValueError("Please set the 'brenda_email' and 'brenda_password' environment variables.")

for db in databases:
    print(f"Fetching data from {db}...")
    test = fetch_from_external(db, export_df)
    export_dfs[db] = test

export_dfs

Fetching data from brenda_getKmValue...
aa
Fetching data from brenda_getIc50Value...
aa
Fetching data from brenda_getKcatKmValue...
aa
Fetching data from brenda_getKiValue...
aa
Fetching data from brenda_getPhRange...
aa
Fetching data from brenda_getPhOptimum...
aa
Fetching data from brenda_getPhStability...
aa
Fetching data from brenda_getCofactor...
aa
Fetching data from brenda_getTemperatureOptimum...
aa
Fetching data from brenda_getTemperatureStability...
aa
Fetching data from brenda_getTemperatureRange...
aa


{'brenda_getKmValue': [{'method': 'getKmValue', 'data': []}],
 'brenda_getIc50Value': [{'method': 'getIc50Value', 'data': []}],
 'brenda_getKcatKmValue': [{'method': 'getKcatKmValue', 'data': []}],
 'brenda_getKiValue': [{'method': 'getKiValue', 'data': []}],
 'brenda_getPhRange': [{'method': 'getPhRange', 'data': []}],
 'brenda_getPhOptimum': [{'method': 'getPhOptimum', 'data': []}],
 'brenda_getPhStability': [{'method': 'getPhStability', 'data': []}],
 'brenda_getCofactor': [{'method': 'getCofactor', 'data': []}],
 'brenda_getTemperatureOptimum': [{'method': 'getTemperatureOptimum',
   'data': []}],
 'brenda_getTemperatureStability': [{'method': 'getTemperatureStability',
   'data': []}],
 'brenda_getTemperatureRange': [{'method': 'getTemperatureRange', 'data': []}]}

# Interface

In [None]:
import ipywidgets as widgets
from IPython.display import display, JSON, clear_output
from src.interpro import InterproInstance, data_types, entry_integration_types, filter_types, db_types

In [None]:
filters = {
    "protein": [("UniProtKB/Swiss-Prot", "reviewed"),
                ("UniProtKB", "UniProt"),
                ("UniProtKB/TrEMBL", "unreviewed")],
    "structure": [("PDB", "PDB")],
    "taxonomy": [("UniProtKB", "uniprot")],
    "proteome": [("UniProtKB", "uniprot")],
    "set": [("CDD", "cdd"),
            ("Pfam", "pfam"),
            ("PIRSF", "pirsf")],
}

def interface_interpro():
    output = widgets.Output()
    dynamic_output = widgets.Output()
    active_sections = {}

    # ------------------------
    # Sección de configuración base
    # ------------------------

    method_dropdown = widgets.Dropdown(
        options=data_types,
        value="entry",
        description="Choose a main data type:"
    )

    db_dropdown = widgets.Dropdown(
        options=db_types["entry"],
        value="InterPro",
        description="DB:"
    )

    # Actualiza db_dropdown en base a method_dropdown
    def update_db_options(change):
        new_type = change['new']
        db_dropdown.options = db_types.get(new_type, [])

    method_dropdown.observe(update_db_options, names='value')

    # ------------------------
    # Sección de filtros dinámicos
    # ------------------------

    container = widgets.VBox()
    button_box = widgets.HBox()
    add_buttons = {}

    # Crear sección individual de filtro
    def create_filter_section(filter_type):
        options = filters[filter_type]
        labels = [label for label, _ in options]
        label_to_value = {label: val for label, val in options}

        db_dropdown_f = widgets.Dropdown(
            options=labels,
            description=f"{filter_type} DB:"
        )

        value_input = widgets.Text(
            placeholder=f"{filter_type} accession...",
            description=f"{filter_type} accession:"
        )

        def clear_value(b):
            value_input.value = ''

        clear_btn = widgets.Button(description="Clear", button_style='info')
        clear_btn.on_click(clear_value)

        def remove_section(b):
            container.children = [c for c in container.children if c != section]
            add_buttons[filter_type].disabled = False
            del active_sections[filter_type]

        remove_btn = widgets.Button(description="Remove", button_style='danger')
        remove_btn.on_click(remove_section)

        section = widgets.VBox([
            widgets.HTML(f"<b>{filter_type}</b>"),
            widgets.HBox([widgets.Label("Filter type:"), widgets.Label(filter_type)]),
            widgets.HBox([db_dropdown_f, clear_btn]),
            value_input,
            remove_btn
        ])
        active_sections[filter_type] = (db_dropdown_f, value_input, label_to_value)
        return section

    # Botones de filtros
    for key in filters:
        btn = widgets.Button(description=key.capitalize(), button_style='info', layout=widgets.Layout(width='auto'))
        add_buttons[key] = btn

        def make_handler(k):
            def handler(b):
                section = create_filter_section(k)
                container.children += (section,)
                add_buttons[k].disabled = True
            return handler

        btn.on_click(make_handler(key))

    button_box.children = list(add_buttons.values())

    # ------------------------
    # Botón para generar query
    # ------------------------

    generate_button = widgets.Button(
        description="Generar Query",
        button_style="success",
        icon="check"
    )

    def on_generate_click(b):
        with output:
            clear_output()
            query = {
                "db": db_dropdown.value,
                "filters": []
            }
            for ftype, (db_dd, val_input, label_map) in active_sections.items():
                db_label = db_dd.value
                db_value = label_map[db_label]
                value = val_input.value.strip()
                if value:
                    query["filters"].append({
                        "type": ftype,
                        "db": db_value,
                        "value": value
                    })
            display(JSON(query, expanded=True))
            # Aquí puedes llamar a la función de InterPro para enviar la consulta
            interpro_instance = InterproInstance()
            response = interpro_instance.fetch_single(
                query=query,
                method=method_dropdown.value,
                pages_to_fetch=1,
                parse=True
            )
            if response:
                display(JSON(response, expanded=True))
            else:
                print("No response received from InterPro API.")

    generate_button.on_click(on_generate_click)

    # ------------------------
    # Render
    # ------------------------

    display(widgets.VBox([
        widgets.HTML("<h3>InterPro Query Interface</h3>"),
        widgets.HTML("<b>Configuración principal:</b>"),
        method_dropdown,
        db_dropdown,
        widgets.HTML("<hr><b>Filtros dinámicos:</b>"),
        widgets.Label("Agregar un filtro:"),
        button_box,
        container,
        generate_button,
        output
    ]))

In [None]:
interface_interpro()

VBox(children=(HTML(value='<h3>InterPro Query Interface</h3>'), HTML(value='<b>Configuración principal:</b>'),…

# Random Test

In [None]:
import pandas as pd
import ast

In [None]:
df = pd.read_csv("test.csv")
df.shape

(67027, 21)

In [None]:
# Search CDK2 and CCNA2 in gene primary use is in or something like that
df["gene_primary"] = df["gene_primary"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else [x]
)
df = df.explode("gene_primary")
df = df[df["gene_primary"].isin(["CDK2", "CCNA2"])]
df

Unnamed: 0,query,accession,protein_name,ec_numbers,organism_name,gene_primary,taxon_id,ineage,sequence,length,...,biogrid_ids,brenda_ids,go_terms,interpro_ids,kegg_ids,pdb_ids,pfam_ids,reactome_ids,refseq_ids,string_ids
19934,(go:DNA binding OR go:ATP binding OR keyword:A...,O55076,Cyclin-dependent kinase 2,['2.7.11.22'],Cricetulus griseus,CDK2,10029,"['Eukaryota', 'Metazoa', 'Chordata', 'Craniata...",MENFQKVEKIGEGTYGVVYKAKNKLTGEVVALKKIRLDTETEGVPS...,298,...,[],['2.7.11.22'],"['GO:0015030', 'GO:0005813', 'GO:0000307', 'GO...","['IPR050108', 'IPR011009', 'IPR000719', 'IPR01...",[],[],['PF00069'],[],[],[]
26084,(go:DNA binding OR go:ATP binding OR keyword:A...,P20248,Cyclin-A2,,Homo sapiens,CCNA2,9606,"['Eukaryota', 'Metazoa', 'Chordata', 'Craniata...",MLGNSAPGPATREAGSALLALQQTALQEDQENINPEKAAPVQQPRT...,432,...,[],[],"['GO:0097122', 'GO:0097124', 'GO:0000307', 'GO...","['IPR039361', 'IPR032447', 'IPR013763', 'IPR03...",['hsa:890'],"['1E9H', '1FIN', '1FVV', '1GY3', '1H1P', '1H1Q...","['PF02984', 'PF00134', 'PF16500']","['R-HSA-1362300', 'R-HSA-1538133', 'R-HSA-1701...",['NP_001228.2'],['9606.ENSP00000481380']
26805,(go:DNA binding OR go:ATP binding OR keyword:A...,P24941,Cyclin-dependent kinase 2,['2.7.11.22'],Homo sapiens,CDK2,9606,"['Eukaryota', 'Metazoa', 'Chordata', 'Craniata...",MENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTETEGVPS...,298,...,[],['2.7.11.22'],"['GO:0015030', 'GO:0005813', 'GO:0000781', 'GO...","['IPR050108', 'IPR011009', 'IPR000719', 'IPR01...",['hsa:1017'],"['1AQ1', '1B38', '1B39', '1BUH', '1CKP', '1DI8...",['PF00069'],"['R-HSA-1538133', 'R-HSA-171319', 'R-HSA-17618...","['NP_001277159.1', 'NP_001789.2', 'NP_439892.2']",['9606.ENSP00000266970']
27540,(go:DNA binding OR go:ATP binding OR keyword:A...,P30274,Cyclin-A2,,Bos taurus,CCNA2,9913,"['Eukaryota', 'Metazoa', 'Chordata', 'Craniata...",MLGSSAHGPAAREAGSAVTLQQTAFQEDQENVNPEKAAPAQQPRTR...,430,...,[],[],"['GO:0097124', 'GO:0000307', 'GO:0005737', 'GO...","['IPR039361', 'IPR032447', 'IPR013763', 'IPR03...",['bta:281667'],"['1VIN', '2G9X', '3BHT', '3BHU', '3BHV', '3DDP...","['PF02984', 'PF00134', 'PF16500']",[],['NP_001068591.1'],['9913.ENSBTAP00000006503']
28601,(go:DNA binding OR go:ATP binding OR keyword:A...,P37881,Cyclin-A2,,Mesocricetus auratus,CCNA2,10036,"['Eukaryota', 'Metazoa', 'Chordata', 'Craniata...",MPGSSRQSGREAGSALLSLQQEDQENVNPEKAAPDQRARAALKTGN...,421,...,[],[],"['GO:0097122', 'GO:0097124', 'GO:0000307', 'GO...","['IPR039361', 'IPR032447', 'IPR013763', 'IPR03...",['maua:101835930'],[],"['PF02984', 'PF00134', 'PF16500']",[],['NP_001268563.1'],['10036.ENSMAUP00000020143']
30029,(go:DNA binding OR go:ATP binding OR keyword:A...,P48963,Cyclin-dependent kinase 2,['2.7.11.22'],Mesocricetus auratus,CDK2,10036,"['Eukaryota', 'Metazoa', 'Chordata', 'Craniata...",MENFQKVEKIGEGTYGVVYKAKNKLTGEVVALKKIRLDTETEGVPS...,298,...,[],['2.7.11.22'],"['GO:0015030', 'GO:0005813', 'GO:0000307', 'GO...","['IPR050108', 'IPR011009', 'IPR000719', 'IPR01...",[],[],['PF00069'],[],[],['10036.ENSMAUP00000021672']
46192,(go:DNA binding OR go:ATP binding OR keyword:A...,Q5E9Y0,Cyclin-dependent kinase 2,['2.7.11.22'],Bos taurus,CDK2,9913,"['Eukaryota', 'Metazoa', 'Chordata', 'Craniata...",MENFQKVEKIGEGTYGVVYKAKNKLTGEVVALKKIRLDTETEGVPS...,298,...,[],[],"['GO:0015030', 'GO:0005813', 'GO:0000307', 'GO...","['IPR050108', 'IPR011009', 'IPR000719', 'IPR01...",['bta:519217'],[],['PF00069'],"['R-BTA-1538133', 'R-BTA-171319', 'R-BTA-17618...",['NP_001014934.1'],['9913.ENSBTAP00000005252']


In [None]:
# Group and separate by taxon_id and get gene_primary
grouped = df.groupby("taxon_id").agg({
    "gene_primary": lambda x: list(
        set(
            item
            for sublist in x.dropna()
            for item in (ast.literal_eval(sublist) if isinstance(sublist, str) and sublist.startswith("[") else [sublist])
        )
    )
})
# Show those with the most gene_primary
grouped = grouped[grouped["gene_primary"].str.len() > 0].reset_index()
grouped = grouped.sort_values(by="gene_primary", key=lambda x: x.str.len(), ascending=False)
grouped

Unnamed: 0,taxon_id,gene_primary
720,9606,"[TTF1, ATN1, SS18, MNDA, NHEJ1, EFEMP1, NCOA4,..."
779,10090,"[Htatip2, Ddb1, Has3, Tbx19, Nsd1, Sohlh2, Ddi..."
359,3702,"[RH9, NFYB1, HMGA, GATA3, ETC2, NSN1, PCNA2, A..."
785,10116,"[Htatip2, Ddb1, Sohlh2, Ddit4, Pagr1, Mcidas, ..."
3153,559292,"[PSF3, ADF1, RPO31, KIN28, TPK1, RPT1, YAP7, N..."
...,...,...
1,25,[ssb]
3,61,[uvrA]
5,84,[sgaR]
7,141,[hbb]


In [None]:
# From tax_id 9606 get the gene_primary list
gene_primary_9606 = grouped[grouped["taxon_id"] == 9606]["gene_primary"].values
#convert to a flat list
gene_primary_9606 = [item for sublist in gene_primary_9606 for item in sublist]
gene_primary_9606 = list(set(gene_primary_9606))  # Remove duplicates
"%0d".join(gene_primary_9606[:100])


'TTF1%0dATN1%0dSS18%0dNHEJ1%0dMNDA%0dEFEMP1%0dNCOA4%0dDUX4L7%0dRHOG%0dLDB1%0dWNK1%0dGATA3%0dATXN7L3%0dZNF224%0dTGIF2LX%0dSMARCA4%0dTCF3%0dCCNE1%0dHLTF%0dZIC3%0dRALY%0dSMARCB1%0dTCF20%0dPLAGL2%0dTSPY1%0dZNF117%0dHOXC13%0dCC2D1A%0dTFEB%0dOTX1%0dLPIN1%0dNRG1%0dZNF638%0dIRX4%0dEHMT1%0dRUVBL2%0dUBTFL1%0dSFMBT1%0dMCM3%0dKAT7%0dHES5%0dZNF507%0dSOX21%0dMYO6%0dRPL6%0dKHDC3L%0dZNF721%0dRPRD1A%0dDNAJC1%0dGAS6%0dKAT2A%0dLRP6%0dCDC34%0dPRAP1%0dZFPM2%0dN4BP2%0dCRYM%0dHMGB4%0dTGFB1%0dARMCX3%0dHDAC9%0dHES4%0dFOXD3%0dSUV39H1%0dCASP8AP2%0dLITAF%0dUSPL1%0dTSC22D2%0dPALB2%0dPIF1%0dNOD2%0dPRDM5%0dSIM2%0dIFNB1%0dHES3%0dCXorf65%0dHOXC5%0dCASP2%0dESX1%0dMTERF3%0dHEMK1%0dENDOG%0dGTF2F2%0dZNF780A%0dSPIN2A%0dNPAS4%0dMAGED1%0dKAT2B%0dCHUK%0dPPP1R12A%0dTRIM34%0dNUDT15%0dHOPX%0dFGF9%0dCCT7%0dGEN1%0dTAF11L8%0dATR%0dPRNP%0dZNF280A'