In [6]:
from src.uniprot import UniprotInterface
import pandas as pd

ModuleNotFoundError: No module named 'utils'

In [None]:
def replace_char_at_index(s, i, new_char):
    if i < 0 or i >= len(s):
        raise IndexError("Index out of range.")
    return s[:i] + new_char + s[i+1:]

In [None]:
ids = ["Q75UA4"]
from_db = 'UniProtKB_AC-ID'
to_db = 'UniProtKB'
disease = "CRC"

In [None]:
downloader = UniprotInterface()

job_id = downloader.submit_id_mapping(from_db=from_db, to_db=to_db, ids=ids)

In [None]:
if downloader.check_id_mapping_results_ready(job_id):
    link = downloader.get_id_mapping_results_link(job_id)
    results = downloader.get_id_mapping_results_search(link)

Fetched: 1 / 1


In [None]:
with open("results.json", "w") as f:
    json.dump(results, f)

In [None]:
results['results'][0]['to']['sequence']['value']

'MKFGKFVLLAASTALAVVGLGGPAAADSTPQAQPSIIGGSNATSGPWAARLFVNGRQNCTATIIAPQYILTAKHCVSSSGTYTFRIGSLDQTSGGTMATGSTITRYPGSADLAIVRLTTSVNATYSPLGSVGDVSVGQNVSVYGWGATSQCGSEINCQSRYLKVATVRVNSISCSDYTGGVAVCANRVNGITAGGDSGGPMFASGRQVGVASTSDRVNNTAYTNITRYRSWISQVAGV'

In [None]:
for result in results['results']:
    print(result['from'])

Q75UA4


In [None]:
export_data = []
sequence = results['results'][0]['to']['sequence']['value']
for feature in results['results'][0]['to']['features']:
    row = []
    if feature['type'] == 'Natural variant' and disease in feature['description']:     
        row.append(feature['featureId'])
        location_start = feature['location']['start']['value']
        location_end = feature['location']['end']['value']
        if location_start == location_end:
            row.append(location_start)
            original_sequence = feature['alternativeSequence']['originalSequence']
            new_sequence = feature['alternativeSequence']['alternativeSequences'][0]
            row.append(f"{original_sequence}->{new_sequence}")
            row.append(replace_char_at_index(sequence, int(location_start)-1, new_sequence))
        else:
            row.append(f"{location_start}-{location_end}")
            row.append("missing")
            row.append(sequence[:int(location_start)-1] + sequence[int(location_end)-1:])
        export_data.append(row)
export_data

In [None]:
df = pd.DataFrame(export_data, columns=["variant id", "position", "change", "sequence"])
df

In [None]:
df.to_csv("results.csv", index=False)

In [None]:
result = results['results'][0]
for reference in result['to']['references']:
    print(reference['citation']['citationCrossReferences'])

[{'database': 'PubMed', 'id': '11133465'}, {'database': 'DOI', 'id': '10.1128/AEM.67.1.345-353.2001'}]
[{'database': 'PubMed', 'id': '16237016'}, {'database': 'DOI', 'id': '10.1128/JB.187.21.7333-7340.2005'}]


In [None]:
references_list = []
result = results['results'][0]                 

try:
    for r in result['to']['references']:
        tmp = {}
        tmp["citacionCrossReferences"] = r['citation']['citationCrossReferences']
        tmp.update({"title": r['citation']['title']})
        references_list.append(tmp)
except KeyError:
    pass

In [None]:
references_list

[{'citacionCrossReferences': [{'database': 'PubMed', 'id': '11133465'},
   {'database': 'DOI', 'id': '10.1128/AEM.67.1.345-353.2001'}],
  'title': 'Purification and characterization of an extracellular poly(L-lactic acid) depolymerase from a soil isolate, Amycolatopsis sp. strain K104-1.'},
 {'citacionCrossReferences': [{'database': 'PubMed', 'id': '16237016'},
   {'database': 'DOI', 'id': '10.1128/JB.187.21.7333-7340.2005'}],
  'title': 'Gene cloning and molecular characterization of an extracellular poly(L-lactic acid) depolymerase from Amycolatopsis sp. strain K104-1.'}]

## Blast

In [None]:
import os, argparse
import shutil
import subprocess
import tarfile
from pathlib import Path
from urllib.request import urlopen
import re
from typing import List

import pandas as pd

DB_DIR = os.path.join("scripts", "db")
BLAST_BASE_URL = "https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/"
UNIPROT_BASE_URL = "https://ftp.uniprot.org/pub/databases/uniprot/current_release"
BLAST_DIR = Path("blast_bin")
#https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref100/uniref100.fasta.gz
#https://ftp.uniprot.org/pub/databases/uniprot/current_release/uniref/uniref100/uniref.xsd
#https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref100/uniref100.xml.gz
#https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.xml.gz
#https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz
#https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz

databases = {
    "uniprotkb_reviewed": "knowledgebase/complete/uniprot_sprot",
    "uniprotkb_unreviewed": "knowledgebase/complete/uniprot_trembl",
    "uniref100": "uniref/niref100/uniref100",
    "uniref90": "uniref/uniref90/uniref90",
    "uniref50": "uniref/uniref50/uniref50",
}

def download_uniprot_database(db_name: str, extension: str = "xml"):
    """ Download a Uniprot database from the Uniprot FTP server.
    Args:
        db_name (str): Name of the database to download.
        extension (str): File extension of the database. Default is "xml".
    """

    if db_name not in databases:
        raise ValueError(f"Database {db_name} is not supported. Supported databases are: {', '.join(databases.keys())}.")
    
    db_path = os.path.join(DB_DIR, f"{db_name}.{extension}")
    
    if not os.path.exists(db_path):
        os.makedirs(DB_DIR, exist_ok=True)
        url = f"{UNIPROT_BASE_URL}/{databases[db_name]}.{extension}.gz"
        os.system(f"wget {url} -O {db_path}.gz")
        print(f"Unzipping {db_path}...")
        subprocess.run(["gunzip", db_path], check=True)
    else:
        print(f"Database {db_name} already exists at {db_path}.")

def get_latest_version_url():
    """Retrieve the latest BLAST+ tarball URL from the NCBI FTP site."""
    with urlopen(BLAST_BASE_URL) as response:
        html = response.read().decode("utf-8")
    # Look for something like: ncbi-blast-2.16.0+-x64-linux.tar.gz
    match = re.search(r'ncbi-blast-(\d+\.\d+\.\d+\+)-x64-linux\.tar\.gz', html)
    if match:
        version = match.group(1)
        tar_name = f"ncbi-blast-{version}-x64-linux.tar.gz"
        return version, BLAST_BASE_URL + tar_name
    else:
        raise RuntimeError("Could not find the latest BLAST version from NCBI.")

def is_blast_installed():
    """Check if 'blastp' is available in the system PATH."""
    try:
        subprocess.run(["blastp", "-version"], check=True, stdout=subprocess.DEVNULL)
        return True
    except (subprocess.CalledProcessError, FileNotFoundError):
        return False


def download_and_extract_blast(version: str, url: str):
    """Download and extract the BLAST+ tarball."""
    tarball_name = url.split("/")[-1]
    if not Path(tarball_name).exists():
        print(f"Downloading BLAST+ {version}...")
        subprocess.run(["wget", url], check=True)

    print("Extracting BLAST+...")
    with tarfile.open(tarball_name, "r:gz") as tar:
        tar.extractall(BLAST_DIR)
    print(f"BLAST extracted to: {BLAST_DIR.resolve()}")


def get_local_blastp_path(version: str):
    """Return the path to local blastp binary."""
    return BLAST_DIR / f"ncbi-blast-{version}" / "bin" / "blastp"


def check_blast():
    """Ensure BLAST is installed. Return path to `blastp` binary."""
    if is_blast_installed():
        print("System-wide BLAST is installed.")
        return shutil.which("blastp")
    else:
        version, url = get_latest_version_url()
        local_blastp = get_local_blastp_path(version)
        if not local_blastp.exists():
            print(f"BLAST {version} not found locally. Installing...")
            BLAST_DIR.mkdir(exist_ok=True)
            download_and_extract_blast(version, url)
        else:
            print(f"Using already downloaded BLAST {version}.")
        return str(local_blastp)

def make_blast_database(db_name: str, db_type: str = "prot", extension: str = "xml"):
    """Create a BLAST database from the Uniprot database."""
    db_path = os.path.join(DB_DIR, f"{db_name}.{extension}")
    if not os.path.exists(db_path):
        raise FileNotFoundError(f"Database {db_name} not found at {db_path}. Please download it first.")
    
    # Check if the database is already created
    blast_db_path = os.path.join(DB_DIR, db_name)
    extensions = [".pdb", ".phr", ".pin", ".psq", ".pot", ".psq", ".ptf", ".pto"]
    makedb = False
    # For all extensions check if exists if there is one failing makedb again
    for ext in extensions:
        if not os.path.exists(blast_db_path + "/db" + ext):
            makedb = True
            break
    if makedb:
        print(f"Creating BLAST database for {db_name}...")
        blast_db_cmd = [
            "makeblastdb",
            "-in", db_path,
            "-dbtype", db_type,
            "-out", os.path.join(DB_DIR, db_name) + "/db",
        ]
    
        subprocess.run(blast_db_cmd, check=True)
        print(f"BLAST database created at: {os.path.join(DB_DIR, databases[db_name])}")
    else:
        print(f"BLAST database already exists at {blast_db_path}. No need to create it again.")

def run_blast(sequences: List[str], db_name: str, blast_type: str = "blastp", evalue: float = 0.001):
    """Run BLAST search."""
    blast_db_path = os.path.join(DB_DIR, db_name)
    if not os.path.exists(blast_db_path):
        raise FileNotFoundError(f"Database {db_name} not found at {blast_db_path}. Please download it first.")

    # Make tmp directory if it does not exist
    os.makedirs("tmp", exist_ok=True)

    # Write sequences to a temporary file
    with open("tmp/sequences.fasta", "w") as f:
        for i, seq in enumerate(sequences):
            f.write(f">{i}\n{seq}\n")
    
    blast_cmd = [
        blast_type,
        "-query", "tmp/sequences.fasta",
        "-db", blast_db_path + "/db",
        "-outfmt", "6",
        "-evalue", str(evalue),
    ]
    
    print(f"Running BLAST search...")
    with open("tmp/blast_results.txt", "w") as f:
        subprocess.run(blast_cmd, stdout=f, check=True)
    print(f"BLAST results saved to tmp/blast_results.txt")
    # Clean up temporary file
    os.remove("tmp/sequences.fasta")

def parse_blast_results(file_path: str, identity_threshold: float = 90.0):
    """Parse BLAST results from a file."""
    with open(file_path, "r") as f:
        results = f.readlines()
    
    parsed_results = []
    for line in results:
        fields = line.strip().split("\t")
        identity = float(fields[2])
        if identity >= identity_threshold:
            parsed_results.append({
                "query": fields[0],
                "subject": fields[1],
                "identity": fields[2],
                "alignment_length": fields[3],
                "evalue": fields[4],
                "bit_score": fields[5],
            })
    
    return parsed_results

In [None]:
df = pd.read_csv("data/test.csv")
sequences = df["sequences"].dropna().unique().tolist()
    
download_uniprot_database("uniprotkb_reviewed", "fasta")
    
blastp_path = check_blast()
print(f"Using blastp at: {blastp_path}")

make_blast_database("uniprotkb_reviewed", extension="fasta")

run_blast(sequences, "uniprotkb_reviewed", blast_type="blastp", evalue=0.0001)

results = parse_blast_results("tmp/blast_results.txt")

# Convert to DataFrame
sequences_df = pd.DataFrame(sequences, columns=["sequences"])
sequences_df["id"] = sequences_df.index

sequences_df

Database uniprotkb_reviewed already exists at scripts/db/uniprotkb_reviewed.fasta.
System-wide BLAST is installed.
Using blastp at: /home/diego/micromamba/envs/bioseqdownloader/bin/blastp
BLAST database already exists at scripts/db/uniprotkb_reviewed. No need to create it again.
Running BLAST search...
BLAST results saved to tmp/blast_results.txt


Unnamed: 0,sequences,id
0,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,0
1,MSFKVYDPIAELIATQFPTSNPDLQIINNDVLVVSPHKITLPMGPQ...,1
2,NYTETAQAIARSWRAGSHDRLKARGEAVAVTVHRLVAVPRGRDTPR...,2


In [None]:
df_blast = pd.DataFrame(results)

df_blast = df_blast.rename(columns={"query": "id", "subject": "subject_id"})
df_blast["id"] = df_blast["id"].astype(int)
df_blast = df_blast.merge(sequences_df, on="id", how="left")
df_blast = df_blast.drop(columns=["id"])
df_blast = df_blast.rename(columns={"sequences": "sequence"})

# Separate subject into source, accession, entry_name
df_blast["source"] = df_blast["subject_id"].apply(lambda x: x.split("|")[0])
df_blast["accession"] = df_blast["subject_id"].apply(lambda x: x.split("|")[1])
df_blast["entry_name"] = df_blast["subject_id"].apply(lambda x: x.split("|")[2])
df_blast = df_blast.drop(columns=["subject_id"])


In [None]:
df_blast

Unnamed: 0,identity,alignment_length,evalue,bit_score,sequence,source,accession,entry_name
0,100.0,438,0,0,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,sp,Q6GZX2,003R_FRG3G
1,100.0,180,0,0,MSFKVYDPIAELIATQFPTSNPDLQIINNDVLVVSPHKITLPMGPQ...,sp,Q197F2,008L_IIV3
2,100.0,50,0,0,MSFKVYDPIAELIATQFPTSNPDLQIINNDVLVVSPHKITLPMGPQ...,sp,Q6GZW6,009L_FRG3G
3,100.0,345,0,0,NYTETAQAIARSWRAGSHDRLKARGEAVAVTVHRLVAVPRGRDTPR...,sp,Q6GZW6,009L_FRG3G


## GO

In [None]:
from src.description_go import *
import os, ast
import pandas as pd
from tqdm import tqdm

In [None]:
DOCKER_IMAGE_NAME = "metastudent"
DOCKER_CONTAINER_NAME = "metastudent_container"
HOST_INPUT_FILE = os.path.abspath("tmp/sequences.fasta")
HOST_OUTPUT_DIR = os.path.abspath("tmp/")
CONTAINER_INPUT_FILE = "/app/input.fasta"
CONTAINER_OUTPUT_FILE = "/app/output.result"


print("[DESCRIPTION_GO] Getting Gen Ontology")
tqdm.pandas()

if not check_dependencies(DOCKER_IMAGE_NAME):
    print("[DESCRIPTION_GO] Metastudent not found. Installing...")
    install_dependencies(DOCKER_IMAGE_NAME)
else:
    print("[DESCRIPTION_GO] Metastudent found.")

input_df = pd.read_csv("results/umami_uniprot.csv")
obsolete_df = pd.read_csv("scripts/resources/amiGO_data.csv", sep="\t", names=["id_go", "description", "is_obsolete"])
input_df['go_terms'] = input_df['go_terms'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

parsed_df = pd.DataFrame()
if os.path.isfile(f"{HOST_OUTPUT_DIR}/output.BPO.txt") and \
        os.path.isfile(f"{HOST_OUTPUT_DIR}/output.CCO.txt") and \
        os.path.isfile(f"{HOST_OUTPUT_DIR}/output.MFO.txt"):
    print("[DESCRIPTION_GO] Metastudent results found.")
    parsed_df = parse_outputs("uniprot_id")

# Filter input_df with go_terms ~= null
input_df_with_go_terms = input_df[input_df["go_terms"].apply(lambda x: isinstance(x, list) and len(x) > 0)]
input_df = input_df[input_df["go_terms"].apply(lambda x: isinstance(x, list) and len(x) == 0)]

if not input_df_with_go_terms.empty:
    print("[DESCRIPTION_GO] Go terms found in input data.")
    input_df_with_go_terms = input_df_with_go_terms[["uniprot_id", "go_terms"]]
    input_df_with_go_terms = input_df_with_go_terms.explode("go_terms")
    parsed_df = pd.concat(
        [
            parsed_df,
            pd.merge(
                input_df_with_go_terms, 
                obsolete_df, 
                left_on="go_terms", 
                right_on="id_go", 
                how="left"
            )
            .drop(columns=["go_terms"])
            .rename(columns={"id_go": "go"})  
        ]
    )

input_df

[DESCRIPTION_GO] Getting Gen Ontology
Docker version 28.0.0, build f9ced58158
[DESCRIPTION_GO] Metastudent found.
[DESCRIPTION_GO] Go terms found in input data.


Unnamed: 0,uniprot_id,entry_type,protein_name,ec_numbers,organism,taxon_id,sequence,length,go_terms,pfam_ids,references,features,keywords,source_db


In [None]:
if not parsed_df.empty:
    # Check if all sequences have been processed
    parsed_ids = parsed_df["uniprot_id"].unique()
    input_ids = input_df["uniprot_id"].unique()
    if len(parsed_ids) == len(input_ids):
        print("[DESCRIPTION_GO] All sequences have been processed.")
        input_df = pd.DataFrame()
    else:
        input_df = input_df[~input_df["uniprot_id"].isin(parsed_ids)]
        print(f"[DESCRIPTION_GO] {len(input_df)} sequences have not been processed.")

[DESCRIPTION_GO] 0 sequences have not been processed.


In [None]:
os.makedirs(HOST_OUTPUT_DIR, exist_ok=True)
if not input_df.empty:
    print("[DESCRIPTION_GO] Running in batches of 50...")
    for i in tqdm(range(0, len(input_df), 50)):
        run_in_batches(input_df[i:i+50], HOST_OUTPUT_DIR)

In [None]:
test = pd.concat(
    [
        parsed_df,
        parse_outputs("uniprot_id")
    ]
)
    
test = test.sort_values(by="uniprot_id")
test = test.merge(obsolete_df, left_on="go", right_on="id_go", how="left")
test = test.drop(columns=["id_go"])

test

File '/home/diego/Documents/PythonProjects/BioSeqDownloader/tmp/output.BPO.txt' not found.


NameError: name 'exit' is not defined

## Uniprot query

In [None]:
import os
os.chdir("src")

In [None]:
query="organism_name:homo sapiens (human) AND length:[15 TO 30] AND reviewed:true"
fields="accession,protein_name,sequence,ec,lineage,organism_name,xref_pfam,xref_alphafolddb,xref_pdb,go_id"
sort="accession asc"
download=True
format="json"

In [None]:
from uniprot import UniprotInterface

uniprot = UniprotInterface()
response = uniprot.submit_stream(
    query=query,
    fields=fields,
    sort=sort,
    include_isoform=True,
    download=download,
    format=format
)

In [None]:
uniprot.parse_stream_response(
    query=query,
    response=response
)

Unnamed: 0,query,accession,protein_name,organism_name,taxon_id,ineage,sequence,length,go_terms,pfam_ids,alphafold_ids,pdb_ids,kegg_ids,brenda_ids,reactome_ids,refseq_ids,interpro_ids
0,organism_name:homo sapiens (human) AND length:...,A0A075B6S0,T cell receptor gamma joining 1,Homo sapiens,9606,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",NYYKKLFGSGTTLVVT,16,"[GO:0042101, GO:0002250]",[],[A0A075B6S0],[],[],[],[],[],[]
1,organism_name:homo sapiens (human) AND length:...,A0A075B6Y3,T cell receptor alpha joining 3,Homo sapiens,9606,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",GYSSASKIIFGSGTRLSIRP,20,"[GO:0042101, GO:0002250]",[],[A0A075B6Y3],[],[],[],[],[],[]
2,organism_name:homo sapiens (human) AND length:...,A0A075B6Y9,T cell receptor alpha joining 42,Homo sapiens,9606,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",YGGSQGNLIFGKGTKLSVKP,20,"[GO:0005886, GO:0002250]",[],[A0A075B6Y9],[],[],[],[],[],[]
3,organism_name:homo sapiens (human) AND length:...,A0A075B700,T cell receptor alpha joining 31,Homo sapiens,9606,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",NNNARLMFGDGTQLVVKP,18,"[GO:0005886, GO:0002250]",[],[A0A075B700],[],[],[],[],[],[]
4,organism_name:homo sapiens (human) AND length:...,A0A075B706,T cell receptor delta joining 1,Homo sapiens,9606,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",TDKLIFGKGTRVTVEP,16,"[GO:0042101, GO:0002250]",[],[A0A075B706],[],[],[],[],[],[]
5,organism_name:homo sapiens (human) AND length:...,A0A0A0MT70,T cell receptor beta joining 2-6,Homo sapiens,9606,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",SGANVLTFGAGSRLTVL,17,"[GO:0042101, GO:0002250]",[],[A0A0A0MT70],[],[],[],[],[],[]
6,organism_name:homo sapiens (human) AND length:...,A0A0A0MT78,T cell receptor beta joining 2-7,Homo sapiens,9606,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",SYEQYFGPGTRLTVT,15,"[GO:0042101, GO:0002250]",[],[],[],[],[],[],[],[]
7,organism_name:homo sapiens (human) AND length:...,A0A0A0MT87,T cell receptor beta joining 2-4,Homo sapiens,9606,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",AKNIQYFGAGTRLSVL,16,"[GO:0042101, GO:0002250]",[],[A0A0A0MT87],[],[],[],[],[],[]
8,organism_name:homo sapiens (human) AND length:...,A0A0A0MT94,T cell receptor beta joining 2-2,Homo sapiens,9606,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",NTGELFFGEGSRLTVL,16,"[GO:0042101, GO:0002250]",[],[A0A0A0MT94],[],[],[],[],[],[]
9,organism_name:homo sapiens (human) AND length:...,A0A0A0MTA4,T cell receptor beta joining 2-5,Homo sapiens,9606,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",QETQYFGPGTRLLVL,15,"[GO:0042101, GO:0002250]",[],[],[],[],[],[],[],[]


## Activity search

In [None]:
import os
import pandas as pd

In [None]:
uniprot_search_files = os.listdir("uniprot_search")
uniprot_search_files

['uniprot_celiac-toxic.csv',
 'uniprot_embryotoxic.csv',
 'uniprot_ace-inhibitor.csv',
 'uniprot_anuran-defense.csv',
 'uniprot_campde-inhibitor.csv',
 'uniprot_anti-neurotensive.csv',
 'uniprot_antitrypanosomic.csv',
 'uniprot_anticancer.csv',
 'uniprot_anorectic.csv',
 'uniprot_chemotactic.csv',
 'uniprot_targeting-GP.csv',
 'uniprot_Blood-Brain-Barrier.csv',
 'uniprot_antitumor.csv',
 'uniprot_cytotoxic.csv',
 'uniprot_activating-ubiquitin-mediated-proteolysis.csv',
 'uniprot_protein-kinase-c-inhibitor.csv',
 'uniprot_antihiv.csv',
 'uniprot_antidiabetic.csv',
 'uniprot_calpain-2-inhibitor.csv',
 'uniprot_antileishmania.csv',
 'uniprot_inhibitor.csv',
 'uniprot_antimicrobial.csv',
 'uniprot_antituberculosis.csv',
 'uniprot_antiviral.csv',
 'uniprot_wound-healing.csv',
 'uniprot_targeting-GN.csv',
 'uniprot_hmg-coa-reductase-inhibitor.csv',
 'uniprot_opioid-agonist.csv',
 'uniprot_toxicology.csv',
 'uniprot_antibacterial.csv',
 'uniprot_antibiofilm.csv',
 'uniprot_hypocholesterolemic

In [None]:
# Create an empty list to store the results
activity_data = []

# Iterate over each file in uniprot_search_files
for file in uniprot_search_files:
    # Load the CSV file into a DataFrame
    df = pd.read_csv(f"uniprot_search/{file}")
    
    # Extract the activity name from the file name
    activity = file.replace("uniprot_", "").replace(".csv", "").replace("-", " ").capitalize()
    
    # Count the number of sequences in the DataFrame
    sequence_count = len(df)
    
    # Append the activity and sequence count to the list
    activity_data.append({"activity": activity, "sequence_count": sequence_count})

# Create a DataFrame from the activity data
activity_df = pd.DataFrame(activity_data)

# Sort the DataFrame by sequence count in descending order
activity_df = activity_df.sort_values(by="sequence_count", ascending=False)

# Calculate the total number of sequences
total_sequences = activity_df["sequence_count"].sum()

# Add a row for the total sequences
activity_df = pd.concat([activity_df, pd.DataFrame([{"activity": "Total", "sequence_count": total_sequences}])], ignore_index=True)

# Display the resulting DataFrame
activity_df

Unnamed: 0,activity,sequence_count
0,Binding,411281
1,Inhibitor,18950
2,Surface binding,15663
3,Regulating,11361
4,Antimicrobial,5945
...,...,...
85,Dipeptidyl peptidaseiv,0
86,Antiamnestic,0
87,Edema inducer,0
88,Antiendotoxin,0


In [None]:
anti_activities_df = activity_df[activity_df['activity'].str.startswith('Anti', na=False)]
anti_activities_df

Unnamed: 0,activity,sequence_count
4,Antimicrobial,5945
6,Antiviral,3835
10,Antibacterial,2366
17,Antifungal,1209
19,Antitoxin,832
24,Antitumor,479
25,Anticancer,393
32,Antiparasitic,140
37,Antiangiogenic,86
41,Antioxidative,55


## Alphafold

In [1]:
from src.alphafold import AlphafoldInterface

In [2]:
fields = {
    "entry": "entryId",
    "gene": "gene",
    "tax_id": "taxId",
    "sequence": "uniprotSequence",
    "model_created": "modelCreatedDate",
    "latest_version": "latestVersion",
    "all_versions": "allVersions"
}

In [3]:
instance = AlphafoldInterface(
    structures=['pdb'],
    output_dir="results",
    fields_to_extract=fields,
)

In [4]:
print(instance.query_usage())

Usage: To fetch predictions, use the UniProt ID as the query.
        Example: 
            - fetch_single("P02666")
            - fetch_batch(["P02666", "P12345"])

        Also you can download structures by setting the `structures` parameter in the constructor.
        Example:
            - alphafold = AlphafoldInterface(structures=["pdb", "cif"])
            - prediction = alphafold.fetch_single("P02666")

        Available structures to download:
            - pdb: Protein Data Bank format
            - cif: Crystallographic Information File format
            - bcif: Binary Crystallographic Information File format
        

Example fields in the response:
	- entryId: str
	- gene: str
	- sequenceChecksum: str
	- sequenceVersionDate: str
	- uniprotAccession: str
	- uniprotId: str
	- uniprotDescription: str
	- taxId: int
	- organismScientificName: str
	- uniprotStart: int
	- uniprotEnd: int
	- uniprotSequence: str
	- modelCreatedDate: str
	- latestVersion: int
	- allVersions: list(

In [5]:
instance.fetch_single(
    query="P02666",
    parse=True
)

<class 'dict'>
{'entryId': 'AF-P02666-F1', 'gene': 'CSN2', 'sequenceChecksum': 'F0BBDD8148A238AE', 'sequenceVersionDate': '1989-07-01', 'uniprotAccession': 'P02666', 'uniprotId': 'CASB_BOVIN', 'uniprotDescription': 'Beta-casein', 'taxId': 9913, 'organismScientificName': 'Bos taurus', 'uniprotStart': 1, 'uniprotEnd': 224, 'uniprotSequence': 'MKVLILACLVALALARELEELNVPGEIVESLSSSEESITRINKKIEKFQSEEQQQTEDELQDKIHPFAQTQSLVYPFPGPIPNSLPQNIPPLTQTPVVVPPFLQPEVMGVSKVKEAMAPKHKEMPFPKYPVEPFTESQSLTLTDVENLHLPLPLLQSWMHQPHQPLPPTVMFPPQSVLSLSQSKVLPVPQKAVPYPQRDMPIQAFLLYQEPVLGPVRGPFPIIV', 'modelCreatedDate': '2022-06-01', 'latestVersion': 4, 'allVersions': [2, 3, 4], 'bcifUrl': 'https://alphafold.ebi.ac.uk/files/AF-P02666-F1-model_v4.bcif', 'cifUrl': 'https://alphafold.ebi.ac.uk/files/AF-P02666-F1-model_v4.cif', 'pdbUrl': 'https://alphafold.ebi.ac.uk/files/AF-P02666-F1-model_v4.pdb', 'paeImageUrl': 'https://alphafold.ebi.ac.uk/files/AF-P02666-F1-predicted_aligned_error_v4.png', 'paeDocUrl': 'https://alphafold.e

[{'entry': 'AF-P02666-F1',
  'gene': 'CSN2',
  'tax_id': 9913,
  'sequence': 'MKVLILACLVALALARELEELNVPGEIVESLSSSEESITRINKKIEKFQSEEQQQTEDELQDKIHPFAQTQSLVYPFPGPIPNSLPQNIPPLTQTPVVVPPFLQPEVMGVSKVKEAMAPKHKEMPFPKYPVEPFTESQSLTLTDVENLHLPLPLLQSWMHQPHQPLPPTVMFPPQSVLSLSQSKVLPVPQKAVPYPQRDMPIQAFLLYQEPVLGPVRGPFPIIV',
  'model_created': '2022-06-01',
  'latest_version': 4,
  'all_versions': [2, 3, 4]}]

In [6]:
instance.fetch_batch(
    queries=["P02666", "Q9TSI0", "P33048", "P11839", "O15552", "P76011"],
    parse=True,
)

<class 'dict'>
{'entryId': 'AF-P02666-F1', 'gene': 'CSN2', 'sequenceChecksum': 'F0BBDD8148A238AE', 'sequenceVersionDate': '1989-07-01', 'uniprotAccession': 'P02666', 'uniprotId': 'CASB_BOVIN', 'uniprotDescription': 'Beta-casein', 'taxId': 9913, 'organismScientificName': 'Bos taurus', 'uniprotStart': 1, 'uniprotEnd': 224, 'uniprotSequence': 'MKVLILACLVALALARELEELNVPGEIVESLSSSEESITRINKKIEKFQSEEQQQTEDELQDKIHPFAQTQSLVYPFPGPIPNSLPQNIPPLTQTPVVVPPFLQPEVMGVSKVKEAMAPKHKEMPFPKYPVEPFTESQSLTLTDVENLHLPLPLLQSWMHQPHQPLPPTVMFPPQSVLSLSQSKVLPVPQKAVPYPQRDMPIQAFLLYQEPVLGPVRGPFPIIV', 'modelCreatedDate': '2022-06-01', 'latestVersion': 4, 'allVersions': [2, 3, 4], 'bcifUrl': 'https://alphafold.ebi.ac.uk/files/AF-P02666-F1-model_v4.bcif', 'cifUrl': 'https://alphafold.ebi.ac.uk/files/AF-P02666-F1-model_v4.cif', 'pdbUrl': 'https://alphafold.ebi.ac.uk/files/AF-P02666-F1-model_v4.pdb', 'paeImageUrl': 'https://alphafold.ebi.ac.uk/files/AF-P02666-F1-predicted_aligned_error_v4.png', 'paeDocUrl': 'https://alphafold.e

[{'entry': 'AF-P02666-F1',
  'gene': 'CSN2',
  'tax_id': 9913,
  'sequence': 'MKVLILACLVALALARELEELNVPGEIVESLSSSEESITRINKKIEKFQSEEQQQTEDELQDKIHPFAQTQSLVYPFPGPIPNSLPQNIPPLTQTPVVVPPFLQPEVMGVSKVKEAMAPKHKEMPFPKYPVEPFTESQSLTLTDVENLHLPLPLLQSWMHQPHQPLPPTVMFPPQSVLSLSQSKVLPVPQKAVPYPQRDMPIQAFLLYQEPVLGPVRGPFPIIV',
  'model_created': '2022-06-01',
  'latest_version': 4,
  'all_versions': [2, 3, 4]},
 {'entry': 'AF-Q9TSI0-F1',
  'gene': 'CSN2',
  'tax_id': 89462,
  'sequence': 'MKVLILACLVALALARELEELNVPGEIVESLSSSEESITHINKKIEKFQSEEQQQMEDELQDKIHPFAQTQSLVYPFPGPIPKSLPQNIPPLTQTPVVVPPFLQPEIMGVSKVKEAMAPKHKEMPFPKYPVEPFTESQSLTLTDVENLHLPLPLLQSWMHQPPQPLPPTVMFPPQSVLSLSQSKVLPVPQKAVPYPQRDMPIQAFLLYQEPVLGPVRGPFPIIV',
  'model_created': '2022-06-01',
  'latest_version': 4,
  'all_versions': [2, 3, 4]},
 {'entry': 'AF-P33048-F1',
  'gene': 'CSN2',
  'tax_id': 9925,
  'sequence': 'MKVLILACLVALAIAREQEELNVVGETVESLSSSEESITHINKKIEKFQSEEQQQTEDELQDKIHPFAQAQSLVYPFTGPIPNSLPQNILPLTQTPVVVPPFLQPEIMGVPKVKETMVPKHKEMPFPKYPVEPFTESQS

In [None]:
instance.save(
    data = result,
    filename = "alphafold2_results",
    extension= "csv"
)

'results/alphafold2_results.csv'

## BioGRID

In [1]:
from src.biogrid import BioGRIDInterface

In [2]:
instance = BioGRIDInterface(
)

In [3]:
print(instance.query_usage())

Usage: To fetch interactions, use the BioGRID API with the following parameters.
        Example:
            - fetch_single(method="interactions", query={})
        Available methods: interactions, organisms, identifiers, evidence

Query Parameters:
	start: 0 (integer) - Start index for pagination
	max: 10000 (integer) - Maximum number of results to return
	interSpeciesExclude: False (boolean) - Include interactions between different species
	selfInteractionsExclude: False (boolean) - If ‘true’, interactions with one interactor will be excluded
	includeEvidence: False (boolean) - If ‘true’, evidence codes will be included in the results
	searchIds: False (boolean) - If ‘true’, the interactor ENTREZ_GENE, ORDERED LOCUS and SYSTEMATIC_NAME (orf) will be examined for a match with the geneList.
	format: tab2 (string) - Format of the response. Options are 'tab1','tab2', 'extendedTab2', 'count', 'json', 'jsonExtended'. Default is 'tab2'.

Example Query:

        {
            "accessKey": "

In [6]:
instance.get_dummy("87aee10241ab2fcc6e497352e5024380")

{'interactions': {'96795.BIOGRID_INTERACTION_ID': 'dict(int)',
  '96795.ENTREZ_GENE_A': 'dict(str)',
  '96795.ENTREZ_GENE_B': 'dict(str)',
  '96795.BIOGRID_ID_A': 'dict(int)',
  '96795.BIOGRID_ID_B': 'dict(int)',
  '96795.SYSTEMATIC_NAME_A': 'dict(str)',
  '96795.SYSTEMATIC_NAME_B': 'dict(str)',
  '96795.OFFICIAL_SYMBOL_A': 'dict(str)',
  '96795.OFFICIAL_SYMBOL_B': 'dict(str)',
  '96795.SYNONYMS_A': 'dict(str)',
  '96795.SYNONYMS_B': 'dict(str)',
  '96795.EXPERIMENTAL_SYSTEM': 'dict(str)',
  '96795.EXPERIMENTAL_SYSTEM_TYPE': 'dict(str)',
  '96795.PUBMED_AUTHOR': 'dict(str)',
  '96795.PUBMED_ID': 'dict(int)',
  '96795.ORGANISM_A': 'dict(int)',
  '96795.ORGANISM_B': 'dict(int)',
  '96795.THROUGHPUT': 'dict(str)',
  '96795.QUANTITATION': 'dict(str)',
  '96795.MODIFICATION': 'dict(str)',
  '96795.ONTOLOGY_TERMS': 'dict(dict)',
  '96795.QUALIFICATIONS': 'dict(str)',
  '96795.TAGS': 'dict(str)',
  '96795.SOURCEDB': 'dict(str)',
  '96795': 'dict'},
 'organisms': {'1140': 'str',
  '3055': 'str

In [7]:
instance.fetch_single(
    query={
        "accessKey": "87aee10241ab2fcc6e497352e5024380",
        "geneList": ["cdc27", "apc1", "apc2"],
        "taxId": "559292",
        "format": "json"
    },
    method="interactions",
    parse=True
)

[{'BIOGRID_INTERACTION_ID': 96795,
  'ENTREZ_GENE_A': '850818',
  'ENTREZ_GENE_B': '855549',
  'BIOGRID_ID_A': 31396,
  'BIOGRID_ID_B': 35658,
  'SYSTEMATIC_NAME_A': 'YLR127C',
  'SYSTEMATIC_NAME_B': 'YNL172W',
  'OFFICIAL_SYMBOL_A': 'APC2',
  'OFFICIAL_SYMBOL_B': 'APC1',
  'SYNONYMS_A': 'RSI1|TID2|anaphase promoting complex subunit 2|L000003970|L000004348',
  'SYNONYMS_B': 'anaphase promoting complex subunit 1|L000004053',
  'EXPERIMENTAL_SYSTEM': 'Affinity Capture-MS',
  'EXPERIMENTAL_SYSTEM_TYPE': 'physical',
  'PUBMED_AUTHOR': 'Gavin AC (2002)',
  'PUBMED_ID': 11805826,
  'ORGANISM_A': 559292,
  'ORGANISM_B': 559292,
  'THROUGHPUT': 'High Throughput',
  'QUANTITATION': '-',
  'MODIFICATION': '-',
  'ONTOLOGY_TERMS': {},
  'QUALIFICATIONS': '-',
  'TAGS': '-',
  'SOURCEDB': 'BIOGRID'},
 {'BIOGRID_INTERACTION_ID': 96796,
  'ENTREZ_GENE_A': '850818',
  'ENTREZ_GENE_B': '853846',
  'BIOGRID_ID_A': 31396,
  'BIOGRID_ID_B': 34109,
  'SYSTEMATIC_NAME_A': 'YLR127C',
  'SYSTEMATIC_NAME_B': 

In [None]:
instance.fetch_batch(
    queries=[
        {
            "accessKey": "",
            "geneList": ["P53", "CDK2", "BRCA1"],
            "format": "json"
        },
        {
            "accessKey": "",
            "geneList": ["BRCA2", "ATM", "CHEK2"],
            "format": "json"
        }
    ],
    method="interactions"
)

[{'2368': {'BIOGRID_INTERACTION_ID': 2368,
   'ENTREZ_GENE_A': '672',
   'ENTREZ_GENE_B': '466',
   'BIOGRID_ID_A': 107140,
   'BIOGRID_ID_B': 106956,
   'SYSTEMATIC_NAME_A': '-',
   'SYSTEMATIC_NAME_B': '-',
   'OFFICIAL_SYMBOL_A': 'BRCA1',
   'OFFICIAL_SYMBOL_B': 'ATF1',
   'SYNONYMS_A': 'BRCAI|BRCC1|BROVCA1|FANCS|IRIS|PNCA4|PPP1R53|PSCP|RNF53',
   'SYNONYMS_B': 'EWS-ATF1|FUS/ATF-1|TREB36',
   'EXPERIMENTAL_SYSTEM': 'Two-hybrid',
   'EXPERIMENTAL_SYSTEM_TYPE': 'physical',
   'PUBMED_AUTHOR': 'Houvras Y (2000)',
   'PUBMED_ID': 10945975,
   'ORGANISM_A': 9606,
   'ORGANISM_B': 9606,
   'THROUGHPUT': 'Low Throughput',
   'QUANTITATION': '-',
   'MODIFICATION': '-',
   'ONTOLOGY_TERMS': {},
   'QUALIFICATIONS': '-',
   'TAGS': '-',
   'SOURCEDB': 'BIOGRID'},
  '2398': {'BIOGRID_INTERACTION_ID': 2398,
   'ENTREZ_GENE_A': '672',
   'ENTREZ_GENE_B': '4436',
   'BIOGRID_ID_A': 107140,
   'BIOGRID_ID_B': 110573,
   'SYSTEMATIC_NAME_A': '-',
   'SYSTEMATIC_NAME_B': '-',
   'OFFICIAL_SYMBOL_A'

## Brenda

In [1]:
from src.brenda import BrendaInstance

In [2]:
instance = BrendaInstance(
    email="diego.fernandez@umag.cl",
    password="Scanfpassword123",
)

In [3]:
instance.fetch_single(
    query=
        {
            "ecNumber": "1.1.1.1", 
            "organism": "Homo sapiens",
        }, 
    operation="getTemperatureOptimum",
)

[{'literature': [655197],
  'temperatureOptimumMaximum': None,
  'temperatureOptimum': '25',
  'commentary': 'assay at',
  'organism': 'Homo sapiens',
  'ecNumber': '1.1.1.1'},
 {'literature': [656058],
  'temperatureOptimumMaximum': None,
  'temperatureOptimum': '25',
  'commentary': 'assay at',
  'organism': 'Homo sapiens',
  'ecNumber': '1.1.1.1'},
 {'literature': [654370],
  'temperatureOptimumMaximum': None,
  'temperatureOptimum': '25',
  'commentary': 'assay at',
  'organism': 'Homo sapiens',
  'ecNumber': '1.1.1.1'},
 {'literature': [655479],
  'temperatureOptimumMaximum': None,
  'temperatureOptimum': '25',
  'commentary': 'assay at',
  'organism': 'Homo sapiens',
  'ecNumber': '1.1.1.1'},
 {'literature': [722073],
  'temperatureOptimumMaximum': None,
  'temperatureOptimum': '25',
  'commentary': 'assay at',
  'organism': 'Homo sapiens',
  'ecNumber': '1.1.1.1'},
 {'literature': [655206],
  'temperatureOptimumMaximum': '37',
  'temperatureOptimum': '30',
  'commentary': 'assay

In [4]:
print(instance.query_usage())

Usage: To fetch data from BRENDA, use the following parameters.
        Example:
            - fetch(query={}, operations=["getKmValue", "getIc50Value"])
        Available operations: getKmValue, getIc50Value, getKcatKmValue, getKiValue, getPhRange, getPhOptimum, getPhStability, getCofactor, getTemperatureOptimum, getTemperatureStability, getTemperatureRange

For more information about each operation, please refer to the BRENDA documentation.
Or use `show_operation({operation_name})` to see the parameters required for each operation.


In [4]:
print(instance.show_operation("getTemperatureOptimum"))

Parameters for getTemperatureOptimum: ecNumber, temperatureOptimum, temperatureOptimumMaximum, commentary, organism, literature


In [4]:
instance.get_dummy()

{'getKmValue': {'literature': 'list(int)',
  'substrate': 'str',
  'kmValue': 'str',
  'kmValueMaximum': 'NoneType',
  'commentary': 'NoneType',
  'organism': 'str',
  'ecNumber': 'str',
  'ligandStructureId': 'int'},
 'getIc50Value': {},
 'getKcatKmValue': {},
 'getKiValue': {'kiValueMaximum': 'NoneType',
  'literature': 'list(int)',
  'kiValue': 'str',
  'commentary': 'NoneType',
  'inhibitor': 'str',
  'organism': 'str',
  'ecNumber': 'str',
  'ligandStructureId': 'int'},
 'getPhRange': {},
 'getPhOptimum': {},
 'getPhStability': {},
 'getCofactor': {'literature': 'list(int)',
  'commentary': 'str',
  'ecNumber': 'str',
  'organism': 'str',
  'cofactor': 'str',
  'ligandStructureId': 'int'},
 'getTemperatureOptimum': {},
 'getTemperatureStability': {},
 'getTemperatureRange': {}}

## Gen Ontology

In [1]:
from src.genontology import GenOntologyInterface

In [2]:
instance = GenOntologyInterface(
    #fields_to_extract = ["goid", "label"]
)

In [3]:
instance.get_dummy()

{'ontology-term': {'goid': 'str',
  'label': 'str',
  'definition': 'str',
  'comment': 'str',
  'creation_date': 'str',
  'synonyms': 'list(str)',
  'relatedSynonyms': 'list(str)',
  'alternativeIds': 'list(str)',
  'xrefs': 'list(str)',
  'subsets': 'list(str)'},
 'go': {'goid': 'str',
  'label': 'str',
  'definition': 'str',
  'comment': 'str',
  'creation_date': 'str',
  'synonyms': 'list(str)',
  'relatedSynonyms': 'list(str)',
  'alternativeIds': 'list(str)',
  'xrefs': 'list(str)',
  'subsets': 'list(str)'}}

In [6]:
instance.fetch_batch(
    method="ontology-term",
    queries=["GO:0008150", "GO:0006915"],
    option=None,
    parse=True
)

[{'goid': 'GO:0008150', 'label': 'biological_process'},
 {'goid': 'GO:0006915', 'label': 'apoptotic process'}]

In [None]:
instance.fetch_batch(
    method="ontology-term",
    queries=["GO:0008150", "GO:0006915"],
    option=None,
    parse=True
)

In [None]:
instance.fetch_to_dataframe(
    method= "ontology-term",
    query= "GO:0008150", 
    option=None,
    look_for_relationships=True
)

Fetching data from https://api.geneontology.org/api/ontology/term/GO%3A0008150
Fetching data from https://api.geneontology.org/api/ontology/term/GO%3A0008150/graph


Unnamed: 0,goid,label,relationships
0,GO:0008150,biological_process,"[GO:0044848, GO:0044419, GO:0008150, GO:006500..."


## Interpro

In [1]:
from src.interpro import InterproInstance

In [5]:
instance = InterproInstance(
    #fields_to_extract={
    #    "1": "accession",
    #    "2": "name"
    #}
)

query = {
    "db" : "InterPro",
    "modifiers" : {},
}

In [6]:
instance.fetch_single(query=query, method="entry", pages_to_fetch=1, parse=True)

[[{'accession': 'IPR000001',
   'name': 'Kringle',
   'source_database': 'interpro',
   'type': 'domain',
   'integrated': None,
   'member_databases': {'cdd': {'cd00108': 'KR'},
    'profile': {'PS50070': 'Kringle domain profile'},
    'pfam': {'PF00051': 'Kringle domain'},
    'smart': {'SM00130': 'Kringle domain'}},
   'go_terms': None}],
 [{'accession': 'IPR000003',
   'name': 'Retinoid X receptor/HNF4',
   'source_database': 'interpro',
   'type': 'family',
   'integrated': None,
   'member_databases': {'prints': {'PR00545': 'Retinoid X receptor'}},
   'go_terms': [{'identifier': 'GO:0003677',
     'name': 'DNA binding',
     'category': {'code': 'F', 'name': 'molecular_function'}},
    {'identifier': 'GO:0003707',
     'name': 'nuclear steroid receptor activity',
     'category': {'code': 'F', 'name': 'molecular_function'}},
    {'identifier': 'GO:0008270',
     'name': 'zinc ion binding',
     'category': {'code': 'F', 'name': 'molecular_function'}},
    {'identifier': 'GO:00063

In [None]:
instance.fetch_to_dataframe(
    [
        {
            "type": "entry",
            "db": "InterPro",
            "entry_integration": "",
            "modifiers": {
                "go_term": "GO:0004867"
            },
            "filter_type": "protein",
            "filter_db": "UniProt",
            "filter_value": "P05067"
        }
    ]
)

17 records found


Unnamed: 0,metadata,proteins
0,"{'accession': 'IPR002223', 'name': 'Pancreatic...","[{'accession': 'p05067', 'protein_length': 770..."
1,"{'accession': 'IPR008154', 'name': 'Amyloidoge...","[{'accession': 'p05067', 'protein_length': 770..."
2,"{'accession': 'IPR008155', 'name': 'Amyloidoge...","[{'accession': 'p05067', 'protein_length': 770..."
3,"{'accession': 'IPR011178', 'name': 'Amyloidoge...","[{'accession': 'p05067', 'protein_length': 770..."
4,"{'accession': 'IPR011993', 'name': 'PH-like do...","[{'accession': 'p05067', 'protein_length': 770..."
5,"{'accession': 'IPR013803', 'name': 'Amyloidoge...","[{'accession': 'p05067', 'protein_length': 770..."
6,"{'accession': 'IPR015849', 'name': 'Amyloidoge...","[{'accession': 'p05067', 'protein_length': 770..."
7,"{'accession': 'IPR019543', 'name': 'Beta-amylo...","[{'accession': 'p05067', 'protein_length': 770..."
8,"{'accession': 'IPR019744', 'name': 'Amyloidoge...","[{'accession': 'p05067', 'protein_length': 770..."
9,"{'accession': 'IPR019745', 'name': 'Amyloidoge...","[{'accession': 'p05067', 'protein_length': 770..."


## PDB

In [6]:
from src.proteindatabank import PDBInterface

In [7]:
fields = {
    "rcsb_id": "rcsb_id",
    "2": "rcsb_comp_model_provenance",
    "3": "rcsb_entry_info.branched_molecular_weight_minimum",
    "4": "rcsb_entry_info.resolution_combined",
    "5": "rcsb_entry_info.experimental_method",
    "6": "rcsb_entry_info.diffrn_resolution_high.value",
}
fields = ["rcsb_id", "rcsb_entry_info.experimental_method"]

In [None]:
instance = PDBInterface(
    max_workers=2,
    fields_to_extract=fields, 
    download_structures=True,
    return_data_list=["rcsb_id", "rcsb_comp_model_provenance", "rcsb_entry_info"],
    output_dir="results"
)

In [None]:
instance.fetch_batch(
    queries=["4HHB", "1A8I", "1A8J", "1A8K", "1A8L", "1A8M"],
    parse=True
)

Info: Downloading 4HHB in pdb format...
Info: Downloading 1A8I in pdb format...
Info: Downloading 1A8J in pdb format...
Info: Downloading 1A8K in pdb format...
Info: Downloading 1A8L in pdb format...
Info: Downloading 1A8M in pdb format...


[{'rcsb_id': '4HHB', 'rcsb_entry_info.experimental_method': 'X-ray'},
 {'rcsb_id': '1A8I', 'rcsb_entry_info.experimental_method': 'X-ray'},
 {'rcsb_id': '1A8J', 'rcsb_entry_info.experimental_method': 'X-ray'},
 {'rcsb_id': '1A8K', 'rcsb_entry_info.experimental_method': 'X-ray'},
 {'rcsb_id': '1A8L', 'rcsb_entry_info.experimental_method': 'X-ray'},
 {'rcsb_id': '1A8M', 'rcsb_entry_info.experimental_method': 'X-ray'}]

In [None]:
instance.get_dummy()

Info: Downloading 4HHB in pdb format...


{'audit_author.name': 'str',
 'audit_author.pdbx_ordinal': 'int',
 'cell.angle_alpha': 'dict(float)',
 'cell.angle_beta': 'dict(float)',
 'cell.angle_gamma': 'dict(float)',
 'cell.length_a': 'dict(float)',
 'cell.length_b': 'dict(float)',
 'cell.length_c': 'dict(float)',
 'cell.zpdb': 'dict(int)',
 'cell': 'dict',
 'citation.country': 'str',
 'citation.id': 'str',
 'citation.journal_abbrev': 'str',
 'citation.journal_id_astm': 'str',
 'citation.journal_id_csd': 'str',
 'citation.journal_id_issn': 'str',
 'citation.journal_volume': 'str',
 'citation.page_first': 'str',
 'citation.page_last': 'str',
 'citation.pdbx_database_id_doi': 'str',
 'citation.pdbx_database_id_pub_med': 'int',
 'citation.rcsb_authors': 'list(str)',
 'citation.rcsb_is_primary': 'str',
 'citation.rcsb_journal_abbrev': 'str',
 'citation.title': 'str',
 'citation.year': 'int',
 'database2.database_code': 'str',
 'database2.database_id': 'str',
 'database2.pdbx_doi': 'str',
 'database2.pdbx_database_accession': 'str',


In [None]:
instance.fetch_single(
    "4HHB",
    parse=True
)

Info: Downloading 4HHB in pdb format...


{'rcsb_id': '4HHB', 'rcsb_entry_info.experimental_method': 'X-ray'}

## Refseq

In [None]:
from src.refseq import RefSeqInterface

In [None]:
features = {
    "1": "GBSeq_locus",
    "2": "GBSeq_length",
    "3": "GBSeq_keywords",
    "4": "GBSeq_feature-table.GBFeature_intervals",
    "5": "GBSeq_feature-table.GBFeature_intervals.GBInterval_from"
}

In [None]:
features = [
    "GBSeq_locus",
    "GBSeq_length",
    "GBSeq_keywords",
    "GBSeq_feature-table.GBFeature_intervals",
    "GBSeq_feature-table.GBFeature_intervals.GBInterval_from"
]

In [None]:
instance = RefSeqInterface(
    max_workers=2,
    #fields_to_extract=features,
)

In [None]:
instance.download_from_refseq_ids(
    ["XP_010804480.1", "XP_010804481.1", "XP_010804482.1"]
).to_csv("results/refseq.csv", index=False)

## Reactome

In [None]:
from src.reactome import ReactomeInstance
from src.utils import get_feature_keys
import pandas as pd

instance = ReactomeInstance(
    max_workers=2
)
get_feature_keys(instance.fetch("R-DME-1834941")[0])

{'dbId': 'int',
 'displayName': 'str',
 'stId': 'str',
 'stIdVersion': 'str',
 'isInDisease': 'bool',
 'isInferred': 'bool',
 'name': 'list(str)',
 'releaseDate': 'str',
 'speciesName': 'str',
 'category': 'str',
 'className': 'str',
 'schemaClass': 'str'}

In [None]:
instance = ReactomeInstance()
response = instance.fetch("R-DME-1834941")
pd.DataFrame(instance.parse(response))

Unnamed: 0,dbId,displayName,stId,stIdVersion,isInDisease,isInferred,name,releaseDate,speciesName,category,className,schemaClass,hasDiagram,hasEHLD
0,10790592,STING binds cyclic GMP-AMP,R-DME-3244643,R-DME-3244643.1,False,True,[STING binds cyclic GMP-AMP],2025-03-26,Drosophila melanogaster,omitted,Reaction,BlackBoxEvent,,
1,10819344,"DDX41 binds bacterial c-di-AMP, c-di-GMP",R-DME-9013869,R-DME-9013869.1,False,True,"[DDX41 binds bacterial c-di-AMP, c-di-GMP ]",2025-03-26,Drosophila melanogaster,binding,Reaction,Reaction,,
2,10790177,STING dimerization,R-DME-3134800,R-DME-3134800.1,False,True,[STING dimerization],2025-03-26,Drosophila melanogaster,binding,Reaction,Reaction,,
3,10788129,STING binds c-di-GMP,R-DME-2396009,R-DME-2396009.1,False,True,[STING binds c-di-GMP],2025-03-26,Drosophila melanogaster,omitted,Reaction,BlackBoxEvent,,
4,10829117,STAT6-mediated induction of chemokines,R-DME-3249367,R-DME-3249367.1,False,True,[STAT6-mediated induction of chemokines],2025-03-26,Drosophila melanogaster,,Pathway,Pathway,False,False
5,10790821,STING recruits TBK1 and STAT6,R-DME-3249378,R-DME-3249378.1,False,True,[STING recruits TBK1 and STAT6],2025-03-26,Drosophila melanogaster,binding,Reaction,Reaction,,
6,10790823,p-S407-STAT6 is phosphorylated at Tyr641,R-DME-3249379,R-DME-3249379.1,False,True,[p-S407-STAT6 is phosphorylated at Tyr641],2025-03-26,Drosophila melanogaster,transition,Reaction,Reaction,,
7,10790813,TBK1 phosphorylates STAT6 at Ser407,R-DME-3249371,R-DME-3249371.1,False,True,[TBK1 phosphorylates STAT6 at Ser407],2025-03-26,Drosophila melanogaster,transition,Reaction,Reaction,,
8,10790798,"p-S407,Y641-STAT6 dimer migrates to the nucleus",R-DME-3249370,R-DME-3249370.1,False,True,"[p-S407,Y641-STAT6 dimer migrates to the nucleus]",2025-03-26,Drosophila melanogaster,transition,Reaction,Reaction,,
9,10790815,"p-S407,Y641-STAT6 is dimerized",R-DME-3249372,R-DME-3249372.1,False,True,"[p-S407,Y641-STAT6 is dimerized]",2025-03-26,Drosophila melanogaster,binding,Reaction,Reaction,,


## STRING

In [None]:
from src.stringdb import StringInterface

In [None]:
instance = StringInterface()
test = instance.fetch(
    outfmt="json",
    method="interaction_partners",
    params={
        "identifiers": ["p53"],
    }
)

In [None]:
instance.parse(test, fmt="json")

[{'stringId_A': '7227.FBpp0083753',
  'stringId_B': '7227.FBpp0110174',
  'preferredName_A': 'p53',
  'preferredName_B': 'tefu',
  'ncbiTaxonId': 7227,
  'score': 0.985,
  'nscore': 0,
  'fscore': 0,
  'pscore': 0,
  'ascore': 0,
  'escore': 0.6,
  'dscore': 0.9,
  'tscore': 0.659},
 {'stringId_A': '7227.FBpp0083753',
  'stringId_B': '7227.FBpp0074047',
  'preferredName_A': 'p53',
  'preferredName_B': 'mei-41',
  'ncbiTaxonId': 7227,
  'score': 0.962,
  'nscore': 0,
  'fscore': 0,
  'pscore': 0,
  'ascore': 0.071,
  'escore': 0,
  'dscore': 0.9,
  'tscore': 0.627},
 {'stringId_A': '7227.FBpp0083753',
  'stringId_B': '7227.FBpp0304253',
  'preferredName_A': 'p53',
  'preferredName_B': 'hpo',
  'ncbiTaxonId': 7227,
  'score': 0.94,
  'nscore': 0,
  'fscore': 0,
  'pscore': 0,
  'ascore': 0.062,
  'escore': 0.225,
  'dscore': 0.9,
  'tscore': 0.28},
 {'stringId_A': '7227.FBpp0083753',
  'stringId_B': '7227.FBpp0080860',
  'preferredName_A': 'p53',
  'preferredName_B': 'lok',
  'ncbiTaxonI

In [None]:
instance.fetch_to_dataframe(
    outfmt="json",
    method="interaction_partners",
    params={
        "identifiers": ["p53", "cdk2"],
        "species": 9606,
    }
)

Unnamed: 0,stringId_A,stringId_B,preferredName_A,preferredName_B,ncbiTaxonId,score,nscore,fscore,pscore,ascore,escore,dscore,tscore
0,9606.ENSP00000266970,9606.ENSP00000481380,CDK2,CCNA2,9606,0.999,0,0.003,0.0,0.453,0.999,0.9,0.999
1,9606.ENSP00000266970,9606.ENSP00000413720,CDK2,CDKN1C,9606,0.999,0,0.000,0.0,0.085,0.859,0.9,0.970
2,9606.ENSP00000266970,9606.ENSP00000228872,CDK2,CDKN1B,9606,0.999,0,0.000,0.0,0.085,0.999,0.9,0.999
3,9606.ENSP00000266970,9606.ENSP00000429089,CDK2,CCNE2,9606,0.999,0,0.000,0.0,0.200,0.995,0.9,0.996
4,9606.ENSP00000266970,9606.ENSP00000255465,CDK2,CCNA1,9606,0.999,0,0.000,0.0,0.292,0.942,0.9,0.999
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2956,9606.ENSP00000269305,9606.ENSP00000310928,TP53,PPARD,9606,0.400,0,0.000,0.0,0.000,0.000,0.0,0.400
2957,9606.ENSP00000269305,9606.ENSP00000419945,TP53,ERVW-1,9606,0.400,0,0.000,0.0,0.000,0.000,0.0,0.400
2958,9606.ENSP00000269305,9606.ENSP00000462980,TP53,TAF4B,9606,0.400,0,0.000,0.0,0.000,0.000,0.4,0.000
2959,9606.ENSP00000269305,9606.ENSP00000431885,TP53,TYK2,9606,0.400,0,0.000,0.0,0.083,0.000,0.0,0.372


## KEGG

In [None]:
from src.kegg import KEGGInterface

instance = KEGGInterface(
    fields_to_extract=["ENTRY", "NAME", "PATHWAY"]
)
instance.fetch(
    method="get",
    query=["hsa:10458", "ece:Z5100"]
)

<Response [200]>

In [None]:
instance.fetch_to_dataframe(
    method="get",
    query=["hsa:10458", "ece:Z5100"]
    
)

Unnamed: 0,ENTRY,NAME,PATHWAY
0,10458 CDS T01001,(RefSeq) BAR/IMD domain containing adaptor pro...,hsa04520 Adherens junction hsa04810 Regulati...
1,Z5100 CDS T00044,(GenBank) espF,ece05130 Pathogenic Escherichia coli infection


### Show graph test

In [None]:
import json

# Convierte a formato Cytoscape.js
def convert_to_cytoscape_format(graph_json):
    elements = []

    for node in graph_json['nodes']:
        elements.append({
            "data": {
                "id": node["id"],
                "label": node["lbl"]
            }
        })

    for edge in graph_json['edges']:
        elements.append({
            "data": {
                "source": edge["sub"],
                "target": edge["obj"],
                "label": edge["pred"]
            }
        })

    return elements

# Genera el HTML
def create_cytoscape_html(graph_json):
    elements = convert_to_cytoscape_format(graph_json)

    html = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <meta charset="utf-8">
        <title>Grafo GO Interactivo</title>
        <script src="https://unpkg.com/cytoscape@3.19.0/dist/cytoscape.min.js"></script>
        <script src="https://unpkg.com/dagre@0.8.5/dist/dagre.min.js"></script>
        <script src="https://unpkg.com/cytoscape-dagre@2.3.2/cytoscape-dagre.js"></script>
        <style>
            html, body {{
                margin: 0;
                padding: 0;
                height: 100%;
                width: 100%;
                font-family: Arial, sans-serif;
            }}
            #cy {{
                height: 100%;
                width: 100%;
                display: block;
            }}
        </style>
    </head>
    <body>
        <div id="cy"></div>
        <script>
            cytoscape.use(cytoscapeDagre);

            var cy = cytoscape({{
                container: document.getElementById('cy'),
                elements: {json.dumps(elements)},
                style: [
                    {{
                        selector: 'node',
                        style: {{
                            'shape': 'roundrectangle',
                            'label': 'data(label)',
                            'text-valign': 'center',
                            'text-halign': 'center',
                            'background-color': '#AED6F1',
                            'color': '#1B2631',
                            'font-size': '8px',
                            'width': 'label',
                            'height': 'label',
                            'padding': '6px',
                            'border-width': 1,
                            'border-color': '#2980B9'
                        }}
                    }},
                    {{
                        selector: 'edge',
                        style: {{
                            'width': 2,
                            'label': 'data(label)',
                            'line-color': '#B2BABB',
                            'target-arrow-color': '#B2BABB',
                            'target-arrow-shape': 'triangle',
                            'curve-style': 'bezier',
                            'font-size': '7px',
                            'color': '#5D6D7E',
                            'text-background-opacity': 1,
                            'text-background-color': '#fff',
                            'text-background-shape': 'roundrectangle',
                            'text-background-padding': 2
                        }}
                    }}
                ],
                layout: {{
                    name: 'dagre',
                    rankDir: 'TB',
                    nodeSep: 30,
                    edgeSep: 10,
                    rankSep: 50
                }},
                zoomingEnabled: true,
                userZoomingEnabled: true,
                boxSelectionEnabled: false
            }});
        </script>
    </body>
    </html>
    """
    return html

# Guardar archivo HTML
def save_graph_to_html(graph_json, output_file='cytoscape_graph.html'):
    html = create_cytoscape_html(graph_json)
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(html)
    print(f"✅ Grafo guardado en: {output_file}")

# Ejecutar
save_graph_to_html(instance.parse(response)["topology_graph_json"], 'mi_grafo.html')

✅ Grafo guardado en: mi_grafo.html
