In [None]:
import requests, math
from collections import defaultdict
import pandas as pd

INTERPRO_URL = "https://www.ebi.ac.uk/interpro/api"
CATH_URL = f"https://www.cathdb.info/version/v4_3_0/api/rest/uniprot_to_funfam"

In [None]:
import requests, math
from collections import defaultdict
import pandas as pd

INTERPRO_URL = "https://www.ebi.ac.uk/interpro/api"
CATH_URL = f"https://www.cathdb.info/version/v4_3_0/api/rest/uniprot_to_funfam"


KNOWN_DATABASES = ["InterPro", "cathgene3d", "cdd", "HAMAP", "panther", "Pfam", "PIRSF", "PRINTS", "ssf", "antifam",
                        "PROSITE" "Patterns", "PROSITE", "profile", "smart", "SFLD", "SUPERFAMILY", "ncbifam"]
# ================================
# FINAL CONSOLIDATED FUNCTIONS
# ================================

def _get_protein_length(uniprot_acc):
    """
    Fetch protein length from UniProt API.
    Returns integer length or None if not found.
    """
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_acc}.json"
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        data = response.json()
        seq = data.get("sequence", {}).get("value")
        if seq:
            return len(seq)
    except Exception:
        pass
    return -1


# Helper function to extract domains from a specific source
def _extract_domains_from_source(uniprot_acc, source, representative_only=False):
    """
    Extract domains from a specific InterPro database source with pagination support.

    Parameters:
    -----------
    uniprot_acc : str
        UniProt accession
    source : str
        Database source ('pfam', 'smart', etc.)
    representative_only : bool
        If True, only return representative domain hits
    """
    url = f"{INTERPRO_URL}/entry/{source}/protein/UniProt/{uniprot_acc}?page_size=200"
    domains = []

    while url:
        try:
            response = requests.get(url, timeout=30)
            if response.status_code == 404:
                break
            response.raise_for_status()
            data = response.json()

            results = data.get("results", []) if isinstance(data, dict) else data

            for entry in results:
                # Get entry metadata
                meta = entry.get("metadata", {})
                entry_acc = meta.get("accession")
                entry_name = meta.get("name", "")
                entry_type = meta.get("type")
                source_db = meta.get("source_database")

                # Find target protein and extract coordinates
                for protein in entry.get("proteins", []):
                    protein_acc = protein.get("accession")
                    if protein_acc and protein_acc.upper() == uniprot_acc.upper():
                        # Check for matches (entry_protein_locations)
                        for location in protein.get("entry_protein_locations", []):
                            # KEY: representative flag is at the LOCATION level, not fragment level
                            is_representative = bool(location.get("representative"))

                            # KEY FILTER: Skip non-representative locations if filtering
                            if representative_only and not is_representative:
                                continue

                            for fragment in location.get("fragments", []):
                                # Skip domains with null/None/empty names
                                if entry_name and entry_name != '':
                                    domains.append({
                                        "accession": entry_acc,
                                        "name": entry_name,
                                        "type": entry_type,
                                        "source": source_db,
                                        "start": fragment.get("start"),
                                        "end": fragment.get("end"),
                                        "score": location.get("score"),
                                        "representative": is_representative
                                    })

            # Follow pagination
            url = data.get("next") if isinstance(data, dict) else None

        except requests.exceptions.RequestException:
            break

    return domains

def get_interpro_domains(uniprot_acc, source=None, representative_only=False):
    """
    Get protein domain coordinates from InterPro API.

    Parameters:
    -----------
    uniprot_acc : str
        UniProt accession (e.g., 'P28482')
    source : str, list, or None
        - str: Query a specific database ('pfam', 'smart', 'prosite', etc.)
        - list: Query multiple specific databases (['pfam', 'smart'])
        - None: Query all available databases (default: representative hits only)
    representative_only : bool or None
        - True: Only return representative domain hits (reduces redundancy)
        - False: Return all hits
        - None: Auto-decide (True if source=None, False if source is specified)

    Returns:
    --------
    list of dict
        Each domain contains: accession, name, type, source, start, end, score, representative
        Sorted by start coordinate, then end coordinate
    """

    # Known InterPro member databases
    #KNOWN_DATABASES = ["pfam", "smart", "prosite", "prints", "panther", "cdd", "superfamily",
    #                  "tigrfams", "gene3d", "hamap", "pirsf", "profile", "cath3d"]

    if source:
        # Handle list of databases
        if isinstance(source, list):
            all_domains = []
            available_dbs = []

            print(f"Querying specified databases for {uniprot_acc}: {', '.join(source)}")
            if representative_only:
                print("(filtering for representative hits only)")

            for db in source:
                domains = _extract_domains_from_source(uniprot_acc, db, representative_only)
                if domains:
                    available_dbs.append(f"{db}({len(domains)})")
                    all_domains.extend(domains)

            if available_dbs:
                print(f"Found data in: {', '.join(available_dbs)}")
            else:
                print(f"No data found in specified databases")

            # Sort by start, then end coordinate
            return sorted(all_domains, key=lambda x: (x["start"] if x["start"] is not None else float('inf'),
                                                       x["end"] if x["end"] is not None else float('inf')))

        # Handle single database (string)
        else:
            domains = _extract_domains_from_source(uniprot_acc, source, representative_only)
            # Sort by start, then end coordinate
            return sorted(domains, key=lambda x: (x["start"] if x["start"] is not None else float('inf'),
                                                   x["end"] if x["end"] is not None else float('inf')))

    else:
        # Query all known databases (default: representative only)
        all_domains = []
        available_dbs = []

        print(f"Scanning all databases for {uniprot_acc}...")
        if representative_only:
            print("(filtering for representative hits only)")

        for db in KNOWN_DATABASES:
            domains = _extract_domains_from_source(uniprot_acc, db, representative_only)
            if domains:
                available_dbs.append(f"{db}({len(domains)})")
                all_domains.extend(domains)

        print(f"Found data in: {', '.join(available_dbs)}")
        # Sort by start, then end coordinate
        return sorted(all_domains, key=lambda x: (x["start"] if x["start"] is not None else float('inf'),
                                                   x["end"] if x["end"] is not None else float('inf')))

def summarize_protein_domains_dict(uniprot_acc, source=None, representative_only=True):
    """
    Return a dict with protein name, domains (with coords and scores), and protein length.

    Parameters:
    -----------
    uniprot_acc : str
        UniProt accession
    source : str, list, or None
        - str: Query a specific database
        - list: Query multiple specific databases
        - None: Query all available databases (default: representative only)
    representative_only : bool or None
        - True: Only return representative domain hits
        - False: Return all hits

    Returns:
    --------
    dict with keys: uniprot_acc, protein_length, domains (sorted by start coordinate), available_databases
    """
    # Get protein length
    length = _get_protein_length(uniprot_acc)
    if length <= 0:
        return None

    # Get domains (this will scan databases if source=None)
    # Domains are already sorted by get_interpro_domains
    domains = get_interpro_domains(uniprot_acc, source=source, representative_only=representative_only)

    # Build available databases dict from domains already retrieved
    if source:
        if isinstance(source, list):
            databases = {}
            for domain in domains:
                db = domain.get('source')
                if db:
                    databases[db] = databases.get(db, 0) + 1
        else:
            databases = {source: len(domains)}
    else:
        # Extract database info from the domains we already have
        databases = {}
        for domain in domains:
            db = domain.get('source')
            if db:
                databases[db] = databases.get(db, 0) + 1

    # Build output dict
    result = {
        "uniprot_acc": uniprot_acc,
        "protein_length": length,
        "domains": domains,
        "available_databases": databases
    }
    return result


In [22]:
def compute_linker_regions(domains, protein_length=None):
    """
    Compute linker regions between domains.

    Parameters:
    -----------
    domains : list of dict
        List of domain dictionaries with 'start' and 'end' keys
        (e.g., result['domains'] from summarize_protein_domains_dict)
    protein_length : int, optional
        Total protein length. If provided, includes N-terminal and C-terminal regions.

    Returns:
    --------
    list of dict
        Each linker region contains: start, end
        Sorted by start coordinate
    """
    if not domains:
        if protein_length:
            return [{"start": 1, "end": protein_length}]
        return []

    # Sort domains by start position
    sorted_domains = sorted(domains, key=lambda x: (x["start"] if x["start"] is not None else float('inf')))

    linkers = []

    # Check for N-terminal linker (before first domain)
    first_domain = sorted_domains[0]
    if first_domain["start"] > 1:
        linker_start = 1
        linker_end = first_domain["start"] - 1
        # Only add if start < end (length > 0)
        if linker_start < linker_end:
            linkers.append({
                "start": linker_start,
                "end": linker_end
            })

    # Find gaps between consecutive domains
    for i in range(len(sorted_domains) - 1):
        current_domain = sorted_domains[i]
        next_domain = sorted_domains[i + 1]

        # Check if there's a gap between current domain end and next domain start
        gap_start = current_domain["end"] + 1
        gap_end = next_domain["start"] - 1

        # Only add if start < end (length > 0)
        if gap_start < gap_end:
            linkers.append({
                "start": gap_start,
                "end": gap_end
            })

    # Check for C-terminal linker (after last domain)
    if protein_length:
        last_domain = sorted_domains[-1]
        if last_domain["end"] < protein_length:
            linker_start = last_domain["end"] + 1
            linker_end = protein_length
            # Only add if start < end (length > 0)
            if linker_start < linker_end:
                linkers.append({
                    "start": linker_start,
                    "end": linker_end
                })

    return linkers

def add_linkers_to_result(result):
    """
    Add linker regions to a result dict from summarize_protein_domains_dict.

    Parameters:
    -----------
    result : dict
        Result dictionary from summarize_protein_domains_dict with keys:
        'domains', 'protein_length', etc.

    Returns:
    --------
    dict
        Same result dict with added 'linkers' key containing linker regions
    """
    linkers = compute_linker_regions(result['domains'], result.get('protein_length'))
    result['linkers'] = linkers
    return result

In [None]:
workdir = "~/Desktop/work/protein_linkers"
protein_file = f"{workdir}/proteins.tsv"
protein_df = pd.read_csv(protein_file, sep="\t")

protein_list = protein_df["Accession"].to_list()

domains = []

for protein in protein_list:
    res = summarize_protein_domains_dict(protein, source=None, representative_only=True)
    if res:
        add_linkers_to_result(res)
        domains.append(res)

print(domains)

In [24]:
for accession in domains:
    add_linkers_to_result(accession)
    if len(accession["linkers"]) > 0:
        print(accession["linkers"])

[{'start': 638, 'end': 667}, {'start': 986, 'end': 1027}]
[{'start': 1, 'end': 2}, {'start': 31, 'end': 34}, {'start': 60, 'end': 94}, {'start': 295, 'end': 318}, {'start': 361, 'end': 393}]
[{'start': 1, 'end': 18}, {'start': 316, 'end': 360}]
[{'start': 167, 'end': 189}]
[{'start': 89, 'end': 102}, {'start': 198, 'end': 205}, {'start': 619, 'end': 622}]
[{'start': 1, 'end': 17}, {'start': 138, 'end': 145}, {'start': 929, 'end': 944}]
[{'start': 1, 'end': 10}, {'start': 233, 'end': 295}, {'start': 427, 'end': 498}, {'start': 536, 'end': 548}, {'start': 852, 'end': 855}]
[{'start': 1, 'end': 7}, {'start': 586, 'end': 589}, {'start': 714, 'end': 716}]
[{'start': 1, 'end': 2}]
[{'start': 1, 'end': 3}, {'start': 232, 'end': 243}]
[{'start': 1, 'end': 9}, {'start': 1376, 'end': 1407}]
[{'start': 608, 'end': 638}]
[{'start': 180, 'end': 234}, {'start': 468, 'end': 488}]
[{'start': 1, 'end': 42}, {'start': 112, 'end': 127}, {'start': 254, 'end': 256}, {'start': 347, 'end': 388}]
[{'start': 1

In [34]:
def format_protein_structure(result):
    """
    Format protein domain and linker data into simplified structure.

    Parameters:
    -----------
    result : dict
        Result from summarize_protein_domains_dict with linkers added

    Returns:
    --------
    dict with keys:
        - domains: list of tuples (domain_name, start, end)
        - linkers: list of tuples (type, length)
            where type is 'outer' for N/C-terminal or 'inner' for inter-domain
    """
    if not result:
        return None

    protein_length = result.get('protein_length')
    domains_list = result.get('domains', [])
    linkers_list = result.get('linkers', [])

    # Format domains as (name, start, end), excluding domains with null/None names
    formatted_domains = [
        (domain['name'], domain['start'], domain['end'])
        for domain in domains_list
        if domain.get('name') is not None and domain.get('name') != ''
    ]

    # Format linkers as (type, length)
    formatted_linkers = []
    for linker in linkers_list:
        start = linker['start']
        end = linker['end']
        length = end - start + 1

        # Determine if outer (N-terminal or C-terminal) or inner (between domains)
        if start == 1 or end == protein_length:
            linker_type = 'outer'
        else:
            linker_type = 'inner'

        formatted_linkers.append((linker_type, length))

    return {
        'uniprot_acc': result.get('uniprot_acc'),
        'domains': formatted_domains,
        'linkers': formatted_linkers
    }

# Apply to all proteins in the domains list
formatted_proteins = []
for protein_data in domains:
    formatted = format_protein_structure(protein_data)
    if formatted:
        formatted_proteins.append(formatted)

# Display the results
for protein in formatted_proteins:
    print(f"\nProtein: {protein['uniprot_acc']}")
    print(f"Domains: {protein['domains']}")
    print(f"Linkers: {protein['linkers']}")


Protein: P69905
Domains: [('Hemoglobin and related oxygen transporters', 1, 142), ('Globin-like', 1, 142)]
Linkers: []

Protein: P68871
Domains: [('Hemoglobin and related oxygen transporters', 1, 147), ('Globins', 2, 147)]
Linkers: []

Protein: P01308
Domains: [('INSULIN/INSULIN GROWTH FACTOR', 1, 110), ('Insulin-like', 10, 109)]
Linkers: []

Protein: P00533
Domains: [('Tyrosine-protein kinase, EGF receptor type', 1, 1048), ('Receptor L-domain', 25, 213), ('Furin-like cysteine rich region', 185, 338), ('Receptor L-domain', 333, 530), ('Growth factor receptor domain IV', 505, 637), ('Phosphorylase Kinase; domain 1', 668, 792), ('Transferase(Phosphotransferase) domain 1', 793, 985), ('Tyrosine-protein kinase, EGF receptor type', 1028, 1209)]
Linkers: [('inner', 30), ('inner', 42)]

Protein: P04637
Domains: [('CELLULAR TUMOR ANTIGEN P53', 3, 369), ('P53 transactivation motif', 6, 30), ('Transactivation domain 2', 35, 59), ('p53-like tetramerisation domain', 319, 360)]
Linkers: [('outer',

In [35]:
import json
import os

# Save formatted_proteins to a JSON file
output_file = os.path.join(os.path.expanduser(workdir), "formatted_proteins.json")

# Convert to a dict with uniprot_acc as keys for easy lookup
formatted_proteins_dict = {
    protein['uniprot_acc']: {
        'domains': protein['domains'],
        'linkers': protein['linkers']
    }
    for protein in formatted_proteins
}

# Save to file
with open(output_file, 'w') as f:
    json.dump(formatted_proteins_dict, f, indent=2)

print(f"\nSaved {len(formatted_proteins_dict)} proteins to {output_file}")
print(f"File size: {os.path.getsize(output_file)} bytes")


Saved 49 proteins to /home/pospim/Desktop/work/protein_linkers/formatted_proteins.json
File size: 50441 bytes


In [None]:
def cath_gene3d(uniprot_acc):
    """
    Pull CATH FunFams for a UniProt and infer domain spans if present.
    (Some responses include mapped regions to the sequence.)
    """
    url = f"{CATH_URL}/{uniprot_acc}?content-type=application/json"
    r = requests.get(url, timeout=30)
    if r.status_code == 404:
        return []
    r.raise_for_status()
    data = r.json()
    hits = []
    for ff in data.get("data", []) if isinstance(data, dict) else data:
        sfam = ff.get("superfamily_id")
        funfam = ff.get("funfam_number")
        for reg in ff.get("regions", []):
            hits.append({
                "db": "CATH-Gene3D",
                "accession": f"{sfam}/FF{funfam}",
                "start": reg.fet("aln_start") or reg.get("start"),
                "end": reg.get("aln_end") or reg.get("end"),
                "score": reg.get("evalue") or reg.get("bitscore")
            })
    return sorted([h for h in hits if h["start"] and h["end"]], key=lambda h: (h["start"], h["end"]))
