In [1]:
import pandas as pd
from functools import partial

In [2]:
columns = [
    "protein_accession",
    "sequence_md5_digest",
    "sequence_length",
    "analysis",
    "signature_accession",
    "signature_description",
    "start_location",
    "stop_location",
    "score",
    "status",
    "date",
    "interpro_annotations_accession",
    "interpro_annotations_description",
    "go_annotation",
    "pathway_annotation"
]

In [3]:
file = "/home/olivier/interproscan/output/Ahypochondriacus_315_v1.0.protein.fa.tsv"
df = pd.read_csv(file, sep="\t", header=None, names=columns)

In [4]:
df

Unnamed: 0,protein_accession,sequence_md5_digest,sequence_length,analysis,signature_accession,signature_description,start_location,stop_location,score,status,date,interpro_annotations_accession,interpro_annotations_description,go_annotation,pathway_annotation
0,AHYPO_003120-RA,c013898bc4a4a5da9d17c19ce37e5b66,286,PANTHER,PTHR31161,PROTEIN GRAVITROPIC IN THE LIGHT 1,25,281,6.3E-10,T,25-01-2025,IPR040225,Protein gravitropic in the light 1-like,-,-
1,AHYPO_003120-RA,c013898bc4a4a5da9d17c19ce37e5b66,286,Coils,Coil,Coil,18,52,-,T,25-01-2025,-,-,-,-
2,AHYPO_003120-RA,c013898bc4a4a5da9d17c19ce37e5b66,286,Coils,Coil,Coil,214,245,-,T,25-01-2025,-,-,-,-
3,AHYPO_003120-RA,c013898bc4a4a5da9d17c19ce37e5b66,286,Coils,Coil,Coil,167,206,-,T,25-01-2025,-,-,-,-
4,AHYPO_016636-RA,7a2a7f42ad6ed4eb00e4bb2dd93a595a,415,SMART,SM00356,c3hfinal6,3,28,2.3E-7,T,25-01-2025,IPR000571,"Zinc finger, CCCH-type",-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216085,AHYPO_016236-RA,3d60c71eee44d88ca5d505a44fb4faad,717,SUPERFAMILY,SSF56112,Protein kinase-like (PK-like),151,433,1.78E-61,T,25-01-2025,IPR011009,Protein kinase-like domain superfamily,-,-
216086,AHYPO_016236-RA,3d60c71eee44d88ca5d505a44fb4faad,717,Gene3D,G3DSA:1.10.510.10,Transferase(Phosphotransferase) domain 1,122,466,4.3E-81,T,25-01-2025,-,-,-,-
216087,AHYPO_016236-RA,3d60c71eee44d88ca5d505a44fb4faad,717,Pfam,PF24289,Domain of unknown function (DUF7477),450,717,9.2E-157,T,25-01-2025,IPR055900,Domain of unknown function DUF7477,-,-
216088,AHYPO_011658-RA,d2839fdf340d2432de0b37e21557fc2f,258,Pfam,PF02485,Core-2/I-Branching enzyme,87,247,5.2E-43,T,25-01-2025,IPR003406,"Glycosyl transferase, family 14",-,-


In [5]:
import sys, json, ssl
from urllib import request
from urllib.error import HTTPError
from time import sleep
from tqdm.auto import tqdm


def get_nb_entries(database: str):
    BASE_URL = f"https://www.ebi.ac.uk:443/interpro/api/entry/all/{database}/"
    req = request.Request(BASE_URL, headers={"Accept": "application/json"})
    res = request.urlopen(req)
    payload = json.loads(res.read().decode())
    return payload["count"]


def get_entries(database: str):
    
    PAGE_SIZE = 200
    BASE_URL = f"https://www.ebi.ac.uk:443/interpro/api/entry/all/database/?page_size={PAGE_SIZE}"
    
    entries = []
  
    #disable SSL verification to avoid config issues
    context = ssl._create_unverified_context()
    
    next = BASE_URL
    
    total_nb_entries = get_nb_entries(database)
    pbar = tqdm(total=total_nb_entries)
    
    attempts = 0
    i = 0
    while next:
    
        try:
            req = request.Request(next, headers={"Accept": "application/json"})
            res = request.urlopen(req, context=context)
            # If the API times out due a long running query
            if res.status == 408:
                # wait just over a minute
                sleep(61)
                # then continue this loop with the same URL
                continue
            elif res.status == 204:
                #no data so leave loop
                break
            payload = json.loads(res.read().decode())
            next = payload["next"]
            attempts = 0
            
            entries += payload["results"]
        
        except HTTPError as e:
            if e.code == 408:
                sleep(61)
                continue
            else:
                # If there is a different HTTP error, it wil re-try 3 times before failing
                if attempts < 3:
                    attempts += 1
                    sleep(61)
                    continue
                else:
                    sys.stderr.write("LAST URL: " + next)
                    raise e
          
        # Don't overload the server, give it time before asking for more
        if next:
            sleep(1)
        
        i += 1
        pbar.update(i * PAGE_SIZE)
    
    pbar.close()
    
    return entries

In [67]:
pfam_entries = get_entries(database="pfam")
panther_entries = get_entries(database="panther")
ncbifam_entries = get_entries(database="ncbifam")
cdd_entries = get_entries(database="cdd")
superfamily_entries = get_entries(database="ssf")
smart_entries = get_entries(database="smart")
prosite_profile_entries = get_entries(database="profile")
prosite_pattern_entries = get_entries(database="prosite")


  0%|          | 0/23801 [00:00<?, ?it/s]

In [121]:
df.analysis.unique()

array(['PANTHER', 'Coils', 'SMART', 'MobiDBLite', 'Gene3D',
       'ProSiteProfiles', 'Pfam', 'SUPERFAMILY', 'FunFam', 'PRINTS',
       'CDD', 'NCBIfam', 'ProSitePatterns', 'Hamap', 'PIRSF', 'SFLD',
       'AntiFam'], dtype=object)

In [86]:
def get_specific_entries(entries, keyword: str):
    specific_entries = []
    for entry in entries:
        metadata = entry["metadata"]
        if keyword.lower() in metadata["name"].lower():
            specific_entries.append(metadata["accession"])
    return specific_entries

In [34]:
keywords = [
    "integrase",
    "reverse transcriptase",
    "RNase H",
    "transpos",
    "ribosom"
]

In [17]:
specific_entries = []
for keyword in keywords:
    specific_entries += get_specific_entries(entries, keyword)
specific_entries = list(set(specific_entries))

NameError: name 'get_specific_entries' is not defined

In [18]:
mask_antifam = (df.analysis == "AntiFam")
mask_pfam = (df.analysis == "Pfam") & (df.signature_accession.isin(specific_entries))

filtered_df = df[mask_antifam | mask_pfam]

In [12]:
def contains_keyword(string: str, keywords: list):
    for keyword in keywords:
        if keyword.lower() in string.lower():
            return True
    return False

In [44]:
func = partial(contains_keyword, keywords=keywords)
fdf = df[df.signature_description.apply(func)]
with open("unwanted_proteins.txt", "w") as fout:
    fout.write("\n".join(fdf.protein_accession.unique().tolist()))
len(fdf.protein_accession.unique())

661