In [None]:
import pandas as pd
from functools import partial

In [None]:
columns = [
    "protein_accession",
    "sequence_md5_digest",
    "sequence_length",
    "analysis",
    "signature_accession",
    "signature_description",
    "start_location",
    "stop_location",
    "score",
    "status",
    "date",
    "interpro_annotations_accession",
    "interpro_annotations_description",
    "go_annotation",
    "pathway_annotation"
]

In [None]:
file = "/home/olivier/interproscan/output/Ahypochondriacus_315_v1.0.protein.fa.tsv"
df = pd.read_csv(file, sep="\t", header=None, names=columns)

In [None]:
df

In [None]:
import sys, json, ssl
from urllib import request
from urllib.error import HTTPError
from time import sleep
from tqdm.auto import tqdm


def get_nb_entries(database: str):
    BASE_URL = f"https://www.ebi.ac.uk:443/interpro/api/entry/all/{database}/"
    req = request.Request(BASE_URL, headers={"Accept": "application/json"})
    res = request.urlopen(req)
    payload = json.loads(res.read().decode())
    return payload["count"]


def get_entries(database: str):
    
    PAGE_SIZE = 200
    BASE_URL = f"https://www.ebi.ac.uk:443/interpro/api/entry/all/database/?page_size={PAGE_SIZE}"
    
    entries = []
  
    #disable SSL verification to avoid config issues
    context = ssl._create_unverified_context()
    
    next = BASE_URL
    
    total_nb_entries = get_nb_entries(database)
    pbar = tqdm(total=total_nb_entries)
    
    attempts = 0
    i = 0
    while next:
    
        try:
            req = request.Request(next, headers={"Accept": "application/json"})
            res = request.urlopen(req, context=context)
            # If the API times out due a long running query
            if res.status == 408:
                # wait just over a minute
                sleep(61)
                # then continue this loop with the same URL
                continue
            elif res.status == 204:
                #no data so leave loop
                break
            payload = json.loads(res.read().decode())
            next = payload["next"]
            attempts = 0
            
            entries += payload["results"]
        
        except HTTPError as e:
            if e.code == 408:
                sleep(61)
                continue
            else:
                # If there is a different HTTP error, it wil re-try 3 times before failing
                if attempts < 3:
                    attempts += 1
                    sleep(61)
                    continue
                else:
                    sys.stderr.write("LAST URL: " + next)
                    raise e
          
        # Don't overload the server, give it time before asking for more
        if next:
            sleep(1)
        
        i += 1
        pbar.update(i * PAGE_SIZE)
    
    pbar.close()
    
    return entries

In [None]:
pfam_entries = get_entries(database="pfam")
panther_entries = get_entries(database="panther")
ncbifam_entries = get_entries(database="ncbifam")
cdd_entries = get_entries(database="cdd")
superfamily_entries = get_entries(database="ssf")
smart_entries = get_entries(database="smart")
prosite_profile_entries = get_entries(database="profile")
prosite_pattern_entries = get_entries(database="prosite")


In [None]:
df.analysis.unique()

In [None]:
def get_specific_entries(entries, keyword: str):
    specific_entries = []
    for entry in entries:
        metadata = entry["metadata"]
        if keyword.lower() in metadata["name"].lower():
            specific_entries.append(metadata["accession"])
    return specific_entries

In [None]:
keywords = [
    "integrase",
    "reverse transcriptase",
    "RNase H",
    "transpos",
    "ribosom"
]

In [None]:
specific_entries = []
for keyword in keywords:
    specific_entries += get_specific_entries(entries, keyword)
specific_entries = list(set(specific_entries))

In [None]:
mask_antifam = (df.analysis == "AntiFam")
mask_pfam = (df.analysis == "Pfam") & (df.signature_accession.isin(specific_entries))

filtered_df = df[mask_antifam | mask_pfam]

In [None]:
def contains_keyword(string: str, keywords: list):
    for keyword in keywords:
        if keyword.lower() in string.lower():
            return True
    return False

In [None]:
func = partial(contains_keyword, keywords=keywords)
fdf = df[df.signature_description.apply(func)]
with open("unwanted_proteins.txt", "w") as fout:
    fout.write("\n".join(fdf.protein_accession.unique().tolist()))
len(fdf.protein_accession.unique())