In [6]:
import time
import requests

class RateLimitedSession:
    def __init__(self, max_requests_per_minute, max_retries=3, retry_delay=1):
        self.max_requests = max_requests_per_minute
        self.interval = 60.0 / max_requests_per_minute
        self.last_request_time = 0
        self.session = requests.Session()
        self.max_retries = max_retries
        self.retry_delay = retry_delay

    def _wait(self):
        now = time.time()
        elapsed = now - self.last_request_time
        wait_time = self.interval - elapsed
        if wait_time > 0:
            time.sleep(wait_time)
        self.last_request_time = time.time()

    def _request_with_retries(self, method, url, **kwargs):
        for attempt in range(self.max_retries):
            self._wait()
            response = self.session.request(method, url, **kwargs)

            if response.status_code == 200:
                return response
            else:
                print(f"[Attempt {attempt + 1}] Non-200 response: {response.status_code}")
                time.sleep(self.retry_delay)

        raise Exception(f"Failed to get a 200 response after {self.max_retries} attempts")

    def get(self, url, **kwargs):
        return self._request_with_retries('GET', url, **kwargs)

    def post(self, url, **kwargs):
        return self._request_with_retries('POST', url, **kwargs)
        
api = RateLimitedSession(max_requests_per_minute=200*60)

In [17]:
# 1. Get proteins of family PF01370
family_accession = "PF01370"
url = f"https://www.ebi.ac.uk/interpro/api/protein/UniProt/entry/pfam/{family_accession}/"
fields = [
    "count",
    "next",
]
params = {
    "fields": ",".join(fields),
    "page_size": 1
}
response = api.get(url)

In [18]:
response.json()

{'count': 435074,
 'next': 'https://www.ebi.ac.uk/interpro/api/protein/UniProt/entry/pfam/PF01370/?cursor=source%3As%3Aa0a009pk95',
 'previous': None,
 'results': [{'metadata': {'accession': 'A0A003',
    'name': 'NAD-dependent epimerase/dehydratase domain-containing protein',
    'source_database': 'unreviewed',
    'length': 340,
    'source_organism': {'taxId': '67581',
     'scientificName': 'Streptomyces viridosporus',
     'fullName': 'Streptomyces viridosporus'},
    'gene': 'moeE5',
    'in_alphafold': True},
   'entries': [{'accession': 'PF01370',
     'entry_protein_locations': [{'fragments': [{'start': 15,
         'end': 249,
         'dc-status': 'CONTINUOUS'}],
       'representative': False,
       'model': 'PF01370',
       'score': 2.6e-52}],
     'protein_length': 340,
     'source_database': 'pfam',
     'entry_type': 'family',
     'entry_integrated': 'ipr001509'}]},
  {'metadata': {'accession': 'A0A009GZV8',
    'name': 'NAD-dependent epimerase/dehydratase domain-c

In [19]:
family_accession = "PF01370"
url = f"https://www.ebi.ac.uk/interpro/api/protein/UniProt/entry/pfam/{family_accession}/"

results = []

for _ in range(0, 10000, 20):
    if url:
        response = api.get(url)
        data = response.json()
        results.extend(data["results"])
        url = data["next"]

In [20]:
results

[{'metadata': {'accession': 'A0A003',
   'name': 'NAD-dependent epimerase/dehydratase domain-containing protein',
   'source_database': 'unreviewed',
   'length': 340,
   'source_organism': {'taxId': '67581',
    'scientificName': 'Streptomyces viridosporus',
    'fullName': 'Streptomyces viridosporus'},
   'gene': 'moeE5',
   'in_alphafold': True},
  'entries': [{'accession': 'PF01370',
    'entry_protein_locations': [{'fragments': [{'start': 15,
        'end': 249,
        'dc-status': 'CONTINUOUS'}],
      'representative': False,
      'model': 'PF01370',
      'score': 2.6e-52}],
    'protein_length': 340,
    'source_database': 'pfam',
    'entry_type': 'family',
    'entry_integrated': 'ipr001509'}]},
 {'metadata': {'accession': 'A0A009GZV8',
   'name': 'NAD-dependent epimerase/dehydratase domain-containing protein',
   'source_database': 'unreviewed',
   'length': 323,
   'source_organism': {'taxId': '1310608',
    'scientificName': 'Acinetobacter sp. 1295259',
    'fullName': 

In [21]:
len(results)

10000

In [23]:
import csv
import json

with open("interpro_results_ext.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["accession", "length", "source_database", "fragments"])  # headers

    for item in results:
        accession = item['metadata']['accession']
        length = item['metadata']['length']
        source_database = item['metadata']['source_database']
        fragments = []

        # Collect all fragments across all entries
        for entry in item['entries']:
            for loc in entry.get('entry_protein_locations', []):
                for fragment in loc.get('fragments', []):
                    fragments.append({
                        "start": fragment["start"],
                        "end": fragment["end"]
                    })

        # Write one row with accession and JSON-encoded fragments
        writer.writerow([accession, length, source_database, json.dumps(fragments)])

In [35]:
import pandas as pd
import json

# Load CSV
df = pd.read_csv("../data/interpro_results_ext.csv")

# Convert the JSON strings in 'fragments' back to Python objects (list of dicts)
df['fragments'] = df['fragments'].apply(json.loads)

print(df.head())


    accession  length source_database                    fragments
0      A0A003     340      unreviewed  [{'start': 15, 'end': 249}]
1  A0A009GZV8     323      unreviewed   [{'start': 3, 'end': 208}]
2  A0A009H3J1     335      unreviewed   [{'start': 2, 'end': 260}]
3  A0A009H7U9     338      unreviewed   [{'start': 4, 'end': 263}]
4  A0A009HJQ2     301      unreviewed   [{'start': 5, 'end': 220}]


In [36]:
def fetch_sequence(protein_acc):
    url = f"https://www.ebi.ac.uk/interpro/api/protein/uniprot/{protein_acc}/"
    try:
        response = api.get(url)
        protein_info = response.json()
    except Exception as e:
        print(f"Error fetching data for {protein_acc}: {e}")
        return None
    sequence = protein_info["metadata"]["sequence"]
    return sequence

fetch_sequence("A0A009H3J1")

'MILVTGGLGFIGSHIALSLMAQGQEVVIVDNLANSTLQTLERLEFISGMYVPFVKLDVRNTPALNKVFEQYSIDAVIHTAGFKSIEESNLKPLEYYNDNVSCIMSLLRAMQRTGVRHFIHLSSLAAYGKSGLQLSETEEFNYAYPNPYIKSQQMIEEIIRDTYKIDHEWKIAILRLSNIVGAFEHGVLGEYVAQLPKNIVPLAMQVAAMQRDLIELQDQAETSDHTTERSFLHVLDLCEAVIASLHWLREQTHCCEAFNIAHDQVHSIRQLLDEISQVTKAEVPTQSAIYKHVELDQVGANIEKAKTLLQWTPKRPLKQMIEDEWRFYQNTLNGR'

In [38]:
df = df.assign(sequence=df['accession'].apply(fetch_sequence))

In [39]:
df

Unnamed: 0,accession,length,source_database,fragments,sequence
0,A0A003,340,unreviewed,"[{'start': 15, 'end': 249}]",MSSDTHGTDLADGDVLVTGAAGFIGSHLVTELRNSGRNVVAVDRRP...
1,A0A009GZV8,323,unreviewed,"[{'start': 3, 'end': 208}]",MNVLITGGTGFIGKQIAKEILKAGSLTLDDNKPQSIDKIILFDAFA...
2,A0A009H3J1,335,unreviewed,"[{'start': 2, 'end': 260}]",MILVTGGLGFIGSHIALSLMAQGQEVVIVDNLANSTLQTLERLEFI...
3,A0A009H7U9,338,unreviewed,"[{'start': 4, 'end': 263}]",MAKILVTGGAGYIGSHTCVELLNAGHEVIVFDNLSNSSEESLKRVQ...
4,A0A009HJQ2,301,unreviewed,"[{'start': 5, 'end': 220}]",MNKNVLITGASGFIGTHLIKFLLQKNYNVIAVTRQAGKASDHPALQ...
...,...,...,...,...,...
9995,A0A0D7E5F6,300,unreviewed,"[{'start': 3, 'end': 222}]",MNILLTGGTGLIGRALCRRWLADGHRLWVWSRTPQRVAMLCGAEVQ...
9996,A0A0D7E685,352,unreviewed,"[{'start': 5, 'end': 222}]",MTNQALVVGASGIVGSALSRLLADEGWNVAGLARRPNTDAGVTPIS...
9997,A0A0D7E6N8,325,unreviewed,"[{'start': 9, 'end': 231}]",MARYLNQTIFVAGHRGMVGSAIVRRLRALGYGNILTAERDELNLLD...
9998,A0A0D7E8A6,356,unreviewed,"[{'start': 11, 'end': 268}]",MTQSSQQDTKVLVTGGAGYIGSHTCVELIRAGYGVVIYDNFSNSHR...


In [41]:
df.to_csv("../data/results_with_sequence.csv", index=False)