In [1]:
import os
import time
import certifi
import requests
import pandas as pd
import xml.etree.ElementTree as ET


In [2]:
base_url = catalog.load('params:oai_extract_options.base_url')
context = catalog.load('params:oai_extract_options.context')

env = 'dev'

print("base_url: ", base_url)
print("context: ", context)

base_url:  https://ri.conicet.gov.ar/oai/
context:  request


In [3]:
def get_oai_response(base_url, verify=None, max_retries=3, backoff_factor=1.0):

    # Usa el bundle de certifi para evitar errores de certificado en requests
    os.environ.setdefault("REQUESTS_CA_BUNDLE", certifi.where())
    os.environ.setdefault("SSL_CERT_FILE", certifi.where())
    VERIFY_SSL = os.getenv("OAI_VERIFY_SSL", "false").lower() == "true"
    CA_BUNDLE = os.getenv("OAI_CA_BUNDLE") or certifi.where()

    verify_param = CA_BUNDLE if VERIFY_SSL else False
    if verify is not None:
        verify_param = verify

    for attempt in range(1, max_retries + 1):
        start_time = time.time()
        response = None
        try:
            response = requests.get(base_url, verify=verify_param)
            elapsed_time = time.time() - start_time
        except requests.RequestException as exc:
            elapsed_time = time.time() - start_time
            print(f"Error en request (intento {attempt}/{max_retries}): {exc}")
        sleep_time = max(elapsed_time, 0.1)
        print(f"Sleeping for {sleep_time:.2f} seconds")
        time.sleep(sleep_time)

        if response and response.status_code == 200:
            return response

        status = response.status_code if response else "sin respuesta"
        print(f"Error: {status} (intento {attempt}/{max_retries})")

        if attempt < max_retries:
            backoff = backoff_factor * attempt
            print(f"Reintentando en {backoff:.2f} segundos...")
            time.sleep(backoff)
    return None


In [4]:
params = f'/{context}?verb=ListSets&resumptionToken=////0'
url = base_url + params

print(f"Consultando: {url}")

response = get_oai_response(url)

xml_content = response.text
root = ET.fromstring(xml_content)
ns = {'oai': 'http://www.openarchives.org/OAI/2.0/'}

root

Consultando: https://ri.conicet.gov.ar/oai//request?verb=ListSets&resumptionToken=////0


Sleeping for 0.33 seconds


[1m<[0m[1;95mElement[0m[39m [0m[32m'[0m[32m{[0m[32mhttp://www.openarchives.org/OAI/2.0/[0m[32m}[0m[32mOAI-PMH'[0m[39m at [0m[1;36m0x700450ac7d30[0m[1m>[0m

In [5]:
root.find('.//oai:resumptionToken', ns).text

[32m'////100'[0m

In [6]:
token_elem = root.find('.//oai:resumptionToken', ns)
if token_elem is not None:
    complete_list_size = token_elem.get('completeListSize')
    cursor = token_elem.get('cursor')
    resumptionToken = token_elem.text

In [7]:
complete_list_size

[32m'2444'[0m

In [8]:
sets_data = []
for set_elem in root.findall('.//oai:set', ns):
    set_spec = set_elem.find('oai:setSpec', ns).text if set_elem.find('oai:setSpec', ns) is not None else None
    set_name = set_elem.find('oai:setName', ns).text if set_elem.find('oai:setName', ns) is not None else None
    sets_data.append({'setSpec': set_spec, 'setName': set_name})
sets_data


[1m[[0m
    [1m{[0m[32m'setSpec'[0m: [32m'snrd'[0m, [32m'setName'[0m: [32m'Sistema Nacional de Repositorios Digitales'[0m[1m}[0m,
    [1m{[0m
        [32m'setSpec'[0m: [32m'com_11336_73'[0m,
        [32m'setName'[0m: [32m'AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGICO CONICET - BAHIA BLANCA'[0m
    [1m}[0m,
    [1m{[0m
        [32m'setSpec'[0m: [32m'com_11336_116'[0m,
        [32m'setName'[0m: [32m'AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGICO CONICET - CORDOBA'[0m
    [1m}[0m,
    [1m{[0m
        [32m'setSpec'[0m: [32m'com_11336_169'[0m,
        [32m'setName'[0m: [32m'AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGICO CONICET - LA PLATA'[0m
    [1m}[0m,
    [1m{[0m
        [32m'setSpec'[0m: [32m'com_11336_184'[0m,
        [32m'setName'[0m: [32m'AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGICO CONICET - MAR DEL PLATA'[0m
    [1m}[0m,
    [1m{[0m
        [32m'setSpec'[0m: [32m'com_11336_199'[0m,
        [32m'setNam