In [1]:
import os
import time
import certifi
import requests
import pandas as pd
import xml.etree.ElementTree as ET
from requests.packages.urllib3.exceptions import InsecureRequestWarning


In [2]:
base_url = catalog.load('params:oai_extract_options.base_url')
context = catalog.load('params:oai_extract_options.context')

env = 'dev'

print("base_url: ", base_url)
print("context: ", context)

base_url:  https://ri.conicet.gov.ar/oai/
context:  request


In [3]:
def get_oai_response(base_url, verify=None, max_retries=3, backoff_factor=1.0, min_interval=0.0):

    # Usa el bundle de certifi para evitar errores de certificado en requests
    os.environ.setdefault("REQUESTS_CA_BUNDLE", certifi.where())
    os.environ.setdefault("SSL_CERT_FILE", certifi.where())
    VERIFY_SSL = os.getenv("OAI_VERIFY_SSL", "false").lower() == "true"
    CA_BUNDLE = os.getenv("OAI_CA_BUNDLE") or certifi.where()
    requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)

    verify_param = CA_BUNDLE if VERIFY_SSL else False
    if verify is not None:
        verify_param = verify

    for attempt in range(1, max_retries + 1):
        start_time = time.time()
        response = None
        error = None
        try:
            response = requests.get(base_url, verify=verify_param)
        except requests.RequestException as exc:
            error = exc
        elapsed_time = time.time() - start_time

        if min_interval > 0:
            wait_time = max(min_interval - elapsed_time, 0)
            if wait_time > 0:
                print(f"Pausando {wait_time:.2f} segundos para no saturar el servidor")
                time.sleep(wait_time)

        if error:
            print(f"Error en request (intento {attempt}/{max_retries}): {error}")

        if response and response.status_code == 200:
            return response

        status = response.status_code if response else "sin respuesta"
        print(f"Error: {status} (intento {attempt}/{max_retries})")

        if attempt < max_retries:
            backoff = backoff_factor * attempt
            print(f"Reintentando en {backoff:.2f} segundos...")
            time.sleep(backoff)
    return None

def log_oai_progress(token_elem, total_processed: int):
    """Muestra el avance usando completeListSize y los registros acumulados."""
    if token_elem is None:
        return
    total = token_elem.get('completeListSize')
    try:
        total_int = int(total) if total is not None else None
        if total_int is not None and total_processed is not None:
            remaining = total_int - total_processed
            print(f"Progreso OAI: {total_processed}/{total_int} (faltan ~{remaining})")
    except ValueError:
        # Si el servidor devuelve valores no numéricos, ignora el progreso.
        pass


## Extract identifiers 

In [4]:
def oai_extract_records(base_url: str, context: str, env: str, verify=None) -> pd.DataFrame:
    records = []
    
    iteration_limit = 2 if env == "dev" else None
    resumption_token = None
    iteration_count = 0

    total_processed = 0

    while True:
        if iteration_limit is not None and iteration_count >= iteration_limit:
            break

        if resumption_token:
            params = f'/{context}?verb=ListRecords&resumptionToken={resumption_token}'
        else:
            params = f'/{context}?verb=ListRecords&metadataPrefix=oai_dc'

        url = base_url + params

        print(f"Consultando: {url}")

        response = get_oai_response(url, verify=verify)

        iteration_count += 1

        if not response or not response.ok:
            print(f"Error al consultar: {url}")
            break

        xml_content = response.text
        root = ET.fromstring(xml_content)
        ns = {
            'oai': 'http://www.openarchives.org/OAI/2.0/',
            'dc': 'http://purl.org/dc/elements/1.1/'
        }

        record_nodes = root.findall('.//oai:record', ns)

        if not record_nodes:
            print("No se encontraron más registros.")
            break

        for record in record_nodes:
            header = record.find('.//oai:header', ns)
            identifier_node = header.find('.//oai:identifier', ns) if header is not None else None
            datestamp_node = header.find('.//oai:datestamp', ns) if header is not None else None
            setspec = [e.text for e in header.findall('.//oai:setSpec', ns)] if header is not None else []

            metadata = record.find('.//oai:metadata', ns)

            if metadata is None:
                continue

            # Valores simples
            title = metadata.find('.//dc:title', ns)
            date_issued = metadata.find('.//dc:date', ns)

            # Multivaluados
            creators = [e.text for e in metadata.findall('.//dc:creator', ns)]
            types = [e.text for e in metadata.findall('.//dc:type', ns)]
            identifiers = [e.text for e in metadata.findall('.//dc:identifier', ns)]
            languages = [e.text for e in metadata.findall('.//dc:language', ns)]
            publishers = [e.text for e in metadata.findall('.//dc:publisher', ns)]
            subjects = [e.text for e in metadata.findall('.//dc:subject', ns)]
            relations = [e.text for e in metadata.findall('.//dc:relation', ns)]
            rights = [e.text for e in metadata.findall('.//dc:rights', ns)]

            records.append({
                'record_id': identifier_node.text if identifier_node is not None else None,
                'datestamp': datestamp_node.text if datestamp_node is not None else None,
                'set_id': setspec,
                'col_id': setspec[0] if setspec else None,
                'title': title.text if title is not None else None,
                'date_issued': date_issued.text if date_issued is not None else None,
                'creators': creators,
                'types': types,
                'identifiers': identifiers,
                'languages': languages,
                'subjects': subjects,
                'publishers': publishers,
                'relations': relations,
                'rights': rights
            })

        total_processed += len(record_nodes)

        token_elem = root.find('.//oai:resumptionToken', ns)
        resumption_token = token_elem.text if token_elem is not None else None
        log_oai_progress(token_elem, total_processed)

        if not resumption_token:
            break

    df = pd.DataFrame(records)

    timestamp = pd.Timestamp.now(tz="UTC").normalize()
    df['extract_datetime'] = timestamp

    return df, df.head(100)

In [5]:
df_records, df_dev = oai_extract_records(base_url, context, env)

Consultando: https://ri.conicet.gov.ar/oai//request?verb=ListRecords&metadataPrefix=oai_dc
Progreso OAI: 100/257187 (faltan ~257087)
Consultando: https://ri.conicet.gov.ar/oai//request?verb=ListRecords&resumptionToken=oai_dc////100
Progreso OAI: 200/257187 (faltan ~256987)


In [6]:
df_records

Unnamed: 0,record_id,datestamp,set_id,col_id,title,date_issued,creators,types,identifiers,languages,subjects,publishers,relations,rights,extract_datetime
0,oai:ri.conicet.gov.ar:11336/179477,2024-01-12T04:51:20Z,"[com_11336_35, com_11336_14, col_11336_36, snrd]",com_11336_35,Paisajes mesetarios en Patagonia: Tecnología d...,2016-12,"[Cassiodoro, Gisela Eva]","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/179477, Cassiodor...",[spa],"[MESETAS, TECNOLOGÍA, USO DEL ESPACIO, LOGÍSTI...",[Instituto Nacional de Antropología y Pensamie...,[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...",2025-12-05 00:00:00+00:00
1,oai:ri.conicet.gov.ar:11336/116972,2020-10-27T20:15:28Z,"[com_11336_219, com_11336_218, col_11336_89919...",com_11336_219,Distancias entre la ecología y la praxis ambie...,2011,"[Nuñez, Paula Gabriela]","[info:eu-repo/semantics/publishedVersion, info...","[http://hdl.handle.net/11336/116972, Nuñez, Pa...",[spa],"[ECOLOGÍA, PRAXIS AMBIENTAL, ECOFEMINISMO, TEO...",[Universidad Nacional de La Plata],[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...",2025-12-05 00:00:00+00:00
2,oai:ri.conicet.gov.ar:11336/23263,2024-04-25T19:52:43Z,"[com_11336_106, com_11336_75, col_11336_107, s...",com_11336_106,Plant community resilience in the face of fire...,2016-02,"[Lipoma, Maria Lucrecia, Gurvich, Diego Ezequi...","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/23263, Lipoma, Ma...",[eng],"[Firefire, Functional Redundancy, Plan Functio...","[Wiley Blackwell Publishing, Inc]",[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...",2025-12-05 00:00:00+00:00
3,oai:ri.conicet.gov.ar:11336/215533,2023-10-20T13:49:47Z,"[com_11336_184, com_11336_171, col_11336_185, ...",com_11336_184,La puesta en acto de las políticas de formació...,2022-11,"[Marchetti, Braian, Bazán, Sonia]","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/215533, Marchetti...",[spa],"[POLÍTICAS PÚBLICAS EDUCATIVAS, FORMACIÓN DOCE...",[Universidad de Buenos Aires. Facultad de Filo...,[info:eu-repo/semantics/altIdentifier/doi/10.3...,"[info:eu-repo/semantics/openAccess, https://cr...",2025-12-05 00:00:00+00:00
4,oai:ri.conicet.gov.ar:11336/109802,2020-10-08T23:12:10Z,"[com_11336_153, com_11336_118, col_11336_90213...",com_11336_153,Introducción a la Genética Forense No-Humana,2015,"[Giovambattista, Guillermo, Barrientos, Laura ...","[info:eu-repo/semantics/publishedVersion, info...","[http://hdl.handle.net/11336/109802, Giovambat...",[spa],"[GENÉTICA FORENSE, IDENTIFICACIÓN GENÉTICA, ID...",[Universidad Nacional de La Plata. Facultad de...,[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...",2025-12-05 00:00:00+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,oai:ri.conicet.gov.ar:11336/182926,2023-01-02T17:03:17Z,"[com_11336_339, com_11336_332, col_11336_340, ...",com_11336_339,"Larval morphology of Megaporus Brinck, 1943 (C...",2020-04,"[Alarie, Yves, Michat, Mariano Cruz, Watts, Ch...","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/182926, Alarie, Y...",[eng],"[AUSTRALASIA, HYDROPORINAE, LARVAE, PHYLOGENY,...",[Coleopterists Society],[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/restrictedAccess, http...",2025-12-05 00:00:00+00:00
194,oai:ri.conicet.gov.ar:11336/261962,2025-05-19T12:55:53Z,"[com_11336_460, com_11336_443, col_11336_461, ...",com_11336_460,“Los propósitos de Caseros no se han cumplido ...,2024-07,"[Herrero, Fabian]","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/261962, Herrero, ...",[spa],"[PRENSA, JOSÉ HERNANDEZ, JUSTO JOSE DE URQUIZA...","[Pontificia Universidad Católica Argentina ""Sa...",[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...",2025-12-05 00:00:00+00:00
195,oai:ri.conicet.gov.ar:11336/265488,2025-07-08T10:16:35Z,"[com_11336_35, com_11336_14, col_11336_36, snrd]",com_11336_35,The two synthetic cannabinoid compounds 4′‐ F‐...,2023-11,"[dos Santos Pereira, Maurício, Maitan Santos, ...","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/265488, dos Santo...",[eng],"[ASTROCYTES, MICROGLIA, NEUROINFLAMMATION, OXI...","[Wiley-liss, div John Wiley & Sons Inc.]",[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...",2025-12-05 00:00:00+00:00
196,oai:ri.conicet.gov.ar:11336/144165,2021-10-18T20:30:01Z,"[com_11336_238, com_11336_231, col_11336_239, ...",com_11336_238,Simultaneous Determination of Human Erythrocyt...,2020-10,"[Londero, Carolina María, Riquelme, Bibiana Do...","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/144165, Londero, ...",[eng],"[HEMORHEOLOGY, MICROFLUIDIC CHAMBER, RED BLOOD...",[Humana Press],[info:eu-repo/semantics/altIdentifier/doi/10.1...,"[info:eu-repo/semantics/restrictedAccess, http...",2025-12-05 00:00:00+00:00
