In [1]:
import os
import time
import certifi
import requests
import pandas as pd
import xml.etree.ElementTree as ET


In [2]:
base_url = catalog.load('params:oai_extract_options.base_url')
context = catalog.load('params:oai_extract_options.context')

env = 'dev'

print("base_url: ", base_url)
print("context: ", context)

base_url:  https://ri.conicet.gov.ar/oai/
context:  request


In [3]:
def get_oai_response(base_url, verify=None, max_retries=3, backoff_factor=1.0):

    # Usa el bundle de certifi para evitar errores de certificado en requests
    os.environ.setdefault("REQUESTS_CA_BUNDLE", certifi.where())
    os.environ.setdefault("SSL_CERT_FILE", certifi.where())
    VERIFY_SSL = os.getenv("OAI_VERIFY_SSL", "false").lower() == "true"
    CA_BUNDLE = os.getenv("OAI_CA_BUNDLE") or certifi.where()

    verify_param = CA_BUNDLE if VERIFY_SSL else False
    if verify is not None:
        verify_param = verify

    for attempt in range(1, max_retries + 1):
        start_time = time.time()
        response = None
        try:
            response = requests.get(base_url, verify=verify_param)
            elapsed_time = time.time() - start_time
        except requests.RequestException as exc:
            elapsed_time = time.time() - start_time
            print(f"Error en request (intento {attempt}/{max_retries}): {exc}")
        sleep_time = max(elapsed_time, 0.1)
        print(f"Sleeping for {sleep_time:.2f} seconds")
        time.sleep(sleep_time)

        if response and response.status_code == 200:
            return response

        status = response.status_code if response else "sin respuesta"
        print(f"Error: {status} (intento {attempt}/{max_retries})")

        if attempt < max_retries:
            backoff = backoff_factor * attempt
            print(f"Reintentando en {backoff:.2f} segundos...")
            time.sleep(backoff)
    return None


In [4]:
df_cols = catalog.load("intermediate/oai/cols")
df_cols


Unnamed: 0,setSpec,setName,col_set,com_set
0,col_11336_373,Articulos(BIOMED),True,False
1,col_11336_466,Articulos(CADIC),True,False


## Extract identifiers 

In [None]:
def oai_extract_identifiers_by_sets(base_url: str, context: str, env: str, df_set: pd.DataFrame, verify=None) -> pd.DataFrame:
    records = []
    iteration_limit = 2 if env == "dev" else None

    col_ids = df_set.head(iteration_limit).loc[:, "setSpec"].tolist()

    for set_id in col_ids:
        iteration_count = 0
        resumption_token = f'oai_dc///{set_id}/0'

        while True:
            if env == 'dev' and iteration_limit is not None and iteration_count >= iteration_limit:
                break

            params = f'/{context}?verb=ListIdentifiers&resumptionToken={resumption_token}'
            url = base_url + params

            print(f"Consultando: {url}")

            response = get_oai_response(url, verify=verify)
            if not response or not response.ok:
                print(f"Error al consultar: {url}")
                break
            
            iteration_count += 1

            xml_content = response.text

            root = ET.fromstring(xml_content)
            ns = { 'oai': 'http://www.openarchives.org/OAI/2.0/' }
        
            record_nodes = root.findall('.//oai:header', ns)

            if not record_nodes:
                print("No se encontraron más registros.")
                break

            for record in record_nodes:
                
                # Valores simples
                record_id = record.find('.//oai:identifier', ns)
                record_datestamp = record.find('.//oai:datestamp', ns)
                
                # Multivaluados
                setspec = [e.text for e in record.findall('.//oai:setSpec', ns)]

                records.append({
                    'record_id': record_id.text if record_id is not None else None,
                    'datestamp': record_datestamp.text if record_datestamp is not None else None,
                    'set_id': setspec,
                })

            token_elem = root.find('.//oai:resumptionToken', ns)
            if token_elem is not None:
                complete_list_size = int(token_elem.get('completeListSize'))
                resumption_token = token_elem.text

            # guarda el tamaño en el df de sets
            df_set.loc[df_set["setSpec"] == set_id, "completeListSize"] = (
                int(complete_list_size) if complete_list_size is not None else None
            )
          
    df = pd.DataFrame(records)

    timestamp = pd.Timestamp.now(tz="UTC").normalize()
    df['extract_datetime'] = timestamp

    return df, df_set, df.head(100)


In [None]:
df_identifiers, df_col_dev, df_dev = oai_extract_identifiers_by_sets(base_url, context, env, df_cols)

Consultando: https://ri.conicet.gov.ar/oai//request?verb=ListIdentifiers&resumptionToken=oai_dc///col_11336_373/0


Sleeping for 6.39 seconds
Consultando: https://ri.conicet.gov.ar/oai//request?verb=ListIdentifiers&resumptionToken=oai_dc///col_11336_373/100


Sleeping for 0.18 seconds
Consultando: https://ri.conicet.gov.ar/oai//request?verb=ListIdentifiers&resumptionToken=oai_dc///col_11336_466/0


Sleeping for 0.40 seconds
Consultando: https://ri.conicet.gov.ar/oai//request?verb=ListIdentifiers&resumptionToken=oai_dc///col_11336_466/100


Sleeping for 0.17 seconds


In [7]:
df_col_dev

Unnamed: 0,setSpec,setName,col_set,com_set,completeListSize
0,col_11336_373,Articulos(BIOMED),True,False,377.0
1,col_11336_466,Articulos(CADIC),True,False,1615.0


In [8]:
df_identifiers

Unnamed: 0,record_id,datestamp,set_id,extract_datetime
0,oai:ri.conicet.gov.ar:11336/15260,2023-03-09T02:44:18Z,"[com_11336_390, com_11336_371, com_11336_372, ...",2025-11-28 00:00:00+00:00
1,oai:ri.conicet.gov.ar:11336/266352,2025-07-17T11:08:26Z,"[com_11336_372, com_11336_371, com_11336_394, ...",2025-11-28 00:00:00+00:00
2,oai:ri.conicet.gov.ar:11336/15225,2024-09-20T17:52:07Z,"[com_11336_372, com_11336_371, com_11336_390, ...",2025-11-28 00:00:00+00:00
3,oai:ri.conicet.gov.ar:11336/133626,2021-12-17T12:37:25Z,"[com_11336_372, com_11336_371, col_11336_373, ...",2025-11-28 00:00:00+00:00
4,oai:ri.conicet.gov.ar:11336/15215,2023-03-09T02:44:18Z,"[com_11336_390, com_11336_371, com_11336_100, ...",2025-11-28 00:00:00+00:00
...,...,...,...,...
395,oai:ri.conicet.gov.ar:11336/231502,2024-03-25T15:48:37Z,"[com_11336_465, com_11336_464, com_11336_14309...",2025-11-28 00:00:00+00:00
396,oai:ri.conicet.gov.ar:11336/63660,2020-12-09T16:51:03Z,"[com_11336_465, com_11336_464, col_11336_466, ...",2025-11-28 00:00:00+00:00
397,oai:ri.conicet.gov.ar:11336/247075,2024-11-01T13:17:20Z,"[com_11336_465, com_11336_464, col_11336_466, ...",2025-11-28 00:00:00+00:00
398,oai:ri.conicet.gov.ar:11336/241723,2024-10-08T15:31:13Z,"[com_11336_465, com_11336_464, com_11336_169, ...",2025-11-28 00:00:00+00:00
