In [34]:
import os
import time
import certifi
import requests
import pandas as pd
import xml.etree.ElementTree as ET


In [35]:
base_url = catalog.load('params:oai_extract_options.base_url')
context = catalog.load('params:oai_extract_options.context')

env = 'dev'

print("base_url: ", base_url)
print("context: ", context)

base_url:  https://ri.conicet.gov.ar/oai/
context:  request


In [36]:
def get_oai_response(base_url, verify=None, max_retries=3, backoff_factor=1.0):

    # Usa el bundle de certifi para evitar errores de certificado en requests
    os.environ.setdefault("REQUESTS_CA_BUNDLE", certifi.where())
    os.environ.setdefault("SSL_CERT_FILE", certifi.where())
    VERIFY_SSL = os.getenv("OAI_VERIFY_SSL", "false").lower() == "true"
    CA_BUNDLE = os.getenv("OAI_CA_BUNDLE") or certifi.where()

    verify_param = CA_BUNDLE if VERIFY_SSL else False
    if verify is not None:
        verify_param = verify

    for attempt in range(1, max_retries + 1):
        start_time = time.time()
        response = None
        try:
            response = requests.get(base_url, verify=verify_param)
            elapsed_time = time.time() - start_time
        except requests.RequestException as exc:
            elapsed_time = time.time() - start_time
            print(f"Error en request (intento {attempt}/{max_retries}): {exc}")
        sleep_time = max(elapsed_time, 0.1)
        print(f"Sleeping for {sleep_time:.2f} seconds")
        time.sleep(sleep_time)

        if response and response.status_code == 200:
            return response

        status = response.status_code if response else "sin respuesta"
        print(f"Error: {status} (intento {attempt}/{max_retries})")

        if attempt < max_retries:
            backoff = backoff_factor * attempt
            print(f"Reintentando en {backoff:.2f} segundos...")
            time.sleep(backoff)
    return None


In [37]:
def oai_extract_sets(base_url, context, env, verify=None, iteration_limit=None):

    if iteration_limit is None and env == "dev":
        iteration_limit = 2

    resumption_token = 0
    all_sets = []

    iteration_count = 0

    while True:

        if iteration_limit is not None and iteration_count >= iteration_limit:
            break

        params = f'/{context}?verb=ListSets&resumptionToken=////{resumption_token}'
        url = base_url + params

        print(f"Consultando: {url}")

        response = get_oai_response(url, verify=verify)
        if not response:
            break

        xml_content = response.text
        root = ET.fromstring(xml_content)
        ns = {'oai': 'http://www.openarchives.org/OAI/2.0/'}

        sets_data = []
        for set_elem in root.findall('.//oai:set', ns):
            set_spec = set_elem.find('oai:setSpec', ns).text if set_elem.find('oai:setSpec', ns) is not None else None
            set_name = set_elem.find('oai:setName', ns).text if set_elem.find('oai:setName', ns) is not None else None
            sets_data.append({'setSpec': set_spec, 'setName': set_name})

        if not sets_data:
            print("No se encontraron más sets.")
            break

        all_sets.extend(sets_data)
        resumption_token += 100  # avanzar manualmente
        iteration_count += 1

    df_sets = pd.DataFrame(all_sets)

    timestamp = pd.Timestamp.now(tz="UTC").normalize()
    df_sets['extract_datetime'] = timestamp

    return df_sets


In [38]:
oai_extract_sets(base_url, context, env)

Consultando: https://ri.conicet.gov.ar/oai//request?verb=ListSets&resumptionToken=////0


Sleeping for 0.17 seconds
Consultando: https://ri.conicet.gov.ar/oai//request?verb=ListSets&resumptionToken=////100


Sleeping for 0.34 seconds


Unnamed: 0,setSpec,setName,extract_datetime
0,snrd,Sistema Nacional de Repositorios Digitales,2025-11-27 00:00:00+00:00
1,com_11336_73,AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGI...,2025-11-27 00:00:00+00:00
2,com_11336_116,AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGI...,2025-11-27 00:00:00+00:00
3,com_11336_169,AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGI...,2025-11-27 00:00:00+00:00
4,com_11336_184,AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGI...,2025-11-27 00:00:00+00:00
...,...,...,...
195,com_11336_174,IIB - INSTITUTO DE INVESTIGACIONES BIOLÓGICAS,2025-11-27 00:00:00+00:00
196,com_11336_167,IIB-INTECH - INSTITUTO DE INVESTIGACIONES BIOT...,2025-11-27 00:00:00+00:00
197,com_11336_108933,IIBIO - INSTITUTO DE INVESTIGACIONES BIOTECNOL...,2025-11-27 00:00:00+00:00
198,com_11336_104,IIBYT - INSTITUTO DE INVESTIGACIONES BIOLÓGICA...,2025-11-27 00:00:00+00:00


In [15]:
if catalog.exists("raw/oai/sets"):
    df_sets = catalog.load("raw/oai/sets")
else:
    df_sets =  oai_extract_sets(base_url, context, env)
df_sets


Unnamed: 0,setSpec,setName,extract_datetime
0,snrd,Sistema Nacional de Repositorios Digitales,2025-11-27 00:00:00+00:00
1,com_11336_73,AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGI...,2025-11-27 00:00:00+00:00
2,com_11336_116,AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGI...,2025-11-27 00:00:00+00:00
3,com_11336_169,AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGI...,2025-11-27 00:00:00+00:00
4,com_11336_184,AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGI...,2025-11-27 00:00:00+00:00
...,...,...,...
2439,col_11336_73920,Tesis(UNIHDO),2025-11-27 00:00:00+00:00
2440,col_11336_73921,Tesis(UNITEFA),2025-11-27 00:00:00+00:00
2441,col_11336_48286,undefined,2025-11-27 00:00:00+00:00
2442,col_11336_48181,undefined,2025-11-27 00:00:00+00:00


In [None]:
def oai_intermediate_sets(df_sets):
    
    df_sets["is_col_set"] = df_sets["setSpec"].str.startswith("col_")
    df_sets["is_com_set"] = df_sets["setSpec"].str.startswith("com_")

    return df_sets


## Recupero los sets si ya estan descargados

In [None]:
df_sets =  oai_intermediate_sets(df_sets)
df_sets


Unnamed: 0,setSpec,setName,extract_datetime,col_set,com_set
0,snrd,Sistema Nacional de Repositorios Digitales,2025-11-27 00:00:00+00:00,False,False
1,com_11336_73,AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGI...,2025-11-27 00:00:00+00:00,False,True
2,com_11336_116,AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGI...,2025-11-27 00:00:00+00:00,False,True
3,com_11336_169,AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGI...,2025-11-27 00:00:00+00:00,False,True
4,com_11336_184,AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGI...,2025-11-27 00:00:00+00:00,False,True
...,...,...,...,...,...
2439,col_11336_73920,Tesis(UNIHDO),2025-11-27 00:00:00+00:00,True,False
2440,col_11336_73921,Tesis(UNITEFA),2025-11-27 00:00:00+00:00,True,False
2441,col_11336_48286,undefined,2025-11-27 00:00:00+00:00,True,False
2442,col_11336_48181,undefined,2025-11-27 00:00:00+00:00,True,False


## Filtro colecciones de sets

In [None]:
def oai_filter_col(df_sets, env):
    
    col_filter = df_sets["is_col_set"] == True
    df_col = df_sets[col_filter]#.loc[:, "setSpec"]

    if env == "dev":
        df_col = df_col.head(2)
    
    return df_col

In [18]:
df_col = oai_filter_col(df_sets, env)
df_col

Unnamed: 0,setSpec,setName,extract_datetime,col_set,com_set
371,col_11336_373,Articulos(BIOMED),2025-11-27 00:00:00+00:00,True,False
372,col_11336_466,Articulos(CADIC),2025-11-27 00:00:00+00:00,True,False
