In [1]:
import os
import time
import certifi
import requests
import pandas as pd
import xml.etree.ElementTree as ET


In [2]:
base_url = catalog.load('params:oai_extract_options.base_url')
context = catalog.load('params:oai_extract_options.context')

env = 'dev'

print("base_url: ", base_url)
print("context: ", context)

base_url:  https://ri.conicet.gov.ar/oai/
context:  request


In [3]:
def get_oai_response(base_url, verify=None, max_retries=3, backoff_factor=1.0):

    # Usa el bundle de certifi para evitar errores de certificado en requests
    os.environ.setdefault("REQUESTS_CA_BUNDLE", certifi.where())
    os.environ.setdefault("SSL_CERT_FILE", certifi.where())
    VERIFY_SSL = os.getenv("OAI_VERIFY_SSL", "false").lower() == "true"
    CA_BUNDLE = os.getenv("OAI_CA_BUNDLE") or certifi.where()

    verify_param = CA_BUNDLE if VERIFY_SSL else False
    if verify is not None:
        verify_param = verify

    for attempt in range(1, max_retries + 1):
        start_time = time.time()
        response = None
        try:
            response = requests.get(base_url, verify=verify_param)
            elapsed_time = time.time() - start_time
        except requests.RequestException as exc:
            elapsed_time = time.time() - start_time
            print(f"Error en request (intento {attempt}/{max_retries}): {exc}")
        sleep_time = max(elapsed_time, 0.1)
        print(f"Sleeping for {sleep_time:.2f} seconds")
        time.sleep(sleep_time)

        if response and response.status_code == 200:
            return response

        status = response.status_code if response else "sin respuesta"
        print(f"Error: {status} (intento {attempt}/{max_retries})")

        if attempt < max_retries:
            backoff = backoff_factor * attempt
            print(f"Reintentando en {backoff:.2f} segundos...")
            time.sleep(backoff)
    return None



In [4]:
def oai_extract_sets(base_url, context, env, verify=None, iteration_limit=None):

    if iteration_limit is None and env == "dev":
        iteration_limit = 2

    resumption_token = 0
    all_sets = []

    while True:

        if env == 'dev' and iteration_count >= iteration_limit:
            break

        params = f'/{context}?verb=ListSets&resumptionToken=////{resumption_token}'
        url = base_url + params

        print(f"Consultando: {url}")

        response = get_oai_response(url, verify=verify)
        if not response:
            break

        xml_content = response.text
        root = ET.fromstring(xml_content)
        ns = {'oai': 'http://www.openarchives.org/OAI/2.0/'}

        sets_data = []
        for set_elem in root.findall('.//oai:set', ns):
            set_spec = set_elem.find('oai:setSpec', ns).text if set_elem.find('oai:setSpec', ns) is not None else None
            set_name = set_elem.find('oai:setName', ns).text if set_elem.find('oai:setName', ns) is not None else None
            sets_data.append({'setSpec': set_spec, 'setName': set_name})

        if not sets_data:
            print("No se encontraron más sets.")
            break

        all_sets.extend(sets_data)
        resumption_token += 100  # avanzar manualmente
        iteration_count += 1

    df_sets = pd.DataFrame(all_sets)
    return df_sets


In [5]:
if catalog.exists("raw/oai/sets#csv"):
    df_sets = catalog.load("raw/oai/sets#csv")
else:
    df_sets =  oai_extract_sets(base_url, context, env)
df_sets


Unnamed: 0,setSpec,setName,extract_datetime
0,snrd,Sistema Nacional de Repositorios Digitales,2025-11-26 00:00:00+00:00
1,com_11336_73,AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGI...,2025-11-26 00:00:00+00:00
2,com_11336_116,AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGI...,2025-11-26 00:00:00+00:00
3,com_11336_169,AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGI...,2025-11-26 00:00:00+00:00
4,com_11336_184,AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGI...,2025-11-26 00:00:00+00:00
...,...,...,...
2439,col_11336_73920,Tesis(UNIHDO),2025-11-26 00:00:00+00:00
2440,col_11336_73921,Tesis(UNITEFA),2025-11-26 00:00:00+00:00
2441,col_11336_48286,undefined,2025-11-26 00:00:00+00:00
2442,col_11336_48181,undefined,2025-11-26 00:00:00+00:00


In [6]:
def oai_intermediate_sets(df_sets):
    
    df_sets["col_set"] = df_sets["setSpec"].str.startswith("col_")
    df_sets["com_set"] = df_sets["setSpec"].str.startswith("com_")

    return df_sets


In [7]:
if catalog.exists("intermediate/oai/sets#csv"):
    df_sets = catalog.load("intermediate/oai/sets#csv")
else:
    df_sets =  oai_intermediate_sets(df_sets)

df_sets


Unnamed: 0,setSpec,setName,extract_datetime,col_set,com_set
0,snrd,Sistema Nacional de Repositorios Digitales,2025-11-26 00:00:00+00:00,False,False
1,com_11336_73,AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGI...,2025-11-26 00:00:00+00:00,False,True
2,com_11336_116,AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGI...,2025-11-26 00:00:00+00:00,False,True
3,com_11336_169,AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGI...,2025-11-26 00:00:00+00:00,False,True
4,com_11336_184,AREA DE INFLUENCIA CENTRO CIENTÍFICO TECNOLÓGI...,2025-11-26 00:00:00+00:00,False,True
...,...,...,...,...,...
2439,col_11336_73920,Tesis(UNIHDO),2025-11-26 00:00:00+00:00,True,False
2440,col_11336_73921,Tesis(UNITEFA),2025-11-26 00:00:00+00:00,True,False
2441,col_11336_48286,undefined,2025-11-26 00:00:00+00:00,True,False
2442,col_11336_48181,undefined,2025-11-26 00:00:00+00:00,True,False


In [8]:
def oai_filter_col(df_sets):
    
    col_filter = df_sets['col_set'] == True

    df_col = df_sets[col_filter]#.loc[:, "setSpec"]
    
    return df_col

In [9]:
df_col = oai_filter_col(df_sets)
df_col

Unnamed: 0,setSpec,setName,extract_datetime,col_set,com_set
371,col_11336_373,Articulos(BIOMED),2025-11-26 00:00:00+00:00,True,False
372,col_11336_466,Articulos(CADIC),2025-11-26 00:00:00+00:00,True,False
373,col_11336_445,Articulos(CAICYT),2025-11-26 00:00:00+00:00,True,False
374,col_11336_11,Articulos(CASLEO),2025-11-26 00:00:00+00:00,True,False
375,col_11336_138773,Articulos (CCONFINES),2025-11-26 00:00:00+00:00,True,False
...,...,...,...,...,...
2439,col_11336_73920,Tesis(UNIHDO),2025-11-26 00:00:00+00:00,True,False
2440,col_11336_73921,Tesis(UNITEFA),2025-11-26 00:00:00+00:00,True,False
2441,col_11336_48286,undefined,2025-11-26 00:00:00+00:00,True,False
2442,col_11336_48181,undefined,2025-11-26 00:00:00+00:00,True,False


In [10]:
def oai_list_identifiers(base_url: str, context: str, env: str, df_set: pd.DataFrame, verify=None) -> pd.DataFrame:
    records = []
    iteration_limit = 1

    col_ids = df_set.head(iteration_limit).iloc[:, 0].tolist()

    if not col_ids:
        print("No se encontraron colecciones pendientes con processed=False.")

    for set_id in col_ids:
        resumption_token = 0
        iteration_count = 0


        while True:
            if env == 'dev' and iteration_count >= iteration_limit:
                break

            params = f'/{context}?verb=ListIdentifiers&resumptionToken=oai_dc///{set_id}/{resumption_token}'
            url = base_url + params

            print(f"Consultando: {url}")

            response = get_oai_response(url, verify=verify)

            resumption_token += 100
            iteration_count += 1

            if not response or not response.ok:
                print(f"Error al consultar: {url}")
                break

            xml_content = response.text

            root = ET.fromstring(xml_content)
            ns = {
                'oai': 'http://www.openarchives.org/OAI/2.0/',
                'dc': 'http://purl.org/dc/elements/1.1/'
            }

        #    return root, ns
        
            record_nodes = root.findall('.//oai:header', ns)


            if not record_nodes:
                print("No se encontraron más registros.")
                break

#            return record_nodes
            for record in record_nodes:
                
                # Valores simples
                record_id = record.find('.//oai:identifier', ns)
                record_datestamp = record.find('.//oai:datestamp', ns)
                
                # Multivaluados
                setspec = [e.text for e in record.findall('.//oai:setSpec', ns)]


                records.append({
                    'record_id': record_id.text if record_id is not None else None,
                    'datestamp': record_datestamp.text if record_datestamp is not None else None,
                    'set_id': setspec,
                })

    df = pd.DataFrame(records)

    return df, df.head(100)


In [11]:
df_identifiers, df_dev = oai_list_identifiers(base_url, context, env, df_col)

Consultando: https://ri.conicet.gov.ar/oai//request?verb=ListIdentifiers&resumptionToken=oai_dc///col_11336_373/0


Sleeping for 0.21 seconds


In [12]:
df_identifiers

Unnamed: 0,record_id,datestamp,set_id
0,oai:ri.conicet.gov.ar:11336/15260,2023-03-09T02:44:18Z,"[com_11336_390, com_11336_371, com_11336_372, ..."
1,oai:ri.conicet.gov.ar:11336/266352,2025-07-17T11:08:26Z,"[com_11336_372, com_11336_371, com_11336_394, ..."
2,oai:ri.conicet.gov.ar:11336/15225,2024-09-20T17:52:07Z,"[com_11336_372, com_11336_371, com_11336_390, ..."
3,oai:ri.conicet.gov.ar:11336/133626,2021-12-17T12:37:25Z,"[com_11336_372, com_11336_371, col_11336_373, ..."
4,oai:ri.conicet.gov.ar:11336/15215,2023-03-09T02:44:18Z,"[com_11336_390, com_11336_371, com_11336_100, ..."
...,...,...,...
95,oai:ri.conicet.gov.ar:11336/15191,2017-04-11T21:07:06Z,"[com_11336_414, com_11336_371, com_11336_372, ..."
96,oai:ri.conicet.gov.ar:11336/48762,2018-06-15T14:40:23Z,"[com_11336_372, com_11336_371, col_11336_373, ..."
97,oai:ri.conicet.gov.ar:11336/227745,2024-02-21T13:13:13Z,"[com_11336_372, com_11336_371, col_11336_373, ..."
98,oai:ri.conicet.gov.ar:11336/141716,2021-09-28T14:49:51Z,"[com_11336_372, com_11336_371, col_11336_373, ..."


In [None]:

def oai_list_records(base_url: str, context: str, env: str, df_set: pd.DataFrame, verify=None) -> pd.DataFrame:
    records = []
    iteration_limit = 1

    col_ids = df_set.head(iteration_limit).iloc[:, 0].tolist()

    if not col_ids:
        print("No se encontraron colecciones pendientes con processed=False.")

    for set_id in col_ids:
        resumption_token = 0
        iteration_count = 0

        while True:
            if env == 'dev' and iteration_count >= iteration_limit:
                break

            params = f'/{context}?verb=ListRecords&resumptionToken=oai_dc///{set_id}/{resumption_token}'
            url = base_url + params

            print(f"Consultando: {url}")

            response = get_oai_response(url, verify=verify)

            resumption_token += 100
            iteration_count += 1

            if not response or not response.ok:
                print(f"Error al consultar: {url}")
                break

            xml_content = response.text
            root = ET.fromstring(xml_content)
            ns = {
                'oai': 'http://www.openarchives.org/OAI/2.0/',
                'dc': 'http://purl.org/dc/elements/1.1/'
            }

            record_nodes = root.findall('.//oai:record', ns)

            if not record_nodes:
                print("No se encontraron más registros.")
                break

            for record in record_nodes:
                identifier = record.find('.//oai:identifier', ns)
                item_id = identifier.text if identifier is not None else None
                metadata = record.find('.//oai:metadata', ns)

                if metadata is None:
                    continue

                # Valores simples
                title = metadata.find('.//dc:title', ns)
                date_issued = metadata.find('.//dc:date', ns)

                # Multivaluados
                creators = [e.text for e in metadata.findall('.//dc:creator', ns)]
                types = [e.text for e in metadata.findall('.//dc:type', ns)]
                identifiers = [e.text for e in metadata.findall('.//dc:identifier', ns)]
                languages = [e.text for e in metadata.findall('.//dc:language', ns)]
                publishers = [e.text for e in metadata.findall('.//dc:publisher', ns)]
                subjects = [e.text for e in metadata.findall('.//dc:subject', ns)]
                relations = [e.text for e in metadata.findall('.//dc:relation', ns)]
                rights = [e.text for e in metadata.findall('.//dc:rights', ns)]

                records.append({
                    'item_id': item_id,
                    'col_id': set_id,
                    'title': title.text if title is not None else None,
                    'date_issued': date_issued.text if date_issued is not None else None,
                    'creators': creators,
                    'types': types,
                    'identifiers': identifiers,
                    'languages': languages,
                    'subjects': subjects,
                    'publishers': publishers,
                    'relations': relations,
                    'rights': rights
                })

    df = pd.DataFrame(records)

    timestamp = pd.Timestamp.now(tz="UTC").normalize()
    df['extract_datetime'] = timestamp
    df['load_datetime'] = timestamp

    return df, df.head(100)


In [11]:
df = oai_list_records(base_url, context, env, df_col)

Consultando: https://ri.conicet.gov.ar/oai//request?verb=ListRecords&resumptionToken=oai_dc///col_11336_373/0


Sleeping for 12.47 seconds
Consultando: https://ri.conicet.gov.ar/oai//request?verb=ListRecords&resumptionToken=oai_dc///col_11336_373/100


Sleeping for 10.25 seconds
Error: sin respuesta (intento 1/3)
Reintentando en 1.00 segundos...


Sleeping for 10.96 seconds
Error: sin respuesta (intento 2/3)
Reintentando en 2.00 segundos...


In [12]:
df

Unnamed: 0,identifier,title,creator,subject,description,date,type,language,relation,rights,format,publisher,contributor,coverage,source
0,"[http://hdl.handle.net/11336/179477, Cassiodor...",[Paisajes mesetarios en Patagonia: Tecnología ...,"[Cassiodoro, Gisela Eva]","[MESETAS, TECNOLOGÍA, USO DEL ESPACIO, LOGÍSTI...",[La disponibilidad de recursos hídricos en Pat...,[2016-12],"[info:eu-repo/semantics/article, info:ar-repo/...",[spa],[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...","[application/pdf, application/pdf]",[Instituto Nacional de Antropología y Pensamie...,,,
1,"[http://hdl.handle.net/11336/116972, Nuñez, Pa...",[Distancias entre la ecología y la praxis ambi...,"[Nuñez, Paula Gabriela]","[ECOLOGÍA, PRAXIS AMBIENTAL, ECOFEMINISMO, TEO...",[Este libro recorre un problema que cotidianam...,[2011],"[info:eu-repo/semantics/publishedVersion, info...",[spa],[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...","[application/pdf, application/pdf, application...",[Universidad Nacional de La Plata],,,
2,"[http://hdl.handle.net/11336/23263, Lipoma, Ma...",[Plant community resilience in the face of fir...,"[Lipoma, Maria Lucrecia, Gurvich, Diego Ezequi...","[Firefire, Functional Redundancy, Plan Functio...",[The ability of communities or ecosystems to r...,[2016-02],"[info:eu-repo/semantics/article, info:ar-repo/...",[eng],[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...","[application/pdf, application/pdf, application...","[Wiley Blackwell Publishing, Inc]",,,
3,"[http://hdl.handle.net/11336/215533, Marchetti...",[La puesta en acto de las políticas de formaci...,"[Marchetti, Braian, Bazán, Sonia]","[POLÍTICAS PÚBLICAS EDUCATIVAS, FORMACIÓN DOCE...",[El presente artículo recupera las conclusione...,[2022-11],"[info:eu-repo/semantics/article, info:ar-repo/...",[spa],[info:eu-repo/semantics/altIdentifier/doi/10.3...,"[info:eu-repo/semantics/openAccess, https://cr...","[application/pdf, application/pdf]",[Universidad de Buenos Aires. Facultad de Filo...,,,
4,"[http://hdl.handle.net/11336/109802, Giovambat...",[Introducción a la Genética Forense No-Humana],"[Giovambattista, Guillermo, Barrientos, Laura ...","[GENÉTICA FORENSE, IDENTIFICACIÓN GENÉTICA, ID...",[El presente libro nace como producto de una c...,[2015],"[info:eu-repo/semantics/publishedVersion, info...",[spa],[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...","[application/pdf, application/pdf, application...",[Universidad Nacional de La Plata. Facultad de...,"[Peral Garcia, Pilar, Giovambattista, Guillerm...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290,"[http://hdl.handle.net/11336/88817, Martin, An...",[Prospective multicentre evaluation of the dir...,"[Martin, Anandi, Imperiale, Belén Rocío, Ravol...","[MULTIDRUG RESISTANCE, MYCOBACTERIUM TUBERCULO...",[Objectives: To perform a multicentre study ev...,[2014-02],"[info:eu-repo/semantics/article, info:ar-repo/...",[eng],[info:eu-repo/semantics/altIdentifier/doi/10.1...,"[info:eu-repo/semantics/openAccess, https://cr...","[application/pdf, application/pdf]",[Oxford University Press],,,
291,"[http://hdl.handle.net/11336/11072, Espinosa, ...",[Una iglesia primitiva e internacional: el per...,"[Espinosa, Mariana Esther]","[HERMANOS LIBRES, MISIONES, INTERNACIONALISMO,...",[Este artículo busca aportar conocimiento sobr...,[2014-01],"[info:eu-repo/semantics/article, info:ar-repo/...",[spa],[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...","[application/pdf, application/pdf]",[Asociación de Cientistas Sociales de la Relig...,,,
292,"[http://hdl.handle.net/11336/173007, Schaller,...",[Reorganización agraria y expansión territoria...,"[Schaller, Enrique Cesar, Almiron, Adrian Alej...","[Tierras fiscales, Tenencia de la tierra, Chac...",[En el trabajo se analiza la política de tierr...,[2021],"[info:eu-repo/semantics/publishedVersion, info...",[spa],[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...","[application/pdf, application/pdf]",[Imprenta Corintios],"[Carini, Gabriel Fernando, Poggetti, Rocío Sol...",,
293,"[http://hdl.handle.net/11336/134034, Ames, Mar...",[Los límites de la tolerancia religiosa en la ...,"[Ames, Maria Cecilia]","[RELIGIÓN ROMANA, CULTURA CLÁSICA, CONTROL SOC...",[En el año 186 a.C. fueron duramente reprimido...,[2008-12],"[info:eu-repo/semantics/article, info:ar-repo/...",[spa],[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...","[application/pdf, application/msword, applicat...",[Trotta],,,
