In [1]:
import os
import time
import certifi
import requests
import pandas as pd
import xml.etree.ElementTree as ET

# Usa el bundle de certifi para evitar errores de certificado en requests
os.environ.setdefault("REQUESTS_CA_BUNDLE", certifi.where())
os.environ.setdefault("SSL_CERT_FILE", certifi.where())


[32m'/home/pablo/.local/lib/python3.10/site-packages/certifi/cacert.pem'[0m

In [2]:
VERIFY_SSL = os.getenv("OAI_VERIFY_SSL", "false").lower() == "true"
CA_BUNDLE = os.getenv("OAI_CA_BUNDLE") or certifi.where()

# Pon VERIFY_SSL=True si quieres validar el certificado con CA_BUNDLE; se desactiva por defecto


In [3]:
base_url = catalog.load('params:oai_extract_options.base_url')
context = catalog.load('params:oai_extract_options.context')
env = 'dev'

In [4]:
df_set = catalog.load('dv_oai/dim_oai_set')
df_set

Unnamed: 0,set_id,name
0,col_11336_279,Articulos(CCT - SAN LUIS)
1,col_11336_134184,Datos de Investigación(CEUR)
2,col_11336_43,Articulos(IGEHCS)
3,col_11336_90048,Libros(OCA HOUSSAY)
4,col_11336_90026,Libros(IQUIBA-NEA)
...,...,...
2439,col_11336_134379,Datos de Investigación (INDES)
2440,col_11336_432,Articulos(IIBBA)
2441,col_11336_90059,Libros(UMYMFOR)
2442,col_11336_90303,Capítulos de libros(INIGEM)


In [5]:
set_id = df_set.iloc[0,0]
set_id

[32m'col_11336_279'[0m

In [6]:
def get_oai_records(base_url, verify=None):
    start_time = time.time()

    verify_param = CA_BUNDLE if VERIFY_SSL else False
    if verify is not None:
        verify_param = verify

    response = requests.get(base_url, verify=verify_param)
    end_time = time.time()
    elapsed_time = end_time - start_time

    print(f"Sleeping for {elapsed_time:.2f} seconds")
    time.sleep(elapsed_time)

    if response.status_code == 200:
        return response
    else:
        print(f"Error: {response.status_code}")
        return None


In [None]:
def oai_extract_item_by_col(base_url: str, context: str, df_set: pd.DataFrame, env: str, verify=None) -> pd.DataFrame:
    records = []
    resumption_token = 0
    iteration_limit = 2
    iteration_count = 0
   
    set_id = df_set.iloc[0,0]

    while True:
        if env == 'dev' and iteration_count >= iteration_limit:
            break

        params = f'/{context}?verb=ListRecords&resumptionToken=oai_dc///{set_id}/{resumption_token}'
        url = base_url + params
        
        print(f"Consultando: {url}")
        
        response = get_oai_records(url, verify=verify)

        resumption_token += 100
        iteration_count += 1

        if not response or not response.ok:
            print(f"Error al consultar: {url}")
            break

        xml_content = response.text
        root = ET.fromstring(xml_content)
        ns = {
            'oai': 'http://www.openarchives.org/OAI/2.0/',
            'dc': 'http://purl.org/dc/elements/1.1/'
        }

        record_nodes = root.findall('.//oai:record', ns)


        if not record_nodes:
            print("No se encontraron más registros.")
            break

        for record in record_nodes:
            identifier = record.find('.//oai:identifier', ns)
            item_id = identifier.text if identifier is not None else None
            metadata = record.find('.//oai:metadata', ns)

            if metadata is None:
                continue

            # Valores simples
            title = metadata.find('.//dc:title', ns)
            date_issued = metadata.find('.//dc:date', ns)

            # Multivaluados
            creators = [e.text for e in metadata.findall('.//dc:creator', ns)]
            types = [e.text for e in metadata.findall('.//dc:type', ns)]
            identifiers = [e.text for e in metadata.findall('.//dc:identifier', ns)]
            languages = [e.text for e in metadata.findall('.//dc:language', ns)]
            publishers = [e.text for e in metadata.findall('.//dc:publisher', ns)]
            subjects = [e.text for e in metadata.findall('.//dc:subject', ns)]
            relations = [e.text for e in metadata.findall('.//dc:relation', ns)]
            rights = [e.text for e in metadata.findall('.//dc:rights', ns)]

            records.append({
                'item_id': item_id,
                'col_id': set_id,
                'title': title.text if title is not None else None,
                'date_issued': date_issued.text if date_issued is not None else None,
                'creators': creators,
                'types': types,
                'identifiers': identifiers,
                'languages': languages,
                'subjects': subjects,
                'publishers': publishers,
                'relations': relations,
                'rights': rights
            })

    df = pd.DataFrame(records)

    df['extract_datetime'] = pd.Timestamp.now(tz="UTC").normalize()
    
    return df, df.head(100)


In [8]:
df, df_dev = oai_extract_item_by_col(base_url, context, df_set, env)

Consultando: https://ri.conicet.gov.ar/oai//request?verb=ListRecords&resumptionToken=oai_dc///col_11336_279/0


Sleeping for 0.27 seconds
Consultando: https://ri.conicet.gov.ar/oai//request?verb=ListRecords&resumptionToken=oai_dc///col_11336_279/100


Sleeping for 0.30 seconds


In [9]:
df

Unnamed: 0,item_id,col_id,title,date_issued,creators,types,identifiers,languages,subjects,publishers,relations,rights,extract_datetime,load_datetime
0,oai:ri.conicet.gov.ar:11336/215406,col_11336_279,Estrategias de fertilización nitrogenada para ...,2019-07,"[Dillchneider Loza, Alexandra, Frasier, Ileana...","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/215406, Dillchnei...",[spa],"[TEXTURA DEL SUELO, EFICIENCIA DE USO DEL NITR...",[Universidad Nacional de La Pampa. Facultad de...,[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...",2025-11-25 00:00:00+00:00,2025-11-25 00:00:00+00:00
1,oai:ri.conicet.gov.ar:11336/140529,col_11336_279,3D litho-constrained inversion model of southe...,2019-04-05,"[Christiansen, Rodolfo Omar, Morosini, Augusto...","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/140529, Christian...",[eng],"[DOUBLY-VERGENT STRUCTURE, FAMATINIAN COLLISIO...",[Elsevier Science],[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...",2025-11-25 00:00:00+00:00,2025-11-25 00:00:00+00:00
2,oai:ri.conicet.gov.ar:11336/140442,col_11336_279,Efecto de la disponibilidad de agua sobre una ...,2019-04,"[Rauber, Ruth Bibiana, Demaría, Manuel, Steina...","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/140442, Rauber, R...",[spa],"[Invasión, Fenología, Precipitaciones, https:/...",[Asociación Argentina de Ecología],[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...",2025-11-25 00:00:00+00:00,2025-11-25 00:00:00+00:00
3,oai:ri.conicet.gov.ar:11336/261114,col_11336_279,"3D model of the El Hornito pluton, Sierras Pam...",2024-08,"[Muñoz, Brian Lucas, Enriquez, Eliel, Christia...","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/261114, Muñoz, Br...",[eng],"[GRANITE, EMPLACEMENT, 3D MODEL, GRAVITY DATA,...",[Pergamon-Elsevier Science Ltd],[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/restrictedAccess, http...",2025-11-25 00:00:00+00:00,2025-11-25 00:00:00+00:00
4,oai:ri.conicet.gov.ar:11336/94498,col_11336_279,La convergencia asistencia/seguridad: trama de...,2018-12,"[Seveso Zanin, Emilio José, Peano, Alejandra d...","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/94498, Seveso Zan...",[spa],"[POLÍTICA SOCIAL, Seguridad, Tterritorio, Pobr...",[Universidad Nacional de Cuyo. Facultad de Fil...,[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...",2025-11-25 00:00:00+00:00,2025-11-25 00:00:00+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,oai:ri.conicet.gov.ar:11336/151244,col_11336_279,Políticas de juventud en la emergencia: más al...,2020-08,"[Becher, Yussef]","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/151244, Becher, Y...",[spa],"[Políticas sociales, Juventudes, Pandemia, Arg...",[Asociación de Universidades Grupo Montevideo....,[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...",2025-11-25 00:00:00+00:00,2025-11-25 00:00:00+00:00
193,oai:ri.conicet.gov.ar:11336/60466,col_11336_279,Reducing hardware hit by queries in web search...,2016-11,"[Mendoza, Marcelo, Marin, Mauricio, Gil Costa,...","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/60466, Mendoza, M...",[eng],"[Distributed Information Retrieval, Incrementa...",[Pergamon-Elsevier Science Ltd],[info:eu-repo/semantics/altIdentifier/doi/10.1...,"[info:eu-repo/semantics/openAccess, https://cr...",2025-11-25 00:00:00+00:00,2025-11-25 00:00:00+00:00
194,oai:ri.conicet.gov.ar:11336/217756,col_11336_279,Composición florística y funcional del pastiza...,2022-12,"[Rauber, Ruth Bibiana, Cendoya, Maria Alicia, ...","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/217756, Rauber, R...",[spa],"[COMUNIDAD VEGETAL, DISTURBIOS, GANADERÍA, htt...",[Asociación Argentina de Ecología],[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...",2025-11-25 00:00:00+00:00,2025-11-25 00:00:00+00:00
195,oai:ri.conicet.gov.ar:11336/235556,col_11336_279,Cultural Liminality and the Construction of a ...,2021-05,"[Puchmüller, Andrea Bibiana]","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/235556, Puchmülle...",[eng],"[THE COURTER, RUSHDIE, HYBRIDITY, DIFFERENCE, ...",[Acharya Nagarjuna Unversity],[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...",2025-11-25 00:00:00+00:00,2025-11-25 00:00:00+00:00


In [10]:
df_dev

Unnamed: 0,item_id,col_id,title,date_issued,creators,types,identifiers,languages,subjects,publishers,relations,rights,extract_datetime,load_datetime
0,oai:ri.conicet.gov.ar:11336/215406,col_11336_279,Estrategias de fertilización nitrogenada para ...,2019-07,"[Dillchneider Loza, Alexandra, Frasier, Ileana...","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/215406, Dillchnei...",[spa],"[TEXTURA DEL SUELO, EFICIENCIA DE USO DEL NITR...",[Universidad Nacional de La Pampa. Facultad de...,[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...",2025-11-25 00:00:00+00:00,2025-11-25 00:00:00+00:00
1,oai:ri.conicet.gov.ar:11336/140529,col_11336_279,3D litho-constrained inversion model of southe...,2019-04-05,"[Christiansen, Rodolfo Omar, Morosini, Augusto...","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/140529, Christian...",[eng],"[DOUBLY-VERGENT STRUCTURE, FAMATINIAN COLLISIO...",[Elsevier Science],[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...",2025-11-25 00:00:00+00:00,2025-11-25 00:00:00+00:00
2,oai:ri.conicet.gov.ar:11336/140442,col_11336_279,Efecto de la disponibilidad de agua sobre una ...,2019-04,"[Rauber, Ruth Bibiana, Demaría, Manuel, Steina...","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/140442, Rauber, R...",[spa],"[Invasión, Fenología, Precipitaciones, https:/...",[Asociación Argentina de Ecología],[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...",2025-11-25 00:00:00+00:00,2025-11-25 00:00:00+00:00
3,oai:ri.conicet.gov.ar:11336/261114,col_11336_279,"3D model of the El Hornito pluton, Sierras Pam...",2024-08,"[Muñoz, Brian Lucas, Enriquez, Eliel, Christia...","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/261114, Muñoz, Br...",[eng],"[GRANITE, EMPLACEMENT, 3D MODEL, GRAVITY DATA,...",[Pergamon-Elsevier Science Ltd],[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/restrictedAccess, http...",2025-11-25 00:00:00+00:00,2025-11-25 00:00:00+00:00
4,oai:ri.conicet.gov.ar:11336/94498,col_11336_279,La convergencia asistencia/seguridad: trama de...,2018-12,"[Seveso Zanin, Emilio José, Peano, Alejandra d...","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/94498, Seveso Zan...",[spa],"[POLÍTICA SOCIAL, Seguridad, Tterritorio, Pobr...",[Universidad Nacional de Cuyo. Facultad de Fil...,[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...",2025-11-25 00:00:00+00:00,2025-11-25 00:00:00+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,oai:ri.conicet.gov.ar:11336/60450,col_11336_279,The aftermath of the Fukushima nuclear acciden...,2016-03,"[Gallardo, Adrian Hugo, Marui, Atsunao]","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/60450, Gallardo, ...",[eng],"[Control Measures, Fukushima, Groundwater Cont...",[Elsevier Science],[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...",2025-11-25 00:00:00+00:00,2025-11-25 00:00:00+00:00
96,oai:ri.conicet.gov.ar:11336/158662,col_11336_279,La Psicología Positiva en la revista Psicodeba...,2010-12,"[Mariñelarena-Dondena, Luciana, Klappenbach, H...","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/158662, Mariñelar...",[spa],"[PSICOLOGÍA POSITIVA, HISTORIA DE LA PSICOLOGÍ...",[Universidad de Palermo. Facultad de Ciencias ...,[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...",2025-11-25 00:00:00+00:00,2025-11-25 00:00:00+00:00
97,oai:ri.conicet.gov.ar:11336/7137,col_11336_279,Androgen receptor expression in pituitary of m...,2015-03,"[Filippa, Veronica Palmira, Rosales, Gabriela ...","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/7137, Filippa, Ve...",[eng],"[LAGOSTOMUS, PITUITARY, ANDROGEN, RECEPTOR, ht...",[Hindawi Publishing Corporation],"[info:eu-repo/semantics/altIdentifier/doi/, in...","[info:eu-repo/semantics/openAccess, https://cr...",2025-11-25 00:00:00+00:00,2025-11-25 00:00:00+00:00
98,oai:ri.conicet.gov.ar:11336/156404,col_11336_279,El psicoanálisis en los debates sobre el rol d...,2000-12,"[Klappenbach, Hugo Alberto Arturo]","[info:eu-repo/semantics/article, info:ar-repo/...","[http://hdl.handle.net/11336/156404, Klappenba...",[spa],"[Historia de la Psicologia, Rol profesional, P...",[Universidad de Buenos Aires. Facultad de Psic...,[info:eu-repo/semantics/altIdentifier/url/http...,"[info:eu-repo/semantics/openAccess, https://cr...",2025-11-25 00:00:00+00:00,2025-11-25 00:00:00+00:00


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197 entries, 0 to 196
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   item_id           197 non-null    object             
 1   col_id            197 non-null    object             
 2   title             197 non-null    object             
 3   date_issued       197 non-null    object             
 4   creators          197 non-null    object             
 5   types             197 non-null    object             
 6   identifiers       197 non-null    object             
 7   languages         197 non-null    object             
 8   subjects          197 non-null    object             
 9   publishers        197 non-null    object             
 10  relations         197 non-null    object             
 11  rights            197 non-null    object             
 12  extract_datetime  197 non-null    datetime64[us, UTC]
 13  load_