In [1]:
import os
import time
import certifi
import requests
import pandas as pd
import xml.etree.ElementTree as ET


In [2]:
base_url = catalog.load('params:oai_extract_options.base_url')
context = catalog.load('params:oai_extract_options.context')

env = 'dev'

print("base_url: ", base_url)
print("context: ", context)

base_url:  https://ri.conicet.gov.ar/oai/
context:  request


In [3]:
def get_oai_response(base_url, verify=None, max_retries=3, backoff_factor=1.0):

    # Usa el bundle de certifi para evitar errores de certificado en requests
    os.environ.setdefault("REQUESTS_CA_BUNDLE", certifi.where())
    os.environ.setdefault("SSL_CERT_FILE", certifi.where())
    VERIFY_SSL = os.getenv("OAI_VERIFY_SSL", "false").lower() == "true"
    CA_BUNDLE = os.getenv("OAI_CA_BUNDLE") or certifi.where()

    verify_param = CA_BUNDLE if VERIFY_SSL else False
    if verify is not None:
        verify_param = verify

    for attempt in range(1, max_retries + 1):
        start_time = time.time()
        response = None
        try:
            response = requests.get(base_url, verify=verify_param)
            elapsed_time = time.time() - start_time
        except requests.RequestException as exc:
            elapsed_time = time.time() - start_time
            print(f"Error en request (intento {attempt}/{max_retries}): {exc}")
        sleep_time = max(elapsed_time, 0.1)
        print(f"Sleeping for {sleep_time:.2f} seconds")
        time.sleep(sleep_time)

        if response and response.status_code == 200:
            return response

        status = response.status_code if response else "sin respuesta"
        print(f"Error: {status} (intento {attempt}/{max_retries})")

        if attempt < max_retries:
            backoff = backoff_factor * attempt
            print(f"Reintentando en {backoff:.2f} segundos...")
            time.sleep(backoff)
    return None


In [4]:
df_ids = catalog.load("intermediate/oai/identifiers")
df_ids


Unnamed: 0,id
0,oai:ri.conicet.gov.ar:11336/77856
1,oai:ri.conicet.gov.ar:11336/101455
2,oai:ri.conicet.gov.ar:11336/102687
3,oai:ri.conicet.gov.ar:11336/103545
4,oai:ri.conicet.gov.ar:11336/103781
...,...
995,oai:ri.conicet.gov.ar:11336/188111
996,oai:ri.conicet.gov.ar:11336/188163
997,oai:ri.conicet.gov.ar:11336/112304
998,oai:ri.conicet.gov.ar:11336/145629


## Extract identifiers 

In [5]:
def oai_extract_records_by_identifiers(base_url: str, context: str, env: str, df_ids: pd.DataFrame, verify=None) -> pd.DataFrame:
    records = []
    iteration_limit = 2 if env == "dev" else None

    ids = df_ids.head(iteration_limit).loc[:, "id"].tolist()

    for record_id in ids:
        iteration_count = 0

        while True:
            if env == 'dev' and iteration_limit is not None and iteration_count >= iteration_limit:
                break

            params = f'/{context}?verb=GetRecord&metadataPrefix=oai_dc&identifier={record_id}'
            url = base_url + params

            print(f"Consultando: {url}")

            response = get_oai_response(url, verify=verify)
            if not response or not response.ok:
                print(f"Error al consultar: {url}")
                break
            
            iteration_count += 1

            xml_content = response.text

            root = ET.fromstring(xml_content)
            ns = { 'oai': 'http://www.openarchives.org/OAI/2.0/' ,
                   'dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/'
                }
        
            record_nodes = root.findall('.//oai:header', ns)

            if not record_nodes:
                print("No se encontraron más registros.")
                break

            for record in record_nodes:
                
                # Valores simples
                record_id = record.find('.//oai:identifier', ns)
                
                record_datestamp = record.find('.//oai:datestamp', ns)
                
                # Multivaluados
                setspec = [e.text for e in record.findall('.//oai:setSpec', ns)]

                record_date = record.find('.//dc:date', ns)
                record_title = record.find('.//dc:title', ns)

                record_creator = [e.text for e in record.findall('.//dc:creator', ns)]
                record_subject = [e.text for e in record.findall('.//dc:subject', ns)]
                record_description = [e.text for e in record.findall('.//dc:description', ns)]
                record_type = [e.text for e in record.findall('.//dc:type', ns)]
                record_identifier = [e.text for e in record.findall('.//dc:identifier', ns)]
                record_language = [e.text for e in record.findall('.//dc:language', ns)]
                record_relation = [e.text for e in record.findall('.//dc:relation', ns)]
                record_rights = [e.text for e in record.findall('.//dc:rights', ns)]
                record_format = [e.text for e in record.findall('.//dc:format', ns)]
                record_publisher = [e.text for e in record.findall('.//dc:publisher', ns)]
                
                records.append({
                    'record_id': record_id.text if record_id is not None else None,
                    'record_date': record_date.text if record_date is not None else None,
                    'record_title': record_title.text if record_title is not None else None,
                    'datestamp': record_datestamp.text if record_datestamp is not None else None,

                    'set_id': setspec,
                    'record_creator': record_creator,
                    'record_subject': record_subject,
                    'record_description': record_description,
                    'record_type': record_type,
                    'record_identifier': record_identifier,
                    'record_language': record_language,
                    'record_relation': record_relation,
                    'record_rights': record_rights,
                    'record_format': record_format,
                    'record_publisher': record_publisher
                })

          
    df = pd.DataFrame(records)

    timestamp = pd.Timestamp.now(tz="UTC").normalize()
    df['extract_datetime'] = timestamp

    # convierte cada lista en columnas (set_0, set_1, ...)
    sets_df = df['set_id'].apply(pd.Series)
    sets_df = sets_df.rename(columns=lambda i: f'set_{i}')

    # junta con record_id y (opcional) elimina la columna original
    df_sets = pd.concat([df[['record_id']], sets_df], axis=1)

    df_sets

    return df, df_sets, df.head(100)


In [6]:
df_records, df_sets, df_dev = oai_extract_records_by_identifiers(base_url, context, env, df_ids)

Consultando: https://ri.conicet.gov.ar/oai//request?verb=GetRecord&metadataPrefix=oai_dc&identifier=oai:ri.conicet.gov.ar:11336/77856


Sleeping for 0.46 seconds
Consultando: https://ri.conicet.gov.ar/oai//request?verb=GetRecord&metadataPrefix=oai_dc&identifier=<Element '{http://www.openarchives.org/OAI/2.0/}identifier' at 0x7b33a4f41c10>


Sleeping for 0.20 seconds
No se encontraron más registros.
Consultando: https://ri.conicet.gov.ar/oai//request?verb=GetRecord&metadataPrefix=oai_dc&identifier=oai:ri.conicet.gov.ar:11336/101455


Sleeping for 0.19 seconds
Consultando: https://ri.conicet.gov.ar/oai//request?verb=GetRecord&metadataPrefix=oai_dc&identifier=<Element '{http://www.openarchives.org/OAI/2.0/}identifier' at 0x7b33a4f41940>


Sleeping for 0.18 seconds
No se encontraron más registros.


In [8]:
df_sets

Unnamed: 0,record_id,set_0,set_1,set_2,set_3,set_4,set_5,set_6
0,oai:ri.conicet.gov.ar:11336/77856,com_11336_35,com_11336_14,col_11336_36,snrd,,,
1,oai:ri.conicet.gov.ar:11336/101455,com_11336_149,com_11336_118,com_11336_437,com_11336_416,col_11336_150,col_11336_438,snrd
