In [1]:
import requests
import pandas as pd
import xmltodict
import math

In [2]:
hub_openaire_graph_originalid = catalog.load('stg_openaire_graph/hub_openaire_graph_originalid')
r_token = catalog.load('params:openaire_fetch_options.refresh_token')
env = 'dev'
pd.set_option("display.max_columns", None)
id_column = 'original_id'
id_param = 'originalId'


In [3]:
hub_openaire_graph_originalid

Unnamed: 0,original_hk,original_id,load_datetime,source
0,b'\x00\x00\xaf@Eu\x82\xdd\xc3\xe0\xa9\xad>>\x1...,50|od______9441::bf90785b53fcaf6520cdd255553795ac,2025-01-28,OPENAIRE
1,b'\x00\x05\x1d\x05\x05\xfb$[\xb9\x7fmGr\xc7\x8...,50|od______9441::f343a5445f9f391d9ec6de2f50fc17f9,2025-01-28,OPENAIRE
2,b'\x00\rMJ\xcb\x19k\x80\xa8\x9dO4r\xb2`\xd8',oai:digital.cic.gba.gob.ar:11746/482,2025-01-28,OPENAIRE
3,b'\x00\x10\xbf\x10K\xe8W\xf6\xe5\xb9\xaf\t\xce...,oai:digital.cic.gba.gob.ar:11746/8696,2025-01-28,OPENAIRE
4,b'\x00\x13g.;\x9b\x9eO4\x13\xc7H\x1e=\x00\x03',oai:digital.cic.gba.gob.ar:11746/3894,2025-01-28,OPENAIRE
...,...,...,...,...
19371,b'\xff\xf0\xd9u\xa8\x96sE\xcf\xc5/\xbe\xec\x1aT4',50|od______9441::004eccb05a755201fbf8803363af5b66,2025-01-28,OPENAIRE
19372,b'\xff\xf3\xf0u\x7f\x1a*t\x87\xb6\xb9\xb0\xc0\...,oai:digital.cic.gba.gob.ar:11746/622,2025-01-28,OPENAIRE
19373,b'\xff\xf4F\xe5\xa6\x1e\xa6B\xaf\x80\x1b;oi\xc...,oai:digital.cic.gba.gob.ar:11746/8737,2025-01-28,OPENAIRE
19374,b'\xff\xfa\xf0f\x14\x8d\xf1k\xa4\x164\x08\x19b...,oai:digital.cic.gba.gob.ar:11746/6871,2025-01-28,OPENAIRE


### Recupero datos de OpenAire Graph

Hago una pequeña manipulación de datos, que debería evitarse. Para eso en vez de el hub doi, hay que usar una dim_doi con el valor del pid ya manipulado en mart.



In [4]:
def fetch_researchproduct_openaire(df_input: pd.DataFrame, r_token: str, env: str) -> tuple[pd.DataFrame, list]:
    """
    Fetch research product data from OpenAIRE API.

    Args:
        df_input (pd.DataFrame): Input DataFrame containing identifiers.
        r_token (str): Authorization token for OpenAIRE API.
        env (str): Environment ('dev' or 'prod').
        id_column (str): Column name in DataFrame containing identifiers (e.g., 'doi' or 'original_id').
        id_param (str): Query parameter name for the API ('doi' or 'originalId').

    Returns:
        tuple[pd.DataFrame, list]: A tuple containing the resulting DataFrame and a list of skipped IDs.
    """
    base_url = "https://api.openaire.eu/search/researchProducts"
    df_list = []
    id_column = 'original_id'
    id_param = 'originalId'
    id_limit = 9999 if env == 'prod' else 9
    skipped_list = []

    # Filter rows where the ID column is not empty
    id_list = df_input[id_column].dropna().iloc[:id_limit].to_list()

    # Define the number of batches based on batch size and ID count
    batch_size = 10
    num_batches = math.ceil(len(id_list) / batch_size)

    for batch_index in range(num_batches):

        batch = id_list[batch_index * batch_size : (batch_index + 1) * batch_size]
        id_comma_separated = ",".join(batch)

        graph_url = f"{base_url}?{id_param}={id_comma_separated}"
        headers = {'Authorization': f'Bearer {r_token}'}

        api_response = requests.get(graph_url, headers=headers)
        print(f'GET "{graph_url}" {api_response.status_code}')

        if api_response.status_code == 200:
            data_dict = xmltodict.parse(api_response.content)
            results = data_dict.get('response', {}).get('results', {}).get('result', [])

            for result in results:
                publication_header = result.get('header', {})
                publication_metadata = result.get('metadata', {}).get('oaf:entity', {}).get('oaf:result', {})

                publication = publication_header | publication_metadata
                if publication:
                    df_normalized = pd.json_normalize(publication, max_level=0)
                    df_list.append(df_normalized)
                else:
                    print("No publication data found in result")
        else:
            print(f'Error: Received status code {api_response.status_code}')
            skipped_list.extend(batch)
            break

    print(f'{len(df_list)} batches processed')
    print(f'{len(skipped_list)} IDs skipped')

    if df_list:
        df = pd.concat(df_list, ignore_index=True)
    else:
        df = pd.DataFrame()

    return df, skipped_list

In [5]:
df, skiped_list = fetch_researchproduct_openaire(hub_openaire_graph_originalid, r_token, env, id_column, id_param)


GET "https://api.openaire.eu/search/researchProducts?originalId=50|od______9441::bf90785b53fcaf6520cdd255553795ac,50|od______9441::f343a5445f9f391d9ec6de2f50fc17f9,oai:digital.cic.gba.gob.ar:11746/482,oai:digital.cic.gba.gob.ar:11746/8696,oai:digital.cic.gba.gob.ar:11746/3894,50|od______9441::bf322dde89550792d6386dbb63a24216,50|od______9441::d86511f3c2bd967c73b06c2aa68c8f7b,50|od______9441::a5da66ac9a7e39207c32ecd5c50a2bea,50|od______9441::b2fc67091b462d2f40e3e155c873e4fd" 200
9 batches processed
0 IDs skipped


In [6]:
df

Unnamed: 0,@xmlns:xsi,dri:objIdentifier,dri:dateOfCollection,dri:dateOfTransformation,collectedfrom,originalId,measure,title,bestaccessright,creator,contributor,dateofacceptance,description,subject,language,format,resulttype,resourcetype,datainfo,rels,children,country,publisher
0,http://www.w3.org/2001/XMLSchema-instance,od______9441::f343a5445f9f391d9ec6de2f50fc17f9,2024-12-18T15:57:41.964,2024-03-14T12:56:07.024Z,"{'@name': 'CIC-Digital', '@id': 'opendoar____:...",[50|od______9441::f343a5445f9f391d9ec6de2f50fc...,"[{'@id': 'influence', '@score': '2.841867E-9',...","{'@classid': 'main title', '@classname': 'main...",{'@classid': 'http://creativecommons.org/licen...,"{'@rank': '1', '@name': 'Victor Fabricio', '@s...","Podestá, Julio César",2014-01-01,"Durante el período de la beca, 01/04/2013 a la...","[{'@classid': 'keyword', '@classname': 'keywor...","{'@classid': 'Español', '@classname': 'Español...","[application/pdf, 5 p.]","{'@classid': 'other', '@classname': 'other', '...","{'@classid': 'UNKNOWN', '@classname': 'UNKNOWN...","{'inferred': 'false', 'deletedbyinference': 'f...",,{'instance': {'accessright': {'@classid': 'htt...,,
1,http://www.w3.org/2001/XMLSchema-instance,od______9441::bf90785b53fcaf6520cdd255553795ac,2024-12-18T15:57:35.103,2024-03-14T12:56:07.281Z,"{'@name': 'CIC-Digital', '@id': 'opendoar____:...","[oai:digital.cic.gba.gob.ar:11746/5555, 50|od_...","[{'@id': 'influence', '@score': '2.841867E-9',...","{'@classid': 'main title', '@classname': 'main...","{'@classid': 'OPEN', '@classname': 'Open Acces...","{'@rank': '1', '@name': 'César Gustavo', '@su...","[Balatti, Pedro Alberto, Saparrat, Mario]",2016-01-01,Cladosporium fulvum es un hongo biotrófico no ...,"[{'@classid': 'keyword', '@classname': 'keywor...","{'@classid': 'Español', '@classname': 'Español...","[application/pdf, 18 p.]","{'@classid': 'other', '@classname': 'other', '...","{'@classid': 'UNKNOWN', '@classname': 'UNKNOWN...","{'inferred': 'false', 'deletedbyinference': 'f...",,{'instance': {'accessright': {'@classid': 'OPE...,,
2,http://www.w3.org/2001/XMLSchema-instance,dedup_wf_002::94e76520064587276002f582f761fa34,2024-12-18T15:57:43.601,,[{'@name': 'Servicio de Difusión de la Creació...,[50|od______1329::c7bd52e7a84c83098d0572988a17...,"[{'@id': 'influence', '@score': '2.841867E-9',...","{'@classid': 'main title', '@classname': 'main...","{'@classid': 'OPEN', '@classname': 'Open Acces...","[{'@rank': '1', '@name': 'Natalia', '@surname'...",,2019-01-01,[Red de Universidades con Carreras en Informát...,"[{'@classid': 'keyword', '@classname': 'keywor...","{'@classid': 'esl/spa', '@classname': 'Spanish...","[application/pdf, 76-83]","{'@classid': 'other', '@classname': 'other', '...","{'@classid': 'UNKNOWN', '@classname': 'UNKNOWN...","{'inferred': 'true', 'deletedbyinference': 'fa...",,{'instance': [{'accessright': {'@classid': 'ht...,"{'@classid': 'AR', '@classname': 'Argentina', ...",
3,http://www.w3.org/2001/XMLSchema-instance,od______9441::a5da66ac9a7e39207c32ecd5c50a2bea,2024-12-18T15:57:36.103,2024-03-14T12:56:20.853Z,"{'@name': 'CIC-Digital', '@id': 'opendoar____:...",[50|od______9441::a5da66ac9a7e39207c32ecd5c50a...,"[{'@id': 'influence', '@score': '2.841867E-9',...","{'@classid': 'main title', '@classname': 'main...",{'@classid': 'http://creativecommons.org/licen...,"[{'@rank': '1', '@name': 'D. H.', '@surname': ...",,2019-01-01,This paper presents an experimental study on c...,"[{'@classid': 'keyword', '@classname': 'keywor...","{'@classid': 'Inglés', '@classname': 'Inglés',...","[application/pdf, 649–658]","{'@classid': 'other', '@classname': 'other', '...","{'@classid': 'UNKNOWN', '@classname': 'UNKNOWN...","{'inferred': 'false', 'deletedbyinference': 'f...",,{'instance': {'accessright': {'@classid': 'htt...,,Elsevier BV
4,http://www.w3.org/2001/XMLSchema-instance,od______9441::76663a48bff44d8a7cd1b2b0eedb303f,2024-12-18T15:57:39.157,2024-03-14T12:56:08.392Z,"{'@name': 'CIC-Digital', '@id': 'opendoar____:...","[oai:digital.cic.gba.gob.ar:11746/482, 50|od__...","[{'@id': 'influence', '@score': '2.841867E-9',...","{'@classid': 'main title', '@classname': 'main...","{'@classid': 'OPEN', '@classname': 'Open Acces...","[{'@rank': '1', '@name': 'Alfredo', '@surname'...",,1969-01-01,[En la primera parte del trabajo se estudia el...,"[{'@classid': 'keyword', '@classname': 'keywor...","{'@classid': 'Español', '@classname': 'Español...","[application/pdf, 28 p.]","{'@classid': 'other', '@classname': 'other', '...","{'@classid': 'UNKNOWN', '@classname': 'UNKNOWN...","{'inferred': 'false', 'deletedbyinference': 'f...","{'rel': {'@inferred': 'true', '@trust': '0.9',...",{'instance': {'accessright': {'@classid': 'OPE...,,Laboratorio de Entrenamiento Multidisciplinari...
5,http://www.w3.org/2001/XMLSchema-instance,od______9441::b2fc67091b462d2f40e3e155c873e4fd,2024-12-18T15:57:30.722,2024-03-14T12:56:09.104Z,"{'@name': 'CIC-Digital', '@id': 'opendoar____:...",[50|od______9441::b2fc67091b462d2f40e3e155c873...,"[{'@id': 'influence', '@score': '2.841867E-9',...","{'@classid': 'main title', '@classname': 'main...","{'@classid': 'OPEN', '@classname': 'Open Acces...","{'@rank': '1', '@name': 'Alejandro Arturo', '@...","Freyre, Lauce Rubén",2003-01-01,Proyectos de investigación en los cuales colab...,"[{'@classid': 'keyword', '@classname': 'keywor...","{'@classid': 'Español', '@classname': 'Español...","[application/pdf, 14 p.]","{'@classid': 'other', '@classname': 'other', '...","{'@classid': 'UNKNOWN', '@classname': 'UNKNOWN...","{'inferred': 'false', 'deletedbyinference': 'f...",,{'instance': {'accessright': {'@classid': 'OPE...,,
6,http://www.w3.org/2001/XMLSchema-instance,dedup_wf_002::9edef6cd1f909c8505a423d72a2ab20c,2024-12-18T15:57:24.075,,[{'@name': 'Servicio de Difusión de la Creació...,"[oai:sedici.unlp.edu.ar:10915/21955, 50|od____...","[{'@id': 'influence', '@score': '2.841867E-9',...","{'@classid': 'main title', '@classname': 'main...","{'@classid': 'OPEN', '@classname': 'Open Acces...","[{'@rank': '1', '@name': 'Liliana Inés', '@su...",,2008-10-01,[La Arquitectura Model Driven (MDA) promueve e...,"[{'@classid': 'keyword', '@classname': 'keywor...","{'@classid': 'esl/spa', '@classname': 'Spanish...","[application/pdf, 12 p.]","{'@classid': 'other', '@classname': 'other', '...","{'@classid': 'UNKNOWN', '@classname': 'UNKNOWN...","{'inferred': 'true', 'deletedbyinference': 'fa...",,{'instance': [{'accessright': {'@classid': 'ht...,"{'@classid': 'AR', '@classname': 'Argentina', ...",
7,http://www.w3.org/2001/XMLSchema-instance,dedup_wf_002::db9655c70e201da1b00342f0908c9d2b,2024-12-18T15:57:31.281,,[{'@name': 'Servicio de Difusión de la Creació...,"[oai:sedici.unlp.edu.ar:10915/100119, 50|od___...","[{'@id': 'influence', '@score': '2.841867E-9',...","{'@classid': 'main title', '@classname': 'main...",{'@classid': 'http://creativecommons.org/licen...,"{'@rank': '1', '@name': 'Roberto', '@surname':...",,2018-01-01,[Centro de Investigación y Desarrollo en Tecno...,"[{'@classid': 'keyword', '@classname': 'keywor...","{'@classid': 'esl/spa', '@classname': 'Spanish...","[application/pdf, 4 p.]","{'@classid': 'other', '@classname': 'other', '...","{'@classid': 'UNKNOWN', '@classname': 'UNKNOWN...","{'inferred': 'true', 'deletedbyinference': 'fa...",,{'instance': [{'accessright': {'@classid': 'ht...,"{'@classid': 'AR', '@classname': 'Argentina', ...",
8,http://www.w3.org/2001/XMLSchema-instance,od______9441::62b3d9600838c5d11a616d494016877d,2024-12-18T15:57:19.207,2024-03-14T12:56:03.886Z,"{'@name': 'CIC-Digital', '@id': 'opendoar____:...","[oai:digital.cic.gba.gob.ar:11746/3894, 50|od_...","[{'@id': 'influence', '@score': '2.841867E-9',...","{'@classid': 'main title', '@classname': 'main...","{'@classid': 'UNKNOWN', '@classname': 'not ava...","[{'@rank': '1', '@name': 'Miguel Atilio', '@su...",,2007-01-01,[Statistical analysis is fundamental to the fi...,"[{'@classid': 'keyword', '@classname': 'keywor...","{'@classid': 'und', '@classname': 'Undetermine...","[application/pdf, p. 22-24]","{'@classid': 'other', '@classname': 'other', '...","{'@classid': 'UNKNOWN', '@classname': 'UNKNOWN...","{'inferred': 'false', 'deletedbyinference': 'f...",,{'instance': {'accessright': {'@classid': 'UNK...,,
