In [None]:
from datetime import date
import pandas as pd
import requests
import time

env = 'dev'

organizations_filter = catalog.load('params:openaire_extract_options.organizations_filter')
researchproducts_filter = catalog.load('params:openaire_extract_options.researchproducts_filter')
ror_filter_value = catalog.load('params:openaire_extract_options.ror_filter_value')
access_token = catalog.load('params:openaire_extract_options.access_token')
refresh_token = catalog.load('params:openaire_extract_options.refresh_token')

print(f'organizations_filter: {organizations_filter}')
print(f'researchproducts_filter: {researchproducts_filter}')
print(f'ror_filter_value: {ror_filter_value}')
print(f'env: {env}')


organizations_filter: pid
researchproducts_filter: relOrganizationId
ror_filter_value: https://ror.org/01tjs6929
env: dev


In [12]:
def refresh_access_token(refresh_token):
    """Obtiene un nuevo access_token usando el refresh_token."""
    refresh_url = f"https://services.openaire.eu/uoa-user-management/api/users/getAccessToken?refreshToken={refresh_token}"
    response = requests.get(refresh_url)
    if response.status_code == 200:
        return response.json().get("access_token")
    else:
        raise Exception(f"Failed to refresh token: {response.status_code}")

In [None]:
def openaire_extract_researchproduct(filter_param, ror_filter_value, access_token, refresh_token, env):
    cursor = '*'
    organizations_base_url = 'https://api.openaire.eu/graph/v1/organizations'
    research_base_url = 'https://api.openaire.eu/graph/v2/researchProducts'
    iteration_limit = 5
    iteration_count = 0
    page_size = 50         # Ajustar según sea necesario
    max_retries = 5        # Máximo número de reintentos en caso de error 429
    retry_wait = 5         # Tiempo inicial de espera entre reintentos (segundos)
    max_refresh_attempts = 3  # Máximo número de intentos para refrescar el token
    refresh_attempts = 0

    request_headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {access_token}"
    }

    org_query_params = {
        "pid": ror_filter_value,
    }

    # Resolver el id de OpenAIRE a partir del ROR
    while True:
        response = requests.get(organizations_base_url, headers=request_headers, params=org_query_params)

        if response.status_code == 403:
            if refresh_attempts >= max_refresh_attempts:
                raise Exception("Máximo de intentos para refrescar el token alcanzado. Abortando.")
            print("Access token expired or invalid while fetching organization. Refreshing token...")
            new_token = refresh_access_token(refresh_token)
            if not new_token:
                raise Exception("No se pudo refrescar el access token para organizations.")
            access_token = new_token
            request_headers["Authorization"] = f"Bearer {access_token}"
            refresh_attempts += 1
            continue

        if response.status_code != 200:
            raise Exception(f"Failed to fetch organization for ROR {ror_filter_value}: {response.status_code}")

        data = response.json()
        results = data.get("results", [])
        if not results:
            raise Exception(f"No organization found for ROR {ror_filter_value}")

        organization_id = results[0].get('id')
        if not organization_id:
            raise Exception("No se encontró id de organización en la respuesta.")

        print(f"OpenAIRE organization id resolved from ROR {ror_filter_value}: {organization_id}")
        break

    # Reiniciamos contador de refrescos antes de ir al endpoint de research products
    refresh_attempts = 0

    query_params = {
        filter_param: organization_id,
        "pageSize": page_size,
        "cursor": cursor
    }

    while True:
        response = requests.get(research_base_url, headers=request_headers, params=query_params)

        # Si el token es inválido o expiró, intentar renovarlo
        if response.status_code == 403:
            if refresh_attempts >= max_refresh_attempts:
                raise Exception("Máximo de intentos para refrescar el token alcanzado. Abortando.")
            print("Access token expired or invalid. Refreshing token...")
            new_token = refresh_access_token(refresh_token)
            if not new_token:
                raise Exception("No se pudo refrescar el access token.")
            access_token = new_token
            request_headers["Authorization"] = f"Bearer {access_token}"
            refresh_attempts += 1
            continue  # Reintenta la solicitud con el nuevo token

        if response.status_code != 200:
            raise Exception(f"Failed to retrieve data: {response.status_code}")

        # Restablecemos el contador de refrescos al tener una respuesta exitosa
        refresh_attempts = 0

        api_response = response.json()
        print(f"Iteration count: {iteration_count}")
        print(f"GET {response.url}")

        # Crear DataFrame con el primer bloque de resultados
        df = pd.DataFrame.from_dict(api_response["results"])

        # Actualizar cursor
        cursor = api_response["header"].get("nextCursor", None)
        query_params["cursor"] = cursor

        # Bucle para iterar con el cursor
        while cursor:
            if env == "dev" and iteration_count >= iteration_limit:
                break

            iteration_count += 1
            print(f"Iteration count: {iteration_count}")
            print(f"GET {response.url}")
            time.sleep(2)

            # Reintentos en caso de error 429
            retries = 0
            while retries < max_retries:
                response = requests.get(research_base_url, headers=request_headers, params=query_params)

                if response.status_code == 403:
                    if refresh_attempts >= max_refresh_attempts:
                        raise Exception("Máximo de intentos para refrescar el token alcanzado durante la ejecución. Abortando.")
                    print("Access token expired during execution. Refreshing token...")
                    new_token = refresh_access_token(refresh_token)
                    if not new_token:
                        raise Exception("No se pudo refrescar el access token durante la ejecución.")
                    access_token = new_token
                    request_headers["Authorization"] = f"Bearer {access_token}"
                    refresh_attempts += 1
                    continue  # Reintenta con el nuevo token

                if response.status_code == 429:
                    retries += 1
                    print(f"Rate limit hit. Retry {retries}/{max_retries}. Waiting {retry_wait} seconds...")
                    time.sleep(retry_wait)
                    retry_wait *= 2  # Incremento exponencial del tiempo de espera
                else:
                    break

            if response.status_code != 200:
                print(f"Failed to retrieve data at iteration {iteration_count}: {response.status_code}")
                break

            # Restablecer contador de refrescos tras respuesta exitosa
            refresh_attempts = 0

            api_response = response.json()

            if not api_response.get("results"):
                print("No more results. Stopping iteration.")
                break

            df_tmp = pd.DataFrame.from_dict(api_response["results"])
            df = pd.concat([df, df_tmp], ignore_index=True)

            cursor = api_response["header"].get("nextCursor", None)
            query_params["cursor"] = cursor

        df[filter_param] = organization_id

        df['load_datetime'] = date.today()

        return df, df.head(1000)


In [17]:
df, df_dev = openaire_extract_researchproduct(
    researchproducts_filter,
    ror_filter_value,
    access_token,
    refresh_token,
    env,
)


Access token expired or invalid while fetching organization. Refreshing token...
OpenAIRE organization id resolved from ROR https://ror.org/01tjs6929: openorgs____::40b9f835648a3e0d057d6917dd7e54d5
Iteration count: 0
GET https://api.openaire.eu/graph/v2/researchProducts?relOrganizationId=openorgs____%3A%3A40b9f835648a3e0d057d6917dd7e54d5&pageSize=50&cursor=%2A
Iteration count: 1
GET https://api.openaire.eu/graph/v2/researchProducts?relOrganizationId=openorgs____%3A%3A40b9f835648a3e0d057d6917dd7e54d5&pageSize=50&cursor=%2A
Iteration count: 2
GET https://api.openaire.eu/graph/v2/researchProducts?relOrganizationId=openorgs____%3A%3A40b9f835648a3e0d057d6917dd7e54d5&pageSize=50&cursor=AoI%2FD2E5YWM1MGY1NzZhYTo6NmFmNjA2NTlhMzU3NGNjZTg2MTg1M2VmN2YyOTlhODYIP4AAAA%3D%3D
Iteration count: 3
GET https://api.openaire.eu/graph/v2/researchProducts?relOrganizationId=openorgs____%3A%3A40b9f835648a3e0d057d6917dd7e54d5&pageSize=50&cursor=AoI%2FD2E5YWM1MGY1NzZhYTo6ZjM2YzUzNDZiNzMxODFhZTE2YzQ3NmU3MGUzYzg1Z

In [None]:
df

Unnamed: 0,authors,openAccessColor,publiclyFunded,type,language,countries,subjects,mainTitle,subTitle,descriptions,...,indicators,projects,organizations,communities,collectedFrom,instances,isGreen,isInDiamondJournal,relOrganizationId,load_datetime
0,"[{'fullName': 'Susevich, María Laura', 'name':...",gold,False,publication,"{'code': 'spa', 'label': 'Spanish; Castilian'}",,"[{'subject': {'scheme': 'keyword', 'value': 'p...",Dicistrovirus from the pollinator community fo...,,[Los Dicistrovirus son una familia de virus qu...,...,"{'citationImpact': {'citationCount': 0.0, 'inf...",,[{'legalName': 'Universidad Nacional de La Pam...,"[{'code': 'knowmad', 'label': 'Knowmad Institu...",[{'key': 'openaire____::0b74b6a356bbf23c245f9a...,"[{'alternateIdentifiers': [{'scheme': 'doi', '...",False,False,openorgs____::40b9f835648a3e0d057d6917dd7e54d5,2025-11-20
1,"[{'fullName': 'Steffen, Kevin Denis', 'name': ...",gold,False,publication,"{'code': 'spa', 'label': 'Spanish; Castilian'}",,"[{'subject': {'scheme': 'keyword', 'value': 'g...",Variability in the growth rates of Saanen kids...,,[This study was carried out to determine the v...,...,"{'citationImpact': {'citationCount': 0.0, 'inf...",,[{'legalName': 'National Scientific and Techni...,,[{'key': 'openaire____::0b74b6a356bbf23c245f9a...,"[{'alternateIdentifiers': [{'scheme': 'doi', '...",False,False,openorgs____::40b9f835648a3e0d057d6917dd7e54d5,2025-11-20
2,"[{'fullName': 'Paredes A., Manuel', 'name': 'M...",gold,False,publication,"{'code': 'spa', 'label': 'Spanish; Castilian'}",,"[{'subject': {'scheme': 'keyword', 'value': 'l...",Effects of dietary inclusion of alfalfa meal o...,,[Se utilizaron 240 pollos de engorde hembras H...,...,"{'citationImpact': {'citationCount': 0.0, 'inf...",,[{'legalName': 'National University of La Plat...,,[{'key': 'openaire____::0b74b6a356bbf23c245f9a...,"[{'alternateIdentifiers': [{'scheme': 'doi', '...",False,False,openorgs____::40b9f835648a3e0d057d6917dd7e54d5,2025-11-20
3,"[{'fullName': 'Evangelista, Carolina', 'name':...",,False,publication,"{'code': 'und', 'label': 'Undetermined'}",,"[{'subject': {'scheme': 'keyword', 'value': 'C...",Estudio de la viabilidad del algoritmo super-t...,,[Presentado a la XV Reunión de Trabajo en Proc...,...,"{'citationImpact': {'citationCount': 0.0, 'inf...",[{'id': 'corda_______::d91859a755964e867fc7aae...,[{'legalName': 'National University of La Plat...,,[{'key': 'openaire____::4cb2a3eb94033446c37331...,"[{'accessRight': {'code': 'c_abf2', 'label': '...",True,False,openorgs____::40b9f835648a3e0d057d6917dd7e54d5,2025-11-20
4,"[{'fullName': 'Galettini, Azucena', 'name': 'A...",,False,publication,"{'code': 'und', 'label': 'Undetermined'}",,"[{'subject': {'scheme': 'keyword', 'value': 'C...",Escritura topográfica y artealización del pais...,,[El poemario Chronicles of the Hostile Sun (19...,...,"{'citationImpact': {'citationCount': 0.0, 'inf...",,[{'legalName': 'National University of La Pamp...,,[{'key': 'openaire____::4cb2a3eb94033446c37331...,"[{'license': 'CC BY', 'accessRight': {'code': ...",True,False,openorgs____::40b9f835648a3e0d057d6917dd7e54d5,2025-11-20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,"[{'fullName': 'Fasano, Cecilia', 'name': 'Ceci...",,False,publication,"{'code': 'esl/spa', 'label': 'Spanish'}",,"[{'subject': {'scheme': 'keyword', 'value': 'p...",Toxicomanías en la década del 20 : Pequeños su...,,[This commentary is aimed at showing the close...,...,"{'citationImpact': {'citationCount': 0.0, 'inf...",,[{'legalName': 'National University of La Plat...,,[{'key': 'openaire____::df45502607927471ecf8a6...,"[{'accessRight': {'code': 'c_abf2', 'label': '...",True,False,openorgs____::40b9f835648a3e0d057d6917dd7e54d5,2025-11-20
296,"[{'fullName': 'Baldo, Juan Diego', 'name': 'Ju...",,False,publication,"{'code': 'esl/spa', 'label': 'Spanish'}",,"[{'subject': {'scheme': 'keyword', 'value': 'C...",<i>Melanophryniscus devincenzii</i> Kapplenbac...,,"[Argentina, Misiones, Departamento Capital, Pa...",...,"{'citationImpact': {'citationCount': 0.0, 'inf...",,[{'legalName': 'National University of La Plat...,,[{'key': 'openaire____::df45502607927471ecf8a6...,"[{'accessRight': {'code': 'c_abf2', 'label': '...",True,False,openorgs____::40b9f835648a3e0d057d6917dd7e54d5,2025-11-20
297,"[{'fullName': 'Cousinet, Roger', 'name': 'Roge...",,False,publication,"{'code': 'esl/spa', 'label': 'Spanish'}",,"[{'subject': {'scheme': 'keyword', 'value': 'E...",La educación estética en la escuela primaria,,[La idea de incluir la educación estética en l...,...,"{'citationImpact': {'citationCount': 0.0, 'inf...",,[{'legalName': 'National University of La Plat...,,[{'key': 'openaire____::df45502607927471ecf8a6...,"[{'type': 'Article', 'urls': ['http://sedici.u...",True,False,openorgs____::40b9f835648a3e0d057d6917dd7e54d5,2025-11-20
298,"[{'fullName': 'Baldini, Lidia Nélida', 'name':...",,False,publication,"{'code': 'esl/spa', 'label': 'Spanish'}",,"[{'subject': {'scheme': 'keyword', 'value': 'c...","Cancha de Paleta, un cementerio del periodo fo...",,[Se presentan los resultados de un trabajo de ...,...,"{'citationImpact': {'citationCount': 0.0, 'inf...",,[{'legalName': 'National University of La Plat...,,[{'key': 'opendoar____::dc6e224a8d74ce03bf3011...,"[{'type': 'Article', 'urls': ['http://sedici.u...",True,False,openorgs____::40b9f835648a3e0d057d6917dd7e54d5,2025-11-20
