In [1]:
from datetime import date
import requests
import pandas as pd
import time
pd.set_option('display.max_columns', None)


In [2]:
institution_ror = catalog.load('params:openalex_fetch_options.institution_ror')
#env = catalog.load('params:fetch_options.env')
env = 'dev'

In [3]:
institution_ror

[32m'https://ror.org/02s7sax82'[0m

In [None]:
def openalex_clean_institution(df):
    """Elimina columnas innecesarias si están presentes."""
    columns_to_drop = {"abstract_inverted_index", "abstract_inverted_index_v3"}
    return df.drop(columns=columns_to_drop.intersection(df.columns), inplace=False)

def openalex_fetch_institution(institution_ror, env):
    session = requests.Session()  # Reutilizar la sesión para eficiencia
    base_url = 'https://api.openalex.org/institutions?filter=ror:{}&cursor={}&per-page=200'
    cursor = '*'
    iteration_limit = 5
    iteration_count = 0
    all_dataframes = []  # Lista para almacenar los DataFrames antes de concatenar

    while True:
        url = base_url.format(institution_ror, cursor)
        print(f'Iteration count: {iteration_count}')
        print(f'GET {url}')

        try:
            response = session.get(url, timeout=10)
            response.raise_for_status()
            api_response = response.json()
        except requests.RequestException as e:
            print(f"Error en la solicitud: {e}")
            break
        except ValueError:
            print("Error al decodificar JSON.")
            break

        # Si no hay resultados, se termina el bucle
        if 'results' not in api_response or not api_response['results']:
            print("No hay más datos disponibles.")
            break

        df_tmp = pd.DataFrame.from_dict(api_response['results'])
        df_tmp = openalex_clean_institution(df_tmp)
        all_dataframes.append(df_tmp)

        # Actualizar cursor
        cursor = api_response.get('meta', {}).get('next_cursor')
        if not cursor:
            break

        # Control de iteraciones en entorno 'dev'
        iteration_count += 1
        if env == 'dev' and iteration_count >= iteration_limit:
            break

        time.sleep(1)  # Respetar límites de la API

    # Concatenar todos los DataFrames en uno solo
    df = pd.concat(all_dataframes, ignore_index=True) if all_dataframes else pd.DataFrame()

    df['load_datetime'] = date.today()

    return df, df.head(1000)


In [5]:
df, df_dev = openalex_fetch_institution(institution_ror, env)

Iteration count: 0
GET https://api.openalex.org/institutions?filter=ror:https://ror.org/02s7sax82&cursor=*&per-page=200
Iteration count: 1
GET https://api.openalex.org/institutions?filter=ror:https://ror.org/02s7sax82&cursor=IlszNTEwLCAnaHR0cHM6Ly9vcGVuYWxleC5vcmcvSTQyMTAxMjI5NzAnXSI=&per-page=200
No hay más datos disponibles.


In [6]:
df

Unnamed: 0,id,ror,display_name,country_code,type,type_id,lineage,homepage_url,image_url,image_thumbnail_url,display_name_acronyms,display_name_alternatives,repositories,works_count,cited_by_count,summary_stats,ids,geo,international,associated_institutions,counts_by_year,roles,topics,topic_share,x_concepts,is_super_system,works_api_url,updated_date,created_date,load_datetime
0,https://openalex.org/I4210122970,https://ror.org/02s7sax82,Comisión de Investigaciones Científicas,AR,government,https://openalex.org/institution-types/government,[https://openalex.org/I4210122970],https://www.cic.gba.gob.ar,,,[CIC],[],[],3510,56910,"{'2yr_mean_citedness': 1.4695652173913043, 'h_...",{'openalex': 'https://openalex.org/I4210122970...,"{'city': 'La Plata', 'geonames_city_id': '3432...",{'display_name': {'en': 'Comisión de Investiga...,"[{'id': 'https://openalex.org/I4411590623', 'r...","[{'year': 2025, 'works_count': 78, 'oa_works_c...","[{'role': 'funder', 'id': 'https://openalex.or...","[{'id': 'https://openalex.org/T10825', 'displa...","[{'id': 'https://openalex.org/T10650', 'displa...","[{'id': 'https://openalex.org/C86803240', 'wik...",False,https://api.openalex.org/works?filter=institut...,2025-09-25T10:26:50.738232,2022-02-02,2025-09-28
