In [1]:
from datetime import date
import pandas as pd

pd.set_option('display.max_columns', None)

filter_param = catalog.load('params:openaire_fetch_options.filter_param')
filter_value = catalog.load('params:openaire_fetch_options.filter_value')
df = catalog.load('raw/openaire/researchproduct#parquet')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185373 entries, 0 to 185372
Data columns (total 29 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   author              185305 non-null  object
 1   openAccessColor     53613 non-null   object
 2   publiclyFunded      121560 non-null  object
 3   type                185373 non-null  object
 4   language            185373 non-null  object
 5   subjects            183336 non-null  object
 6   mainTitle           185373 non-null  object
 7   description         183327 non-null  object
 8   publicationDate     185361 non-null  object
 9   publisher           76578 non-null   object
 10  source              68904 non-null   object
 11  format              170917 non-null  object
 12  bestAccessRight     99741 non-null   object
 13  container           48356 non-null   object
 14  id                  185373 non-null  object
 15  originalId          185373 non-null  object
 16  in

In [3]:
df.columns


[1;35mIndex[0m[1m([0m[1m[[0m[32m'author'[0m, [32m'openAccessColor'[0m, [32m'publiclyFunded'[0m, [32m'type'[0m, [32m'language'[0m,
       [32m'subjects'[0m, [32m'mainTitle'[0m, [32m'description'[0m, [32m'publicationDate'[0m, [32m'publisher'[0m,
       [32m'source'[0m, [32m'format'[0m, [32m'bestAccessRight'[0m, [32m'container'[0m, [32m'id'[0m, [32m'originalId'[0m,
       [32m'indicators'[0m, [32m'instance'[0m, [32m'isGreen'[0m, [32m'isInDiamondJournal'[0m,
       [32m'contributor'[0m, [32m'pid'[0m, [32m'country'[0m, [32m'contactPerson'[0m, [32m'embargoEndDate'[0m,
       [32m'coverage'[0m, [32m'version'[0m, [32m'size'[0m, [32m'filter_applied'[0m[1m][0m,
      [33mdtype[0m=[32m'object'[0m[1m)[0m

## Paso 1: Convierto tipos y selecciono columnas con cardinalidad 1 con respecto a cada research product
+ info en https://graph.openaire.eu/docs/data-model/entities/research-product

In [4]:
def land_openaire_researchproduct(filter_param, filter_value, df: pd.DataFrame)-> pd.DataFrame:

    expected_columns = [
        'id',
        'openAccessColor',
        'publiclyFunded',
        'type',
        'language',
        'country',
        'mainTitle',
        'description',
        'publicationDate',
        'format',
        'bestAccessRight',
        'indicators',
        'isGreen',
        'isInDiamondJournal',
        'publisher',
        'source',
        'container',
        'contributor',
        'contactPerson',
        'coverage',
        'contactPerson',
        'embargoEndDate',
    ]

    # Agregar columnas faltantes con NaN
    for col in expected_columns:
        if col not in df.columns:
            df[col] = pd.NA

    df = df.convert_dtypes()

    df_researchproduct = df[expected_columns].copy()
    df.reset_index(drop=True, inplace=True)

    # language
    df_researchproduct['language_code'] = df_researchproduct['language'].apply(lambda x: x['code'])
    df_researchproduct['language_label'] = df_researchproduct['language'].apply(lambda x: x['label'])

    ## bestAccessRight
    df_researchproduct['bestAccessRight_label'] = df['bestAccessRight'].apply(lambda x: x['label'] if x else None)
    df_researchproduct['bestAccessRight_scheme'] = df['bestAccessRight'].apply(lambda x: x['scheme'] if x else None)

    ## indicators
    df_indicators = pd.json_normalize(df['indicators']).reset_index(drop=True)
    
    indicators_expected_columns = [
        "citationImpact.citationClass",
        "citationImpact.citationCount",
        "citationImpact.impulse",
        "citationImpact.impulseClass",
        "citationImpact.influence",
        "citationImpact.influenceClass",
        "citationImpact.popularity",
        "citationImpact.popularityClass",
        "usageCounts.downloads",
        "usageCounts.views",
    ]

    # Agregar columnas para indicators y faltantes con NaN
    for col in indicators_expected_columns:
        if col not in df_indicators.columns:
            df_indicators[col] = pd.NA

    df_researchproduct = pd.concat([df_researchproduct.drop(columns=['indicators']).reset_index(drop=True), df_indicators], axis=1)

    # TODO country
    # TODO description
    # TODO format
    # TODO instance
    # TODO source
    # TODO container
    # TODO contributor
    # TODO contactPerson
    # TODO coverage

    ## drop de columnas procesadas en otros df
    df_researchproduct.drop(columns=[
        'country', 'bestAccessRight', 
        'language', 'format',  
        'container', 'source', 'description',
        'contributor', 'contactPerson', 'coverage'
        ], inplace=True)

    df_researchproduct['load_datetime'] = date.today()

    df_researchproduct[filter_param] = filter_value

    return df_researchproduct


In [5]:
df_researchproduct = land_openaire_researchproduct(filter_param, filter_value, df)

In [6]:
df_researchproduct

Unnamed: 0,id,openAccessColor,publiclyFunded,type,mainTitle,publicationDate,isGreen,isInDiamondJournal,publisher,embargoEndDate,language_code,language_label,bestAccessRight_label,bestAccessRight_scheme,usageCounts,citationImpact.citationClass,citationImpact.citationCount,citationImpact.impulse,citationImpact.impulseClass,citationImpact.influence,citationImpact.influenceClass,citationImpact.popularity,citationImpact.popularityClass,usageCounts.downloads,usageCounts.views,load_datetime,relOrganizationId
0,4dc99724cf04::95ea5df70a451a0487e051faa6c0a646,gold,False,publication,Variability in the growth rates of Saanen kids...,2023-12-18,False,False,"Universidad Nacional Mayor de San Marcos, Facu...",,spa,Spanish; Castilian,OPEN,http://vocabularies.coar-repositories.org/docu...,,C5,0.0,0.0,C5,2.841867e-09,C5,2.885067e-09,C5,,,2025-02-28,openorgs____::40b9f835648a3e0d057d6917dd7e54d5
1,RECOLECTA___::24cb4438d1afe299e63cfdea4a31911f,,False,publication,Estudio de la viabilidad del algoritmo super-t...,2013-01-01,True,False,,,Español,Español,OPEN,http://vocabularies.coar-repositories.org/docu...,,C5,0.0,0.0,C5,2.841867e-09,C5,7.596338e-10,C5,,,2025-02-28,openorgs____::40b9f835648a3e0d057d6917dd7e54d5
2,core_ac_uk__::a391293fe8bd6c9c6c203ee15e8c2e8f,,False,publication,The Three Hundred project: The gas disruption ...,2021-03-01,True,False,Oxford University Press (OUP),,eng,English,OPEN,http://vocabularies.coar-repositories.org/docu...,,C5,0.0,0.0,C5,2.841867e-09,C5,2.155773e-09,C5,,,2025-02-28,openorgs____::40b9f835648a3e0d057d6917dd7e54d5
3,dedup_wf_002::00004d08998bcb6f35ce5d53811f12e9,,False,publication,Evolución del sistema de gestión de incidentes...,2023-06-23,True,False,,,esl/spa,Spanish,OPEN,http://vocabularies.coar-repositories.org/docu...,,C5,0.0,0.0,C5,2.841867e-09,C5,2.885067e-09,C5,,,2025-02-28,openorgs____::40b9f835648a3e0d057d6917dd7e54d5
4,dedup_wf_002::0000eb33b6be05d67799615dd117e5b2,gold,False,publication,Responsabilidad Social Corporativa en Empresas...,2013-12-01,False,False,Universidad Nacional de La Plata,,eng,English,OPEN,http://vocabularies.coar-repositories.org/docu...,,C5,0.0,0.0,C5,2.841867e-09,C5,7.596338e-10,C5,,,2025-02-28,openorgs____::40b9f835648a3e0d057d6917dd7e54d5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185368,portalunbbra::1a507fabf255873c7efa9a67b3d081a5,bronze,False,publication,"PATRONES MUSICALES, ESQUEMAS, Y METÁFORAS DE S...",2014-12-13,False,False,Programa de Pós-Graduação em Música da Univers...,,spa,Spanish; Castilian,,,,C5,0.0,0.0,C5,2.841867e-09,C5,8.529256e-10,C5,,,2025-02-28,openorgs____::40b9f835648a3e0d057d6917dd7e54d5
185369,portalunbbra::d22e22c3f5f77ddd0f037c0a1da23a9b,bronze,False,publication,South Korea´s expanding regional and global ro...,2013-12-17,False,False,Revista do CEAM,,por,Portuguese,,,,C5,0.0,0.0,C5,2.841867e-09,C5,7.596338e-10,C5,,,2025-02-28,openorgs____::40b9f835648a3e0d057d6917dd7e54d5
185370,portalunbbra::fe22079ac12e4b48e955751af4ce027b,gold,False,publication,Interview with Javier Gorrais,2019-04-30,False,False,Programa de Pós-Graduação em Estudos da Traduç...,,spa,Spanish; Castilian,OPEN,http://vocabularies.coar-repositories.org/docu...,,C5,0.0,0.0,C5,2.841867e-09,C5,1.626197e-09,C5,,,2025-02-28,openorgs____::40b9f835648a3e0d057d6917dd7e54d5
185371,revistasumft::106b3b4f6115356de22e7aaf07960b34,gold,False,publication,Deconstructing the classic categories of an et...,2023-12-31,False,False,PPGAS/UFMT,,por,Portuguese,OPEN,http://vocabularies.coar-repositories.org/docu...,,C5,0.0,0.0,C5,2.841867e-09,C5,2.885067e-09,C5,,,2025-02-28,openorgs____::40b9f835648a3e0d057d6917dd7e54d5
