In [1]:
from datetime import date
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 5)

df = catalog.load('raw/openaire/researchproduct_dev#parquet')

In [2]:
def _pick_load_dt(df: pd.DataFrame):
    # Si hay una sola fecha en el batch, usala; si hay varias, quedate con la más reciente;
    # si no hay, hoy.
    if 'load_datetime' not in df.columns or df['load_datetime'].isna().all():
        return date.today()
    vals = df['load_datetime'].dropna()
    if vals.nunique() == 1:
        return vals.iloc[0]
    return pd.to_datetime(vals).max().date()

In [3]:
df_research_subjects = df.loc[:,['id','subjects']]
df_research_subjects.dropna(inplace=True)

In [4]:
df_research_subjects

Unnamed: 0,id,subjects
0,4dc99724cf04::319dc88111c9b2d6021228590e79130a,"[{'provenance': None, 'subject': {'scheme': 'k..."
1,4dc99724cf04::95ea5df70a451a0487e051faa6c0a646,"[{'provenance': None, 'subject': {'scheme': 'k..."
2,4dc99724cf04::fd4ac9e4a4b67e70441e5d033c144e1e,"[{'provenance': None, 'subject': {'scheme': 'k..."
3,RECOLECTA___::24cb4438d1afe299e63cfdea4a31911f,"[{'provenance': None, 'subject': {'scheme': 'k..."
4,RECOLECTA___::c5f9bd7b9739f4c5f064595b286d2e01,"[{'provenance': None, 'subject': {'scheme': 'k..."
...,...,...
995,dedup_wf_002::037a11b75e1662b0beb84c3b97f47ce1,"[{'provenance': None, 'subject': {'scheme': 'k..."
996,dedup_wf_002::037a8383ee10ffcebea2e3102e3fa6ce,"[{'provenance': None, 'subject': {'scheme': 'k..."
997,dedup_wf_002::037ba1ff4a8a10f7948e078bb3d6efb6,"[{'provenance': None, 'subject': {'scheme': 'k..."
998,dedup_wf_002::037c3a3ae3212005fc63e98d8378595c,"[{'provenance': None, 'subject': {'scheme': 'k..."


In [5]:
df_research_subjects = df_research_subjects.explode('subjects').reset_index(drop=True)

In [6]:
df_research_subjects

Unnamed: 0,id,subjects
0,4dc99724cf04::319dc88111c9b2d6021228590e79130a,"{'provenance': None, 'subject': {'scheme': 'ke..."
1,4dc99724cf04::319dc88111c9b2d6021228590e79130a,"{'provenance': None, 'subject': {'scheme': 'ke..."
2,4dc99724cf04::319dc88111c9b2d6021228590e79130a,"{'provenance': None, 'subject': {'scheme': 'ke..."
3,4dc99724cf04::319dc88111c9b2d6021228590e79130a,"{'provenance': None, 'subject': {'scheme': 'ke..."
4,4dc99724cf04::319dc88111c9b2d6021228590e79130a,"{'provenance': None, 'subject': {'scheme': 'ke..."
...,...,...
7522,dedup_wf_002::037c9fce98a79decd202e1afba75b99e,"{'provenance': None, 'subject': {'scheme': 'ke..."
7523,dedup_wf_002::037c9fce98a79decd202e1afba75b99e,"{'provenance': None, 'subject': {'scheme': 'ke..."
7524,dedup_wf_002::037c9fce98a79decd202e1afba75b99e,"{'provenance': None, 'subject': {'scheme': 'ke..."
7525,dedup_wf_002::037c9fce98a79decd202e1afba75b99e,"{'provenance': None, 'subject': {'scheme': 'ke..."


In [7]:
df_subjects = pd.json_normalize(df_research_subjects['subjects'])
df_research_subjects = pd.concat([df_research_subjects['id'], df_subjects],axis=1)

In [8]:
df_research_subjects

Unnamed: 0,id,provenance,subject.scheme,subject.value
0,4dc99724cf04::319dc88111c9b2d6021228590e79130a,,keyword,pigeons
1,4dc99724cf04::319dc88111c9b2d6021228590e79130a,,keyword,polinizadores
2,4dc99724cf04::319dc88111c9b2d6021228590e79130a,,keyword,dicistrovirus
3,4dc99724cf04::319dc88111c9b2d6021228590e79130a,,keyword,pollinators
4,4dc99724cf04::319dc88111c9b2d6021228590e79130a,,keyword,palomas
...,...,...,...,...
7522,dedup_wf_002::037c9fce98a79decd202e1afba75b99e,,keyword,International relations
7523,dedup_wf_002::037c9fce98a79decd202e1afba75b99e,,keyword,Political science
7524,dedup_wf_002::037c9fce98a79decd202e1afba75b99e,,keyword,Relaciones Internacionales
7525,dedup_wf_002::037c9fce98a79decd202e1afba75b99e,,keyword,JZ2-6530


## Paso 1: Convierto tipos y selecciono columnas con cardinalidad 1 con respecto a cada research product
+ info en https://graph.openaire.eu/docs/data-model/entities/research-product

In [9]:
def openaire_land_researchproduct_subjects(df: pd.DataFrame)-> pd.DataFrame:

    load_dt = _pick_load_dt(df)

    df_research_subjects = df.loc[:,['id','subjects']]
    df_research_subjects.dropna(inplace=True)

    df_research_subjects = df_research_subjects.explode('subjects').reset_index(drop=True)

    df_subjects = pd.json_normalize(df_research_subjects['subjects'])
    df_research_subjects = pd.concat([df_research_subjects['id'], df_subjects],axis=1)

    df_research_subjects['load_datetime'] = load_dt

    return df_research_subjects


In [10]:
df_research_subjects = openaire_land_researchproduct_subjects(df)

In [11]:
df_research_subjects

Unnamed: 0,id,provenance,subject.scheme,subject.value,load_datetime
0,4dc99724cf04::319dc88111c9b2d6021228590e79130a,,keyword,pigeons,2025-09-05
1,4dc99724cf04::319dc88111c9b2d6021228590e79130a,,keyword,polinizadores,2025-09-05
2,4dc99724cf04::319dc88111c9b2d6021228590e79130a,,keyword,dicistrovirus,2025-09-05
3,4dc99724cf04::319dc88111c9b2d6021228590e79130a,,keyword,pollinators,2025-09-05
4,4dc99724cf04::319dc88111c9b2d6021228590e79130a,,keyword,palomas,2025-09-05
...,...,...,...,...,...
7522,dedup_wf_002::037c9fce98a79decd202e1afba75b99e,,keyword,International relations,2025-09-05
7523,dedup_wf_002::037c9fce98a79decd202e1afba75b99e,,keyword,Political science,2025-09-05
7524,dedup_wf_002::037c9fce98a79decd202e1afba75b99e,,keyword,Relaciones Internacionales,2025-09-05
7525,dedup_wf_002::037c9fce98a79decd202e1afba75b99e,,keyword,JZ2-6530,2025-09-05
