In [1]:
import ast
import pandas as pd

pd.set_option('display.max_columns', None)
df = catalog.load('raw/openaire/researchproduct')

In [2]:
df = df.convert_dtypes()

## Paso 1: Seleccionar columnas con identificador y pid

In [3]:
df_researchproduct = df.loc[:,['dri:objIdentifier', 'pid']]

In [4]:
df_researchproduct.head(5)

Unnamed: 0,dri:objIdentifier,pid
0,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,"{'@classid': 'doi', '@classname': 'Digital Obj..."
1,doi_________::01bdfeebe25730711be433245a4426e7,"{'@classid': 'doi', '@classname': 'Digital Obj..."
2,doi_dedup___::fccba469c4dbc14c5cec34e69ab21625,"[{'@classid': 'doi', '@classname': 'Digital Ob..."
3,doi_dedup___::000d1effb7dd214508cfa2db791e3814,"[{'@classid': 'doi', '@classname': 'Digital Ob..."
4,doi_dedup___::387324dfb1abac93013eccfd545780c1,"[{'@classid': 'doi', '@classname': 'Digital Ob..."


## Paso 2: Asegurarse de que 'pid' sea un diccionario o lista

In [5]:
df_researchproduct['pid'] = df_researchproduct['pid'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df_researchproduct['pid'] = df_researchproduct['pid'].apply(lambda x: [x] if not isinstance(x, list) else x)

In [6]:
df_researchproduct.head(3)

Unnamed: 0,dri:objIdentifier,pid
0,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,"[{'@classid': 'doi', '@classname': 'Digital Ob..."
1,doi_________::01bdfeebe25730711be433245a4426e7,"[{'@classid': 'doi', '@classname': 'Digital Ob..."
2,doi_dedup___::fccba469c4dbc14c5cec34e69ab21625,"[{'@classid': 'doi', '@classname': 'Digital Ob..."


## Paso 3: Explode la columna 'pid' y reinicia el índice

In [7]:
df_researchproduct = df_researchproduct.explode('pid').reset_index(drop=True)

## Paso 4: Normalizar la columna 'pid' en nuevas columnas

In [8]:
pid_expanded = pd.json_normalize(df_researchproduct["pid"])

## Paso 5: Eliminar columnas no deseadas de 'pid'

In [9]:
pid_expanded.drop(columns=['@classname', '@schemeid', '@schemename', '@inferred', '@provenanceaction', '@trust'], inplace=True)

## Paso 5: Concatenar asegurando que los índices están alineados

In [10]:
df_researchproduct2pid = pd.concat([df_researchproduct, pid_expanded], axis=1)
df_researchproduct2pid.drop(columns='pid', inplace=True)

In [11]:
df_researchproduct2pid

Unnamed: 0,dri:objIdentifier,@classid,#text
0,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,doi,10.3390/quantum4010009
1,doi_________::01bdfeebe25730711be433245a4426e7,doi,10.1002/cctc.202301719
2,doi_dedup___::fccba469c4dbc14c5cec34e69ab21625,doi,10.1016/j.asr.2020.07.008
3,doi_dedup___::fccba469c4dbc14c5cec34e69ab21625,mag_id,3042553916
4,doi_dedup___::000d1effb7dd214508cfa2db791e3814,doi,10.24267/22564004.544
...,...,...,...
10767,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,doi,10.15560/13.2.2096
10768,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,doi,10.60692/mqntz-0n998
10769,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,doi,10.60692/4rqe5-dsr28
10770,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,mag_id,2607177089
