In [1]:
from datetime import date
import ast
import pandas as pd

pd.set_option('display.max_columns', None)
df = catalog.load('raw/openaire/researchproduct')

## Paso 0: Seleccionar columnas con identificador y 'relevantdate'

In [2]:
df_researchproduct = df.loc[:, ['dri:objIdentifier', 'relevantdate']]
df_researchproduct = df_researchproduct.convert_dtypes()

In [3]:
df_researchproduct.head(5)

Unnamed: 0,dri:objIdentifier,relevantdate
0,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,"[{'@classid': 'created', '@classname': 'create..."
1,doi_________::01bdfeebe25730711be433245a4426e7,"[{'@classid': 'created', '@classname': 'create..."
2,doi_dedup___::fccba469c4dbc14c5cec34e69ab21625,"[{'@classid': 'created', '@classname': 'create..."
3,doi_dedup___::000d1effb7dd214508cfa2db791e3814,"[{'@classid': 'created', '@classname': 'create..."
4,doi_dedup___::387324dfb1abac93013eccfd545780c1,"[{'@classid': 'created', '@classname': 'create..."


## Paso 1: Asegurarse de que 'relevantdate' sea un diccionario o lista

In [4]:
df_researchproduct['relevantdate'] = df_researchproduct['relevantdate'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df_researchproduct['relevantdate'] = df_researchproduct['relevantdate'].apply(lambda x: [x] if not isinstance(x, list) else x)

In [5]:
df_researchproduct

Unnamed: 0,dri:objIdentifier,relevantdate
0,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,"[{'@classid': 'created', '@classname': 'create..."
1,doi_________::01bdfeebe25730711be433245a4426e7,"[{'@classid': 'created', '@classname': 'create..."
2,doi_dedup___::fccba469c4dbc14c5cec34e69ab21625,"[{'@classid': 'created', '@classname': 'create..."
3,doi_dedup___::000d1effb7dd214508cfa2db791e3814,"[{'@classid': 'created', '@classname': 'create..."
4,doi_dedup___::387324dfb1abac93013eccfd545780c1,"[{'@classid': 'created', '@classname': 'create..."
...,...,...
3870,doi_dedup___::a901f8fcb7a8e4fd764da3fdbf42c064,"[{'@classid': 'created', '@classname': 'create..."
3871,doi_dedup___::7d4824281cdafde3676552da1141877a,"[{'@classid': 'created', '@classname': 'create..."
3872,doi_dedup___::acf6451bd9b42e6b944f84d81fe47232,"[{'@classid': 'created', '@classname': 'create..."
3873,doi_dedup___::42f7f1f2a1e586ca7bb4942987b54b71,"[{'@classid': 'created', '@classname': 'create..."


## Paso 2: Explode la columna 'relevantdate' y reinicia el índice

In [6]:
df_researchproduct = df_researchproduct.explode('relevantdate').reset_index(drop=True)

In [7]:
df_researchproduct

Unnamed: 0,dri:objIdentifier,relevantdate
0,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,"{'@classid': 'created', '@classname': 'created..."
1,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,"{'@classid': 'published-online', '@classname':..."
2,doi_________::01bdfeebe25730711be433245a4426e7,"{'@classid': 'created', '@classname': 'created..."
3,doi_________::01bdfeebe25730711be433245a4426e7,"{'@classid': 'published-online', '@classname':..."
4,doi_________::01bdfeebe25730711be433245a4426e7,"{'@classid': 'published-print', '@classname': ..."
...,...,...
11362,doi_dedup___::42f7f1f2a1e586ca7bb4942987b54b71,"{'@classid': 'published-online', '@classname':..."
11363,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,"{'@classid': 'created', '@classname': 'created..."
11364,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,"{'@classid': 'published-online', '@classname':..."
11365,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,"{'@classid': 'issued', '@classname': 'issued',..."


## Paso 3: Normalizar la columna 'measure' en nuevas columnas

In [8]:
relevantdate_expanded = pd.json_normalize(df_researchproduct["relevantdate"])

In [9]:
relevantdate_expanded

Unnamed: 0,@classid,@classname,@schemeid,@schemename,#text,@inferred,@provenanceaction,@trust
0,created,created,dnet:dataCite_date,dnet:dataCite_date,2022-02-23,,,
1,published-online,published-online,dnet:dataCite_date,dnet:dataCite_date,2022-02-22,,,
2,created,created,dnet:dataCite_date,dnet:dataCite_date,2024-02-02,,,
3,published-online,published-online,dnet:dataCite_date,dnet:dataCite_date,2024-02-16,,,
4,published-print,published-print,dnet:dataCite_date,dnet:dataCite_date,2024-05-08,,,
...,...,...,...,...,...,...,...,...
11362,published-online,published-online,dnet:dataCite_date,dnet:dataCite_date,2014-03-01,,,
11363,created,created,dnet:dataCite_date,dnet:dataCite_date,2017-04-25,,,
11364,published-online,published-online,dnet:dataCite_date,dnet:dataCite_date,2017-04-17,,,
11365,issued,issued,dnet:dataCite_date,dnet:dataCite_date,2017-04-17,,,


## Paso 4: Concatenar df_researchproduct con df_relevantdate asegurando que los índices están alineados

In [10]:
df_researchproduct2relevantdate = pd.concat([df_researchproduct, relevantdate_expanded], axis=1)
df_researchproduct2relevantdate.drop(columns='relevantdate', inplace=True)

In [11]:
df_researchproduct2relevantdate

Unnamed: 0,dri:objIdentifier,@classid,@classname,@schemeid,@schemename,#text,@inferred,@provenanceaction,@trust
0,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,created,created,dnet:dataCite_date,dnet:dataCite_date,2022-02-23,,,
1,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,published-online,published-online,dnet:dataCite_date,dnet:dataCite_date,2022-02-22,,,
2,doi_________::01bdfeebe25730711be433245a4426e7,created,created,dnet:dataCite_date,dnet:dataCite_date,2024-02-02,,,
3,doi_________::01bdfeebe25730711be433245a4426e7,published-online,published-online,dnet:dataCite_date,dnet:dataCite_date,2024-02-16,,,
4,doi_________::01bdfeebe25730711be433245a4426e7,published-print,published-print,dnet:dataCite_date,dnet:dataCite_date,2024-05-08,,,
...,...,...,...,...,...,...,...,...,...
11362,doi_dedup___::42f7f1f2a1e586ca7bb4942987b54b71,published-online,published-online,dnet:dataCite_date,dnet:dataCite_date,2014-03-01,,,
11363,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,created,created,dnet:dataCite_date,dnet:dataCite_date,2017-04-25,,,
11364,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,published-online,published-online,dnet:dataCite_date,dnet:dataCite_date,2017-04-17,,,
11365,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,issued,issued,dnet:dataCite_date,dnet:dataCite_date,2017-04-17,,,


## Paso 5: Agrego load_datetime

In [12]:
df_researchproduct2relevantdate['load_datetime'] = date.today()

In [13]:
df_researchproduct2relevantdate

Unnamed: 0,dri:objIdentifier,@classid,@classname,@schemeid,@schemename,#text,@inferred,@provenanceaction,@trust,load_datetime
0,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,created,created,dnet:dataCite_date,dnet:dataCite_date,2022-02-23,,,,2024-11-28
1,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,published-online,published-online,dnet:dataCite_date,dnet:dataCite_date,2022-02-22,,,,2024-11-28
2,doi_________::01bdfeebe25730711be433245a4426e7,created,created,dnet:dataCite_date,dnet:dataCite_date,2024-02-02,,,,2024-11-28
3,doi_________::01bdfeebe25730711be433245a4426e7,published-online,published-online,dnet:dataCite_date,dnet:dataCite_date,2024-02-16,,,,2024-11-28
4,doi_________::01bdfeebe25730711be433245a4426e7,published-print,published-print,dnet:dataCite_date,dnet:dataCite_date,2024-05-08,,,,2024-11-28
...,...,...,...,...,...,...,...,...,...,...
11362,doi_dedup___::42f7f1f2a1e586ca7bb4942987b54b71,published-online,published-online,dnet:dataCite_date,dnet:dataCite_date,2014-03-01,,,,2024-11-28
11363,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,created,created,dnet:dataCite_date,dnet:dataCite_date,2017-04-25,,,,2024-11-28
11364,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,published-online,published-online,dnet:dataCite_date,dnet:dataCite_date,2017-04-17,,,,2024-11-28
11365,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,issued,issued,dnet:dataCite_date,dnet:dataCite_date,2017-04-17,,,,2024-11-28
