In [1]:
from datetime import date
import ast
import pandas as pd

pd.set_option('display.max_columns', None)
df = catalog.load('raw/openaire/researchproduct')

## Paso 0: Seleccionar columnas con identificador y 'measure'

In [2]:
df_researchproduct = df.loc[:, ['dri:objIdentifier', 'subject']]
df_researchproduct = df_researchproduct.convert_dtypes()

In [3]:
df_researchproduct.head(5)

Unnamed: 0,dri:objIdentifier,subject
0,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,"[{'@classid': 'keyword', '@classname': 'keywor..."
1,doi_________::01bdfeebe25730711be433245a4426e7,
2,doi_dedup___::fccba469c4dbc14c5cec34e69ab21625,"[{'@classid': 'FOS', '@classname': 'Fields of ..."
3,doi_dedup___::000d1effb7dd214508cfa2db791e3814,"[{'@classid': 'keyword', '@classname': 'keywor..."
4,doi_dedup___::387324dfb1abac93013eccfd545780c1,"[{'@classid': 'FOS', '@classname': 'Fields of ..."


## Paso 1: Asegurarse de que 'subject' sea un diccionario o lista

In [4]:
df_researchproduct['subject'] = df_researchproduct['subject'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df_researchproduct['subject'] = df_researchproduct['subject'].apply(lambda x: [x] if not isinstance(x, list) else x)

In [5]:
df_researchproduct

Unnamed: 0,dri:objIdentifier,subject
0,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,"[{'@classid': 'keyword', '@classname': 'keywor..."
1,doi_________::01bdfeebe25730711be433245a4426e7,[<NA>]
2,doi_dedup___::fccba469c4dbc14c5cec34e69ab21625,"[{'@classid': 'FOS', '@classname': 'Fields of ..."
3,doi_dedup___::000d1effb7dd214508cfa2db791e3814,"[{'@classid': 'keyword', '@classname': 'keywor..."
4,doi_dedup___::387324dfb1abac93013eccfd545780c1,"[{'@classid': 'FOS', '@classname': 'Fields of ..."
...,...,...
3870,doi_dedup___::a901f8fcb7a8e4fd764da3fdbf42c064,"[{'@classid': 'keyword', '@classname': 'keywor..."
3871,doi_dedup___::7d4824281cdafde3676552da1141877a,"[{'@classid': 'keyword', '@classname': 'keywor..."
3872,doi_dedup___::acf6451bd9b42e6b944f84d81fe47232,"[{'@classid': 'keyword', '@classname': 'keywor..."
3873,doi_dedup___::42f7f1f2a1e586ca7bb4942987b54b71,"[{'@classid': 'keyword', '@classname': 'keywor..."


## Paso 2: Explode la columna 'subject' y reinicia el índice

In [6]:
df_researchproduct = df_researchproduct.explode('subject').reset_index(drop=True)

In [7]:
df_researchproduct

Unnamed: 0,dri:objIdentifier,subject
0,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,"{'@classid': 'keyword', '@classname': 'keyword..."
1,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,"{'@classid': 'keyword', '@classname': 'keyword..."
2,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,"{'@classid': 'keyword', '@classname': 'keyword..."
3,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,"{'@classid': 'keyword', '@classname': 'keyword..."
4,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,"{'@classid': 'keyword', '@classname': 'keyword..."
...,...,...
52137,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,"{'@classid': 'keyword', '@classname': 'keyword..."
52138,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,"{'@classid': 'keyword', '@classname': 'keyword..."
52139,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,"{'@classid': 'keyword', '@classname': 'keyword..."
52140,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,"{'@classid': 'keyword', '@classname': 'keyword..."


## Paso 3: Normalizar la columna 'measure' en nuevas columnas

In [8]:
subject_expanded = pd.json_normalize(df_researchproduct["subject"])

In [9]:
subject_expanded

Unnamed: 0,@classid,@classname,@schemeid,@schemename,@inferred,@provenanceaction,@trust,#text,@inferenceprovenance
0,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,false,sysimport:crosswalk:repository,0.9,quantum phase transitions,
1,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,false,sysimport:crosswalk:repository,0.9,exactly solvable models,
2,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,false,sysimport:crosswalk:repository,0.9,Physics,
3,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,false,sysimport:crosswalk:repository,0.9,QC1-999,
4,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,false,sysimport:crosswalk:repository,0.9,Física,
...,...,...,...,...,...,...,...,...,...
52137,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,,,,Physical Sciences,
52138,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,false,sysimport:crosswalk:repository,0.9,Subterranean habits,
52139,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,,,,Arts and Humanities,
52140,keyword,keyword,dnet:result_subject,dnet:result_subject,false,sysimport:crosswalk:repository,0.9,CEMETERY,


## Paso 4: Concatenar df_researchproduct con df_subject asegurando que los índices están alineados

In [10]:
df_researchproduct2subject = pd.concat([df_researchproduct, subject_expanded], axis=1)
df_researchproduct2subject.drop(columns='subject', inplace=True)

In [11]:
df_researchproduct2subject

Unnamed: 0,dri:objIdentifier,@classid,@classname,@schemeid,@schemename,@inferred,@provenanceaction,@trust,#text,@inferenceprovenance
0,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,false,sysimport:crosswalk:repository,0.9,quantum phase transitions,
1,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,false,sysimport:crosswalk:repository,0.9,exactly solvable models,
2,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,false,sysimport:crosswalk:repository,0.9,Physics,
3,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,false,sysimport:crosswalk:repository,0.9,QC1-999,
4,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,false,sysimport:crosswalk:repository,0.9,Física,
...,...,...,...,...,...,...,...,...,...,...
52137,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,,,,Physical Sciences,
52138,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,false,sysimport:crosswalk:repository,0.9,Subterranean habits,
52139,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,,,,Arts and Humanities,
52140,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,keyword,keyword,dnet:result_subject,dnet:result_subject,false,sysimport:crosswalk:repository,0.9,CEMETERY,


## Paso 5: Agrego load_datetime

In [12]:
df_researchproduct2subject['load_datetime'] = date.today()

In [13]:
df_researchproduct2subject

Unnamed: 0,dri:objIdentifier,@classid,@classname,@schemeid,@schemename,@inferred,@provenanceaction,@trust,#text,@inferenceprovenance,load_datetime
0,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,false,sysimport:crosswalk:repository,0.9,quantum phase transitions,,2024-11-28
1,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,false,sysimport:crosswalk:repository,0.9,exactly solvable models,,2024-11-28
2,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,false,sysimport:crosswalk:repository,0.9,Physics,,2024-11-28
3,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,false,sysimport:crosswalk:repository,0.9,QC1-999,,2024-11-28
4,doi_dedup___::2577d684a839dc78be3e1307914cdfd5,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,false,sysimport:crosswalk:repository,0.9,Física,,2024-11-28
...,...,...,...,...,...,...,...,...,...,...,...
52137,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,,,,Physical Sciences,,2024-11-28
52138,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,false,sysimport:crosswalk:repository,0.9,Subterranean habits,,2024-11-28
52139,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,keyword,keyword,dnet:subject_classification_typologies,dnet:subject_classification_typologies,,,,Arts and Humanities,,2024-11-28
52140,doi_dedup___::10fb361856b54e85f1dd2a296b90d1ad,keyword,keyword,dnet:result_subject,dnet:result_subject,false,sysimport:crosswalk:repository,0.9,CEMETERY,,2024-11-28
