In [1]:
import pandas as pd
from datetime import datetime
from rich import print
from rich.pretty import Pretty

pd.set_option('display.max_columns', None)

In [2]:
df_work_raw = catalog.load('raw/openalex/work_dev#parquet')

**Estructura de Authorships**

In [3]:
from rich import print, pretty

first_authorship = df_work_raw['authorships'].iloc[0][0]
print(pretty.Pretty(first_authorship, expand_all=False))

# Transformaciones

In [4]:
# Seleccionar las columnas necesarias y convertir los tipos de datos
df_work2authorships = df_work_raw[['id', 'authorships']].convert_dtypes()
df_work2authorships.rename(columns={"id": "work_id"}, inplace=True)

In [5]:
df_work2authorships.head(3)

Unnamed: 0,work_id,authorships
0,https://openalex.org/W4211046286,[{'affiliations': [{'institution_ids': array([...
1,https://openalex.org/W4292403095,[{'affiliations': [{'institution_ids': array([...
2,https://openalex.org/W2501864044,[{'affiliations': [{'institution_ids': array([...


In [6]:
# Expandir la lista de authorships
df_work2authorships_exploded = df_work2authorships.explode('authorships', ignore_index=True)

In [7]:
df_work2authorships_exploded.head(3)

Unnamed: 0,work_id,authorships
0,https://openalex.org/W4211046286,{'affiliations': [{'institution_ids': ['https:...
1,https://openalex.org/W4211046286,{'affiliations': [{'institution_ids': ['https:...
2,https://openalex.org/W4211046286,{'affiliations': [{'institution_ids': ['https:...


In [8]:
# Normalizar la información de authorships
df_authorships_norm = pd.json_normalize(df_work2authorships_exploded['authorships'])
df_authorships_norm.rename(columns={"author.id": "author_id"}, inplace=True)

In [9]:
df_authorships_norm.head(3)

Unnamed: 0,affiliations,author_position,countries,institutions,is_corresponding,raw_affiliation_strings,raw_author_name,author.display_name,author_id,author.orcid
0,[{'institution_ids': ['https://openalex.org/I3...,first,[US],"[{'country_code': 'US', 'display_name': 'Unive...",False,"[University at Albany, 1400 Washington Ave, Al...",The ATLAS Collaboration,the Atlas Collaboration,https://openalex.org/A5107845465,
1,[{'institution_ids': ['https://openalex.org/I2...,middle,[FR],"[{'country_code': 'FR', 'display_name': 'Aix-M...",False,"[CPPM, Aix-Marseille Université, CNRS/IN2P3, M...",G Aad,G. Aad,https://openalex.org/A5100769897,https://orcid.org/0000-0002-6665-4934
2,[{'institution_ids': ['https://openalex.org/I4...,middle,[TR],"[{'country_code': 'TR', 'display_name': 'Boğaz...",False,"[Faculty of Sciences, Department of Physics, B...",E Abat,E. Abat,https://openalex.org/A5086185112,


In [10]:
# Combinar work_id con la información normalizada de authorships
df_work2authorships = df_work2authorships_exploded[['work_id']].join(df_authorships_norm)

In [11]:
df_work2authorships.head(3)

Unnamed: 0,work_id,affiliations,author_position,countries,institutions,is_corresponding,raw_affiliation_strings,raw_author_name,author.display_name,author_id,author.orcid
0,https://openalex.org/W4211046286,[{'institution_ids': ['https://openalex.org/I3...,first,[US],"[{'country_code': 'US', 'display_name': 'Unive...",False,"[University at Albany, 1400 Washington Ave, Al...",The ATLAS Collaboration,the Atlas Collaboration,https://openalex.org/A5107845465,
1,https://openalex.org/W4211046286,[{'institution_ids': ['https://openalex.org/I2...,middle,[FR],"[{'country_code': 'FR', 'display_name': 'Aix-M...",False,"[CPPM, Aix-Marseille Université, CNRS/IN2P3, M...",G Aad,G. Aad,https://openalex.org/A5100769897,https://orcid.org/0000-0002-6665-4934
2,https://openalex.org/W4211046286,[{'institution_ids': ['https://openalex.org/I4...,middle,[TR],"[{'country_code': 'TR', 'display_name': 'Boğaz...",False,"[Faculty of Sciences, Department of Physics, B...",E Abat,E. Abat,https://openalex.org/A5086185112,


In [None]:
# Extraer la relación work-author
df_work2author = df_work2authorships[['work_id', 'author_id', 'author_position']]

In [None]:
df_work2author.head(3)

In [None]:
# Expandir la lista de instituciones asociadas a cada autor
df_work2institution_exploded = df_work2authorships.explode('institutions', ignore_index=True)

In [None]:
df_work2institution_exploded.head(3)

In [None]:
# Normalizar la información de instituciones
df_institution_norm = pd.json_normalize(df_work2institution_exploded['institutions'])
df_institution_norm.drop(columns=['lineage'], errors='ignore', inplace=True)

In [None]:
df_institution_norm.head(3)

In [None]:
# Combinar author_id con la información normalizada de instituciones
df_author2institution = df_work2institution_exploded[['author_id']].join(df_institution_norm)

In [None]:
df_author2institution.head(3)

In [None]:
# Combinar work_id con la información normalizada de instituciones
df_work2institution = df_work2institution_exploded[['work_id']].join(df_institution_norm)

In [None]:
df_work2institution.head(3)

# Nodo

In [None]:
def land_openalex_work_authorships(df_work_raw):

    # Seleccionar las columnas necesarias y convertir los tipos de datos
    df_work2authorships = df_work_raw[['id', 'authorships']].convert_dtypes()
    df_work2authorships.rename(columns={"id": "work_id"}, inplace=True)

    # Expandir la lista de authorships
    df_work2authorships_exploded = df_work2authorships.explode('authorships', ignore_index=True)

    # Normalizar la información de authorships
    df_authorships_norm = pd.json_normalize(df_work2authorships_exploded['authorships'])
    df_authorships_norm.rename(columns={"author.id": "author_id"}, inplace=True)
    
    # Combinar work_id con la información normalizada de authorships
    df_work2authorships = df_work2authorships_exploded[['work_id']].join(df_authorships_norm)

    # Extraer la relación work-author
    df_work2author = df_work2authorships[['work_id', 'author_id', 'author_position']]

    # Expandir la lista de instituciones asociadas a cada autor
    df_work2institution_exploded = df_work2authorships.explode('institutions', ignore_index=True)

    # Normalizar la información de instituciones
    df_institution_norm = pd.json_normalize(df_work2institution_exploded['institutions'])
    df_institution_norm.drop(columns=['lineage'], errors='ignore', inplace=True)

    # Combinar author_id con la información normalizada de instituciones
    df_author2institution = df_work2institution_exploded[['author_id']].join(df_institution_norm)

    # Combinar work_id con la información normalizada de instituciones
    df_work2institution = df_work2institution_exploded[['work_id']].join(df_institution_norm)
    
    df_work2author['load_datetime'] = datetime.today()
    df_work2institution['load_datetime'] = datetime.today()
    df_author2institution['load_datetime'] = datetime.today()

    return df_work2author, df_work2institution, df_author2institution


## Ejecuto Nodo

In [None]:
df_work2author, df_work2institution, df_author2institution = land_openalex_work_authorships(df_work_raw)

# Resultados

In [None]:
df_work2author

In [None]:
df_work2institution

In [None]:
df_author2institution