In [1]:
from datetime import date
from pandas import json_normalize
import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
df_work_raw = catalog.load('raw/openalex/work#parquet')

# Pruebas

In [3]:
df_work = df_work_raw.convert_dtypes()
df_work = df_work.loc[:,['id','authorships']]

In [4]:
df_work.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 0 to 24
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           150 non-null    string
 1   authorships  150 non-null    object
dtypes: object(1), string(1)
memory usage: 3.5+ KB


In [5]:
df_work

Unnamed: 0,id,authorships
0,https://openalex.org/W4211046286,[{'affiliations': [{'institution_ids': array([...
1,https://openalex.org/W4292403095,[{'affiliations': [{'institution_ids': array([...
2,https://openalex.org/W2098406858,[{'affiliations': [{'institution_ids': array([...
3,https://openalex.org/W2501864044,[{'affiliations': [{'institution_ids': array([...
4,https://openalex.org/W2012331775,"[{'affiliations': [], 'author': {'display_name..."
...,...,...
20,https://openalex.org/W2972599699,[{'affiliations': [{'institution_ids': array([...
21,https://openalex.org/W3024722698,[{'affiliations': [{'institution_ids': array([...
22,https://openalex.org/W2965771657,[{'affiliations': [{'institution_ids': array([...
23,https://openalex.org/W4200079077,[{'affiliations': [{'institution_ids': array([...


In [6]:
df_work_authorship = df_work.explode('authorships').reset_index(drop=True)

In [7]:
df_work_authorship

Unnamed: 0,id,authorships
0,https://openalex.org/W4211046286,{'affiliations': [{'institution_ids': ['https:...
1,https://openalex.org/W4211046286,{'affiliations': [{'institution_ids': ['https:...
2,https://openalex.org/W4211046286,{'affiliations': [{'institution_ids': ['https:...
3,https://openalex.org/W4211046286,{'affiliations': [{'institution_ids': ['https:...
4,https://openalex.org/W4211046286,{'affiliations': [{'institution_ids': ['https:...
...,...,...
6060,https://openalex.org/W2285488823,{'affiliations': [{'institution_ids': ['https:...
6061,https://openalex.org/W2285488823,{'affiliations': [{'institution_ids': ['https:...
6062,https://openalex.org/W2285488823,{'affiliations': [{'institution_ids': ['https:...
6063,https://openalex.org/W2285488823,{'affiliations': [{'institution_ids': ['https:...


In [8]:
def land_work2authorship_openalex(df_work_raw):

    df_work = df_work_raw.convert_dtypes()
    df_work = df_work.loc[:,['id','authorships']]

    df_work_authorship = df_work.explode('authorships').reset_index(drop=True)

    # Normalizar el JSON de la columna 'authorships' y mantener la asociación con 'id'
    df_work_authorship = pd.concat([df_work_authorship['id'], json_normalize(df_work_authorship['authorships'])], axis=1)

    # remove affiliations.
    # https://docs.openalex.org/api-entities/works/work-object/authorship-object#affiliations
    #   "This information will be redundant with institutions below, but is useful if you need to know about what we used to match institutions."
    df_work_authorship.drop(columns=['affiliations'], inplace=True)

    df_work_authorship = df_work_authorship.explode('institutions').reset_index(drop=True)
    df_institution = json_normalize(df_work_authorship['institutions'])
    df_institution = df_institution.add_prefix('institution_')
    df_work_authorship = pd.concat((df_work_authorship.drop(columns=['institutions']), df_institution), axis=1)

    df_work_authorship.columns = df_work_authorship.columns.str.replace('.', '_')

    df_work_authorship = df_work_authorship[['id','author_id','author_display_name','author_orcid','author_position','is_corresponding','institution_id','institution_display_name','institution_ror','institution_type','institution_country_code']]

    df_work_authorship.rename(columns={'id':'work_id'}, inplace=True)

    df_work_authorship['load_datetime'] = date.today()

    return df_work_authorship

In [9]:
stage_work_authorship = land_work2authorship_openalex(df_work_raw)
stage_work_authorship

Unnamed: 0,work_id,author_id,author_display_name,author_orcid,author_position,is_corresponding,institution_id,institution_display_name,institution_ror,institution_type,institution_country_code,load_datetime
0,https://openalex.org/W4211046286,https://openalex.org/A5107845465,the Atlas Collaboration,,first,False,https://openalex.org/I392282,"University at Albany, State University of New ...",https://ror.org/012zs8222,education,US,2024-10-15
1,https://openalex.org/W4211046286,https://openalex.org/A5100769897,G. Aad,https://orcid.org/0000-0002-6665-4934,middle,False,https://openalex.org/I21491767,Aix-Marseille Université,https://ror.org/035xkbk20,education,FR,2024-10-15
2,https://openalex.org/W4211046286,https://openalex.org/A5100769897,G. Aad,https://orcid.org/0000-0002-6665-4934,middle,False,https://openalex.org/I1294671590,Centre National de la Recherche Scientifique,https://ror.org/02feahw73,government,FR,2024-10-15
3,https://openalex.org/W4211046286,https://openalex.org/A5086185112,E. Abat,,middle,False,https://openalex.org/I4405392,Boğaziçi University,https://ror.org/03z9tma90,education,TR,2024-10-15
4,https://openalex.org/W4211046286,https://openalex.org/A5107827585,J. Abdallah,,middle,False,https://openalex.org/I4210101901,Instituto de Física Corpuscular,https://ror.org/017xch102,facility,ES,2024-10-15
...,...,...,...,...,...,...,...,...,...,...,...,...
8291,https://openalex.org/W2285488823,https://openalex.org/A5090678076,Agapi Dima,https://orcid.org/0000-0001-6086-5725,middle,False,https://openalex.org/I174878644,University of Patras,https://ror.org/017wvtq80,education,GR,2024-10-15
8292,https://openalex.org/W2285488823,https://openalex.org/A5090972193,Αντωνία Τέρπου,https://orcid.org/0000-0002-9956-5727,middle,False,https://openalex.org/I174878644,University of Patras,https://ror.org/017wvtq80,education,GR,2024-10-15
8293,https://openalex.org/W2285488823,https://openalex.org/A5001029451,Αθανάσιος Α. Κουτίνας,https://orcid.org/0000-0001-7825-130X,middle,False,https://openalex.org/I174878644,University of Patras,https://ror.org/017wvtq80,education,GR,2024-10-15
8294,https://openalex.org/W2285488823,https://openalex.org/A5067522738,Guillermo R. Castro,https://orcid.org/0000-0002-6187-7805,last,True,https://openalex.org/I151201029,Consejo Nacional de Investigaciones Científica...,https://ror.org/03cqe8w59,government,AR,2024-10-15
