In [27]:
import pandas as pd

In [28]:
df = pd.read_excel("base_maestra_final.xlsx", sheet_name='maestra')

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3143 entries, 0 to 3142
Data columns (total 67 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   ID                              3143 non-null   int64  
 1   FOLIO                           3143 non-null   int64  
 2   NOMBRE_COMPLETO                 3143 non-null   object 
 3   portal_del_investigador         3143 non-null   int64  
 4   enlace_pdi                      3143 non-null   object 
 5   ETNIA                           3143 non-null   object 
 6   ESTADO BECA                     3143 non-null   object 
 7   ANO_INICIO_BECA                 3143 non-null   int64  
 8   TIPO_BECA                       3143 non-null   object 
 9   AREA_OCDE                       3143 non-null   object 
 10  PROGRAMA                        3143 non-null   object 
 11  PAIS_POSGRADO                   3143 non-null   object 
 12  RBD_ESTABLECIMIENTO             31

In [30]:
import pandas as pd, numpy as np, hashlib

# copia de trabajo
dfx = df.copy()

def norm(x):
    if pd.isna(x): 
        return ""
    s = str(x).strip().lower()
    return " ".join(s.split())

def make_id(label, parts_tuple):
    key = label + "|" + "|".join([norm(p) for p in parts_tuple])
    return hashlib.md5(key.encode("utf-8")).hexdigest()


In [31]:
def clean_author_id(v):
    if pd.isna(v): 
        return ""
    try:
        return str(int(float(v)))
    except Exception:
        s = str(v)
        return s[:-2] if s.endswith(".0") else s

dfx["author_id_clean"] = dfx["author_id"].apply(clean_author_id)


In [32]:
becario_cols = [
    "ID","FOLIO","NOMBRE_COMPLETO","ETNIA","ESTADO BECA","ANO_INICIO_BECA",
    "TIPO_BECA","ES_MUJER","SEXO_REGISTRADO","AREA_OCDE","PROGRAMA",
    "portal_del_investigador","enlace_pdi","tiene_scopus"
]

becario_df = (
    dfx[becario_cols]
    .drop_duplicates(subset=["ID"])
    .rename(columns={"ID":"becario_id"})
    .reset_index(drop=True)
)

becario_df.head(3)


Unnamed: 0,becario_id,FOLIO,NOMBRE_COMPLETO,ETNIA,ESTADO BECA,ANO_INICIO_BECA,TIPO_BECA,ES_MUJER,SEXO_REGISTRADO,AREA_OCDE,PROGRAMA,portal_del_investigador,enlace_pdi,tiene_scopus
0,1,72160230,ABELINO ENRIQUE JIMENEZ GAJARDO,NO,NO VIGENTE,2015,BECAS CHILE,0,MASCULINO,Ingeniería y Tecnología,PHD IN ELECTRICAL AND COMPUTER ENGINEERING,0,sin_informacion,1
1,2,21150571,ABRAHAM BENJAMIN NOVOA LAGOS,NO,NO VIGENTE,2015,CONICYT,0,MASCULINO,HUMANIDADES,DOCTORADO EN LINGUISTICA,0,sin_informacion,1
2,3,72170568,ACCEL NICOLAS ABARCA PROUZA,NO,NO VIGENTE,2016,BECAS CHILE,0,MASCULINO,INGENIERÍA Y TECNOLOGÍA,PH.D. IN ELECTRICAL ENGINEERING,0,sin_informacion,1


In [33]:
sec_key_cols = ["RBD_ESTABLECIMIENTO","NOMBRE_ESTABLECIMIENTO"]
sec_props = sec_key_cols + ["COMUNA_ESTABLECIMIENTO","REGION_ESTABLECIMIENTO",
                            "GSE_ESTABLECIMIENTO","TIPO_ESTABLECIMIENTO","ES_EMBLEMATICO"]

# nodo
tmp = dfx[sec_props].copy()
tmp["__key__"] = tmp[sec_key_cols].apply(lambda r: tuple(norm(x) for x in r), axis=1)
tmp = tmp[tmp["__key__"].apply(lambda t: any(t))]
tmp["secundaria_id"] = tmp["__key__"].apply(lambda t: make_id("Secundaria", t))
secundaria_df = tmp.drop(columns="__key__").drop_duplicates(subset=["secundaria_id"]).reset_index(drop=True)

# relación becario—secundaria
rel = dfx[["ID"] + sec_key_cols].copy()
rel["__key__"] = rel[sec_key_cols].apply(lambda r: tuple(norm(x) for x in r), axis=1)
rel = rel[rel["__key__"].apply(lambda t: any(t))]
rel["secundaria_id"] = rel["__key__"].apply(lambda t: make_id("Secundaria", t))
bec_sec_df = rel[["ID","secundaria_id"]].drop_duplicates().rename(columns={"ID":"becario_id"}).reset_index(drop=True)

secundaria_df.head(3), bec_sec_df.head(3)


(  RBD_ESTABLECIMIENTO               NOMBRE_ESTABLECIMIENTO  \
 0                8748  COLEGIO INSTITUTO ALONSO DE ERCILLA   
 1                4824             LICEO INDUSTRIAL DE TOME   
 2               11962  COLEGIO PEDRO DE VALDIVIA-AGUSTINAS   
 
   COMUNA_ESTABLECIMIENTO REGION_ESTABLECIMIENTO GSE_ESTABLECIMIENTO  \
 0               Santiago   Región Metropolitana                Alto   
 1                   Tomé      Región del Biobío                Bajo   
 2               Santiago   Región Metropolitana                Alto   
 
                                 TIPO_ESTABLECIMIENTO ES_EMBLEMATICO  \
 0  Particular Subvencionado con Financiamiento Co...              0   
 1                                          Municipal              0   
 2  Particular Subvencionado con Financiamiento Co...              0   
 
                       secundaria_id  
 0  a305dcf71ce9592c70063a2a54246a87  
 1  e314ba167db25c26e48b284d58dd6d19  
 2  75ebeecc68df66c9b959b4301666b901  ,
    becar

In [34]:
pre_key_cols = ["UNIVERSIDAD_PREGRADO","REGION_UNIVERSIDAD_PREGRADO",
                "ACREDITACION_PREGRADO","ANIOS_ACREDITACION_PREGRADO","PREGRADO_ELITE"]
pre_props = pre_key_cols

# nodo
tmp = dfx[pre_props].copy()
tmp["__key__"] = tmp[pre_key_cols].apply(lambda r: tuple(norm(x) for x in r), axis=1)
tmp = tmp[tmp["__key__"].apply(lambda t: any(t))]
tmp["pregrado_id"] = tmp["__key__"].apply(lambda t: make_id("Pregrado", t))
pregrado_df = tmp.drop(columns="__key__").drop_duplicates(subset=["pregrado_id"]).reset_index(drop=True)

# relación
rel = dfx[["ID"] + pre_key_cols].copy()
rel["__key__"] = rel[pre_key_cols].apply(lambda r: tuple(norm(x) for x in r), axis=1)
rel = rel[rel["__key__"].apply(lambda t: any(t))]
rel["pregrado_id"] = rel["__key__"].apply(lambda t: make_id("Pregrado", t))
bec_pre_df = rel[["ID","pregrado_id"]].drop_duplicates().rename(columns={"ID":"becario_id"}).reset_index(drop=True)

pregrado_df.head(3), bec_pre_df.head(3)


(           UNIVERSIDAD_PREGRADO REGION_UNIVERSIDAD_PREGRADO  \
 0          Universidad De Chile        Región Metropolitana   
 1     Universidad De Concepcion           Región del Biobío   
 2  Universidad Austral De Chile          Región de Los Ríos   
 
   ACREDITACION_PREGRADO ANIOS_ACREDITACION_PREGRADO PREGRADO_ELITE  \
 0        Acreditacion 5                           5              1   
 1         Investigación                           7              0   
 2            Excelencia                           6              0   
 
                         pregrado_id  
 0  a76a87e7bed2bcb9fadfca27b6d9737c  
 1  b632e55875a8ef772c2bc9d6d8e32546  
 2  75e89aaf5a2368c8d22a8d840067f9cc  ,
    becario_id                       pregrado_id
 0           1  a76a87e7bed2bcb9fadfca27b6d9737c
 1           2  b632e55875a8ef772c2bc9d6d8e32546
 2           3  a76a87e7bed2bcb9fadfca27b6d9737c)

In [35]:
pos_key_cols = ["UNIVERSIDAD_POSGRADO","PAIS_POSGRADO","AREA_OCDE","PROGRAMA"]
pos_props = pos_key_cols

# nodo
tmp = dfx[pos_props].copy()
tmp["__key__"] = tmp[pos_key_cols].apply(lambda r: tuple(norm(x) for x in r), axis=1)
tmp = tmp[tmp["__key__"].apply(lambda t: any(t))]
tmp["posgrado_id"] = tmp["__key__"].apply(lambda t: make_id("Posgrado", t))
posgrado_df = tmp.drop(columns="__key__").drop_duplicates(subset=["posgrado_id"]).reset_index(drop=True)

# relación
rel = dfx[["ID"] + pos_key_cols].copy()
rel["__key__"] = rel[pos_key_cols].apply(lambda r: tuple(norm(x) for x in r), axis=1)
rel = rel[rel["__key__"].apply(lambda t: any(t))]
rel["posgrado_id"] = rel["__key__"].apply(lambda t: make_id("Posgrado", t))
bec_pos_df = rel[["ID","posgrado_id"]].drop_duplicates().rename(columns={"ID":"becario_id"}).reset_index(drop=True)

posgrado_df.head(3), bec_pos_df.head(3)


(            UNIVERSIDAD_POSGRADO   PAIS_POSGRADO                AREA_OCDE  \
 0     CARNEGIE MELLON UNIVERSITY  Estados Unidos  Ingeniería y Tecnología   
 1      UNIVERSIDAD DE CONCEPCION           CHILE              HUMANIDADES   
 2  TECHNISCHE UNIVERSITEIT DELFT    PAÍSES BAJOS  INGENIERÍA Y TECNOLOGÍA   
 
                                      PROGRAMA  \
 0  PHD IN ELECTRICAL AND COMPUTER ENGINEERING   
 1                    DOCTORADO EN LINGUISTICA   
 2             PH.D. IN ELECTRICAL ENGINEERING   
 
                         posgrado_id  
 0  ed204a027ddc2a740ccb50ed4255ae57  
 1  30d29f013f1040a9fc7b5d44bf7151e3  
 2  4dbe25076e9aae6b5056c62d9c93db39  ,
    becario_id                       posgrado_id
 0           1  ed204a027ddc2a740ccb50ed4255ae57
 1           2  30d29f013f1040a9fc7b5d44bf7151e3
 2           3  4dbe25076e9aae6b5056c62d9c93db39)

In [36]:
sh_key_cols = ["UNIVERSIDAD_RANKING_SHANGHAI","PAIS_RANKING_SHANGHAI",
               "RANK_SHANGHAI","RANK_NACIONAL_SHANGHAI","TOP_100_SHANGHAI"]
sh_props = sh_key_cols

# nodo
tmp = dfx[sh_props].copy()
tmp["__key__"] = tmp[sh_key_cols].apply(lambda r: tuple(norm(x) for x in r), axis=1)
tmp = tmp[tmp["__key__"].apply(lambda t: any(t))]
tmp["shanghai_rank_id"] = tmp["__key__"].apply(lambda t: make_id("ShanghaiRank", t))
shanghai_rank_df = tmp.drop(columns="__key__").drop_duplicates(subset=["shanghai_rank_id"]).reset_index(drop=True)

# link posgrado—shanghai (por fila; se deduplica)
pos_tmp = dfx[pos_key_cols].copy()
pos_tmp["pos_key"] = pos_tmp[pos_key_cols].apply(lambda r: tuple(norm(x) for x in r), axis=1)
pos_tmp["posgrado_id"] = pos_tmp["pos_key"].apply(lambda t: make_id("Posgrado", t))

sh_tmp = dfx[sh_key_cols].copy()
sh_tmp["sh_key"] = sh_tmp[sh_key_cols].apply(lambda r: tuple(norm(x) for x in r), axis=1)
sh_tmp["shanghai_rank_id"] = sh_tmp["sh_key"].apply(lambda t: make_id("ShanghaiRank", t) if any(t) else "")

pos_sh_df = pd.concat([pos_tmp["posgrado_id"], sh_tmp["shanghai_rank_id"]], axis=1)
pos_sh_df = pos_sh_df[pos_sh_df["shanghai_rank_id"]!=""].drop_duplicates().reset_index(drop=True)

shanghai_rank_df.head(3), pos_sh_df.head(3)


(     UNIVERSIDAD_RANKING_SHANGHAI PAIS_RANKING_SHANGHAI    RANK_SHANGHAI  \
 0      Carnegie Mellon University                    us          101-150   
 1       Universidad De Concepcion       sin_informacion  sin_informacion   
 2  Delft University of Technology                    nl          151-200   
 
   RANK_NACIONAL_SHANGHAI TOP_100_SHANGHAI                  shanghai_rank_id  
 0                  39-50                0  6bde7a32b02a72a66948e74d761bb13c  
 1        sin_informacion  sin_informacion  e7b26eeed9f50118898efec585f0d493  
 2                    7-9                0  0e64000816cc3fa0b24010447c18f3b7  ,
                         posgrado_id                  shanghai_rank_id
 0  ed204a027ddc2a740ccb50ed4255ae57  6bde7a32b02a72a66948e74d761bb13c
 1  30d29f013f1040a9fc7b5d44bf7151e3  e7b26eeed9f50118898efec585f0d493
 2  4dbe25076e9aae6b5056c62d9c93db39  0e64000816cc3fa0b24010447c18f3b7)

In [37]:
qs_key_cols = ["UNIVERSIDAD_RANKING_QS","PAIS_RANKING_QS",
               "RANK_QS_2024","RANK_QS_2025","RANK_GLOBAL_QS","SCORE_OVERALL_QS"]
qs_props = qs_key_cols

# nodo
tmp = dfx[qs_props].copy()
tmp["__key__"] = tmp[qs_key_cols].apply(lambda r: tuple(norm(x) for x in r), axis=1)
tmp = tmp[tmp["__key__"].apply(lambda t: any(t))]
tmp["qs_rank_id"] = tmp["__key__"].apply(lambda t: make_id("QSRank", t))
qs_rank_df = tmp.drop(columns="__key__").drop_duplicates(subset=["qs_rank_id"]).reset_index(drop=True)

# link posgrado—qs
qs_tmp = dfx[qs_key_cols].copy()
qs_tmp["qs_key"] = qs_tmp[qs_key_cols].apply(lambda r: tuple(norm(x) for x in r), axis=1)
qs_tmp["qs_rank_id"] = qs_tmp["qs_key"].apply(lambda t: make_id("QSRank", t) if any(t) else "")

pos_qs_df = pd.concat([pos_tmp["posgrado_id"], qs_tmp["qs_rank_id"]], axis=1)
pos_qs_df = pos_qs_df[pos_qs_df["qs_rank_id"]!=""].drop_duplicates().reset_index(drop=True)

qs_rank_df.head(3), pos_qs_df.head(3)


(           UNIVERSIDAD_RANKING_QS PAIS_RANKING_QS RANK_QS_2024 RANK_QS_2025  \
 0      Carnegie Mellon University   United States           52           58   
 1       Universidad de Concepción           Chile      601-610      631-640   
 2  Delft University of Technology     Netherlands           47           49   
 
   RANK_GLOBAL_QS SCORE_OVERALL_QS                        qs_rank_id  
 0           492=               72  72cfec2a05829f6afab366a378d7073a  
 1           336=                -  c49abcf62660e4fe3c9006dbc7f3bf44  
 2             14               77  cacf227a84de748674078e90557aa36a  ,
                         posgrado_id                        qs_rank_id
 0  ed204a027ddc2a740ccb50ed4255ae57  72cfec2a05829f6afab366a378d7073a
 1  30d29f013f1040a9fc7b5d44bf7151e3  c49abcf62660e4fe3c9006dbc7f3bf44
 2  4dbe25076e9aae6b5056c62d9c93db39  cacf227a84de748674078e90557aa36a)

In [38]:
bec_aut_df = (
    dfx.loc[dfx["author_id_clean"]!="", ["ID","author_id_clean"]]
    .drop_duplicates()
    .rename(columns={"ID":"becario_id","author_id_clean":"author_id"})
    .reset_index(drop=True)
)

bec_aut_df.head(3)


Unnamed: 0,becario_id,author_id
0,1,43461349700
1,2,57217866202
2,3,55546553200


In [39]:
summary = {
    "n_becario": len(becario_df),
    "n_secundaria": len(secundaria_df),
    "n_rel_bec_sec": len(bec_sec_df),
    "n_pregrado": len(pregrado_df),
    "n_rel_bec_pre": len(bec_pre_df),
    "n_posgrado": len(posgrado_df),
    "n_rel_bec_pos": len(bec_pos_df),
    "n_shanghai": len(shanghai_rank_df),
    "n_rel_pos_sh": len(pos_sh_df),
    "n_qs": len(qs_rank_df),
    "n_rel_pos_qs": len(pos_qs_df),
    "n_rel_bec_author": len(bec_aut_df),
}
summary


{'n_becario': 3143,
 'n_secundaria': 863,
 'n_rel_bec_sec': 3143,
 'n_pregrado': 252,
 'n_rel_bec_pre': 3143,
 'n_posgrado': 1345,
 'n_rel_bec_pos': 3143,
 'n_shanghai': 301,
 'n_rel_pos_sh': 1345,
 'n_qs': 265,
 'n_rel_pos_qs': 1345,
 'n_rel_bec_author': 2704}

In [40]:
secundaria_df.duplicated(subset=["secundaria_id"]).sum(), secundaria_df.isna().mean()
pregrado_df.duplicated(subset=["pregrado_id"]).sum(), pregrado_df.isna().mean()
posgrado_df.duplicated(subset=["posgrado_id"]).sum(), posgrado_df.isna().mean()


(0,
 UNIVERSIDAD_POSGRADO    0.0
 PAIS_POSGRADO           0.0
 AREA_OCDE               0.0
 PROGRAMA                0.0
 posgrado_id             0.0
 dtype: float64)

In [41]:
print("Shanghai con TOP_100=Sí:", (shanghai_rank_df["TOP_100_SHANGHAI"]=="Sí").sum())
print("QS con TOP_100_QS=1:", (df["TOP_100_QS"]==1).sum())


Shanghai con TOP_100=Sí: 0
QS con TOP_100_QS=1: 997


In [42]:
df["TOP_100_SHANGHAI"].value_counts(dropna=False).head(20)


TOP_100_SHANGHAI
0                  1665
sin_informacion    1033
1                   445
Name: count, dtype: int64

In [44]:
def to_bin_top100(x):
    if pd.isna(x): 
        return 0
    s = str(x).strip().lower()
    # normaliza tildes y variantes usuales
    s = (s.replace("sí", "si")
           .replace("verdadero", "true")
           .replace("verdad", "true"))
    yes = {"si", "sí", "true", "t", "y", "yes", "1", "top100", "top_100", "x"}
    no  = {"no", "false", "f", "n", "0", "sin_informacion"}
    if s in yes: 
        return 1
    if s in no:  
        return 0
    # intenta parsear números
    try:
        return 1 if float(s) == 1.0 else 0
    except Exception:
        # por defecto, considera no-top (mejor conservador)
        return 0

df["TOP_100_SHANGHAI_BIN"] = df["TOP_100_SHANGHAI"].apply(to_bin_top100)
df["TOP_100_SHANGHAI_BIN"].value_counts()


TOP_100_SHANGHAI_BIN
0    2698
1     445
Name: count, dtype: int64

In [45]:
print("Shanghai TOP_100 (bin):", int(df["TOP_100_SHANGHAI_BIN"].sum()))
print("QS TOP_100 (int):", int((df["TOP_100_QS"]==1).sum()))


Shanghai TOP_100 (bin): 445
QS TOP_100 (int): 997


In [46]:
# une por clave canónica reconstruida (misma que usamos para crear IDs)
tmp = df[[
    "UNIVERSIDAD_RANKING_SHANGHAI","PAIS_RANKING_SHANGHAI",
    "RANK_SHANGHAI","RANK_NACIONAL_SHANGHAI","TOP_100_SHANGHAI_BIN"
]].copy()

# reconstruye la clave canónica y el id (igual que antes)
def norm(x):
    if pd.isna(x): return ""
    s = str(x).strip().lower()
    return " ".join(s.split())

import hashlib
def make_id(label, parts):
    key = label + "|" + "|".join([norm(p) for p in parts])
    return hashlib.md5(key.encode("utf-8")).hexdigest()

key_cols = ["UNIVERSIDAD_RANKING_SHANGHAI","PAIS_RANKING_SHANGHAI","RANK_SHANGHAI","RANK_NACIONAL_SHANGHAI","TOP_100_SHANGHAI"]
tmp["__key__"] = df[key_cols].apply(lambda r: tuple(norm(x) for x in r), axis=1)
tmp["shanghai_rank_id"] = tmp["__key__"].apply(lambda t: make_id("ShanghaiRank", t))

# agregamos el binario por id
top100_bin_by_id = tmp.groupby("shanghai_rank_id", as_index=False)["TOP_100_SHANGHAI_BIN"].max()

# añade/actualiza columna en el nodo
shanghai_rank_df = shanghai_rank_df.merge(top100_bin_by_id, on="shanghai_rank_id", how="left").fillna({"TOP_100_SHANGHAI_BIN":0}).astype({"TOP_100_SHANGHAI_BIN":"int64"})
shanghai_rank_df.head(3)


Unnamed: 0,UNIVERSIDAD_RANKING_SHANGHAI,PAIS_RANKING_SHANGHAI,RANK_SHANGHAI,RANK_NACIONAL_SHANGHAI,TOP_100_SHANGHAI,shanghai_rank_id,TOP_100_SHANGHAI_BIN
0,Carnegie Mellon University,us,101-150,39-50,0,6bde7a32b02a72a66948e74d761bb13c,0
1,Universidad De Concepcion,sin_informacion,sin_informacion,sin_informacion,sin_informacion,e7b26eeed9f50118898efec585f0d493,0
2,Delft University of Technology,nl,151-200,7-9,0,0e64000816cc3fa0b24010447c18f3b7,0


In [47]:
df["TOP_100_SHANGHAI_BY_RANK"] = pd.to_numeric(df["RANK_SHANGHAI"], errors="coerce").fillna(1e9).astype(int) <= 100
df["TOP_100_SHANGHAI_BY_RANK"].sum()


445

In [49]:
becario_df.to_csv("exports/becario.csv", index=False)
secundaria_df.to_csv("exports/secundaria.csv", index=False)
pregrado_df.to_csv("exports/pregrado.csv", index=False)
posgrado_df.to_csv("exports/posgrado.csv", index=False)
shanghai_rank_df.to_csv("exports/shanghai_rank.csv", index=False)
qs_rank_df.to_csv("exports/qs_rank.csv", index=False)
bec_sec_df.to_csv("exports/id_becario__id_secundaria.csv", index=False)
bec_pre_df.to_csv("exports/id_becario__id_pregrado.csv", index=False)
bec_pos_df.to_csv("exports/id_becario__id_posgrado.csv", index=False)
pos_sh_df.to_csv("exports/id_posgrado__id_shanghai_rank.csv", index=False)
pos_qs_df.to_csv("exports/id_posgrado__id_qs_rank.csv", index=False)
bec_aut_df.to_csv("exports/id_becario__author_id.csv", index=False)

In [55]:
df_abstract = pd.read_csv(r"C:\Users\Rodrigo\Desktop\tesis_magister\bases_de_datos\scopus\new\abstracts.csv")
df_authors = pd.read_csv(r"C:\Users\Rodrigo\Desktop\tesis_magister\bases_de_datos\scopus\new\authors.csv")

In [57]:
df_abstract.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28953 entries, 0 to 28952
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   author_id          28953 non-null  int64  
 1   scopus_id          28904 non-null  float64
 2   title              28438 non-null  object 
 3   abstract           27568 non-null  object 
 4   keywords           22165 non-null  object 
 5   year               28438 non-null  float64
 6   month              28438 non-null  float64
 7   day                28438 non-null  float64
 8   grant_year         26195 non-null  float64
 9   diff_year          25761 non-null  float64
 10  period             25761 non-null  object 
 11  publicationname    28437 non-null  object 
 12  aggtype            28435 non-null  object 
 13  subtype            28438 non-null  object 
 14  publicationname.1  28437 non-null  object 
 15  citedbycount       28438 non-null  float64
 16  doi                271

In [58]:
df_authors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2714 entries, 0 to 2713
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   author_id                 2714 non-null   int64  
 1   indexed_name              2714 non-null   object 
 2   grant_year                2706 non-null   float64
 3   first_publication_year    2714 non-null   int64  
 4   last_publication_year     2714 non-null   int64  
 5   document_count            2714 non-null   int64  
 6   cited_by_count            2714 non-null   int64  
 7   coauthor_count            2557 non-null   float64
 8   h-index                   2707 non-null   float64
 9   subject_areas             2713 non-null   object 
 10  affiliation_display_name  2256 non-null   object 
 11  affiliation_parent_id     2257 non-null   float64
 12  affiliation_parent_name   2256 non-null   object 
 13  affiliation_id            1250 non-null   float64
 14  affiliat