In [1]:
import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import aquarel as aq
from ineqpy.inequality import gini
import scipy.stats as st
import pyalex as alex
alex.config.email = "noah0roussel01980@gmail.com"
works = pl.read_csv("../data/works/works_q1.csv")
works = works.filter(pl.col("year") != 2025)

year_begin = 1920
year_end = 2024
works = works.with_columns(
    age=2025 - pl.col("year")
)


In [None]:
cbc_per_year = (
    works
    .select(
        [pl.col("year"), pl.col("cited_by_count"), pl.col("title")]
    )
    .group_by(
        pl.col("year"),
        maintain_order=True
    )
    .agg(
        pl.col("cited_by_count").mean().name.prefix("mean_")
    )
) 

works = works.join(
    cbc_per_year,
    on="year",
    how="left"
) 

works = (
    works
    .with_columns(
        mncs = pl.col("cited_by_count") / pl.col("mean_cited_by_count")
    )
    .drop("mean_cited_by_count")
)
cols_authors = [col for col in works.columns if col.startswith("author_")]

works = works.with_columns(
    authors_count = sum(
        [pl.col(col).is_not_null().cast(pl.Int8) for col in cols_authors]
    )

)

works = works.with_columns(
    title = pl.col("title").fill_null("").str.to_lowercase(),
    abstract = pl.col("abstract").fill_null("").str.to_lowercase()
)

works = works.with_columns( 
    review = (
        pl.col("title").str.contains("review") |
        pl.col("abstract").str.contains("review")
    ),
    meta_analysis = (
        pl.col("title").str.contains("meta[\u00AD-]?analysis") |
        pl.col("abstract").str.contains("meta[\u00AD-]?analysis")
    )
)

works = works.with_columns([
    pl.when(pl.col("countries_distinct_count").is_null() | pl.col("countries_distinct_count").is_nan() | (pl.col("countries_distinct_count") == 0))
      .then(1)
      .otherwise(pl.col("countries_distinct_count"))
      .alias("countries_distinct_count")
])
works = works.with_columns([
    pl.when(pl.col("institutions_distinct_count").is_null() | pl.col("institutions_distinct_count").is_nan() | (pl.col("institutions_distinct_count") == 0))
      .then(1)
      .otherwise(pl.col("institutions_distinct_count"))
      .alias("institutions_distinct_count")
])

works

title,year,cited_by_count,countries_distinct_count,institutions_distinct_count,citation_normalized_percentile,primary_topic,keywords,concepts,referenced_works_count,referenced_works,abstract,abstract_inverted_index,journal,author_1,author_2,author_3,institution_1,institution_2,institution_3,country_1,country_2,country_3,cited_by_count_2025,cited_by_count_2024,cited_by_count_2023,cited_by_count_2022,cited_by_count_2021,cited_by_count_2020,cited_by_count_2019,cited_by_count_2018,cited_by_count_2017,cited_by_count_2016,cited_by_count_2015,cited_by_count_2014,cited_by_count_2013,cited_by_count_2012,…,country_239,country_240,country_241,country_242,country_243,country_244,country_245,country_246,country_247,country_248,country_249,country_250,country_251,country_252,country_253,country_254,country_255,country_256,country_257,country_258,country_259,country_260,country_261,country_262,country_263,country_264,country_265,country_266,country_267,country_268,country_269,country_270,age,mncs,authors_count,review,meta_analysis
str,i64,i64,i64,i64,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,f64,i8,bool,bool
"""ros stress in cancer cells and…",2004,1881,1,1,"""{'value': 0.816836, 'is_in_top…","""Redox biology and oxidative st…","""[{'id': 'https://openalex.org/…","""[{'id': 'https://openalex.org/…",158,"""['https://openalex.org/W116540…","""""",,"""Drug Resistance Updates""","""https://openalex.org/A50102407…","""https://openalex.org/A50076048…","""https://openalex.org/A51006444…","""['The University of Texas MD A…","""['The University of Texas MD A…","""['The University of Texas MD A…","""US""","""US""","""US""",40.0,71.0,85.0,108.0,133.0,133.0,97.0,91.0,127.0,111.0,119.0,130.0,108.0,117.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,21,35.103559,3,false,false
"""aminoglycoside modifying enzym…",2010,1294,1,1,"""{'value': 0.995475, 'is_in_top…","""Bacteriophages and microbial i…","""[{'id': 'https://openalex.org/…","""[{'id': 'https://openalex.org/…",336,"""['https://openalex.org/W141030…","""""",,"""Drug Resistance Updates""","""https://openalex.org/A50909767…","""https://openalex.org/A50055229…",,"""['California State University,…","""['California State University,…",,"""US""","""US""",,74.0,108.0,123.0,144.0,136.0,162.0,108.0,92.0,72.0,70.0,57.0,52.0,45.0,26.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,15,24.857588,2,false,false
"""overcoming the blood–brain tum…",2015,898,2,6,"""{'value': 0.998282, 'is_in_top…","""Glioma Diagnosis and Treatment""","""[{'id': 'https://openalex.org/…","""[{'id': 'https://openalex.org/…",159,"""['https://openalex.org/W148541…","""""",,"""Drug Resistance Updates""","""https://openalex.org/A50341518…","""https://openalex.org/A50913295…","""https://openalex.org/A50395930…","""['The Netherlands Cancer Insti…","""['Amsterdam UMC Location Vrije…","""['The Netherlands Cancer Insti…","""NL""","""NL""","""NL""",80.0,87.0,97.0,110.0,116.0,119.0,91.0,80.0,59.0,43.0,15.0,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10,23.127161,6,false,false
"""targeting the pi3k/akt/mtor pa…",2008,786,1,2,"""{'value': 0.999673, 'is_in_top…","""PI3K/AKT/mTOR signaling in can…","""[{'id': 'https://openalex.org/…","""[{'id': 'https://openalex.org/…",236,"""['https://openalex.org/W150966…","""""",,"""Drug Resistance Updates""","""https://openalex.org/A50898992…","""https://openalex.org/A50296122…","""https://openalex.org/A51080787…","""['National Cancer Institute', …","""['Center for Cancer Research',…","""['National Cancer Institute', …","""US""","""US""","""US""",15.0,21.0,25.0,39.0,38.0,39.0,44.0,50.0,52.0,47.0,51.0,53.0,58.0,57.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,17,14.902469,4,false,false
"""if not apoptosis, then what? t…",2001,722,1,1,"""{'value': 0.890095, 'is_in_top…","""Cancer-related Molecular Pathw…","""[{'id': 'https://openalex.org/…","""[{'id': 'https://openalex.org/…",106,"""['https://openalex.org/W114421…","""""",,"""Drug Resistance Updates""","""https://openalex.org/A50062088…","""https://openalex.org/A50498675…","""https://openalex.org/A50918311…","""['University of Illinois Chica…","""['University of Illinois Chica…","""['University of Illinois Chica…","""US""","""US""","""US""",8.0,19.0,20.0,19.0,20.0,22.0,17.0,14.0,32.0,36.0,20.0,32.0,38.0,37.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,24,12.263251,3,false,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""contributors""",1992,0,1,1,,,"""[]""","""[{'id': 'https://openalex.org/…",0,"""[]""","""""",,"""Vaccines""",,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,33,0.0,0,false,false
"""copyright""",1992,0,1,1,,,"""[]""","""[{'id': 'https://openalex.org/…",0,"""[]""","""""",,"""Vaccines""",,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,33,0.0,0,false,false
"""preface""",1992,0,1,1,,,"""[]""","""[{'id': 'https://openalex.org/…",0,"""[]""","""""",,"""Vaccines""","""https://openalex.org/A51022314…",,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,33,0.0,1,false,false
"""dendritic cells in the inducti…",1991,0,1,1,"""{'value': 0.0, 'is_in_top_1_pe…","""Immunotherapy and Immune Respo…","""[{'id': 'https://openalex.org/…","""[{'id': 'https://openalex.org/…",10,"""['https://openalex.org/W148626…","""""","""{'Bone': [0], 'marrow-derived'…","""Vaccines""","""https://openalex.org/A50641254…",,,"""['MRC Clinical Trials Unit at …",,,"""GB""",,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,34,0.0,1,false,false


In [3]:
import polars as pl
from tqdm import tqdm

cols_authors = [f"author_{i}" for i in range(1, 21)]
institutions_col = [f"institution_{i}" for i in range(1, 193)]

works = works.sort("year").with_row_index("index_work")

years = works["year"].unique().to_list()
results = []

for y in tqdm(years, desc="Processing years"):
    batch = works.filter(pl.col("year") == y)

    authors_batch = (
        batch
        .select(["index_work", "title", "year", "cited_by_count", "mncs", *cols_authors])
        .unpivot(
            on=cols_authors,
            index=["title", "year", "index_work", "cited_by_count", "mncs"],
            variable_name="author_pos",
            value_name="author_name"
        )
        .filter(pl.col("author_name").is_not_null())
        .with_columns(count=pl.lit(1))
        .sort("index_work")
        .with_columns([
            (pl.col("count").cum_sum().over("author_name") - pl.col("count")).alias("cumulative_count"),
            (pl.col("cited_by_count").cum_sum().over("author_name") - pl.col("cited_by_count")).alias("cumulative_citations"),
            (pl.col("mncs").cum_sum().over("author_name") - pl.col("mncs")).alias("cumulative_mncs"),
        ])
        .with_columns(
            pl.when(pl.col("cumulative_count") == 0)
            .then(pl.lit(1))
            .otherwise(pl.col("cumulative_mncs") / pl.col("cumulative_count"))
            .alias("mean_past_mncs_authors")
        )
        .group_by("index_work")
        .agg([
            pl.mean("cumulative_count").alias("mean_past_contributions_authors"),
            pl.mean("mean_past_mncs_authors").alias("mean_past_mncs_authors")
        ])
    )

    institutions_batch = (
        batch
        .select(["index_work", "year", "cited_by_count", "mncs", *institutions_col])
        .unpivot(
            on=institutions_col,
            index=["year", "index_work", "cited_by_count", "mncs"],
            variable_name="institution_pos",
            value_name="institution_name"
        )
        .filter(pl.col("institution_name").is_not_null())
        .with_columns(
            count=pl.lit(1),
            institution_simple=pl.col("institution_name")
                .str.split("',", inclusive=False)
                .list.first()
                .str.slice(2)
        )
        .sort("index_work")
        .with_columns([
            (pl.col("count").cum_sum().over("institution_simple") - pl.col("count")).alias("cumulative_count"),
            (pl.col("cited_by_count").cum_sum().over("institution_simple") - pl.col("cited_by_count")).alias("cumulative_citations"),
            (pl.col("mncs").cum_sum().over("institution_simple") - pl.col("mncs")).alias("cumulative_mncs"),
        ])
        .with_columns(
            pl.when(pl.col("cumulative_count") == 0)
            .then(pl.lit(1))
            .otherwise(pl.col("cumulative_mncs") / pl.col("cumulative_count"))
            .alias("mean_past_mncs_institutions")
        )
        .group_by("index_work")
        .agg([
            pl.mean("cumulative_count").alias("mean_past_contributions_institutions"),
            pl.mean("mean_past_mncs_institutions").alias("mean_past_mncs_institutions")
        ])
    )

    batch_final = batch.join(authors_batch, on="index_work", how="left") \
                       .join(institutions_batch, on="index_work", how="left")

    results.append(batch_final)

works_final = pl.concat(results)

works_final = works_final.with_columns([
    pl.col("mean_past_mncs_authors").fill_null(1),
    pl.col("mean_past_contributions_authors").fill_null(0),
    pl.col("mean_past_mncs_institutions").fill_null(1),
    pl.col("mean_past_contributions_institutions").fill_null(0)
])

works = works_final


Processing years: 100%|██████████| 116/116 [00:25<00:00,  4.60it/s]


In [4]:
works = works.with_columns(
    primary_topic = pl.col("primary_topic").str.replace("'","", literal=True, n=2).str.replace("(","", literal=True).str.replace(")","", literal=True).str.replace(",","", literal=True),
    primary_subfield = pl.col("primary_subfield").str.replace("'","", literal=True, n=2).str.replace("(","", literal=True).str.replace(")","", literal=True).str.replace(",","", literal=True),
    primary_field = pl.col("primary_field").str.replace("'","", literal=True, n=2).str.replace("(","", literal=True).str.replace(")","", literal=True).str.replace(",","", literal=True), 
    primary_domain = pl.col("primary_domain").str.replace("'","", literal=True, n=2).str.replace("(","", literal=True).str.replace(")","", literal=True).str.replace(",","", literal=True)
)

In [5]:
works

index_work,title,year,cited_by_count,countries_distinct_count,institutions_distinct_count,citation_normalized_percentile,primary_topic,keywords,concepts,referenced_works_count,referenced_works,abstract,abstract_inverted_index,journal,author_1,author_2,author_3,institution_1,institution_2,institution_3,country_1,country_2,country_3,cited_by_count_2025,cited_by_count_2024,cited_by_count_2023,cited_by_count_2022,cited_by_count_2021,cited_by_count_2020,cited_by_count_2019,cited_by_count_2018,cited_by_count_2017,cited_by_count_2016,cited_by_count_2015,cited_by_count_2014,cited_by_count_2013,…,country_243,country_244,country_245,country_246,country_247,country_248,country_249,country_250,country_251,country_252,country_253,country_254,country_255,country_256,country_257,country_258,country_259,country_260,country_261,country_262,country_263,country_264,country_265,country_266,country_267,country_268,country_269,country_270,age,mncs,authors_count,review,meta_analysis,mean_past_contributions_authors,mean_past_mncs_authors,mean_past_contributions_institutions,mean_past_mncs_institutions
u32,str,i64,i64,i64,i64,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,f64,i8,bool,bool,f64,f64,f64,f64
0,"""on the pharmacological action …",1909,47,1,1,"""{'value': 0.99265, 'is_in_top_…","""Chemistry and Stereochemistry …","""[]""","""[{'id': 'https://openalex.org/…",0,"""[]""","""""",,"""Journal of Pharmacology and Ex…","""https://openalex.org/A51102708…","""https://openalex.org/A50329477…",,,,,,,,,1.0,,,,1.0,,,,,,,1.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,116,10.143885,2,false,false,0.0,1.0,0.0,1.0
1,"""the antagonism of the adrenal …",1909,15,1,1,"""{'value': 0.951659, 'is_in_top…","""Hormonal Regulation and Hypert…","""[]""","""[{'id': 'https://openalex.org/…",0,"""[]""","""""",,"""Journal of Pharmacology and Ex…","""https://openalex.org/A51116416…",,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,116,3.23741,1,false,false,0.0,1.0,0.0,1.0
2,"""quantitative experiments with …",1909,14,1,1,"""{'value': 0.978142, 'is_in_top…","""Tuberculosis Research and Epid…","""[{'id': 'https://openalex.org/…","""[{'id': 'https://openalex.org/…",0,"""[]""","""""",,"""Journal of Pharmacology and Ex…","""https://openalex.org/A50125106…",,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,116,3.021583,1,false,false,0.0,1.0,0.0,1.0
3,"""the comparative toxicity of th…",1909,12,1,1,"""{'value': 0.970958, 'is_in_top…","""Pharmacological Effects and To…","""[{'id': 'https://openalex.org/…","""[{'id': 'https://openalex.org/…",0,"""[]""","""""","""{'The': [0, 122], 'physiologic…","""Journal of Pharmacology and Ex…","""https://openalex.org/A51121624…","""https://openalex.org/A50802778…",,,,,,,,,,,,,,,,,,,,1.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,116,2.589928,2,false,false,0.0,1.0,0.0,1.0
4,"""on the relation between the to…",1909,11,1,1,"""{'value': 0.911704, 'is_in_top…","""Chemical Thermodynamics and Mo…","""[{'id': 'https://openalex.org/…","""[{'id': 'https://openalex.org/…",0,"""[]""","""""",,"""Journal of Pharmacology and Ex…","""https://openalex.org/A51136090…","""https://openalex.org/A50358658…",,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,116,2.374101,2,false,false,0.0,1.0,0.0,1.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
689356,"""the addis declaration on immun…",2024,0,1,1,"""{'value': 0.0, 'is_in_top_1_pe…","""HIV/AIDS Impact and Responses""","""[{'id': 'https://openalex.org/…","""[{'id': 'https://openalex.org/…",13,"""['https://openalex.org/W207878…","""""","""{'Background/Objectives:': [0]…","""Vaccines""","""https://openalex.org/A50057967…","""https://openalex.org/A50190474…","""https://openalex.org/A50194418…","""['World Health Organization Re…","""['World Health Organization Re…","""['World Health Organization Re…","""CG""","""CG""","""CG""",,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0.0,13,false,false,1.538462,1.603876,47.0,1.454523
689357,"""mucosal immunization with an i…",2024,0,1,2,"""{'value': 0.0, 'is_in_top_1_pe…","""Influenza Virus Research Studi…","""[]""","""[{'id': 'https://openalex.org/…",43,"""['https://openalex.org/W209152…","""""","""{'Background/Objectives:': [0]…","""Vaccines""","""https://openalex.org/A50467370…","""https://openalex.org/A50467218…","""https://openalex.org/A50408736…","""['Research Institute of Influe…","""['Ministry of Health of the Ru…","""['Research Institute of Influe…","""RU""","""RU""","""RU""",,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0.0,10,false,false,0.9,0.6,66.0,0.259329
689358,"""favorable nonclinical safety p…",2024,0,1,2,"""{'value': 0.0, 'is_in_top_1_pe…","""Respiratory viral infections r…","""[{'id': 'https://openalex.org/…","""[{'id': 'https://openalex.org/…",36,"""['https://openalex.org/W103182…","""""","""{'Background:': [0], 'Respirat…","""Vaccines""","""https://openalex.org/A50609585…","""https://openalex.org/A50516929…","""https://openalex.org/A51157172…","""['Pfizer (United States)', 'ht…","""['Pfizer (United States)', 'ht…","""['Pfizer (United States)', 'ht…","""US""","""US""","""US""",,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0.0,7,false,false,0.142857,0.994786,380.625,1.458431
689359,"""anti-hbs positivity related to…",2024,0,1,3,"""{'value': 0.0, 'is_in_top_1_pe…","""Hepatitis B Virus Studies""","""[{'id': 'https://openalex.org/…","""[{'id': 'https://openalex.org/…",35,"""['https://openalex.org/W124722…","""""","""{'Background:': [0], 'In': [1]…","""Vaccines""","""https://openalex.org/A51065645…","""https://openalex.org/A50792643…","""https://openalex.org/A50066103…","""['Medical University of Silesi…","""['Gdańsk Medical University', …","""['Medical University of Silesi…","""PL""","""PL""","""PL""",,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0.0,7,false,false,0.0,1.0,49.571429,0.964199


In [6]:
works.lazy().collect().write_parquet(
    "../data/modelling/works_pre_topics.parquet",
    compression="zstd",
    row_group_size=2_000
)