In [1]:
import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import aquarel as aq
from ineqpy.inequality import gini
import scipy.stats as st
import pyalex as alex
alex.config.email = "noah0roussel01980@gmail.com"
works = pl.read_csv("../data/2_extracted_works/works_q1.csv")
works = works.filter(pl.col("year") != 2025)

year_begin = 1920
year_end = 2024
works = works.with_columns(
    age=2025 - pl.col("year")
)


In [2]:
works_bjp = pl.read_csv("../data/2_extracted_works/works_bjp.csv")
works_bjp = works_bjp.filter(pl.col("year") != 2025)

In [3]:
cbc_per_year = (
    works
    .select(
        [pl.col("year"), pl.col("cited_by_count"), pl.col("title")]
    )
    .group_by(
        pl.col("year"),
        maintain_order=True
    )
    .agg(
        pl.col("cited_by_count").mean().name.prefix("mean_")
    )
) 

works = works.join(
    cbc_per_year,
    on="year",
    how="left"
) 

works = (
    works
    .with_columns(
        mncs = pl.col("cited_by_count") / pl.col("mean_cited_by_count")
    )
    .drop("mean_cited_by_count")
)
cols_authors = [col for col in works.columns if col.startswith("author_")]

works = works.with_columns(
    authors_count = sum(
        [pl.col(col).is_not_null().cast(pl.Int8) for col in cols_authors]
    )

)

works = works.with_columns(
    title = pl.col("title").fill_null("").str.to_lowercase(),
    abstract = pl.col("abstract").fill_null("").str.to_lowercase()
)

works = works.with_columns( #remove abstract, add words : survey, overview, state of?
    review = (
        pl.col("title").str.contains("review") |
        pl.col("abstract").str.contains("review")
    ),
    meta_analysis = (
        pl.col("title").str.contains("meta[\u00AD-]?analysis") |
        pl.col("abstract").str.contains("meta[\u00AD-]?analysis")
    )
)

works = works.with_columns([
    pl.when(pl.col("countries_distinct_count").is_null() | pl.col("countries_distinct_count").is_nan() | (pl.col("countries_distinct_count") == 0))
      .then(1)
      .otherwise(pl.col("countries_distinct_count"))
      .alias("countries_distinct_count")
])
works = works.with_columns([
    pl.when(pl.col("institutions_distinct_count").is_null() | pl.col("institutions_distinct_count").is_nan() | (pl.col("institutions_distinct_count") == 0))
      .then(1)
      .otherwise(pl.col("institutions_distinct_count"))
      .alias("institutions_distinct_count")
])

works

title,year,cited_by_count,countries_distinct_count,institutions_distinct_count,citation_normalized_percentile,primary_topic,keywords,concepts,referenced_works_count,referenced_works,abstract,abstract_inverted_index,journal,author_1,author_2,author_3,institution_1,institution_2,institution_3,country_1,country_2,country_3,cited_by_count_2025,cited_by_count_2024,cited_by_count_2023,cited_by_count_2022,cited_by_count_2021,cited_by_count_2020,cited_by_count_2019,cited_by_count_2018,cited_by_count_2017,cited_by_count_2016,cited_by_count_2015,cited_by_count_2014,cited_by_count_2013,cited_by_count_2012,…,country_239,country_240,country_241,country_242,country_243,country_244,country_245,country_246,country_247,country_248,country_249,country_250,country_251,country_252,country_253,country_254,country_255,country_256,country_257,country_258,country_259,country_260,country_261,country_262,country_263,country_264,country_265,country_266,country_267,country_268,country_269,country_270,age,mncs,authors_count,review,meta_analysis
str,i64,i64,i64,i64,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,f64,i8,bool,bool
"""ros stress in cancer cells and…",2004,1881,1,1,"""{'value': 0.816836, 'is_in_top…","""Redox biology and oxidative st…","""[{'id': 'https://openalex.org/…","""[{'id': 'https://openalex.org/…",158,"""['https://openalex.org/W116540…","""""",,"""Drug Resistance Updates""","""https://openalex.org/A50102407…","""https://openalex.org/A50076048…","""https://openalex.org/A51006444…","""['The University of Texas MD A…","""['The University of Texas MD A…","""['The University of Texas MD A…","""US""","""US""","""US""",40.0,71.0,85.0,108.0,133.0,133.0,97.0,91.0,127.0,111.0,119.0,130.0,108.0,117.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,21,35.103559,3,false,false
"""aminoglycoside modifying enzym…",2010,1294,1,1,"""{'value': 0.995475, 'is_in_top…","""Bacteriophages and microbial i…","""[{'id': 'https://openalex.org/…","""[{'id': 'https://openalex.org/…",336,"""['https://openalex.org/W141030…","""""",,"""Drug Resistance Updates""","""https://openalex.org/A50909767…","""https://openalex.org/A50055229…",,"""['California State University,…","""['California State University,…",,"""US""","""US""",,74.0,108.0,123.0,144.0,136.0,162.0,108.0,92.0,72.0,70.0,57.0,52.0,45.0,26.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,15,24.857588,2,false,false
"""overcoming the blood–brain tum…",2015,898,2,6,"""{'value': 0.998282, 'is_in_top…","""Glioma Diagnosis and Treatment""","""[{'id': 'https://openalex.org/…","""[{'id': 'https://openalex.org/…",159,"""['https://openalex.org/W148541…","""""",,"""Drug Resistance Updates""","""https://openalex.org/A50341518…","""https://openalex.org/A50913295…","""https://openalex.org/A50395930…","""['The Netherlands Cancer Insti…","""['Amsterdam UMC Location Vrije…","""['The Netherlands Cancer Insti…","""NL""","""NL""","""NL""",80.0,87.0,97.0,110.0,116.0,119.0,91.0,80.0,59.0,43.0,15.0,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10,23.127161,6,false,false
"""targeting the pi3k/akt/mtor pa…",2008,786,1,2,"""{'value': 0.999673, 'is_in_top…","""PI3K/AKT/mTOR signaling in can…","""[{'id': 'https://openalex.org/…","""[{'id': 'https://openalex.org/…",236,"""['https://openalex.org/W150966…","""""",,"""Drug Resistance Updates""","""https://openalex.org/A50898992…","""https://openalex.org/A50296122…","""https://openalex.org/A51080787…","""['National Cancer Institute', …","""['Center for Cancer Research',…","""['National Cancer Institute', …","""US""","""US""","""US""",15.0,21.0,25.0,39.0,38.0,39.0,44.0,50.0,52.0,47.0,51.0,53.0,58.0,57.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,17,14.902469,4,false,false
"""if not apoptosis, then what? t…",2001,722,1,1,"""{'value': 0.890095, 'is_in_top…","""Cancer-related Molecular Pathw…","""[{'id': 'https://openalex.org/…","""[{'id': 'https://openalex.org/…",106,"""['https://openalex.org/W114421…","""""",,"""Drug Resistance Updates""","""https://openalex.org/A50062088…","""https://openalex.org/A50498675…","""https://openalex.org/A50918311…","""['University of Illinois Chica…","""['University of Illinois Chica…","""['University of Illinois Chica…","""US""","""US""","""US""",8.0,19.0,20.0,19.0,20.0,22.0,17.0,14.0,32.0,36.0,20.0,32.0,38.0,37.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,24,12.263251,3,false,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""contributors""",1992,0,1,1,,,"""[]""","""[{'id': 'https://openalex.org/…",0,"""[]""","""""",,"""Vaccines""",,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,33,0.0,0,false,false
"""copyright""",1992,0,1,1,,,"""[]""","""[{'id': 'https://openalex.org/…",0,"""[]""","""""",,"""Vaccines""",,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,33,0.0,0,false,false
"""preface""",1992,0,1,1,,,"""[]""","""[{'id': 'https://openalex.org/…",0,"""[]""","""""",,"""Vaccines""","""https://openalex.org/A51022314…",,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,33,0.0,1,false,false
"""dendritic cells in the inducti…",1991,0,1,1,"""{'value': 0.0, 'is_in_top_1_pe…","""Immunotherapy and Immune Respo…","""[{'id': 'https://openalex.org/…","""[{'id': 'https://openalex.org/…",10,"""['https://openalex.org/W148626…","""""","""{'Bone': [0], 'marrow-derived'…","""Vaccines""","""https://openalex.org/A50641254…",,,"""['MRC Clinical Trials Unit at …",,,"""GB""",,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,34,0.0,1,false,false


In [4]:
year_begin = 1920
year_end = 2024
works_bjp = works_bjp.with_columns(
    age=2025 - pl.col("year")
)

cbc_per_year = (
    works
    .group_by("year")
    .agg(
        pl.col("cited_by_count").mean().alias("mean_cited_by_count")
    )
)


works_bjp = works_bjp.join(
    cbc_per_year,
    on="year",
    how="left"
)


works_bjp = (
    works_bjp
    .with_columns(
        mncs = pl.col("cited_by_count") / pl.col("mean_cited_by_count")
    )
    .drop("mean_cited_by_count")
)

bjp_cols_authors = [col for col in works_bjp.columns if col.startswith("author_")]

works_bjp = works_bjp.with_columns(
    authors_count = sum(
        [pl.col(col).is_not_null().cast(pl.Int8) for col in bjp_cols_authors]
    )

)

works_bjp = works_bjp.with_columns(
    title = pl.col("title").fill_null("").str.to_lowercase(),
    abstract = pl.col("abstract").fill_null("").str.to_lowercase()
)

works_bjp = works_bjp.with_columns( #remove abstract, add words : survey, overview, state of?
    review = (
        pl.col("title").str.contains("review") |
        pl.col("abstract").str.contains("review")
    ),
    meta_analysis = (
        pl.col("title").str.contains("meta[\u00AD-]?analysis") |
        pl.col("abstract").str.contains("meta[\u00AD-]?analysis")
    )
)

works_bjp = works_bjp.with_columns([
    pl.when(pl.col("countries_distinct_count").is_null() | pl.col("countries_distinct_count").is_nan() | (pl.col("countries_distinct_count") == 0))
      .then(1)
      .otherwise(pl.col("countries_distinct_count"))
      .alias("countries_distinct_count")
])
works_bjp = works_bjp.with_columns([
    pl.when(pl.col("institutions_distinct_count").is_null() | pl.col("institutions_distinct_count").is_nan() | (pl.col("institutions_distinct_count") == 0))
      .then(1)
      .otherwise(pl.col("institutions_distinct_count"))
      .alias("institutions_distinct_count")
])


bjp_group_references_mean = (
    works_bjp.lazy()  
    .filter(~pl.col("review") & ~pl.col("meta_analysis"))
    .group_by("referenced_works_count")
    .agg([
        pl.col("mncs").mean().alias("mncs"),
        pl.len().alias("count")
    ])
    .sort("referenced_works_count")
    .collect()  
)

works_bjp

title,year,cited_by_count,countries_distinct_count,institutions_distinct_count,citation_normalized_percentile,primary_topic,keywords,concepts,referenced_works_count,referenced_works,abstract,abstract_inverted_index,journal,author_1,author_2,author_3,author_4,author_5,institution_1,institution_2,institution_3,institution_4,institution_5,country_1,country_2,country_3,country_4,country_5,cited_by_count_2025,cited_by_count_2024,cited_by_count_2023,cited_by_count_2022,cited_by_count_2021,cited_by_count_2020,cited_by_count_2019,cited_by_count_2018,…,country_108,country_109,country_110,country_111,country_112,country_113,country_114,country_115,country_116,country_117,country_118,country_119,country_120,institution_108,institution_109,institution_110,institution_111,institution_112,institution_113,institution_114,institution_115,institution_116,institution_117,institution_118,institution_119,institution_120,institution_121,institution_122,institution_123,institution_124,institution_125,institution_126,age,mncs,authors_count,review,meta_analysis
str,i64,i64,i64,i64,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,f64,i8,bool,bool
"""animal research: reporting <i>…",2010,3465,1,4,"""{'value': 0.999709, 'is_in_top…","""('Animal testing and alternati…","""[{'id': 'https://openalex.org/…","""[{'id': 'https://openalex.org/…",6,"""['https://openalex.org/W167468…","""the nc3rs gratefully acknowled…","""{'The': [0, 222], 'NC3Rs': [1,…","""British Journal of Pharmacolog…","""https://openalex.org/A50535847…","""https://openalex.org/A50876564…","""https://openalex.org/A50047534…","""https://openalex.org/A50192845…","""https://openalex.org/A50429626…","""['National Centre for the Repl…","""['University of Bristol', 'htt…","""['University of Bristol', 'htt…","""['Imperial College London', 'h…","""['University of Oxford', 'http…","""GB""","""GB""","""GB""","""GB""","""GB""",104.0,169.0,209.0,239.0,287.0,378.0,396.0,279.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,15,66.562243,5,false,false
"""principles of early drug disco…",2010,2474,1,2,"""{'value': 0.763975, 'is_in_top…","""('Computational Drug Discovery…","""[{'id': 'https://openalex.org/…","""[{'id': 'https://openalex.org/…",25,"""['https://openalex.org/W150104…","""developing a new drug from ori…","""{'Developing': [0], 'a': [1, 1…","""British Journal of Pharmacolog…","""https://openalex.org/A51099337…","""https://openalex.org/A51141727…","""https://openalex.org/A50660771…","""https://openalex.org/A51099819…",,"""['GlaxoSmithKline (United King…","""[""King's College London"", 'htt…","""[""King's College London"", 'htt…",,,"""GB""","""GB""","""GB""","""GB""",,197.0,363.0,363.0,308.0,292.0,234.0,150.0,160.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,15,47.52525,4,true,false
"""measuring reactive species and…",2004,2200,1,1,"""{'value': 0.990741, 'is_in_top…","""('Antioxidant Activity and Oxi…","""[]""","""[{'id': 'https://openalex.org/…",386,"""['https://openalex.org/W116935…","""free radicals and other reacti…","""{'Free': [0], 'radicals': [1],…","""British Journal of Pharmacolog…","""https://openalex.org/A50537269…","""https://openalex.org/A50691363…",,,,"""['National University of Singa…","""['National University of Singa…",,,,"""SG""","""SG""",,,,31.0,76.0,81.0,91.0,101.0,127.0,107.0,107.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,21,41.056794,2,true,false
"""guide to receptors and channel…",2011,2073,1,3,"""{'value': 0.999918, 'is_in_top…","""('Inflammatory mediators and N…","""[]""","""[{'id': 'https://openalex.org/…",2,"""['https://openalex.org/W227908…","""abstract the fifth edition of …","""{'Abstract': [0], 'The': [1], …","""British Journal of Pharmacolog…","""https://openalex.org/A50904197…","""https://openalex.org/A50146921…","""https://openalex.org/A51052506…",,,"""['University of Nottingham', '…","""['University of Kent', 'https:…","""['University of Greenwich', 'h…",,,"""GB""","""GB""",,,,7.0,11.0,14.0,22.0,17.0,20.0,34.0,25.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,14,43.369663,3,false,false
"""characterization of three inhi…",1990,1876,1,1,"""{'value': 0.999876, 'is_in_top…","""('Nitric Oxide and Endothelin …","""[{'id': 'https://openalex.org/…","""[{'id': 'https://openalex.org/…",37,"""['https://openalex.org/W196575…","""1. three analogues of l-argini…","""{'1.': [0], 'Three': [1], 'ana…","""British Journal of Pharmacolog…","""https://openalex.org/A51121514…","""https://openalex.org/A50561992…","""https://openalex.org/A50266963…","""https://openalex.org/A50397957…","""https://openalex.org/A50053578…","""['Wellcome Trust', 'https://op…","""['Wellcome Trust', 'https://op…","""['Wellcome Trust', 'https://op…","""['Wellcome Trust', 'https://op…","""['Wellcome Trust', 'https://op…","""GB""","""GB""","""GB""","""GB""","""GB""",7.0,12.0,12.0,16.0,19.0,20.0,18.0,13.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,35,74.914326,5,false,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""""",1986,0,1,1,,,"""[]""","""[{'id': 'https://openalex.org/…",0,"""[]""","""""",,"""British Journal of Pharmacolog…",,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,39,0.0,0,false,false
"""transdermal iontophoresis - ro…",1994,0,1,1,"""{'value': 0.0, 'is_in_top_1_pe…","""('Chemotherapy-related skin to…","""[{'id': 'https://openalex.org/…","""[{'id': 'https://openalex.org/…",0,"""[]""","""""",,"""British Journal of Pharmacolog…","""https://openalex.org/A51043508…","""https://openalex.org/A51134356…",,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,31,0.0,2,false,false
"""oral communications""",1991,0,1,1,"""{'value': 0.0, 'is_in_top_1_pe…","""('Attention Deficit Hyperactiv…","""[]""","""[{'id': 'https://openalex.org/…",266,"""['https://openalex.org/W140901…","""""",,"""British Journal of Pharmacolog…","""https://openalex.org/A51101816…","""https://openalex.org/A51114079…",,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,34,0.0,2,false,false
"""interleukin 1 and lipopolysacc…",1989,0,1,1,"""{'value': 0.0, 'is_in_top_1_pe…","""('Inflammatory mediators and N…","""[{'id': 'https://openalex.org/…","""[{'id': 'https://openalex.org/…",0,"""[]""","""""",,"""British Journal of Pharmacolog…","""https://openalex.org/A50457849…","""https://openalex.org/A50374877…",,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,36,0.0,2,false,false


In [None]:
#to fo