In [None]:
import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import aquarel as aq
from ineqpy.inequality import gini
import scipy.stats as st
import pyalex as alex
alex.config.email = "noah0roussel01980@gmail.com"
works = pl.read_csv("../data/2_extracted_works/works_q1.csv")
works = works.filter(pl.col("year") != 2025)

year_begin = 1920
year_end = 2024
works = works.with_columns(
    age=2025 - pl.col("year")
)


In [None]:
works_bjp = pl.read_csv("../data/2_extracted_works/works_bjp.csv")
works_bjp = works_bjp.filter(pl.col("year") != 2025)

In [None]:
cbc_per_year = (
    works
    .select(
        [pl.col("year"), pl.col("cited_by_count"), pl.col("title")]
    )
    .group_by(
        pl.col("year"),
        maintain_order=True
    )
    .agg(
        pl.col("cited_by_count").mean().name.prefix("mean_")
    )
) 

works = works.join(
    cbc_per_year,
    on="year",
    how="left"
) 

works = (
    works
    .with_columns(
        mncs = pl.col("cited_by_count") / pl.col("mean_cited_by_count")
    )
    .drop("mean_cited_by_count")
)
cols_authors = [col for col in works.columns if col.startswith("author_")]

works = works.with_columns(
    authors_count = sum(
        [pl.col(col).is_not_null().cast(pl.Int8) for col in cols_authors]
    )

)

works = works.with_columns(
    title = pl.col("title").fill_null("").str.to_lowercase(),
    abstract = pl.col("abstract").fill_null("").str.to_lowercase()
)

works = works.with_columns( #remove abstract, add words : survey, overview, state of?
    review = (
        pl.col("title").str.contains("review") |
        pl.col("abstract").str.contains("review")
    ),
    meta_analysis = (
        pl.col("title").str.contains("meta[\u00AD-]?analysis") |
        pl.col("abstract").str.contains("meta[\u00AD-]?analysis")
    )
)

works = works.with_columns([
    pl.when(pl.col("countries_distinct_count").is_null() | pl.col("countries_distinct_count").is_nan() | (pl.col("countries_distinct_count") == 0))
      .then(1)
      .otherwise(pl.col("countries_distinct_count"))
      .alias("countries_distinct_count")
])
works = works.with_columns([
    pl.when(pl.col("institutions_distinct_count").is_null() | pl.col("institutions_distinct_count").is_nan() | (pl.col("institutions_distinct_count") == 0))
      .then(1)
      .otherwise(pl.col("institutions_distinct_count"))
      .alias("institutions_distinct_count")
])

works

In [None]:
year_begin = 1920
year_end = 2024
works_bjp = works_bjp.with_columns(
    age=2025 - pl.col("year")
)

cbc_per_year = (
    works
    .group_by("year")
    .agg(
        pl.col("cited_by_count").mean().alias("mean_cited_by_count")
    )
)


works_bjp = works_bjp.join(
    cbc_per_year,
    on="year",
    how="left"
)


works_bjp = (
    works_bjp
    .with_columns(
        mncs = pl.col("cited_by_count") / pl.col("mean_cited_by_count")
    )
    .drop("mean_cited_by_count")
)

bjp_cols_authors = [col for col in works_bjp.columns if col.startswith("author_")]

works_bjp = works_bjp.with_columns(
    authors_count = sum(
        [pl.col(col).is_not_null().cast(pl.Int8) for col in bjp_cols_authors]
    )

)

works_bjp = works_bjp.with_columns(
    title = pl.col("title").fill_null("").str.to_lowercase(),
    abstract = pl.col("abstract").fill_null("").str.to_lowercase()
)

works_bjp = works_bjp.with_columns( #remove abstract, add words : survey, overview, state of?
    review = (
        pl.col("title").str.contains("review") |
        pl.col("abstract").str.contains("review")
    ),
    meta_analysis = (
        pl.col("title").str.contains("meta[\u00AD-]?analysis") |
        pl.col("abstract").str.contains("meta[\u00AD-]?analysis")
    )
)

works_bjp = works_bjp.with_columns([
    pl.when(pl.col("countries_distinct_count").is_null() | pl.col("countries_distinct_count").is_nan() | (pl.col("countries_distinct_count") == 0))
      .then(1)
      .otherwise(pl.col("countries_distinct_count"))
      .alias("countries_distinct_count")
])
works_bjp = works_bjp.with_columns([
    pl.when(pl.col("institutions_distinct_count").is_null() | pl.col("institutions_distinct_count").is_nan() | (pl.col("institutions_distinct_count") == 0))
      .then(1)
      .otherwise(pl.col("institutions_distinct_count"))
      .alias("institutions_distinct_count")
])


bjp_group_references_mean = (
    works_bjp.lazy()  
    .filter(~pl.col("review") & ~pl.col("meta_analysis"))
    .group_by("referenced_works_count")
    .agg([
        pl.col("mncs").mean().alias("mncs"),
        pl.len().alias("count")
    ])
    .sort("referenced_works_count")
    .collect()  
)

works_bjp

In [None]:

authors = (
    works
    .select(["title", "year", "cited_by_count", "mncs", *cols_authors])  
    .sort("year")
    .with_row_index("index_work")
    .unpivot(
        on=cols_authors,
        index=["title", "year", "index_work", "cited_by_count", "mncs"],
        variable_name="author_pos",
        value_name="author_name"
    )
    .filter(pl.col("author_name").is_not_null())
    .with_columns(count=pl.lit(1, dtype=pl.Int8))  
)

authors.drop("title")



In [None]:

authors_bjp = (
    works_bjp
    .select(["title", "year", "cited_by_count", "mncs", *cols_authors])  
    .sort("year")
    .with_row_index("index_work")
    .unpivot(
        on=cols_authors,
        index=["title", "year", "index_work", "cited_by_count", "mncs"],
        variable_name="author_pos",
        value_name="author_name"
    )
    .filter(pl.col("author_name").is_not_null())
    .with_columns(count=pl.lit(1, dtype=pl.Int8))  
)

authors_bjp.drop("title")



In [None]:
top_authors= (
    authors
    .group_by(by = "author_name")
    .sum()
    .sort(by = "cited_by_count", descending=True)
    .drop(["index_work", "author_name"])
)

active_authors = (
    authors
    .group_by(by = "author_name")
    .sum()
    .sort(by = "count", descending=True)
    .drop(["index_work", "author_name"])
)

top_authors

In [None]:
top_authors_bjp = (
    authors_bjp
    .group_by(by = "author_name")
    .sum()
    .sort(by = "cited_by_count", descending=True)
    .drop(["index_work", "author_name"])
)

active_authors_bjp = (
    authors_bjp
    .group_by(by = "author_name")
    .sum()
    .sort(by = "count", descending=True)
    .drop(["index_work", "author_name"])
)

top_authors_bjp

In [None]:
from collections import Counter

top_20_authors = top_authors.head(20)

author_ids = [aid for aid in top_20_authors["author_name"].to_list() if isinstance(aid, str)]

authors_info = list(alex.Authors()[author_ids])

id_to_info = {}
for a in authors_info:
    countries = [] #freq of affiliated countries - not citations
    if a.get("affiliations"):
        for aff in a["affiliations"]:
            inst = aff.get("institution")
            if inst and inst.get("country_code"):
                countries.append(inst["country_code"])
    country_freq = dict(Counter(countries)) if countries else {}
    id_to_info[a["id"]] = {
        "name": a.get("display_name"),
        "country_freq": country_freq  
    }

top_20_authors = top_20_authors.with_columns([
    pl.col("author_name").map_elements(lambda aid: id_to_info.get(aid, {}).get("name"), return_dtype=pl.Utf8).alias("name"),
    pl.col("author_name").map_elements(lambda aid: id_to_info.get(aid, {}).get("country_freq"), return_dtype=pl.Object).alias("country_freq"),
])

top_20_authors


In [None]:
top_20_authors_bjp = top_authors_bjp.head(20)

author_ids = [aid for aid in top_20_authors_bjp["author_name"].to_list() if isinstance(aid, str)]

authors_info = list(alex.Authors()[author_ids])

id_to_info = {}
for a in authors_info:
    countries = [] #freq of affiliated countries - not citations
    if a.get("affiliations"):
        for aff in a["affiliations"]:
            inst = aff.get("institution")
            if inst and inst.get("country_code"):
                countries.append(inst["country_code"])
    country_freq = dict(Counter(countries)) if countries else {}
    id_to_info[a["id"]] = {
        "name": a.get("display_name"),
        "country_freq": country_freq  
    }

top_20_authors_bjp = top_20_authors_bjp.with_columns([
    pl.col("author_name").map_elements(lambda aid: id_to_info.get(aid, {}).get("name"), return_dtype=pl.Utf8).alias("name"),
    pl.col("author_name").map_elements(lambda aid: id_to_info.get(aid, {}).get("country_freq"), return_dtype=pl.Object).alias("country_freq"),
])

top_20_authors_bjp


In [None]:
from matplotlib.lines import Line2D
import matplotlib.cm as cm

country_freqs = top_20_authors["country_freq"].to_list()
all_countries_flat = [c for d in country_freqs for c in d.keys()]
country_counter = Counter(all_countries_flat)
top_countries = [c for c, _ in country_counter.most_common(20)]

cmap = cm.get_cmap("tab20", len(top_countries))
country_colors = {country: cmap(i) for i, country in enumerate(top_countries)}
country_colors["Other"] = "lightgrey"

bar_values = []
for cf in country_freqs:
    total = sum(cf.values()) if cf else 1
    proportions = [cf.get(c, 0)/total for c in top_countries]
    other_prop = sum(v for k,v in cf.items() if k not in top_countries)/total
    proportions.append(other_prop)
    bar_values.append(proportions)

bar_values = np.array(bar_values)
names = top_20_authors["name"].to_list()[::-1]
bar_values = bar_values[::-1]
citations = np.array(top_20_authors["cited_by_count"].to_list()[::-1])
plot_countries = top_countries + ["Other"]

fig, ax = plt.subplots(figsize=(10, 12)) 
bottom = np.zeros(len(names))

for i, country in enumerate(plot_countries):
    ax.barh(names, bar_values[:, i]*citations, left=bottom,
            color=country_colors[country], edgecolor='white', height=0.8)  
    bottom += bar_values[:, i]*citations

ax.set_xlabel("Number of citations", fontsize=14)
ax.set_ylabel("Author", fontsize=14)
ax.set_title("Top 20 Q1 Authors by Citations - Country Distribution", fontsize=16)

handles = [Line2D([0], [0], color=country_colors[c], lw=12) for c in plot_countries]  
ax.legend(handles, plot_countries, title="Country",
          loc='lower right', fontsize=10, title_fontsize=12, frameon=True, framealpha=0.9)

plt.tight_layout()
plt.show()


In [None]:
names = top_20_authors_bjp["name"].to_list()
country_freqs = top_20_authors_bjp["country_freq"].to_list()

bar_values = []
for cf in country_freqs:
    total = sum(cf.values()) if cf else 1
    proportions = [cf.get(c, 0)/total for c in top_countries]
    other_prop = sum(v for k,v in cf.items() if k not in top_countries)/total
    proportions.append(other_prop)
    bar_values.append(proportions)

bar_values = np.array(bar_values)
names = names[::-1]
bar_values = bar_values[::-1]
citations = np.array(top_20_authors_bjp["cited_by_count"].to_list()[::-1])

plot_countries = top_countries + ["Other"]

fig, ax = plt.subplots(figsize=(10, 12))
bottom = np.zeros(len(names))

for i, country in enumerate(plot_countries):
    ax.barh(names, bar_values[:, i]*citations, left=bottom,
            color=country_colors[country], edgecolor='white', height=0.8)
    bottom += bar_values[:, i]*citations

ax.set_xlabel("Number of citations", fontsize=14)
ax.set_ylabel("Author", fontsize=14)
ax.set_title("Top 20 BJP Authors by Citations - Country Distribution", fontsize=16)

from matplotlib.lines import Line2D
handles = [Line2D([0], [0], color=country_colors[c], lw=12) for c in plot_countries]
ax.legend(handles, plot_countries, title="Country",
          loc='lower right', fontsize=10, title_fontsize=12, frameon=True, framealpha=0.9)

plt.tight_layout()
plt.show()


In [None]:
from adjustText import adjust_text

top_20_authors = top_20_authors.with_columns([
    pl.col("count").log().alias("publications_log"),  
])

top_20_authors = top_20_authors.with_columns(
    pl.col("mncs_mean").log().alias("mncs_log"),  
)


pub_log = top_20_authors["publications_log"].to_numpy()
x_min = np.min(pub_log)
x_max = np.max(pub_log)

mncs_log = top_20_authors["mncs_log"].to_numpy()
y_min = np.min(mncs_log)
y_max = np.max(mncs_log)



top_20_authors = top_20_authors.with_columns([
    ((pl.col("publications_log") - x_min) / (x_max - x_min) * 2 - 1).alias("x_viz"),
    ((pl.col("mncs_log") - y_min) / (y_max - y_min) * 2 - 1).alias("y_viz"),
])

plt.figure(figsize=(10, 10))

plt.scatter(
    top_20_authors["x_viz"],
    top_20_authors["y_viz"],
    s=100,
    color='skyblue',
    edgecolor='k',
    zorder=2
)

texts = []
for name, x, y in zip(
    top_20_authors["name"],
    top_20_authors["x_viz"],
    top_20_authors["y_viz"]
):
    if y >= 0 and x >= 0:
        color = "green"
    elif y >= 0 and x < 0:
        color = "orange"
    elif y < 0 and x < 0:
        color = "red"
    else:
        color = "purple"
    
    texts.append(plt.text(x, y, name, fontsize=10, fontweight="bold", color=color))

adjust_text(texts, arrowprops=dict(arrowstyle='->', color='gray', alpha=1))

plt.axvline(0, color="black", linewidth=1)
plt.axhline(0, color="black", linewidth=1)

plt.grid(True, linestyle='--', alpha=0.5)

plt.xlim(-1.1, 1.1)
plt.ylim(-1.1, 1.1)
plt.xlabel("Total Publications (log, scaled)")
plt.ylabel("Mean MNCS (log, scaled)")

plt.title("Top 20 Q1 Authors : Publications vs Mean MNCS")

plt.tight_layout()
plt.show()

In [None]:

top_20_authors_bjp = top_20_authors_bjp.with_columns([
    pl.col("count").log().alias("publications_log"), 
])

top_20_authors_bjp = top_20_authors_bjp.with_columns(
    pl.col("mncs_mean").log().alias("mncs_log"),  
)


pub_log = top_20_authors_bjp["publications_log"].to_numpy()
x_min = np.min(pub_log)
x_max = np.max(pub_log)

mncs_log = top_20_authors_bjp["mncs_log"].to_numpy()
y_min = np.min(mncs_log)
y_max = np.max(mncs_log)

top_20_authors_bjp = top_20_authors_bjp.with_columns([
    ((pl.col("publications_log") - x_min) / (x_max - x_min) * 2 - 1).alias("x_viz"),
    ((pl.col("mncs_log") - y_min) / (y_max - y_min) * 2 - 1).alias("y_viz"),
])

plt.figure(figsize=(10, 10))

plt.scatter(
    top_20_authors_bjp["x_viz"],
    top_20_authors_bjp["y_viz"],
    s=100,
    color='skyblue',
    edgecolor='k',
    zorder=2
)

texts = []
for name, x, y in zip(
    top_20_authors_bjp["name"],
    top_20_authors_bjp["x_viz"],
    top_20_authors_bjp["y_viz"]
):
    if y >= 0 and x >= 0:
        color = "green"
    elif y >= 0 and x < 0:
        color = "orange"
    elif y < 0 and x < 0:
        color = "red"
    else:
        color = "purple"
    
    texts.append(plt.text(x, y, name, fontsize=10, fontweight="bold", color=color))

adjust_text(texts, arrowprops=dict(arrowstyle='->', color='gray', alpha=1))

plt.axvline(0, color="black", linewidth=1)
plt.axhline(0, color="black", linewidth=1)

plt.grid(True, linestyle='--', alpha=0.5)

plt.xlim(-1.1, 1.1)
plt.ylim(-1.1, 1.1)
plt.xlabel("Total Publications (log, scaled)")
plt.ylabel("Mean MNCS (log, scaled)")

plt.title("Top 20 BJP Authors : Publications vs Mean MNCS")

plt.tight_layout()
plt.show()

In [None]:
import time


def fetch_authors(ids, batch_size=90, max_retries=3):
    results = []
    for i in range(0, len(ids), batch_size):
        batch = ids[i:i+batch_size]
        for attempt in range(max_retries):
            try:
                res = list(alex.Authors()[batch])
                for a in res:
                    if a is not None:
                        results.append(a)
                print(f"Got{min(i+batch_size, len(ids))}/{len(ids)} authors")
                break
            except Exception as e:
                print(f" Error (attempt {attempt+1}) : {e}")
                time.sleep(2)
        else:
            print(f" Failure after {max_retries} retries for batch {i}:{i+batch_size}")
    return results

all_results = []

for year in sorted(works["year"].unique().to_list()):
    print(f"\nYear {year}...")
    
    df_y = (
        works.filter(pl.col("year") == year)
        .select(["year", "cited_by_count", *cols_authors])
        .unpivot(
            on=cols_authors,
            index=["year", "cited_by_count"],
            variable_name="author_pos",
            value_name="author_id"
        )
        .filter(pl.col("author_id").is_not_null())
    )

    top_authors_y = (
        df_y.group_by("author_id", maintain_order=True)
        .agg(pl.sum("cited_by_count").alias("total_citations"))
        .sort("total_citations", descending=True)
        .head(100)
    )

    author_ids = top_authors_y["author_id"].to_list()

    authors_info = fetch_authors(author_ids, batch_size=90)

    id_to_info = {}
    for a in authors_info:
        try:
            author_id = a["id"]
            id_to_info[author_id] = {
                "name": a.get("display_name", "Unknown"),
                "citations": a.get("cited_by_count", 0)
            }
        except TypeError:
            continue  

    top_authors_y = top_authors_y.with_columns([
        pl.col("author_id").map_elements(
            lambda aid: id_to_info.get(aid, {}).get("name", "Unknown"), 
            return_dtype=pl.Utf8
        ).alias("name"),
        pl.col("author_id").map_elements(
            lambda aid: id_to_info.get(aid, {}).get("citations", 0), 
            return_dtype=pl.Int64
        ).alias("author_total_citations"),
        pl.lit(year).alias("year")
    ])

    all_results.append(top_authors_y)

top_authors_per_year = pl.concat(all_results)



In [None]:
all_results = []

for year in sorted(works_bjp["year"].unique().to_list()):
    print(f"\nYear {year}...")
    
    df_y = (
        works_bjp.filter(pl.col("year") == year)
        .select(["year", "cited_by_count", *bjp_cols_authors])
        .unpivot(
            on=bjp_cols_authors,
            index=["year", "cited_by_count"],
            variable_name="author_pos",
            value_name="author_id"
        )
        .filter(pl.col("author_id").is_not_null())
    )

    top_authors_y = (
        df_y.group_by("author_id", maintain_order=True)
        .agg(pl.sum("cited_by_count").alias("total_citations"))
        .sort("total_citations", descending=True)
        .head(100)
    )

    author_ids = top_authors_y["author_id"].to_list()

    authors_info = fetch_authors(author_ids, batch_size=90)

    id_to_info = {}
    for a in authors_info:
        try:
            author_id = a["id"]
            id_to_info[author_id] = {
                "name": a.get("display_name", "Unknown"),
                "citations": a.get("cited_by_count", 0)
            }
        except TypeError:
            continue  

    top_authors_y = top_authors_y.with_columns([
        pl.col("author_id").map_elements(
            lambda aid: id_to_info.get(aid, {}).get("name", "Unknown"), 
            return_dtype=pl.Utf8
        ).alias("name"),
        pl.col("author_id").map_elements(
            lambda aid: id_to_info.get(aid, {}).get("citations", 0), 
            return_dtype=pl.Int64
        ).alias("author_total_citations"),
        pl.lit(year).alias("year")
    ])

    all_results.append(top_authors_y)

top_authors_per_year_bjp = pl.concat(all_results)


In [None]:
import gender_guesser.detector as gender

d = gender.Detector()

def normalize_gender(g):
    if g in ["male","mostly_male"]:
        return "male"
    elif g in ["female","mostly_female"]:
        return "female"
    else:
        return "unknown"
    
import re


def extract_first_names(names):
    first_names = []

    for n in names:
        first_name = None

        if isinstance(n, str):
            first_author = n.split("&")[0].strip()
            first_name = first_author.split()[0].strip()

        elif isinstance(n, list) and len(n) > 0:
            first_author = n[0].strip()
            first_name = first_author.split()[0].strip()

        if isinstance(first_name, str):

            if re.fullmatch(r"([A-Z]\.)+", first_name):
                continue

            if re.fullmatch(r"[A-Z]", first_name):
                continue

            if first_name.isupper():
                continue

            first_names.append(first_name)

    return first_names



def build_gender_df_per_year(df):
    records = []
    for year, group in df.groupby("year"):
        first_names = extract_first_names(group["name"].to_list())
        for name in first_names:
            if name:
                g = normalize_gender(d.get_gender(name))
            else:
                g = "unknown"
            records.append({"year": year, "gender": g, "first_name": name})
    return pd.DataFrame(records)




In [None]:
df_top_authors_pd = top_authors_per_year.to_pandas()

df_gender_all = build_gender_df_per_year(df_top_authors_pd)
gender_counts = (
    df_gender_all.groupby(["year", "gender"])
    .size()
    .reset_index(name="count")
)



total_per_year = df_gender_all.groupby("year").size().reset_index(name="total")
gender_counts = gender_counts.merge(total_per_year, on="year")
gender_counts["percent"] = 100 * gender_counts["count"] / gender_counts["total"]

plt.figure(figsize=(12,6))
sns.lineplot(data=gender_counts, x="year", y="percent", hue="gender",palette={"male":"#3944f3", "female":"#f39cdb", "unknown":"#8c8c8c"})
plt.title("Evolution of Gender Distribution Among Top 100 Cited Authors per Year - Q1 Journals") 
plt.ylabel("Percentage (%)")
plt.xlabel("Year")
plt.ylim(0,100)
plt.xlim(1950,2024)
plt.legend(title="Gender")
plt.grid(True, linestyle="--", alpha=0.5)
plt.show()


In [None]:
df_top_authors_pd = top_authors_per_year_bjp.to_pandas()

df_gender_all = build_gender_df_per_year(df_top_authors_pd)
gender_counts = (
    df_gender_all.groupby(["year", "gender"])
    .size()
    .reset_index(name="count")
)

total_per_year = df_gender_all.groupby("year").size().reset_index(name="total")
gender_counts = gender_counts.merge(total_per_year, on="year")
gender_counts["percent"] = 100 * gender_counts["count"] / gender_counts["total"]

plt.figure(figsize=(12,6))
sns.lineplot(data=gender_counts, x="year", y="percent", hue="gender",palette={"male":"#3944f3", "female":"#f39cdb", "unknown":"#8c8c8c"})
plt.title("Evolution of Gender Distribution Among Top 100 Cited Authors per Year - BJP Journal") 
plt.ylabel("Percentage (%)")
plt.xlabel("Year")
plt.ylim(0,100)
plt.xlim(1968,2024)
plt.legend(title="Gender")
plt.grid(True, linestyle="--", alpha=0.5)
plt.show()


In [None]:

all_results = []

for year in sorted(works["year"].unique().to_list()):
    print(f"\nYear {year}...")
    
    df_y = (
        works.filter(pl.col("year") == year)
        .select(["year", "cited_by_count", *cols_authors])
        .unpivot(
            on=cols_authors,
            index=["year", "cited_by_count"],
            variable_name="author_pos",
            value_name="author_id"
        )
        .filter(pl.col("author_id").is_not_null())
    )

    top_authors_y = (
        df_y.group_by("author_id", maintain_order=True)
        .agg(pl.len().alias("total_publications"))
        .sort("total_publications", descending=True)
        .head(100)
    )

    author_ids = top_authors_y["author_id"].to_list()

    authors_info = fetch_authors(author_ids, batch_size=90)

    id_to_info = {}
    for a in authors_info:
        try:
            author_id = a["id"]
            id_to_info[author_id] = {
                "name": a.get("display_name", "Unknown"),
                "citations": a.get("cited_by_count", 0)
            }
        except TypeError:
            continue  

    top_authors_y = top_authors_y.with_columns([
        pl.col("author_id").map_elements(
            lambda aid: id_to_info.get(aid, {}).get("name", "Unknown"), 
            return_dtype=pl.Utf8
        ).alias("name"),
        pl.col("author_id").map_elements(
            lambda aid: id_to_info.get(aid, {}).get("citations", 0), 
            return_dtype=pl.Int64
        ).alias("author_total_citations"),
        pl.lit(year).alias("year")
    ])

    all_results.append(top_authors_y)

top_authors_per_year = pl.concat(all_results)



In [None]:

all_results = []

for year in sorted(works_bjp["year"].unique().to_list()):
    print(f"\nYear {year}...")
    
    df_y = (
        works_bjp.filter(pl.col("year") == year)
        .select(["year", "cited_by_count", *bjp_cols_authors])
        .unpivot(
            on=bjp_cols_authors,
            index=["year", "cited_by_count"],
            variable_name="author_pos",
            value_name="author_id"
        )
        .filter(pl.col("author_id").is_not_null())
    )

    top_authors_y = (
        df_y.group_by("author_id", maintain_order=True)
        .agg(pl.len().alias("total_publications"))
        .sort("total_publications", descending=True)
        .head(100)
    )

    author_ids = top_authors_y["author_id"].to_list()

    authors_info = fetch_authors(author_ids, batch_size=90)

    id_to_info = {}
    for a in authors_info:
        try:
            author_id = a["id"]
            id_to_info[author_id] = {
                "name": a.get("display_name", "Unknown"),
                "citations": a.get("cited_by_count", 0)
            }
        except TypeError:
            continue  

    top_authors_y = top_authors_y.with_columns([
        pl.col("author_id").map_elements(
            lambda aid: id_to_info.get(aid, {}).get("name", "Unknown"), 
            return_dtype=pl.Utf8
        ).alias("name"),
        pl.col("author_id").map_elements(
            lambda aid: id_to_info.get(aid, {}).get("citations", 0), 
            return_dtype=pl.Int64
        ).alias("author_total_citations"),
        pl.lit(year).alias("year")
    ])

    all_results.append(top_authors_y)

top_authors_per_year_bjp = pl.concat(all_results)


In [None]:
d = gender.Detector()

df_top_authors_pd = top_authors_per_year.to_pandas()

df_gender_all = build_gender_df_per_year(df_top_authors_pd)
gender_counts = (
    df_gender_all.groupby(["year", "gender"])
    .size()
    .reset_index(name="count")
)

total_per_year = df_gender_all.groupby("year").size().reset_index(name="total")
gender_counts = gender_counts.merge(total_per_year, on="year")
gender_counts["percent"] = 100 * gender_counts["count"] / gender_counts["total"]

plt.figure(figsize=(12,6))
sns.lineplot(data=gender_counts, x="year", y="percent", hue="gender",palette={"male":"#3944f3", "female":"#f39cdb", "unknown":"#8c8c8c"})
plt.title("Evolution of Gender Distribution Among Top 100 Active Authors per Year - Q1 Journals") 
plt.ylabel("Percentage (%)")
plt.xlabel("Year")
plt.ylim(0,100)
plt.xlim(1950,2024)
plt.legend(title="Gender")
plt.grid(True, linestyle="--", alpha=0.5)
plt.show()



In [None]:
d = gender.Detector()

df_top_authors_pd = top_authors_per_year_bjp.to_pandas()

df_gender_all = build_gender_df_per_year(df_top_authors_pd)
gender_counts = (
    df_gender_all.groupby(["year", "gender"])
    .size()
    .reset_index(name="count")
)

total_per_year = df_gender_all.groupby("year").size().reset_index(name="total")
gender_counts = gender_counts.merge(total_per_year, on="year")
gender_counts["percent"] = 100 * gender_counts["count"] / gender_counts["total"]

plt.figure(figsize=(12,6))
sns.lineplot(data=gender_counts, x="year", y="percent", hue="gender",palette={"male":"#3944f3", "female":"#f39cdb", "unknown":"#8c8c8c"})
plt.title("Evolution of Gender Distribution Among Top 100 Active Authors per Year - BJP Journal") 
plt.ylabel("Percentage (%)")
plt.xlabel("Year")
plt.ylim(0,100)
plt.xlim(1968,2024)
plt.legend(title="Gender")
plt.grid(True, linestyle="--", alpha=0.5)
plt.show()



In [None]:
all_results = []

for year in sorted(works["year"].unique().to_list()):
    print(f"\nYear {year}...")
    
    df_y = (
        works.filter(pl.col("year") == year)
        .select(["year", "cited_by_count", *cols_authors])
        .unpivot(
            on=cols_authors,
            index=["year", "cited_by_count"],
            variable_name="author_pos",
            value_name="author_id"
        )
        .filter(pl.col("author_id").is_not_null())
    )

    top_authors_y = (
        df_y.group_by("author_id", maintain_order=True)
        .agg(pl.sum("cited_by_count").alias("total_citations"))
        .sort("total_citations", descending=True)
        .head(1000) #top 1000 authors because of time issues
    )

    author_ids = top_authors_y["author_id"].to_list()

    authors_info = fetch_authors(author_ids, batch_size=90)

    id_to_info = {}
    for a in authors_info:
        try:
            author_id = a["id"]
            id_to_info[author_id] = {
                "name": a.get("display_name", "Unknown"),
                "citations": a.get("cited_by_count", 0)
            }
        except TypeError:
            continue  

    top_authors_y = top_authors_y.with_columns([
        pl.col("author_id").map_elements(
            lambda aid: id_to_info.get(aid, {}).get("name", "Unknown"), 
            return_dtype=pl.Utf8
        ).alias("name"),
        pl.col("author_id").map_elements(
            lambda aid: id_to_info.get(aid, {}).get("citations", 0), 
            return_dtype=pl.Int64
        ).alias("author_total_citations"),
        pl.lit(year).alias("year")
    ])

    all_results.append(top_authors_y)

top_authors_per_year = pl.concat(all_results)



In [None]:
all_results = []

for year in sorted(works_bjp["year"].unique().to_list()):
    print(f"\nYear {year}...")
    
    df_y = (
        works_bjp.filter(pl.col("year") == year)
        .select(["year", "cited_by_count", *bjp_cols_authors])
        .unpivot(
            on=bjp_cols_authors,
            index=["year", "cited_by_count"],
            variable_name="author_pos",
            value_name="author_id"
        )
        .filter(pl.col("author_id").is_not_null())
    )

    top_authors_y = (
        df_y.group_by("author_id", maintain_order=True)
        .agg(pl.sum("cited_by_count").alias("total_citations"))
        .sort("total_citations", descending=True)
        .head(1000)#top 1000 authors because of time issues
    )

    author_ids = top_authors_y["author_id"].to_list()

    authors_info = fetch_authors(author_ids, batch_size=90)

    id_to_info = {}
    for a in authors_info:
        try:
            author_id = a["id"]
            id_to_info[author_id] = {
                "name": a.get("display_name", "Unknown"),
                "citations": a.get("cited_by_count", 0)
            }
        except TypeError:
            continue  

    top_authors_y = top_authors_y.with_columns([
        pl.col("author_id").map_elements(
            lambda aid: id_to_info.get(aid, {}).get("name", "Unknown"), 
            return_dtype=pl.Utf8
        ).alias("name"),
        pl.col("author_id").map_elements(
            lambda aid: id_to_info.get(aid, {}).get("citations", 0), 
            return_dtype=pl.Int64
        ).alias("author_total_citations"),
        pl.lit(year).alias("year")
    ])

    all_results.append(top_authors_y)

top_authors_per_year_bjp = pl.concat(all_results)


In [None]:
d = gender.Detector()

df_top_authors_pd = top_authors_per_year.to_pandas()

df_gender_all = build_gender_df_per_year(df_top_authors_pd)
gender_counts = (
    df_gender_all.groupby(["year", "gender"])
    .size()
    .reset_index(name="count")
)

total_per_year = df_gender_all.groupby("year").size().reset_index(name="total")
gender_counts = gender_counts.merge(total_per_year, on="year")
gender_counts["percent"] = 100 * gender_counts["count"] / gender_counts["total"]

plt.figure(figsize=(12,6))
sns.lineplot(data=gender_counts, x="year", y="percent", hue="gender",palette={"male":"#3944f3", "female":"#f39cdb", "unknown":"#8c8c8c"})
plt.title("Evolution of Gender Distribution Among Top 1000 Cited Authors per Year - Q1 Journals") 
plt.ylabel("Percentage (%)")
plt.xlabel("Year")
plt.ylim(0,100)
plt.xlim(1950,2024)
plt.legend(title="Gender")
plt.grid(True, linestyle="--", alpha=0.5)
plt.show()



In [None]:
d = gender.Detector()

df_top_authors_pd = top_authors_per_year_bjp.to_pandas()

df_gender_all = build_gender_df_per_year(df_top_authors_pd)
gender_counts = (
    df_gender_all.groupby(["year", "gender"])
    .size()
    .reset_index(name="count")
)

total_per_year = df_gender_all.groupby("year").size().reset_index(name="total")
gender_counts = gender_counts.merge(total_per_year, on="year")
gender_counts["percent"] = 100 * gender_counts["count"] / gender_counts["total"]

plt.figure(figsize=(12,6))
sns.lineplot(data=gender_counts, x="year", y="percent", hue="gender",palette={"male":"#3944f3", "female":"#f39cdb", "unknown":"#8c8c8c"})
plt.title("Evolution of Gender Distribution Among Top 1000 Cited Authors per Year - BJP Journal") 
plt.ylabel("Percentage (%)")
plt.xlabel("Year")
plt.ylim(0,100)
plt.xlim(1968,2024)
plt.legend(title="Gender")
plt.grid(True, linestyle="--", alpha=0.5)
plt.show()



In [None]:
import itertools
import networkx as nx
import matplotlib.cm as cm
import matplotlib.colors as mcolors

pairs_counter = {}
author_collab_totals = {}

for index_work, group in authors.group_by("index_work"):
    author_list = list(group["author_name"])
    if len(author_list) >= 2:
        for pair in itertools.combinations(sorted(author_list), 2):
            pairs_counter[pair] = pairs_counter.get(pair, 0) + 1
            author_collab_totals[pair[0]] = author_collab_totals.get(pair[0], 0) + 1
            author_collab_totals[pair[1]] = author_collab_totals.get(pair[1], 0) + 1

top_authors = sorted(author_collab_totals.items(), key=lambda x: x[1], reverse=True)[:25]
top_authors = [a for a, _ in top_authors]

author_ids = [aid for aid in top_authors if isinstance(aid, str)]

def fetch_authors(ids, batch_size=50):
    results = []
    for i in range(0, len(ids), batch_size):
        batch = ids[i:i+batch_size]
        res = list(alex.Authors()[batch])
        results.extend(res)
    return results

authors_info = fetch_authors(author_ids)

id_to_info = {}
for a in authors_info:
    inst_id, country = None, None
    if a.get("affiliations"):
        inst = a["affiliations"][0].get("institution")
        if inst:
            inst_id = inst.get("id")
            country = inst.get("country_code")
    id_to_info[a["id"]] = {
        "name": a.get("display_name"),
        "institution": inst_id,
        "country": country
    }

top_25_authors_df = pl.DataFrame({"author_id": author_ids})
top_25_authors_df = top_25_authors_df.with_columns([
    pl.col("author_id").map_elements(lambda aid: id_to_info.get(aid, {}).get("name"), return_dtype=pl.Utf8).alias("name"),
    pl.col("author_id").map_elements(lambda aid: id_to_info.get(aid, {}).get("institution"), return_dtype=pl.Utf8).alias("institution"),
    pl.col("author_id").map_elements(lambda aid: id_to_info.get(aid, {}).get("country"), return_dtype=pl.Utf8).alias("country"),
])




In [None]:

top_25_ids = set(top_25_authors_df["author_id"].to_list())


G = nx.Graph()
for (a1, a2), count in pairs_counter.items():
    if a1 in top_25_ids and a2 in top_25_ids and a1 != a2:
        G.add_edge(a1, a2, weight=count)

if len(G.nodes) > 0 and len(G.edges) > 0:
    node_vals = np.array([deg for node, deg in G.degree(weight='weight')])
    node_sizes = 200 + (node_vals - node_vals.min()) / (node_vals.max() - node_vals.min() + 1e-9) * (5000 - 200)

    edge_vals = np.array([G[u][v]['weight'] for u, v in G.edges()])
    edge_weights = 0.5 + (edge_vals - edge_vals.min()) / (edge_vals.max() - edge_vals.min() + 1e-9) * (20 - 0.5)

    countries = top_25_authors_df["country"].to_list()
    unique_countries = [c for c in set(countries) if c is not None]
    cmap = cm.get_cmap("tab20")  
    color_cycle = cmap.colors if hasattr(cmap, "colors") else [cmap(i) for i in range(len(unique_countries))]
    country_to_color = {c: color_cycle[i % len(color_cycle)] for i, c in enumerate(unique_countries)}

    node_colors = [country_to_color.get(id_to_info.get(node, {}).get("country"), (0.8, 0.8, 0.8)) 
                   for node in G.nodes()]

    pos = nx.spring_layout(G, k=1.1, weight='weight', seed=42)

    fig, ax = plt.subplots(figsize=(13, 11))
    nx.draw_networkx_edges(G, pos, ax=ax, width=edge_weights, edge_color='lightgray', alpha=0.5)
    nx.draw_networkx_nodes(G, pos, ax=ax, node_size=node_sizes, node_color=node_colors, alpha=0.9,
                           edgecolors='black', linewidths=0.5)

    labels = {node: id_to_info.get(node, {}).get("name", node) for node in G.nodes()}
    nx.draw_networkx_labels(G, pos, labels=labels, ax=ax, font_size=9, font_weight='bold')

    for country, color in country_to_color.items():
        ax.scatter([], [], c=[color], label=country, s=200)
    ax.legend(title="Country", loc="upper right", fontsize=9)

    ax.set_title("Top 25 Authors Collaboration Network - Q1 Journals", fontsize=16, pad=20)
    ax.axis("off")
    plt.tight_layout()
    plt.show()


In [None]:
pairs_counter = {}
author_collab_totals = {}

for index_work, group in authors_bjp.group_by("index_work"):
    author_list = list(group["author_name"])
    if len(author_list) >= 2:
        for pair in itertools.combinations(sorted(author_list), 2):
            pairs_counter[pair] = pairs_counter.get(pair, 0) + 1
            author_collab_totals[pair[0]] = author_collab_totals.get(pair[0], 0) + 1
            author_collab_totals[pair[1]] = author_collab_totals.get(pair[1], 0) + 1

top_authors = sorted(author_collab_totals.items(), key=lambda x: x[1], reverse=True)[:25]
top_authors = [a for a, _ in top_authors]

author_ids = [aid for aid in top_authors if isinstance(aid, str)]

def fetch_authors(ids, batch_size=50):
    results = []
    for i in range(0, len(ids), batch_size):
        batch = ids[i:i+batch_size]
        res = list(alex.Authors()[batch])
        results.extend(res)
    return results

authors_info = fetch_authors(author_ids)

id_to_info = {}
for a in authors_info:
    inst_id, country = None, None
    if a.get("affiliations"):
        inst = a["affiliations"][0].get("institution")
        if inst:
            inst_id = inst.get("id")
            country = inst.get("country_code")
    id_to_info[a["id"]] = {
        "name": a.get("display_name"),
        "institution": inst_id,
        "country": country
    }

top_25_authors_df = pl.DataFrame({"author_id": author_ids})
top_25_authors_df = top_25_authors_df.with_columns([
    pl.col("author_id").map_elements(lambda aid: id_to_info.get(aid, {}).get("name"), return_dtype=pl.Utf8).alias("name"),
    pl.col("author_id").map_elements(lambda aid: id_to_info.get(aid, {}).get("institution"), return_dtype=pl.Utf8).alias("institution"),
    pl.col("author_id").map_elements(lambda aid: id_to_info.get(aid, {}).get("country"), return_dtype=pl.Utf8).alias("country"),
])




In [None]:
top_25_ids = set(top_25_authors_df["author_id"].to_list())

G = nx.Graph()
for (a1, a2), count in pairs_counter.items():
    if a1 in top_25_ids and a2 in top_25_ids and a1 != a2:
        G.add_edge(a1, a2, weight=count)

if len(G.nodes) > 0 and len(G.edges) > 0:
    node_vals = np.array([deg for node, deg in G.degree(weight='weight')])
    node_sizes = 200 + (node_vals - node_vals.min()) / (node_vals.max() - node_vals.min() + 1e-9) * (5000 - 200)

    edge_vals = np.array([G[u][v]['weight'] for u, v in G.edges()])
    edge_weights = 0.5 + (edge_vals - edge_vals.min()) / (edge_vals.max() - edge_vals.min() + 1e-9) * (20 - 0.5)

    countries = top_25_authors_df["country"].to_list()
    unique_countries = [c for c in set(countries) if c is not None]
    cmap = cm.get_cmap("tab20") 
    color_cycle = cmap.colors if hasattr(cmap, "colors") else [cmap(i) for i in range(len(unique_countries))]
    country_to_color = {c: color_cycle[i % len(color_cycle)] for i, c in enumerate(unique_countries)}

    node_colors = [country_to_color.get(id_to_info.get(node, {}).get("country"), (0.8, 0.8, 0.8)) 
                   for node in G.nodes()]

    pos = nx.spring_layout(G, k=1.1, weight='weight', seed=42)

    fig, ax = plt.subplots(figsize=(13, 11))
    nx.draw_networkx_edges(G, pos, ax=ax, width=edge_weights, edge_color='lightgray', alpha=0.5)
    nx.draw_networkx_nodes(G, pos, ax=ax, node_size=node_sizes, node_color=node_colors, alpha=0.9,
                           edgecolors='black', linewidths=0.5)

    labels = {node: id_to_info.get(node, {}).get("name", node) for node in G.nodes()}
    nx.draw_networkx_labels(G, pos, labels=labels, ax=ax, font_size=9, font_weight='bold')

    for country, color in country_to_color.items():
        ax.scatter([], [], c=[color], label=country, s=200)
    ax.legend(title="Country", loc="upper right", fontsize=9)

    ax.set_title("Top 25 Authors Collaboration Network - BJP Journal", fontsize=16, pad=20)
    ax.axis("off")
    plt.tight_layout()
    plt.show()


In [None]:
authors = authors.sort("index_work").with_columns([
    (pl.col("count").cum_sum().over("author_name") - pl.col("count")).alias("cumulative_count"), 
    (pl.col("cited_by_count").cum_sum().over("author_name") - pl.col("cited_by_count")).alias("cumulative_citations"),
    (pl.col("mncs").cum_sum().over("author_name") - pl.col("mncs")).alias("cumulative_mncs")
])

authors = authors.with_columns(
    pl.when(pl.col("cumulative_count") == 0)
    .then(pl.lit(1))
    .otherwise(pl.col("cumulative_mncs") / pl.col("cumulative_count"))
    .alias("mean_past_mncs_authors")
)

authors

In [None]:
authors_bjp = authors_bjp.sort("index_work").with_columns([
    (pl.col("count").cum_sum().over("author_name") - pl.col("count")).alias("cumulative_count"), 
    (pl.col("cited_by_count").cum_sum().over("author_name") - pl.col("cited_by_count")).alias("cumulative_citations"),
    (pl.col("mncs").cum_sum().over("author_name") - pl.col("mncs")).alias("cumulative_mncs")
])

authors_bjp = authors_bjp.with_columns(
    pl.when(pl.col("cumulative_count") == 0)
    .then(pl.lit(1))
    .otherwise(pl.col("cumulative_mncs") / pl.col("cumulative_count"))
    .alias("mean_past_mncs_authors")
)

authors_bjp

In [None]:
def authors_distinct_yearly_normalized(df_authors):
    years = sorted(df_authors["year"].unique().to_list())
    authors_count = []

    for y in years:
        df_year = df_authors.filter(pl.col("year") == y)
        auth_set = set(df_year["author_name"].drop_nulls().to_list())
        authors_count.append(len(auth_set))

    authors_count = np.array(authors_count, dtype=float)
    authors_count_norm = authors_count / authors_count.max() if authors_count.max() > 0 else authors_count

    return years, authors_count, authors_count_norm


In [None]:
years_q1, authors_q1_raw, authors_q1_norm = authors_distinct_yearly_normalized(authors)
years_bjp, authors_bjp_raw, authors_bjp_norm = authors_distinct_yearly_normalized(authors_bjp)
max_q1_val = authors_q1_raw.max()
max_q1_year = years_q1[int(authors_q1_raw.argmax())]

print(max_q1_val)
print(max_q1_year)
with aq.load_theme("scientific"):
    plt.figure(figsize=(10,6))

    plt.plot(years_q1, authors_q1_raw,
             color="#3944f3", label="Q1 distinct authors", linewidth=2)

    plt.plot(years_bjp, authors_bjp_raw,
             color="#f39c12", label="BJP distinct authors", linewidth=2)

    plt.xlabel("Year", fontsize=14)
    plt.title("Author Diversity Over Time", fontsize=16)
    plt.legend(fontsize=12)
    plt.grid(alpha=0.3)
    plt.xlim(1950, 2024)
    plt.show()

max_bjp_val = authors_bjp_raw.max()
max_bjp_year = years_bjp[int(authors_bjp_raw.argmax())]

print(max_bjp_val)
print(max_bjp_year)


In [None]:
cols_to_stats = ["mncs", "cited_by_count"]

agg_exprs = []
for col in cols_to_stats:
    agg_exprs.append(pl.mean(col).alias(f"{col}_mean"))
    agg_exprs.append(pl.median(col).alias(f"{col}_median"))
    agg_exprs.append(pl.col(col).quantile(0.025).alias(f"{col}_p2_5"))
    agg_exprs.append(pl.col(col).quantile(0.975).alias(f"{col}_p97_5"))

group_Nauthors_q1 = (
    works.group_by("authors_count", maintain_order=True)
    .agg(agg_exprs)
    .sort("authors_count")
)

group_Nauthors_bjp = (
    works_bjp.group_by("authors_count", maintain_order=True)
    .agg(agg_exprs)
    .sort("authors_count")
)

print(group_Nauthors_q1.head())
print(group_Nauthors_bjp.head())

In [None]:
authors_q1 = group_Nauthors_q1["authors_count"].to_numpy()
mean_q1 = group_Nauthors_q1["mncs_mean"].to_numpy()
median_q1 = group_Nauthors_q1["mncs_median"].to_numpy()
low_q1 = group_Nauthors_q1["mncs_p2_5"].to_numpy()
high_q1 = group_Nauthors_q1["mncs_p97_5"].to_numpy()

authors2_bjp = group_Nauthors_bjp["authors_count"].to_numpy()
mean_bjp = group_Nauthors_bjp["mncs_mean"].to_numpy()
median_bjp = group_Nauthors_bjp["mncs_median"].to_numpy()
low_bjp = group_Nauthors_bjp["mncs_p2_5"].to_numpy()
high_bjp = group_Nauthors_bjp["mncs_p97_5"].to_numpy()


with aq.load_theme("scientific"):
    plt.figure(figsize=(7.2, 6))

    plt.plot(authors_q1, mean_q1, color="#3944f3", label="Q1 works mean")
    #plt.plot(authors_q1, median_q1, color="#39daf3",label="Q1 works median")
    #plt.fill_between(authors_q1, low_q1, high_q1, color="#3944f3", alpha=0.2)

    plt.plot(authors2_bjp, mean_bjp, color="#f39c12", label="BJP works mean")
    #plt.plot(authors2_bjp, median_bjp, color="#f31212",label="BJP works median")
    #plt.fill_between(authors2_bjp, low_bjp, high_bjp, color="#f39c12", alpha=0.2)
    
    plt.ylim(0, 5)
    plt.xlim(0, 20)
    plt.xticks(rotation=45, fontsize=16)
    plt.yticks(fontsize=16)
    plt.xlabel("Number of authors", fontsize=18)
    plt.ylabel("MNCS", fontsize=18)
    plt.legend(fontsize=14)
    plt.title("MNCS by Number of Authors - Q1 vs BJP", fontsize=20)
    plt.show()

In [None]:
r_pearson, p_pearson = st.pearsonr(works["authors_count"], works["mncs"])
print(f"Pearson: r = {r_pearson:.3f}, p-value = {p_pearson:.3f}")

r_spearman, p_spearman = st.spearmanr(works["authors_count"], works["mncs"])
print(f"Spearman: rho = {r_spearman:.3f}, p-value = {p_spearman:.3f}")

In [None]:
import re

cols_authors = sorted([c for c in works.columns if re.match(r"author_\d+", c)])
cols_countries = sorted([c for c in works.columns if re.match(r"country_\d+", c)])

base_cols = ["title", "year", "cited_by_count", "mncs"]
works_small = works.select(base_cols + cols_authors + cols_countries)

df_list = []

for i in range(len(cols_authors)):
    df_i = (
        works_small
        .select([
            *base_cols,
            pl.col(cols_authors[i]).alias("author"),
            pl.col(cols_countries[i]).alias("country")
        ])
        .with_columns(pos=pl.lit(i + 1))
        .filter(pl.col("author").is_not_null())
    )
    df_list.append(df_i)

authorsg = pl.concat(df_list)
authorsg = authorsg.with_columns(count=pl.lit(1, dtype=pl.Int8))

authorsg


In [None]:


cols_authors = sorted([c for c in works_bjp.columns if re.match(r"author_\d+", c)])
cols_countries = sorted([c for c in works_bjp.columns if re.match(r"country_\d+", c)])

base_cols = ["title", "year", "cited_by_count", "mncs"]
works_small = works_bjp.select(base_cols + cols_authors + cols_countries)

df_list = []

for i in range(len(cols_authors)):
    df_i = (
        works_small
        .select([
            *base_cols,
            pl.col(cols_authors[i]).alias("author"),
            pl.col(cols_countries[i]).alias("country")
        ])
        .with_columns(pos=pl.lit(i + 1))
        .filter(pl.col("author").is_not_null())
    )
    df_list.append(df_i)

authorsg_bjp = pl.concat(df_list)
authorsg_bjp = authorsg_bjp.with_columns(count=pl.lit(1, dtype=pl.Int8))

authorsg_bjp

In [None]:
import tqdm
import time
import os

authorsg = authorsg.with_columns([
    pl.col("author").fill_null("Unknown"),
    pl.col("country").fill_null("Unknown")
])

authors_pd = authorsg.to_pandas()
authors_pd["decade"] = (authors_pd["year"] // 10) * 10
authors_pd = authors_pd[authors_pd["decade"] >= 1950]

decades = sorted(authors_pd["decade"].unique())
print(f"{len(decades)} Decades : {decades}")

SAVE_PATH = "results_network_metrics.csv" #save temp file to run in multiple sessions without losing progress

if os.path.exists(SAVE_PATH):
    df_results = pd.read_csv(SAVE_PATH)
    done_decades = set(df_results["decade"])
    print(f"{len(done_decades)} decades already done.")
else:
    df_results = pd.DataFrame()
    done_decades = set()

results = []

for decade in decades:
    if decade in done_decades:
        print(f"Decade {decade} already done")
        continue

    df_dec = authors_pd[authors_pd["decade"] == decade]
    print(f"\{decade}, ({len(df_dec)} works)")
    t0 = time.time()

    titles = df_dec["title"].unique()
    pairs_counter = {}

    for title in tqdm(titles, desc=f"→ Decade {decade}", leave=False):
        group = df_dec[df_dec["title"] == title]
        authors = list(group["author"])
        if len(authors) >= 2:
            for (a1, a2) in itertools.combinations(sorted(authors), 2):
                key = (a1, a2)
                pairs_counter[key] = pairs_counter.get(key, 0) + 1


    edges = pd.DataFrame([{"source": a1, "target": a2, "weight": w}
                          for (a1, a2), w in pairs_counter.items()])
    G = nx.from_pandas_edgelist(edges, "source", "target", edge_attr="weight")

    total_nodes = G.number_of_nodes()
    if total_nodes == 0:
        continue

    largest = max(nx.connected_components(G), key=len)
    G_lcc = G.subgraph(largest)

    size_lcc = len(largest)
    ratio_lcc = size_lcc / total_nodes
    avg_degree = sum(dict(G.degree()).values()) / total_nodes
    density = nx.density(G)
    avg_clustering = nx.average_clustering(G)
    num_components = nx.number_connected_components(G)

    avg_path_length = None
    diameter = None
    if size_lcc < 3000:
        try:
            avg_path_length = nx.average_shortest_path_length(G_lcc)
        except Exception:
            pass
        try:
            diameter = nx.diameter(G_lcc)
        except Exception:
            pass

    result = {
        "decade": decade,
        "total_nodes": total_nodes,
        "size_lcc": size_lcc,
        "ratio_lcc": ratio_lcc,
        "avg_degree": avg_degree,
        "density": density,
        "avg_clustering": avg_clustering,
        "num_components": num_components,
        "avg_path_length": avg_path_length,
        "diameter": diameter,
        "time_sec": round(time.time() - t0, 1)
    }

    results.append(result)
    df_results = pd.concat([df_results, pd.DataFrame([result])], ignore_index=True)

    df_results.to_csv(SAVE_PATH, index=False)
    print(f"{decade} traité ({result['time_sec']}s) — {total_nodes} nœuds, LCC={size_lcc}, ratio={ratio_lcc:.2%}")

print("\n Summuary:")
print(df_results.sort_values("decade"))



In [None]:

metrics = [
    ("ratio_lcc", "Largest Connected Component (%)", lambda x: x * 100),
    ("size_lcc", "Size of Largest Connected Component", None),
    ("avg_degree", "Average Degree", None),
    ("density", "Network Density", None),
    ("avg_clustering", "Average Clustering Coefficient", None),
    ("num_components", "Number of Connected Components", None),
    ("avg_path_length", "Average Path Length (LCC only)", None),
    ("diameter", "Diameter (LCC only)", None),
]

for col, title, transform in metrics:
    if col not in df_results.columns:
        continue  
    plt.figure(figsize=(8, 5))
    y = transform(df_results[col]) if transform else df_results[col]
    plt.plot(df_results["decade"], y, marker="o", linewidth=2, color="steelblue")
    
    plt.title(f"{title} by Decade -Q1 Journals", fontsize=14, fontweight="bold")
    plt.xlabel("Decade")
    plt.ylabel(title.split("(")[0].strip())
    plt.grid(True, linestyle="--", alpha=0.6)
    plt.tight_layout()
    plt.xlim(1950, 2020)
    plt.show()


In [None]:
import tqdm
import time
import os

authorsg_bjp = authorsg_bjp.with_columns([
    pl.col("author").fill_null("Unknown"),
    pl.col("country").fill_null("Unknown")
])

authors_pd = authorsg_bjp.to_pandas()
authors_pd["decade"] = (authors_pd["year"] // 10) * 10
authors_pd = authors_pd[authors_pd["decade"] >= 1950]

decades = sorted(authors_pd["decade"].unique())
print(f"{len(decades)} Decades : {decades}")

SAVE_PATH = "results_network_metrics_bjp.csv" #save temp file to run in multiple sessions without losing progress

if os.path.exists(SAVE_PATH):
    df_results = pd.read_csv(SAVE_PATH)
    done_decades = set(df_results["decade"])
    print(f"{len(done_decades)} decades already done.")
else:
    df_results = pd.DataFrame()
    done_decades = set()

results = []

for decade in decades:
    if decade in done_decades:
        print(f"Decade {decade} already done")
        continue

    df_dec = authors_pd[authors_pd["decade"] == decade]
    print(f"\{decade}, ({len(df_dec)} works)")
    t0 = time.time()

    titles = df_dec["title"].unique()
    pairs_counter = {}

    for title in tqdm(titles, desc=f"→ Decade {decade}", leave=False):
        group = df_dec[df_dec["title"] == title]
        authors = list(group["author"])
        if len(authors) >= 2:
            for (a1, a2) in itertools.combinations(sorted(authors), 2):
                key = (a1, a2)
                pairs_counter[key] = pairs_counter.get(key, 0) + 1


    edges = pd.DataFrame([{"source": a1, "target": a2, "weight": w}
                          for (a1, a2), w in pairs_counter.items()])
    G = nx.from_pandas_edgelist(edges, "source", "target", edge_attr="weight")

    total_nodes = G.number_of_nodes()
    if total_nodes == 0:
        continue

    largest = max(nx.connected_components(G), key=len)
    G_lcc = G.subgraph(largest)

    size_lcc = len(largest)
    ratio_lcc = size_lcc / total_nodes
    avg_degree = sum(dict(G.degree()).values()) / total_nodes
    density = nx.density(G)
    avg_clustering = nx.average_clustering(G)
    num_components = nx.number_connected_components(G)

    avg_path_length = None
    diameter = None
    if size_lcc < 3000:
        try:
            avg_path_length = nx.average_shortest_path_length(G_lcc)
        except Exception:
            pass
        try:
            diameter = nx.diameter(G_lcc)
        except Exception:
            pass

    result = {
        "decade": decade,
        "total_nodes": total_nodes,
        "size_lcc": size_lcc,
        "ratio_lcc": ratio_lcc,
        "avg_degree": avg_degree,
        "density": density,
        "avg_clustering": avg_clustering,
        "num_components": num_components,
        "avg_path_length": avg_path_length,
        "diameter": diameter,
        "time_sec": round(time.time() - t0, 1)
    }

    results.append(result)
    df_results = pd.concat([df_results, pd.DataFrame([result])], ignore_index=True)

    df_results.to_csv(SAVE_PATH, index=False)
    print(f"{decade} traité ({result['time_sec']}s) — {total_nodes} nœuds, LCC={size_lcc}, ratio={ratio_lcc:.2%}")

print("\n Summuary:")
print(df_results.sort_values("decade"))



In [None]:
import matplotlib.pyplot as plt

metrics = [
    ("ratio_lcc", "Largest Connected Component (%)", lambda x: x * 100),
    ("size_lcc", "Size of Largest Connected Component", None),
    ("avg_degree", "Average Degree", None),
    ("density", "Network Density", None),
    ("avg_clustering", "Average Clustering Coefficient", None),
    ("num_components", "Number of Connected Components", None),
    ("avg_path_length", "Average Path Length (LCC only)", None),
    ("diameter", "Diameter (LCC only)", None),
]

for col, title, transform in metrics:
    if col not in df_results.columns:
        continue  
    
    plt.figure(figsize=(8, 5))
    y = transform(df_results[col]) if transform else df_results[col]
    plt.plot(df_results["decade"], y, marker="o", linewidth=2, color="steelblue")
    
    plt.title(f"{title} by Decade - BJP Journal", fontsize=14, fontweight="bold")
    plt.xlabel("Decade")
    plt.ylabel(title.split("(")[0].strip())
    plt.grid(True, linestyle="--", alpha=0.6)
    plt.tight_layout()
    plt.xlim(1950, 2020)
    plt.show()
    
    

    



In [None]:
import bar_chart_race as bcr

print(works["year"].min(), works["year"].max())

authors_global = (
        works
        .select(["year", *cols_authors, "cited_by_count"])
        .unpivot(
            on=cols_authors,
            index=["year", "cited_by_count"],
            variable_name="author_pos",
            value_name="author_id"
        )
        .filter(pl.col("author_id").is_not_null())
        .group_by("author_id")
        .agg(pl.sum("cited_by_count").alias("total_citations"))
        .sort("total_citations", descending=True)
)

top_authors_ids = authors_global.head(10)["author_id"].to_list()

authors_info = fetch_authors(top_authors_ids, batch_size=90)

id_to_name = {a["id"]: a.get("display_name", "Unknown") for a in authors_info if a is not None}

df_top = (
    works
    .select(["year", *cols_authors, "cited_by_count"])
    .unpivot(
        on=cols_authors,
        index=["year", "cited_by_count"],
        variable_name="author_pos",
        value_name="author_id"
    )
    .filter(pl.col("author_id").is_in(top_authors_ids))
)
df_top = df_top.with_columns(
    pl.col("author_id").map_elements(lambda aid: id_to_name.get(aid, "Unknown"), return_dtype=pl.Utf8).alias("name")
)

df_yearly = (
        df_top.group_by(["year", "name"])
        .agg(pl.sum("cited_by_count").alias("value"))
        .sort(["year", "value"], descending=[False, True])
)

authors_pivot = df_yearly.pivot(
    index="year",
    columns="name",
    values="value",
    aggregate_function="sum"
).fill_null(0).sort("year")

authors_pivot_pd = authors_pivot.to_pandas().set_index("year")
authors_pivot_pd.index = pd.PeriodIndex(authors_pivot_pd.index, freq="Y").to_timestamp()

authors_pivot_cumsum = authors_pivot_pd.cumsum()

bcr.bar_chart_race(
    df=authors_pivot_cumsum,
    filename='../results/q1/authors/top_authors_q1_cumulative.mp4',
    orientation='h',
    sort='desc',
    n_bars=10,
    fixed_order=False,
    fixed_max=True,
    steps_per_period=8,
    period_length=800,
    period_fmt='%Y',
    title=f'The changing ranks of the Top 10 authors in 2024 - Q1 by citations',
    bar_size=.95,
    interpolate_period=True
)

In [None]:



authors_global = (
        works_bjp
        .select(["year", *bjp_cols_authors, "cited_by_count"])
        .unpivot(
            on=bjp_cols_authors,
            index=["year", "cited_by_count"],
            variable_name="author_pos",
            value_name="author_id"
        )
        .filter(pl.col("author_id").is_not_null())
        .group_by("author_id")
        .agg(pl.sum("cited_by_count").alias("total_citations"))
        .sort("total_citations", descending=True)
)

top_authors_ids = authors_global.head(10)["author_id"].to_list()

authors_info = fetch_authors(top_authors_ids, batch_size=90)

id_to_name = {a["id"]: a.get("display_name", "Unknown") for a in authors_info if a is not None}

df_top = (
    works_bjp
    .select(["year", *bjp_cols_authors, "cited_by_count"])
    .unpivot(
        on=bjp_cols_authors,
        index=["year", "cited_by_count"],
        variable_name="author_pos",
        value_name="author_id"
    )
    .filter(pl.col("author_id").is_in(top_authors_ids))
)
df_top = df_top.with_columns(
    pl.col("author_id").map_elements(lambda aid: id_to_name.get(aid, "Unknown"), return_dtype=pl.Utf8).alias("name")
)

df_yearly = (
        df_top.group_by(["year", "name"])
        .agg(pl.sum("cited_by_count").alias("value"))
        .sort(["year", "value"], descending=[False, True])
)

authors_pivot = df_yearly.pivot(
    index="year",
    columns="name",
    values="value",
    aggregate_function="sum"
).fill_null(0).sort("year")

authors_pivot_pd = authors_pivot.to_pandas().set_index("year")
authors_pivot_pd.index = pd.PeriodIndex(authors_pivot_pd.index, freq="Y").to_timestamp()

authors_pivot_cumsum = authors_pivot_pd.cumsum()

bcr.bar_chart_race(
    df=authors_pivot_cumsum,
    filename='../results/bjp/authors/top_authors_bjp_cumulative.mp4',
    orientation='h',
    sort='desc',
    n_bars=10,
    fixed_order=False,
    fixed_max=True,
    steps_per_period=8,
    period_length=800,
    period_fmt='%Y',
    title=f'The changing ranks of the Top 10 authors in 2024 - BJP by Citations',
    bar_size=.95,
    interpolate_period=True
)


In [None]:
import bar_chart_race as bcr

authors_global = (
        works
        .select(["year", *cols_authors])
        .unpivot(
            on=cols_authors,
            index=["year"],
            variable_name="author_pos",
            value_name="author_id"
        )
        .filter(pl.col("author_id").is_not_null())
        .group_by("author_id")
        .agg(pl.count("author_id").alias("total_publications"))
        .sort("total_publications", descending=True)
)
    
top_authors_ids = authors_global.head(11)["author_id"].to_list()

authors_info = fetch_authors(top_authors_ids, batch_size=90)

id_to_name = {a["id"]: a.get("display_name", "Unknown") for a in authors_info if a is not None}

df_top = (
    works
    .select(["year", *cols_authors, "cited_by_count"])
    .unpivot(
        on=cols_authors,
        index=["year", "cited_by_count"],
        variable_name="author_pos",
        value_name="author_id"
    )
    .filter(pl.col("author_id").is_in(top_authors_ids))
)
df_top = df_top.with_columns(
    pl.col("author_id").map_elements(lambda aid: id_to_name.get(aid, "Unknown"), return_dtype=pl.Utf8).alias("name")
)

df_yearly = (
        df_top.group_by(["year", "name"])
        .agg(pl.count("author_id").alias("value"))
        .sort(["year", "value"], descending=[False, True])
)

authors_pivot = (
    df_yearly
    .filter(pl.col("name") != "Unknown")
    .pivot(
        index="year",
        columns="name",
        values="value",
        aggregate_function="sum"
    )
    .fill_null(0)
    .sort("year")
)

authors_pivot_pd = authors_pivot.to_pandas().set_index("year")
authors_pivot_pd.index = pd.PeriodIndex(authors_pivot_pd.index, freq="Y").to_timestamp()

authors_pivot_cumsum = authors_pivot_pd.cumsum()

bcr.bar_chart_race(
    df=authors_pivot_cumsum,
    filename='../results/q1/authors/active_authors_q1_cumulative.mp4',
    orientation='h',
    sort='desc',
    n_bars=10,
    fixed_order=False,
    fixed_max=True,
    steps_per_period=8,
    period_length=800,
    period_fmt='%Y',
    title=f'The changing ranks of the Top 10 authors in 2024 - Q1 by Publications',
    bar_size=.95,
    interpolate_period=True
)

In [None]:
import bar_chart_race as bcr

authors_global = (
        works_bjp
        .select(["year", *bjp_cols_authors])
        .unpivot(
            on=bjp_cols_authors,
            index=["year"],
            variable_name="author_pos",
            value_name="author_id"
        )
        .filter(pl.col("author_id").is_not_null())
        .group_by("author_id")
        .agg(pl.count("author_id").alias("total_publications"))
        .sort("total_publications", descending=True)
)
    
top_authors_ids = authors_global.head(10)["author_id"].to_list()

authors_info = fetch_authors(top_authors_ids, batch_size=90)

id_to_name = {a["id"]: a.get("display_name", "Unknown") for a in authors_info if a is not None}

df_top = (
    works_bjp
    .select(["year", *bjp_cols_authors, "cited_by_count"])
    .unpivot(
        on=bjp_cols_authors,
        index=["year", "cited_by_count"],
        variable_name="author_pos",
        value_name="author_id"
    )
    .filter(pl.col("author_id").is_in(top_authors_ids))
)
df_top = df_top.with_columns(
    pl.col("author_id").map_elements(lambda aid: id_to_name.get(aid, "Unknown"), return_dtype=pl.Utf8).alias("name")
)

df_yearly = (
        df_top.group_by(["year", "name"])
        .agg(pl.count("author_id").alias("value"))
        .sort(["year", "value"], descending=[False, True])
)

authors_pivot = df_yearly.pivot(
    index="year",
    columns="name",
    values="value",
    aggregate_function="sum"
).fill_null(0).sort("year")

authors_pivot_pd = authors_pivot.to_pandas().set_index("year")
authors_pivot_pd.index = pd.PeriodIndex(authors_pivot_pd.index, freq="Y").to_timestamp()

authors_pivot_cumsum = authors_pivot_pd.cumsum()

bcr.bar_chart_race(
    df=authors_pivot_cumsum,
    filename='../results/bjp/authors/active_authors_bjp_cumulative.mp4',
    orientation='h',
    sort='desc',
    n_bars=10,
    fixed_order=False,
    fixed_max=True,
    steps_per_period=8,
    period_length=800,
    period_fmt='%Y',
    title=f'The changing ranks of the Top 10 authors in 2024 - BJP by Publications',
    bar_size=.95,
    interpolate_period=True
)



In [None]:
all_results = []

for year in sorted(works["year"].unique().to_list()):
    df_y = (
        works.filter(pl.col("year") == year)
        .select(["year", "cited_by_count", *cols_authors])
        .unpivot(
            on=cols_authors,
            index=["year", "cited_by_count"],
            variable_name="author_pos",
            value_name="author_id"
        )
        .filter(pl.col("author_id").is_not_null())
    )

    top_authors_y = (
        df_y.group_by("author_id")
        .agg(pl.sum("cited_by_count")
             .alias("value"))
        .sort("value", descending=True).head(1500)
    )

    all_results.append(top_authors_y.with_columns(pl.lit(year).alias("year")))

df_top_yearly = pl.concat(all_results)

unique_ids = df_top_yearly["author_id"].unique().to_list()
authors_info = fetch_authors(unique_ids, batch_size=90)
id_to_name = {a["id"]: a.get("display_name", "Unknown") for a in authors_info if a is not None}

df_top_yearly = df_top_yearly.with_columns(
    pl.col("author_id").map_elements(lambda aid: id_to_name.get(aid, "Unknown"), return_dtype=pl.Utf8).alias("name")
)




In [None]:

top10_per_year = (
    df_top_yearly
    .sort(["year", "value"], descending=[False, True])
    .group_by("year")
    .head(10)
)

top10_authors = top10_per_year["author_id"].unique().to_list()

df_top_filtered = df_top_yearly.filter(pl.col("author_id").is_in(top10_authors))

authors_pivot = df_top_filtered.pivot(
    index="year",
    columns="name",
    values="value",
    aggregate_function="sum"
).fill_null(0).sort("year")

authors_pivot_pd = authors_pivot.to_pandas().set_index("year")
authors_pivot_pd.index = pd.PeriodIndex(authors_pivot_pd.index, freq="Y").to_timestamp()

authors_pivot_cumsum = authors_pivot_pd.cumsum()

bcr.bar_chart_race(
    df=authors_pivot_cumsum,
    filename='../results/q1/authors/top10_q1_authors_evo.mp4',
    orientation='h',
    sort='desc',
    n_bars=10,
    fixed_order=False,
    fixed_max=True,
    steps_per_period=8,
    period_length=800,
    period_fmt='%Y',
    title='The evolution of the Top 10 most cited authors over the years - Q1 Journals',
    bar_size=.95,
    interpolate_period=True
)

In [None]:
df_top_pd = df_top_filtered.to_pandas().sort_values(["name", "year"])


df_top_pd["rolling_5y"] = (
    df_top_pd.groupby("name")["value"]
    .rolling(5, min_periods=1)  
    .sum()
    .reset_index(level=0, drop=True)
)

df_top_pd_grouped = df_top_pd.groupby(["year", "name"], as_index=False)["rolling_5y"].sum()

pivot_5y = df_top_pd_grouped.pivot(index="year", columns="name", values="rolling_5y").fillna(0)

years = pivot_5y.index
pivot_5y.index = [f"({y-4}–{y})" for y in years]

bcr.bar_chart_race(
    df=pivot_5y,
    filename='../results/q1/authors/top10_q1_authors_evo_5y.mp4',
    orientation='h',
    sort='desc',
    n_bars=10,
    fixed_order=False,
    fixed_max=True,
    steps_per_period=50,
    period_length=3000,
    title='The evolution of the Top 10 most cited authors over the years (5-year period) - Q1 Journals',
    bar_size=.95,
    interpolate_period=True
)



In [None]:
all_results = []

for year in sorted(works_bjp["year"].unique().to_list()):
    df_y = (
        works_bjp.filter(pl.col("year") == year)
        .select(["year", "cited_by_count", *bjp_cols_authors])
        .unpivot(
            on=bjp_cols_authors,
            index=["year", "cited_by_count"],
            variable_name="author_pos",
            value_name="author_id"
        )
        .filter(pl.col("author_id").is_not_null())
    )

    top_authors_y = (
        df_y.group_by("author_id").agg(pl.sum("cited_by_count").alias("value")).sort("value", descending=True).head(1500)
    )

    all_results.append(top_authors_y.with_columns(pl.lit(year).alias("year")))

df_top_yearly = pl.concat(all_results)

unique_ids = df_top_yearly["author_id"].unique().to_list()
authors_info = fetch_authors(unique_ids, batch_size=90)
id_to_name = {a["id"]: a.get("display_name", "Unknown") for a in authors_info if a is not None}

df_top_yearly = df_top_yearly.with_columns(
    pl.col("author_id").map_elements(lambda aid: id_to_name.get(aid, "Unknown"), return_dtype=pl.Utf8).alias("name")
)

In [None]:
top10_per_year = (
    df_top_yearly
    .sort(["year", "value"], descending=[False, True])
    .group_by("year")
    .head(10)
)

top10_authors = top10_per_year["author_id"].unique().to_list()

df_top_filtered = df_top_yearly.filter(pl.col("author_id").is_in(top10_authors))

authors_pivot = df_top_filtered.pivot(
    index="year",
    columns="name",
    values="value",
    aggregate_function="sum"
).fill_null(0).sort("year")

authors_pivot_pd = authors_pivot.to_pandas().set_index("year")
authors_pivot_pd.index = pd.PeriodIndex(authors_pivot_pd.index, freq="Y").to_timestamp()

authors_pivot_cumsum = authors_pivot_pd.cumsum()

bcr.bar_chart_race(
    df=authors_pivot_cumsum,
    filename='../results/bjp/authors/top10_bjp_authors_evo.mp4',
    orientation='h',
    sort='desc',
    n_bars=10,
    fixed_order=False,
    fixed_max=True,
    steps_per_period=8,
    period_length=800,
    period_fmt='%Y',
    title='The evolution of the Top 10 most cited authors over the years - BJP Journal',
    bar_size=.95,
    interpolate_period=True
)

In [None]:

df_top_pd = df_top_filtered.to_pandas().sort_values(["name", "year"])


df_top_pd["rolling_5y"] = (
    df_top_pd.groupby("name")["value"]
    .rolling(5, min_periods=1) 
    .sum()
    .reset_index(level=0, drop=True)
)

df_top_pd_grouped = df_top_pd.groupby(["year", "name"], as_index=False)["rolling_5y"].sum()


pivot_5y = df_top_pd_grouped.pivot(index="year", columns="name", values="rolling_5y").fillna(0)

years = pivot_5y.index
pivot_5y.index = [f"({y-4}–{y})" for y in years]

bcr.bar_chart_race(
    df=pivot_5y,
    filename='../results/bjp/authors/top10_bjp_authors_evo_5y.mp4',
    orientation='h',
    sort='desc',
    n_bars=10,
    fixed_order=False,
    fixed_max=True,
    steps_per_period=50,
    period_length=3000,
    title='The evolution of the Top 10 most cited authors over the years (5-year period) - BJP Journal',
    bar_size=.95,
    interpolate_period=True
)
