In [2]:
import ast
import pandas as pd
import numpy as np
from collections import Counter

df = pd.read_excel('../data/dimensions/api/raw/combined/202511/df_dimensions.xlsx', index_col=0)

# ================================================================
# SAFE PARSING
# ================================================================
def parse(x):
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except Exception:
            return None
    return x

df["researchers"]        = df["researchers"].apply(parse)
df["category_for_2020"]  = df["category_for_2020"].apply(parse)
df["altmetric"]          = pd.to_numeric(df.get("altmetric", 0), errors="coerce").fillna(0)
df["times_cited"]        = pd.to_numeric(df.get("times_cited", 0), errors="coerce").fillna(0)

df["researchers"]        = df["researchers"].apply(lambda x: x if isinstance(x, list) else [])
df["category_for_2020"]  = df["category_for_2020"].apply(lambda x: x if isinstance(x, list) else [])

# ================================================================
# L2 / L4 PARSING PER PAPER  (same logic you used before)
# ================================================================
L2L4_rows = []

for _, row in df.iterrows():
    cat = row["category_for_2020"]
    if not isinstance(cat, list):
        continue

    L2_list = []
    L4_list = []

    for d in cat:
        nm = d.get("name", "")
        if not nm:
            continue
        parts = nm.split()
        code  = parts[0]

        if code.isdigit() and len(code) == 2:
            L2_list.append((code, nm[len(code):].strip()))

        if code.isdigit() and len(code) == 4:
            L4_list.append((code, nm[len(code):].strip()))

    for l2c, l2n in L2_list:
        matched = False
        for l4c, l4n in L4_list:
            if l4c.startswith(l2c):
                matched = True
                L2L4_rows.append({
                    "paper_id": row["id"],
                    "L2_name": l2n,
                    "L4_name": l4n,
                })
        if not matched:
            L2L4_rows.append({
                "paper_id": row["id"],
                "L2_name": l2n,
                "L4_name": None,
            })

L2L4 = pd.DataFrame(L2L4_rows)

# ================================================================
# EXPLODE AUTHORS → DEDUPLICATED (author, paper)
# ================================================================
author_rows = []

for _, row in df.iterrows():
    paper_id    = row["id"]
    times_cited = row["times_cited"]
    altmetric   = row["altmetric"]
    for r in row["researchers"]:
        author_rows.append(
            {
                "author_id":  r.get("id"),
                "first_name": r.get("first_name"),
                "last_name":  r.get("last_name"),
                "paper_id":   paper_id,
                "times_cited": times_cited,
                "altmetric":   altmetric,
            }
        )

authors = pd.DataFrame(author_rows)

# remove any exact duplicates of (author_id, paper_id)
authors = authors.drop_duplicates(subset=["author_id", "paper_id"])

# ================================================================
# AUTHOR-LEVEL CITATION METRICS (NO L2/L4 DUP INFLATION)
# ================================================================
def h_index(citations):
    c = sorted(citations, reverse=True)
    return sum(c_i >= (i + 1) for i, c_i in enumerate(c))

def g_index(citations):
    c = sorted(citations, reverse=True)
    s = 0
    g = 0
    for i, c_i in enumerate(c, start=1):
        s += c_i
        if s >= i * i:
            g = i
    return g

def i10_index(citations):
    return sum(c >= 10 for c in citations)

author_stats = (
    authors.groupby(["author_id", "first_name", "last_name"])
           .agg(
               citations_list   = ("times_cited", lambda x: list(x)),
               papers_in_dataset=("paper_id", "nunique"),     # TRUE paper count
               total_citations  = ("times_cited", "sum"),
               total_altmetric  = ("altmetric", "sum"),
           )
           .reset_index()
)

total_citations_global = authors["times_cited"].sum()

author_stats["full_name"] = author_stats["first_name"].fillna("") + " " + author_stats["last_name"].fillna("")
author_stats["full_name"] = author_stats["full_name"].str.strip()

author_stats["mean_citations"]   = author_stats["citations_list"].apply(np.mean)
author_stats["median_citations"] = author_stats["citations_list"].apply(np.median)
author_stats["max_citations"]    = author_stats["citations_list"].apply(max)

author_stats["i10_index"]        = author_stats["citations_list"].apply(i10_index)
author_stats["h_index_dataset"]  = author_stats["citations_list"].apply(h_index)
author_stats["g_index_dataset"]  = author_stats["citations_list"].apply(g_index)

author_stats["h_index_normalised"] = (
    author_stats["h_index_dataset"] / author_stats["papers_in_dataset"]
)

author_stats["citation_variance"] = author_stats["citations_list"].apply(np.var)
author_stats["citation_skewness"] = author_stats["citations_list"].apply(
    lambda c: pd.Series(c).skew()
)

author_stats["proportion_uncited"] = (
    author_stats["citations_list"].apply(lambda c: sum(x == 0 for x in c))
    / author_stats["papers_in_dataset"]
)

author_stats["citation_share"] = (
    author_stats["total_citations"] / total_citations_global
)

# ================================================================
# MODAL L2 / L4 PER AUTHOR (USING L2L4, BUT NOT AFFECTING COUNTS)
# ================================================================
# link authors to L2/L4 via paper_id
author_L2L4 = authors[["author_id", "paper_id"]].merge(
    L2L4, on="paper_id", how="left"
)

def mode_or_none(series):
    s = series.dropna()
    if s.empty:
        return None
    return Counter(s).most_common(1)[0][0]

modal_L2 = (
    author_L2L4.groupby("author_id")["L2_name"]
               .agg(mode_or_none)
               .reset_index()
               .rename(columns={"L2_name": "modal_L2"})
)

modal_L4 = (
    author_L2L4.groupby("author_id")["L4_name"]
               .agg(mode_or_none)
               .reset_index()
               .rename(columns={"L4_name": "modal_L4"})
)

author_stats = (
    author_stats
    .merge(modal_L2, on="author_id", how="left")
    .merge(modal_L4, on="author_id", how="left")
)

# ================================================================
# FINAL TABLE
# ================================================================
author_summary = author_stats[
    [
        "full_name",          # Name
        "papers_in_dataset",  # Papers
        "total_citations",    # Citations
        "i10_index",          # i10
        "g_index_dataset",    # G index
        "h_index_dataset",    # H index
        "total_altmetric",    # Total Altmetric
        "modal_L2",           # Modal L2
        "modal_L4",           # Modal L4
    ]
].copy()

author_summary = author_summary.sort_values("h_index_dataset", ascending=False)

Unnamed: 0,full_name,papers_in_dataset,total_citations,i10_index,g_index_dataset,h_index_dataset,total_altmetric,modal_L2,modal_L4
21007,Jill P Pell,183,13197,141,112,63,28664.0,Health Sciences,Public Health
4824,George G Davey Smith,130,13120,111,114,55,10869.0,Health Sciences,Epidemiology
1987,Naveed A Sattar,119,9377,99,96,53,20661.0,Biomedical and Clinical Sciences,Public Health
31591,Carlos A Celis-Morales,154,9345,110,94,52,28275.0,Health Sciences,Public Health
36111,Andrew Mark Mcintosh,98,12363,88,98,50,9584.0,Psychology,Biological Psychology
...,...,...,...,...,...,...,...,...,...
9996,Huixiang Zhuang,1,0,0,0,0,1.0,Biomedical and Clinical Sciences,Biological Psychology
32023,Anders Ellekaer Junker,1,0,0,0,0,1.0,Biomedical and Clinical Sciences,Medical Biochemistry and Metabolomics
10003,Ming-Ju Tsai,1,0,0,0,0,2.0,Biological Sciences,Genetics
35668,Yuanwei Chen,1,0,0,0,0,0.0,Health Sciences,Epidemiology


In [4]:
author_summary.to_excel('../output/tables/author_analytics.xlsx')