In [1]:
import os
import ast
import pandas as pd
import numpy as np
from collections import defaultdict
import ast, functools
from typing import Optional, List
from tqdm.notebook import tqdm
tqdm.pandas()

df = pd.read_excel('../data/dimensions/api/raw/combined/202511/df_dimensions.xlsx', index_col=0)

In [4]:
df.columns

Index(['Unnamed: 0', 'id', 'title', 'abstract', 'acknowledgements',
       'altmetric', 'altmetric_id', 'authors', 'authors_count', 'category_bra',
       'category_for', 'category_for_2020', 'category_uoa', 'concepts',
       'concepts_scores', 'date', 'date_inserted', 'date_online', 'date_print',
       'dimensions_url', 'document_type', 'doi', 'field_citation_ratio',
       'funder_countries', 'funders', 'issn', 'issue', 'journal_lists',
       'journal_title_raw', 'linkout', 'mesh_terms', 'open_access', 'pages',
       'pmcid', 'pmid', 'publisher', 'recent_citations', 'reference_ids',
       'referenced_pubs', 'relative_citation_ratio', 'research_org_cities',
       'research_org_countries', 'research_org_country_names',
       'research_org_names', 'research_org_state_codes',
       'research_org_state_names', 'research_org_types', 'research_orgs',
       'researchers', 'score', 'supporting_grant_ids', 'times_cited', 'type',
       'volume', 'year', 'journal.id', 'journal.title', 

In [13]:
# ==================================================================================================
# FULL PIPELINE: EXTRACTION + GENDER + SUMMARY + 10 FASTEST-GROWING FIELDS + FINAL TABLE
# ==================================================================================================

import ast, functools
import pandas as pd
from typing import Optional

# ======================================================================
# PARSE INPUT COLUMNS
# ======================================================================
def parse(x):
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except Exception:
            return None
    return x

df["category_for_2020"]   = df["category_for_2020"].apply(parse)
df["researchers"]         = df["researchers"].apply(parse)
df["research_org_names"]  = df["research_org_names"].apply(parse)
df["altmetric"]           = pd.to_numeric(df.get("altmetric", 0), errors="coerce").fillna(0)
df["times_cited"]         = pd.to_numeric(df.get("times_cited", 0), errors="coerce").fillna(0)


# ======================================================================
# EXTRACT L2 + L4 CODES (INCLUDING YEAR)
# ======================================================================
rows = []

for _, row in df.iterrows():
    cat = row["category_for_2020"]
    if not isinstance(cat, list):
        continue

    L2_list = []
    L4_list = []

    for d in cat:
        nm = d.get("name", "")
        if not nm:
            continue
        parts = nm.split()
        code  = parts[0]

        if code.isdigit() and len(code) == 2:
            L2_list.append((code, nm[len(code):].strip()))
        if code.isdigit() and len(code) == 4:
            L4_list.append((code, nm[len(code):].strip()))

    for l2c, l2n in L2_list:
        for l4c, l4n in L4_list:
            if l4c.startswith(l2c):
                rows.append({
                    "id": row["id"],
                    "year": row["year"],        # <— propagate year correctly
                    "L2_code": l2c,
                    "L2_name": l2n,
                    "L4_name": l4n,
                    "times_cited": row["times_cited"],
                    "altmetric": row["altmetric"],
                    "researchers": row["researchers"],
                    "orgs": row["research_org_names"]
                })

exploded = pd.DataFrame(rows)


# ======================================================================
# GENDER INFERENCE SETUP
# ======================================================================
import gender_guesser.detector as gender
_DETECTOR1 = gender.Detector(case_sensitive=False)

try:
    from gender_detector.gender_detector import GenderDetector
    _DETECTOR2 = GenderDetector("uk")
    _HAS_DETECTOR2 = True
except Exception:
    _DETECTOR2 = None
    _HAS_DETECTOR2 = False

def _map_gender_guesser(label: Optional[str]) -> str:
    m = (label or "").lower()
    if m in {"male", "mostly_male"}:
        return "male"
    if m in {"female", "mostly_female"}:
        return "female"
    return "unknown"

def _map_gender_detector(label: Optional[str]) -> str:
    l = (label or "").lower()
    return l if l in {"male", "female"} else "unknown"

@functools.lru_cache(maxsize=8192)
def infer_gender_offline(name: Optional[str]) -> str:
    if not isinstance(name, str) or not name.strip():
        return "unknown"
    first = name.strip().split()[0]

    g1 = _map_gender_guesser(_DETECTOR1.get_gender(first))
    if g1 != "unknown":
        return g1

    if _HAS_DETECTOR2 and _DETECTOR2 is not None:
        try:
            g2_raw = _DETECTOR2.guess(first)
        except Exception:
            g2_raw = None
        g2 = _map_gender_detector(g2_raw)
        if g2 != "unknown":
            return g2

    return "unknown"

def extract_forenames_from_researchers(lst):
    if not isinstance(lst, list):
        return []
    out = []
    for r in lst:
        if isinstance(r, dict):
            fn = r.get("first_name")
            if fn:
                out.append(fn)
    return out


# ======================================================================
# APPLY GENDER INFERENCE
# ======================================================================
exploded["forenames"]   = exploded["researchers"].progress_apply(extract_forenames_from_researchers)
exploded["gender_list"] = exploded["forenames"].progress_apply(
    lambda xs: [infer_gender_offline(x) for x in xs]
)


# ======================================================================
# L4 SUMMARY (ALL FIELDS)
# ======================================================================
top_L4 = exploded["L4_name"].unique().tolist()
results = []

for l4 in top_L4:
    sub = exploded[exploded["L4_name"] == l4]

    L2_name = sub["L2_name"].mode().iloc[0]
    papers  = len(sub)

    citations = sub["times_cited"].astype(int)
    total_citations = citations.sum()
    mean_citations  = citations.mean()

    total_altmetric = sub["altmetric"].sum()

    # h-index
    h = 0
    for i, c in enumerate(sorted(citations, reverse=True), start=1):
        if c >= i:
            h = i
        else:
            break

    # researchers
    researcher_list = []
    for lst in sub["researchers"]:
        if isinstance(lst, list):
            for r in lst:
                fn = r.get("first_name", "")
                ln = r.get("last_name", "")
                nm = f"{fn} {ln}".strip()
                if nm:
                    researcher_list.append(nm)

    modal_researcher    = pd.Series(researcher_list).mode().iloc[0] if researcher_list else None
    unique_researchers  = len(set(researcher_list))

    # orgs
    org_list = []
    for lst in sub["orgs"]:
        if isinstance(lst, list):
            org_list.extend(lst)

    modal_org  = pd.Series(org_list).mode().iloc[0] if org_list else None
    unique_orgs = len(set(org_list))

    # gender %
    flat_genders = []
    for lst in sub["gender_list"]:
        if isinstance(lst, list):
            flat_genders.extend(lst)

    percent_female = (
        100 * sum(g == "female" for g in flat_genders) / len(flat_genders)
        if flat_genders else 0.0
    )

    results.append([
        l4, L2_name, papers, total_citations, mean_citations,
        total_altmetric, h, modal_researcher, unique_researchers,
        modal_org, unique_orgs, percent_female
    ])

res_df = pd.DataFrame(results, columns=[
    "L4_name","L2_name","papers","total_citations","mean_citations",
    "total_altmetric","h_index","modal_researcher","unique_researchers",
    "modal_org","unique_orgs","percent_female"
])


# ======================================================================
# COMPUTE 10 FASTEST-GROWING L4 FIELDS (2024→2025)
# ======================================================================
counts = (
    exploded
    .groupby(["L4_name", "year"])
    .size()
    .rename("papers")
    .reset_index()
)

pivot = counts.pivot(index="L4_name", columns="year", values="papers").fillna(0).astype(int)
pivot = pivot.rename(columns={2024: "n_2024", 2025: "n_2025"})

valid = pivot[pivot["n_2024"] > 0].copy()
valid["growth_pct"] = (valid["n_2025"] - valid["n_2024"]) / valid["n_2024"] * 100

top10_growth = valid.sort_values("growth_pct", ascending=False).head(10)


# ======================================================================
# MERGE SUMMARY WITH TOP 10 GROWTH
# ======================================================================
merged = res_df.merge(
    top10_growth[["growth_pct"]],
    left_on="L4_name",
    right_index=True,
    how="inner"
)

final_table = merged.rename(columns={
    "L4_name":                "Level Four Field of Research",
    "papers":                 "Total Papers",
    "L2_name":                "Level Two Field of Research",
    "total_citations":        "Total Citations",
    "h_index":                "H-Index",
    "total_altmetric":        "Total Altmetric",
    "modal_researcher":       "Modal Researcher",
    "unique_researchers":     "Unique Researchers",
    "percent_female":         "Percent Female",
    "modal_org":              "Modal Organisation",
    "unique_orgs":            "Unique Organisations",
    "growth_pct":             "Growth %"
})

# enforce column order and sort by growth
final_table = final_table[
    [
        "Level Four Field of Research",
        "Total Papers",
        "Level Two Field of Research",
        "Total Citations",
        "H-Index",
        "Total Altmetric",
        "Modal Researcher",
        "Unique Researchers",
        "Percent Female",
        "Modal Organisation",
        "Unique Organisations",
        "Growth %",
    ]
].sort_values("Growth %", ascending=False)

final_table


  0%|          | 0/14870 [00:00<?, ?it/s]

  0%|          | 0/14870 [00:00<?, ?it/s]

Unnamed: 0,Level Four Field of Research,Total Papers,Level Two Field of Research,Total Citations,H-Index,Total Altmetric,Modal Researcher,Unique Researchers,Percent Female,Modal Organisation,Unique Organisations,Growth %
30,Human Geography,18,Human Society,513,10,2137.0,Charlotte J Roscoe,84,26.315789,Harvard University,47,300.0
37,Econometrics,6,Economics,17,3,1.0,Alexander Giessing,13,23.076923,Columbia University,8,200.0
77,Atmospheric Sciences,4,Earth Sciences,1,1,2.0,Bin Bin Su,30,3.333333,Tongji Medical College of Huazhong University ...,7,200.0
47,Analytical Chemistry,5,Chemical Sciences,480,3,73.0,Adam S Butterworth,41,24.390244,Baker IDI Heart and Diabetes Institute,15,100.0
46,Distributed Computing and Systems Software,5,Information and Computing Sciences,43,3,1.0,Alauddin Bhuiyan,20,20.0,Colorado School of Public Health,10,100.0
44,Electrical Engineering,5,Engineering,43,3,1.0,Alauddin Bhuiyan,20,20.0,Colorado School of Public Health,10,100.0
53,Information Systems,4,Information and Computing Sciences,47,2,13.0,Abdullah Mesut Erzurumluoglu,31,19.354839,Bayer AG,12,100.0
32,Medical Biotechnology,13,Biomedical and Clinical Sciences,436,5,278.0,Adam R Johnson,127,21.428571,Harvard University,46,66.666667
33,Paediatrics,18,Biomedical and Clinical Sciences,271,9,810.0,John R B Perry,144,21.37931,University of Cambridge,67,50.0
17,Bioinformatics and Computational Biology,111,Biological Sciences,10423,36,5994.0,Alkes L Price,1023,25.669836,Harvard University,485,33.333333
