In [1]:
import os
import ast
import pandas as pd
import numpy as np
from collections import defaultdict
import ast, functools
from typing import Optional, List
from tqdm.notebook import tqdm
tqdm.pandas()

df = pd.read_excel('../data/dimensions/api/raw/combined/202511/df_dimensions.xlsx', index_col=0)

In [31]:
def parse(x):
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except Exception:
            return None
    return x

df["category_for_2020"]   = df["category_for_2020"].apply(parse)
df["researchers"]         = df["researchers"].apply(parse)
df["research_org_names"]  = df["research_org_names"].apply(parse)
df["altmetric"]           = pd.to_numeric(df.get("altmetric", 0), errors="coerce").fillna(0)
df["times_cited"]         = pd.to_numeric(df.get("times_cited", 0), errors="coerce").fillna(0)


# --------------------------------------------------------------------------------------------------
# EXTRACT L2 + L4 CODES
# --------------------------------------------------------------------------------------------------
rows = []

for _, row in df.iterrows():
    cat = row["category_for_2020"]
    if not isinstance(cat, list):
        continue

    L2_list = []
    L4_list = []

    for d in cat:
        nm = d.get("name", "")
        if not nm:
            continue
        parts = nm.split()
        code  = parts[0]

        if code.isdigit() and len(code) == 2:
            L2_list.append((code, nm[len(code):].strip()))

        if code.isdigit() and len(code) == 4:
            L4_list.append((code, nm[len(code):].strip()))

    for l2c, l2n in L2_list:
        for l4c, l4n in L4_list:
            if l4c.startswith(l2c):
                rows.append({
                    "id": row["id"],
                    "L2_code": l2c,
                    "L2_name": l2n,
                    "L4_name": l4n,
                    "times_cited": row["times_cited"],
                    "altmetric": row["altmetric"],
                    "researchers": row["researchers"],
                    "orgs": row["research_org_names"]
                })

exploded = pd.DataFrame(rows)


# ==================================================================================================
# GENDER INFERENCE SETUP (YOUR EXACT LOGIC)
# ==================================================================================================
import gender_guesser.detector as gender
_DETECTOR1 = gender.Detector(case_sensitive=False)

try:
    from gender_detector.gender_detector import GenderDetector
    _DETECTOR2 = GenderDetector("uk")
    _HAS_DETECTOR2 = True
except Exception:
    _DETECTOR2 = None
    _HAS_DETECTOR2 = False


def _map_gender_guesser(label: Optional[str]) -> str:
    m = (label or "").lower()
    if m in {"male", "mostly_male"}:
        return "male"
    if m in {"female", "mostly_female"}:
        return "female"
    return "unknown"


def _map_gender_detector(label: Optional[str]) -> str:
    l = (label or "").lower()
    return l if l in {"male", "female"} else "unknown"


@functools.lru_cache(maxsize=8192)
def infer_gender_offline(name: Optional[str]) -> str:
    if not isinstance(name, str) or not name.strip():
        return "unknown"
    first = name.strip().split()[0]

    g1 = _map_gender_guesser(_DETECTOR1.get_gender(first))
    if g1 != "unknown":
        return g1

    if _HAS_DETECTOR2 and _DETECTOR2 is not None:
        try:
            g2_raw = _DETECTOR2.guess(first)
        except Exception:
            g2_raw = None
        g2 = _map_gender_detector(g2_raw)
        if g2 != "unknown":
            return g2

    return "unknown"


def extract_forenames_from_researchers(lst):
    if not isinstance(lst, list):
        return []
    out = []
    for r in lst:
        if isinstance(r, dict):
            fn = r.get("first_name")
            if fn:
                out.append(fn)
    return out


# --------------------------------------------------------------------------------------------------
# APPLY GENDER INFERENCE TO exploded
# --------------------------------------------------------------------------------------------------
exploded["forenames"]   = exploded["researchers"].progress_apply(extract_forenames_from_researchers)
exploded["gender_list"] = exploded["forenames"].progress_apply(
    lambda xs: [infer_gender_offline(x) for x in xs]
)


# --------------------------------------------------------------------------------------------------
# AGGREGATE TO L4 SUMMARY
# --------------------------------------------------------------------------------------------------
top_L4 = exploded["L4_name"].value_counts().head(10).index.tolist()
results = []

for l4 in top_L4:
    sub = exploded[exploded["L4_name"] == l4]

    L2_name = sub["L2_name"].mode().iloc[0]
    papers  = len(sub)

    citations = sub["times_cited"].astype(int)
    total_citations = citations.sum()
    mean_citations  = citations.mean()

    total_altmetric = sub["altmetric"].sum()

    # h-index
    h = 0
    for i, c in enumerate(sorted(citations, reverse=True), start=1):
        if c >= i:
            h = i
        else:
            break

    # researcher metrics
    researcher_list = []
    for lst in sub["researchers"]:
        if isinstance(lst, list):
            for r in lst:
                fn = r.get("first_name", "")
                ln = r.get("last_name", "")
                nm = f"{fn} {ln}".strip()
                if nm:
                    researcher_list.append(nm)

    modal_researcher    = pd.Series(researcher_list).mode().iloc[0] if researcher_list else None
    unique_researchers  = len(set(researcher_list))

    # org metrics
    org_list = []
    for lst in sub["orgs"]:
        if isinstance(lst, list):
            org_list.extend(lst)

    modal_org  = pd.Series(org_list).mode().iloc[0] if org_list else None
    unique_orgs = len(set(org_list))

    # --------- percent female at L2 (your requirement) ----------
    flat_genders = []
    for lst in sub["gender_list"]:
        if isinstance(lst, list):
            flat_genders.extend(lst)

    if flat_genders:
        percent_female = 100 * sum(g == "female" for g in flat_genders) / len(flat_genders)
    else:
        percent_female = 0.0

    results.append([
        l4, L2_name, papers, total_citations, mean_citations,
        total_altmetric, h, modal_researcher, unique_researchers,
        modal_org, unique_orgs, percent_female
    ])

res_df = pd.DataFrame(results, columns=[
    "L4_name","L2_name","papers","total_citations","mean_citations",
    "total_altmetric","h_index","modal_researcher","unique_researchers",
    "modal_org","unique_orgs","percent_female"
]).sort_values(by="papers", ascending=False)

res_df



  0%|          | 0/14870 [00:00<?, ?it/s]

  0%|          | 0/14870 [00:00<?, ?it/s]

Unnamed: 0,L4_name,L2_name,papers,total_citations,mean_citations,total_altmetric,h_index,modal_researcher,unique_researchers,modal_org,unique_orgs,percent_female
0,Clinical Sciences,Biomedical and Clinical Sciences,2635,74813,28.39203,216391.0,121,Jill P Pell,13330,University of Oxford,3206,22.928976
1,Epidemiology,Health Sciences,2425,107235,44.220619,156783.0,151,George G Davey Smith,10967,Harvard University,2679,22.046014
2,Genetics,Biological Sciences,2145,170400,79.440559,154988.0,196,Kari Stefansson,12336,Harvard University,3578,27.725268
3,Public Health,Health Sciences,1067,47390,44.414246,146271.0,99,Carlos A Celis-Morales,4585,University of Oxford,1303,23.698297
4,Cardiovascular Medicine and Haematology,Biomedical and Clinical Sciences,967,32233,33.332989,81468.0,89,Steffen Erhard Petersen,5612,Harvard University,1606,22.479662
5,Biological Psychology,Psychology,817,40563,49.648715,92264.0,90,Andrew Mark Mcintosh,4337,University of Edinburgh,1428,27.033609
6,Nutrition and Dietetics,Biomedical and Clinical Sciences,678,16145,23.812684,70287.0,66,Carlos A Celis-Morales,3351,Harvard University,1143,21.288465
7,Oncology and Carcinogenesis,Biomedical and Clinical Sciences,648,16104,24.851852,32716.0,60,Susanna C Larsson,3749,Harvard University,1166,24.7843
8,Health Services and Systems,Health Sciences,581,18740,32.254733,57967.0,61,Jill P Pell,3447,University of Oxford,1052,23.716134
9,Neurosciences,Biomedical and Clinical Sciences,459,14862,32.379085,46171.0,53,Wei Cheng,2967,University of Oxford,956,24.705605
