In [1]:
# Cell 1 — imports and tqdm setup
from __future__ import annotations
import os, ast, functools
from pathlib import Path
from typing import Optional, List
import pandas as pd
from tqdm.notebook import tqdm  # notebook-friendly tqdm
tqdm.pandas()  # enables .progress_apply on pandas objects

# gender-guesser (aka "gender-guesser" package: gender.Detector)
import gender_guesser.detector as gender

# optional secondary detector
try:
    from gender_detector.gender_detector import GenderDetector
    _HAS_DETECTOR2 = True
except Exception:
    GenderDetector = None  # type: ignore
    _HAS_DETECTOR2 = False

# Cell 2 — detector initialization (global so cached inference can see them)
_DETECTOR1 = gender.Detector(case_sensitive=False)
_DETECTOR2 = GenderDetector('uk') if _HAS_DETECTOR2 else None


# Cell 3 — helpers (unchanged semantics; notebook-safe)
def _map_gender_guesser(label: Optional[str]) -> str:
    """
    Map gender-guesser outputs to {'male','female','unknown'}.
    """
    m = (label or "").strip().lower()
    if m in {"male", "mostly_male"}:
        return "male"
    if m in {"female", "mostly_female"}:
        return "female"
    # 'andy' (androgynous) and anything else -> unknown
    return "unknown"

def _map_gender_detector(label: Optional[str]) -> str:
    """
    Map gender-detector outputs to {'male','female','unknown'}.
    """
    l = (label or "").strip().lower()
    return l if l in {"male", "female"} else "unknown"

@functools.lru_cache(maxsize=8192)
def infer_gender_offline(name: Optional[str]) -> str:
    """
    Deterministic offline gender inference with strict precedence:
        1) gender-guesser (primary)
        2) gender-detector (secondary; only if primary yields 'unknown' and available)
    Returns one of {'male','female','unknown'}.
    """
    if not isinstance(name, str) or not name.strip():
        return "unknown"
    first = name.strip().split()[0]

    # Primary
    g1_raw = _DETECTOR1.get_gender(first)
    g1 = _map_gender_guesser(g1_raw)
    if g1 != "unknown":
        return g1

    # Fallback
    if _HAS_DETECTOR2 and _DETECTOR2 is not None:
        try:
            g2_raw = _DETECTOR2.guess(first) if hasattr(_DETECTOR2, "guess") else _DETECTOR2.get_gender(first)  # type: ignore[attr-defined]
        except Exception:
            g2_raw = None
        g2 = _map_gender_detector(g2_raw)
        if g2 != "unknown":
            return g2

    return "unknown"

def _infer_list_gender(forenames: Optional[List[str]]) -> List[str]:
    """
    Apply infer_gender_offline elementwise to a list of forenames.
    Non-list inputs are treated as empty.
    """
    if not isinstance(forenames, list):
        return []
    # tqdm for lists (gives a small per-row bar; can be verbose — usually better to keep only the Series/DataFrame bars)
    return [infer_gender_offline(x if isinstance(x, str) else "") for x in forenames]

def _counts(gs: List[str]) -> pd.Series:
    return pd.Series({
        "number_male":    sum(g == "male" for g in gs),
        "number_female":  sum(g == "female" for g in gs),
        "number_unknown": sum(g == "unknown" for g in gs),
        "number_people":  len(gs),
    })

def extract_forenames(author_list_str):
    """Safely parse the author list and extract forenames."""
    try:
        authors = ast.literal_eval(author_list_str)
        return [
            a["first_name"]
            for a in authors
            if isinstance(a, dict) and a.get("first_name")
        ]
    except (ValueError, SyntaxError, TypeError):
        return []


# Cell 4 — main function with tqdm at each heavy step
def return_merged(identifier: str) -> pd.DataFrame:
    base_path = f"/home/jinx/Dropbox/ics_work/ics_taxonomies/data/dimensions_outputs/api/raw/{identifier}/202510"

    # Build list of CSVs with a progress bar over directories
    # (os.walk itself is a generator; materialize and wrap with tqdm)
    walk_list = list(os.walk(base_path))
    csv_files = []
    for root, _, files in tqdm(walk_list, desc=f"[{identifier}] scanning dirs"):
        for f in files:
            if f.endswith(".csv"):
                csv_files.append(os.path.join(root, f))

    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in {base_path}")

    # Read CSVs with a progress bar; attach source filename
    frames = []
    for f in tqdm(csv_files, desc=f"[{identifier}] reading CSVs"):
        frames.append(pd.read_csv(f).assign(source_file=os.path.basename(f)))

    # Concatenate (fast; no need for tqdm here)
    df_all = pd.concat(frames, ignore_index=False)

    # Apply functions with pandas-native progress bars
    df_all["author_forenames"] = df_all["authors"].progress_apply(
        extract_forenames
    )

    df_all["author_genders"] = df_all["author_forenames"].progress_apply(
        _infer_list_gender
    )

    df_all[["number_male",
            "number_female",
            "number_unknown",
            "number_people"]] = df_all["author_genders"].progress_apply(_counts)

    # Optional quick peek without printing entire frame
    return df_all


# Cell 5 — run with progress
df_dois = return_merged("doi")
df_isbn = return_merged("isbn")


[doi] scanning dirs:   0%|          | 0/1 [00:00<?, ?it/s]

[doi] reading CSVs:   0%|          | 0/1615 [00:00<?, ?it/s]

  0%|          | 0/138381 [00:00<?, ?it/s]

  0%|          | 0/138381 [00:00<?, ?it/s]

  0%|          | 0/138381 [00:00<?, ?it/s]

[isbn] scanning dirs:   0%|          | 0/1 [00:00<?, ?it/s]

[isbn] reading CSVs:   0%|          | 0/238 [00:00<?, ?it/s]

  0%|          | 0/23800 [00:00<?, ?it/s]

  0%|          | 0/23800 [00:00<?, ?it/s]

  0%|          | 0/23800 [00:00<?, ?it/s]

In [31]:
df_outputs = pd.read_excel('../../ics_taxonomies/data/raw/raw_ref_outputs_data.xlsx', skiprows=4)

In [32]:
df_outputs_w_doi = pd.merge(df_outputs, df_dois, how='left', left_on='DOI', right_on='doi')

In [44]:
df_outputs_mdoi = pd.merge(df_outputs, df_dois, how='left',
                           left_on='DOI', right_on='doi')
df_outputs_w_doi = df_outputs_mdoi[df_outputs_mdoi['doi'].notnull()]
df_outputs_wo_doi = df_outputs_mdoi[df_outputs_mdoi['doi'].isnull()]

In [45]:
df_outputs_wo_doi = df_outputs_wo_doi.drop(['Unnamed: 0', 'id', 'authors_count', 'category_for_2020',
       'dimensions_url', 'doi', 'isbn', 'year', 'authors', 'source_file',
       'author_forenames', 'author_genders', 'number_male', 'number_female',
       'number_unknown', 'number_people'], axis=1
       )

In [46]:
import re
def normalize_isbn(isbn):
    if pd.isna(isbn):
        return None
    if isinstance(isbn, list):  # normalize each element if list
        return [re.sub(r'[^0-9Xx]', '', str(x)).upper() for x in isbn]
    else:
        return re.sub(r'[^0-9Xx]', '', str(isbn)).upper()

df_outputs_wo_doi['ISBN_norm'] = df_outputs_wo_doi['ISBN'].apply(normalize_isbn)
df_isbn['isbn_norm'] = df_isbn['isbn'].apply(normalize_isbn)
df_isbn_exploded = df_isbn.explode('isbn_norm')
df_outputs_wo_doi_w_isbn = pd.merge(
    df_outputs_wo_doi,
    df_isbn_exploded,
    how='left',
    left_on='ISBN_norm',
    right_on='isbn_norm'
)
df_outputs_wo_doi_w_isbn = df_outputs_wo_doi_w_isbn[df_outputs_wo_doi_w_isbn['isbn_norm'].notnull()].drop('ISBN_norm', axis=1)

In [51]:
print(len(df_outputs_wo_doi_w_isbn), len(df_outputs_w_doi), len(df_outputs))

3553 135203 185286


In [48]:
df_concat = pd.concat([df_outputs_w_doi, df_outputs_wo_doi_w_isbn])

In [56]:
df_concat = df_concat[['Institution UKPRN code', 'Institution name', 'Main panel',
                       'Unit of assessment number', 'Unit of assessment name', 'REF2ID', 'authors',
                       'category_for_2020', 'year', 'doi', 'isbn',
                       'isbn_norm','authors_count', 'author_forenames', 'author_genders',
                       'number_male', 'number_female', 'number_unknown', 'number_people']]

In [57]:
df_concat.to_csv('../../ics_taxonomies/data/dimensions_outputs/outputs_concat_with_any_number_authors.csv')
print(len(df_concat))

136016


In [58]:
df_concat = df_concat[df_concat['number_people']>0]
df_concat.to_csv('../../ics_taxonomies/data/dimensions_outputs/outputs_concat_with_positive_authors.csv')

136016
