In [1]:
import os
import sys
import json
from pathlib import Path
from typing import List, Dict, Any, Optional
from datetime import datetime, timedelta, date
import numpy as np
import pandas as pd
from numbers import Number
from jsonschema import ValidationError

# --- your utils ---
sys.path.append(os.path.abspath("N:\\CancerEpidem\\BrBreakthrough\\DeliveryProcess\\Schema_and_Derivation_utils\\Questionnaire\\R0\\scripts"))
from common_utils import validate_data, load_schema, get_config
sys.path.append(os.path.abspath("N:\\CancerEpidem\\BrBreakthrough\\DeliveryProcess\\Schema_and_Derivation_utils"))
from utilities import connect_DB, createLogger, read_data
from config import r0_json_path, out_json_path, Delivery_log_path, test_server

In [2]:
def rounding(x, dig):
    if x is None:
        return None
    if isinstance(x, Number):
        x = float(x)
        if np.isfinite(x):
            return round(x, dig)
    return x

In [3]:
def _to_float_or_none(x):
    if x in (None, ''):
        return None
    try:
        return float(str(x).strip())
    except Exception:
        return None

In [4]:
# HELPERS

def _safe_parse_date(d):
    """Parse many date inputs to date; returns None on failure."""
    if d is None or d == "" or (isinstance(d, float) and np.isnan(d)):
        return None
    dt = pd.to_datetime(d, errors="coerce")
    if pd.isna(dt):
        return None
    # return a date (not datetime) to keep math clean
    return dt.to_pydatetime().date()

def _add_years(d, years):
    """Add whole years; clamp Feb 29 → Feb 28 when needed."""
    if d is None:
        return None
    try:
        return d.replace(year=d.year + years)
    except ValueError:
        return d.replace(month=2, day=28, year=d.year + years)

def _intervals_overlap(a_start, a_end, b_start, b_end):
    """Half-open overlap: [a_start, a_end) vs [b_start, b_end)."""
    if any(x is None for x in (a_start, a_end, b_start, b_end)):
        return False
    return (a_start < b_end) and (b_start < a_end)
    
def _is_neg(x) -> bool:
    if x is None or x is pd.NA:
        return False
    if isinstance(x, (float, np.floating)) and pd.isna(x):
        return False
    try:
        return float(x) < 0.0
    except Exception:
        return False
        
def _is_zero(x) -> bool:
    if x is None or x is pd.NA:
        return False
    if isinstance(x, (float, np.floating)) and pd.isna(x):
        return False
    try:
        return float(x) == 0.0
    except Exception:
        return False
        
def _is_int_eq(x, target:int) -> bool:
    if x is None or x is pd.NA or (isinstance(x, float) and pd.isna(x)):
        return False
    try:
        return int(x) == target
    except Exception:
        return False

def _norm_site(x):
    """
    Normalize a coded/site string for equality checks. 
      - lower case
      - strip surrounding whitespace
    Returns None for null/NA/non-strings.
    """
    if x is None:
        return None
    try:
        s = str(x).strip().lower()
        return s if s else None
    except Exception:
        return None

In [5]:
"""
Derive ethnicity variables:

  • R0_Ethnicity: 1 = White, 2 = Black, 3 = Asian, 4 = Other / mixed / none-of-these
  • R0_AshkenaziAncestry: 1 = Ashkenazi ancestry present, 0 = no Ashkenazi ancestry, NA = no ethnicity info

-------------------------------------------------------------------------------
1) Collapsed ethnicity (R0_Ethnicity)
-------------------------------------------------------------------------------
Methodology:
  1. Primary (“fast”) source:
     Use the single-choice ethnicity field:
       - R0_EthnGroup

     Map R0_EthnGroup into collapsed groups using:
       - 1 -> White
       - 2 -> Asian
       - 3 -> White        (Jewish is collapsed into White for R0_Ethnicity)
       - 4 -> Black
       - 5 -> Asian
       - 6 -> White
       - 7 -> Black
       - 8 -> Asian
       - 9 -> Other        (“None apply”, but may be overridden using components)
       - 10 -> Black
       - 11 -> Asian
       - 0 -> Other        (Other/unspecified)

     After this step, R0_Ethnicity is populated wherever R0_EthnGroup is present
     and maps to a collapsed group.

  2. Targeted override when “none apply” was selected:
     If R0_EthnGroup == 9 (“none apply”) but any disaggregated ethnicity fields
     are non-missing, replace the fast-derived “Other” with a component-derived
     classification.

     Component fields considered:
       White umbrella:
         - R0_EthnGroupWhite
         - R0_EthnGroupJAshk
         - R0_EthnGroupJSeph
       Black umbrella:
         - R0_EthnGroupBlkCarib
         - R0_EthnGroupBlkAfr
         - R0_EthnGroupBlkOther
       Asian umbrella:
         - R0_EthnGroupIndian
         - R0_EthnGroupPakistani
         - R0_EthnGroupBngldsh
         - R0_EthnGroupChinese
       Other umbrella:
         - R0_EthnGroupNone

     Override classification rule (when overriding is triggered):
       - If exactly one umbrella group is represented by non-missing components,
         assign that collapsed group.
       - If multiple umbrella groups are represented, classify as Other.

  3. Fallback for missing fast ethnicity:
     For participants where R0_Ethnicity is still missing after Steps 1–2
     (e.g., R0_EthnGroup missing or unmapped), derive R0_Ethnicity using the
     same component-field logic as in Step 2:
       - Identify which umbrella group(s) have any non-missing component values
       - If exactly one umbrella group is present, assign it
       - If multiple umbrella groups are present, assign Other
       - If no component information is present, leave as NA

Decisions / cutoffs:
  • “Fast first, components second” approach:
      - R0_EthnGroup is the primary source for collapsed ethnicity.
      - Component fields are used as a fallback, and also as an override only when
        R0_EthnGroup explicitly indicates “none apply” (9) but components contain data.
  • Handling of mixed/multiple umbrella groups:
      - If more than one umbrella group is indicated by components, assign Other (4).
  • Missingness:
      - If neither fast nor component information is present, R0_Ethnicity remains NA
        (no special numeric missing code is applied).

-------------------------------------------------------------------------------
2) Ashkenazi ancestry flag (R0_AshkenaziAncestry)
-------------------------------------------------------------------------------
Methodology:
  1. Identify Ashkenazi ancestry using either:
       - Fast indicator: R0_EthnGroup == 3 (Jewish)
       - Component indicator: R0_EthnGroupJAshk is non-missing

  2. Create the output flag:
       - R0_AshkenaziAncestry = 1 if either indicator is present
       - R0_AshkenaziAncestry = 0 if ethnicity information exists but neither indicator is present
       - R0_AshkenaziAncestry = NA if no ethnicity information exists at all

  3. “Any ethnicity information exists” is defined as:
       - R0_EthnGroup is non-missing OR
       - any of the component ethnicity fields (listed above) is non-missing

Decisions / cutoffs:
  • A participant is considered to have Ashkenazi ancestry if:
      - R0_EthnGroup indicates Jewish (code 3), OR
      - the Ashkenazi component field R0_EthnGroupJAshk is non-missing.
  • If there is no ethnicity data (fast and all components missing), the flag is left NA
    rather than being forced to 0.
"""

'\nDerive ethnicity variables:\n\n  • R0_Ethnicity: 1 = White, 2 = Black, 3 = Asian, 4 = Other / mixed / none-of-these\n  • R0_AshkenaziAncestry: 1 = Ashkenazi ancestry present, 0 = no Ashkenazi ancestry, NA = no ethnicity info\n\n-------------------------------------------------------------------------------\n1) Collapsed ethnicity (R0_Ethnicity)\n-------------------------------------------------------------------------------\nMethodology:\n  1. Primary (“fast”) source:\n     Use the single-choice ethnicity field:\n       - R0_EthnGroup\n\n     Map R0_EthnGroup into collapsed groups using:\n       - 1 -> White\n       - 2 -> Asian\n       - 3 -> White        (Jewish is collapsed into White for R0_Ethnicity)\n       - 4 -> Black\n       - 5 -> Asian\n       - 6 -> White\n       - 7 -> Black\n       - 8 -> Asian\n       - 9 -> Other        (“None apply”, but may be overridden using components)\n       - 10 -> Black\n       - 11 -> Asian\n       - 0 -> Other        (Other/unspecified)\n\

In [6]:
def derive_ethnicity_simple(df: pd.DataFrame) -> pd.DataFrame:

    fast = "R0_EthnGroup"
    comps = {
        "White": ["R0_EthnGroupWhite", "R0_EthnGroupJAshk", "R0_EthnGroupJSeph"],
        "Black": ["R0_EthnGroupBlkCarib", "R0_EthnGroupBlkAfr", "R0_EthnGroupBlkOther"],
        "Asian": ["R0_EthnGroupIndian", "R0_EthnGroupPakistani",
                  "R0_EthnGroupBngldsh", "R0_EthnGroupChinese"],
        "Other": ["R0_EthnGroupNone"],
    }

    FAST_TO_GROUP = {
        1: 1,  # White
        2: 3,  # Asian
        3: 1,  # Jewish (collapsed into White)
        4: 2,  # Black
        5: 3,  # Asian
        6: 1,  # White
        7: 2,  # Black
        8: 3,  # Asian
        9: 4,  # None apply -> Other (unless overridden by components)
        10: 2, # Black
        11: 3, # Asian
        0: 4,  # Other / unspecified -> Other
    }

    GROUP_CODE = {"White": 1, "Black": 2, "Asian": 3, "Other": 4}

    out_eth = pd.Series(pd.NA, index=df.index, dtype="Int64")

    # 1) Start from the fast variable where available
    if fast in df.columns:
        out_eth = df[fast].map(FAST_TO_GROUP).astype("Int64")

        # 2) Override R0_EthnGroup == 9 ("none apply") when disaggregated fields give more detail
        comp_cols = [c for cols in comps.values() for c in cols if c in df.columns]
        if comp_cols:
            mask_none_apply = df[fast] == 9
            has_any_component = df[comp_cols].notna().any(axis=1)
            override = mask_none_apply & has_any_component

            if override.any():
                for idx, row in df.loc[override].iterrows():
                    groups = []

                    if any(col in row.index and pd.notna(row[col]) for col in comps["White"]):
                        groups.append("White")
                    if any(col in row.index and pd.notna(row[col]) for col in comps["Black"]):
                        groups.append("Black")
                    if any(col in row.index and pd.notna(row[col]) for col in comps["Asian"]):
                        groups.append("Asian")
                    if any(col in row.index and pd.notna(row[col]) for col in comps["Other"]):
                        groups.append("Other")

                    if not groups:
                        # No usable component info; keep the fast-derived value (Other)
                        continue

                    uniq = set(groups)
                    if len(uniq) == 1:
                        out_eth.at[idx] = GROUP_CODE[next(iter(uniq))]
                    else:
                        # Multiple umbrella groups ticked -> Other
                        out_eth.at[idx] = GROUP_CODE["Other"]

    # 3) Fallback: where ethnicity is still missing, use components only
    need = out_eth.isna()
    if need.any():
        for idx, row in df.loc[need].iterrows():
            groups = []

            if any(col in row.index and pd.notna(row[col]) for col in comps["White"]):
                groups.append("White")
            if any(col in row.index and pd.notna(row[col]) for col in comps["Black"]):
                groups.append("Black")
            if any(col in row.index and pd.notna(row[col]) for col in comps["Asian"]):
                groups.append("Asian")
            if any(col in row.index and pd.notna(row[col]) for col in comps["Other"]):
                groups.append("Other")

            if not groups:
                # No information at all → leave as NA
                continue

            uniq = set(groups)
            if len(uniq) == 1:
                out_eth.at[idx] = GROUP_CODE[next(iter(uniq))]
            else:
                out_eth.at[idx] = GROUP_CODE["Other"]

    # 4) Ashkenazi ancestry (unchanged in spirit, just written explicitly)
    # Fast-based Jewish flag
    if fast in df.columns:
        fast_is_ashk = df[fast] == 3
    else:
        fast_is_ashk = pd.Series(False, index=df.index)

    # Component-based Ashkenazi flag
    if "R0_EthnGroupJAshk" in df.columns:
        comp_ashk_present = df["R0_EthnGroupJAshk"].notna()
    else:
        comp_ashk_present = pd.Series(False, index=df.index)

    has_ashk = fast_is_ashk | comp_ashk_present

    # "Any ethnicity info at all" = fast or any component non-missing
    any_info_cols = []
    if fast in df.columns:
        any_info_cols.append(fast)
    any_info_cols.extend(comp_cols if 'comp_cols' in locals() else [])

    if any_info_cols:
        any_info = df[any_info_cols].notna().any(axis=1)
    else:
        any_info = pd.Series(False, index=df.index)

    out_aj = pd.Series(pd.NA, index=df.index, dtype="Int64")
    out_aj.loc[has_ashk] = 1
    out_aj.loc[~has_ashk & any_info] = 0

    df["R0_Ethnicity"] = out_eth
    df["R0_AshkenaziAncestry"] = out_aj

    return df


In [7]:
"""
Derive body size variables (baseline entry + age 20 + waist/hip):

  • R0_Height (cm), R0_Weight (kg), R0_BMI
  • R0_Height20 (cm), R0_Weight20 (kg), R0_BMI20


-------------------------------------------------------------------------------
1) Entry height / weight (R0_Height, R0_Weight)
-------------------------------------------------------------------------------
Methodology:
  1. Read baseline (current-at-entry) height inputs from PhysicalDevelopment:
       - R0_CurrentHght_Cm
       - R0_CurrentHght_Ft, R0_CurrentHght_In
     and baseline weight inputs:
       - R0_CurrentWght_Kg
       - R0_CurrentWght_St, R0_CurrentWght_Lbs

  2. Coerce inputs to numeric where possible (blank/None -> missing). Any parsing
     failures are treated as missing.

  3. Height conversion and selection:
       - Imperial height (if provided): ft * 30.48 + inches * 2.54  -> cm
       - Metric height: cm as provided
     Selection rule:
       - Prefer metric height if it is present and > 0
       - Otherwise use imperial-converted height if present and > 0
       - Otherwise height is missing (None)

  4. Weight conversion and selection:
       - Imperial weight (if provided): stone * 6.35029318 + pounds * 0.45359237 -> kg
       - Metric weight: kg as provided
     Selection rule:
       - Prefer metric weight if it is present and > 0
       - Otherwise use imperial-converted weight if present and > 0
       - Otherwise weight is missing (None)

Decisions / cutoffs:
  • Validity check is positivity only:
      - Height must be > 0 cm to be retained
      - Weight must be > 0 kg to be retained
    (No additional upper-bound plausibility filters are applied in the cleaning step.)
  • Metric measurements take priority over imperial-derived values when both exist.

-------------------------------------------------------------------------------
2) Entry BMI (R0_BMI)
-------------------------------------------------------------------------------
Methodology:
  1. Compute BMI as kg / (m^2) using:
       - weight_kg = R0_Weight
       - height_m  = R0_Height / 100
  2. Return BMI rounded to 1 decimal place.

Decisions / cutoffs:
  • If R0_CurrentPreg == 1 (currently pregnant at entry), set:
      - R0_BMI = 999
    (Pregnancy status is stored as R0_PregAtEntry = R0_CurrentPreg.)
  • If height or weight is missing, non-numeric, or non-positive:
      - R0_BMI = None
  • Otherwise:
      - R0_BMI = round(BMI, 1)

-------------------------------------------------------------------------------
3) Age-20 height / weight / BMI (R0_Height20, R0_Weight20, R0_BMI20)
-------------------------------------------------------------------------------
Methodology:
  1. Read age-20 height inputs from PhysicalDevelopment:
       - R0_HghtAge20_Cm
       - R0_HghtAge20_Ft, R0_HghtAge20_In
     and age-20 weight inputs:
       - R0_WghtAge20_Kg
       - R0_WghtAge20_St, R0_WghtAge20_Lbs

  2. Apply the same cleaning and unit-conversion logic as entry values:
       - Height: prefer positive cm; else ft/in converted to cm
       - Weight: prefer positive kg; else st/lbs converted to kg

  3. Compute BMI20 using the same BMI function as entry BMI, rounded to 1 dp.

Decisions / cutoffs:
  • If R0_AgeEntry < 20, set:
      - R0_BMI20 = 999
    Rationale: participant cannot have an age-20 value within study context if they
    entered younger than 20.
  • If R0_PregAt20 == 1, set:
      - R0_BMI20 = 999
    Rationale: BMI at age 20 is not considered comparable when pregnant at 20.
  • If neither exclusion applies:
      - R0_BMI20 = round(BMI(height20_cm, weight20_kg), 1) if both measures are valid
      - otherwise R0_BMI20 = None (missing/invalid height or weight)
"""

'\nDerive body size variables (baseline entry + age 20 + waist/hip):\n\n  • R0_Height (cm), R0_Weight (kg), R0_BMI\n  • R0_Height20 (cm), R0_Weight20 (kg), R0_BMI20\n\n\n-------------------------------------------------------------------------------\n1) Entry height / weight (R0_Height, R0_Weight)\n-------------------------------------------------------------------------------\nMethodology:\n  1. Read baseline (current-at-entry) height inputs from PhysicalDevelopment:\n       - R0_CurrentHght_Cm\n       - R0_CurrentHght_Ft, R0_CurrentHght_In\n     and baseline weight inputs:\n       - R0_CurrentWght_Kg\n       - R0_CurrentWght_St, R0_CurrentWght_Lbs\n\n  2. Coerce inputs to numeric where possible (blank/None -> missing). Any parsing\n     failures are treated as missing.\n\n  3. Height conversion and selection:\n       - Imperial height (if provided): ft * 30.48 + inches * 2.54  -> cm\n       - Metric height: cm as provided\n     Selection rule:\n       - Prefer metric height if it is 

In [8]:
# =========================
# Body size derivations
# =========================

def _get_recorded_weight(pdata: dict, which: str):
    """
    Look up a recorded weight entry by flag ('Cur', '20', '40', '60')
    in the PhysicalDevelopment record.

    Returns (stone, lbs, kg) or (None, None, None)
    """
    items = pdata.get("RecordedWeights") or []

    for idx, entry in enumerate(items):
        num = entry.get("R0_RecWght_Num")
        if num == which:
            st  = entry.get("R0_WghtAge_St")
            lbs = entry.get("R0_WghtAge_Lbs")
            kg  = entry.get("R0_WghtAge_Kg")
            return st, lbs, kg
    return None, None, None

def _get_recorded_height(pdata: dict, which: str):
    """
    Look up a recorded height entry by flag ('Cur', '20').

    Returns (feet, inches, cm) or (None, None, None)
    """
    items = pdata.get("RecordedHeights") or []

    for idx, entry in enumerate(items):
        num = entry.get("R0_RecHght_Num")
        if num == which:
            ft = entry.get("R0_HghtAge_Ft")
            inc = entry.get("R0_HghtAge_In")
            cm = entry.get("R0_HghtAge_Cm")
            return ft, inc, cm
    return None, None, None
    
def clean_body_size(feet, inches, cm, stone, pounds, kg):
    """
    Returns (weight_kg, height_cm) using cleaned inputs.
    No sentinel codes are used:
      • If no valid value -> None
      • Otherwise numeric in SI units
    """
    # height
    ft    = _to_float_or_none(feet)
    inch  = _to_float_or_none(inches)
    cmval = _to_float_or_none(cm)

    h_imp = None
    if ft is not None or inch is not None:
        if ft   is None: ft = 0.0
        if inch is None: inch = 0.0
        h_imp = ft * 30.48 + inch * 2.54

    h_met = cmval

    height_cm = None
    if h_met is not None and 0 < h_met:
        height_cm = h_met
    elif h_imp is not None and 0 < h_imp:
        height_cm = h_imp
    else:
        height_cm = None

    # weight
    st   = _to_float_or_none(stone)
    lbs  = _to_float_or_none(pounds)
    kg_v = _to_float_or_none(kg)

    w_imp = None
    if st is not None or lbs is not None:
        if st  is None: st  = 0.0
        if lbs is None: lbs = 0.0
        w_imp = st * 6.35029318 + lbs * 0.45359237

    w_met = kg_v

    # take any positive value of kg over imperial but if no kg then imperial if positive
    weight_kg = None
    if w_met is not None and 0 < w_met:
        weight_kg = w_met
    elif w_imp is not None and 0 < w_imp:
        weight_kg = w_imp
    else:
        weight_kg = None

    return weight_kg, height_cm

def _bmi_value(weight_kg, height_cm):
    """Compute BMI (kg/m^2) if both provided & plausible; else None."""
    try:
        if weight_kg is None or height_cm is None:
            return None
        w = float(weight_kg)
        h_m = float(height_cm) / 100.0
        if w <= 0 or h_m <= 0:
            return None
        bmi = w / (h_m * h_m)
        return round(bmi, 1)
    except Exception:
        return None

In [9]:
"""
Derive waist/hip circumference and waist-hip ratio (WHR):

  • R0_WaistCircum     (cm)
  • R0_HipCircum       (cm)
  • R0_WaistHipRatio   (waist/hip), or 999 if currently pregnant at questionnaire

-------------------------------------------------------------------------------
1) Inputs used
-------------------------------------------------------------------------------
Methodology:
  Source: PhysicalDevelopment / physical measurements record (by TCode), using:
    - Waist:
        • R0_WaistCircum_In
        • R0_WaistCircum_Cm
    - Hip:
        • R0_HipCircum_In
        • R0_HipCircum_Cm

  Pregnancy source (by TCode), used only for WHR pregnancy sentinel:
    - R0_CurrentPreg   (1 = Yes)

Decisions / cutoffs:
  • Waist and hip circumferences are derived independently; missingness can differ across them.

-------------------------------------------------------------------------------
2) Convert to cm and select value (_cm_from_inches_or_cm)
-------------------------------------------------------------------------------
Methodology:
  For each measurement (waist and hip), choose a single cm value using:
    1) If imperial inches is present and > 0:
         cm = inches * 2.54
    2) Else if metric cm is present and > 0:
         cm = cm_value
    3) Else:
         cm = None

  Output cm values are rounded to 2 decimal places using rounding(val, 2).

Decisions / cutoffs:
  • Priority rule is imperial-first: inches (converted) overrides cm when both exist.
  • Validity check implemented is lower-bound only:
      - values must be > 0
    (Note: although the helper docstring references an upper bound (<=350), the implemented
     logic does not enforce an upper bound; it only checks val > 0.)

-------------------------------------------------------------------------------
3) Compute WHR (R0_WaistHipRatio)
-------------------------------------------------------------------------------
Methodology:
  1. Determine pregnancy flag for the same TCode:
      - If preg_flag == 1 (currently pregnant):
          R0_WaistHipRatio = 999
      - Else:
          If waist_cm and hip_cm are present and hip_cm > 0:
              R0_WaistHipRatio = rounding(waist_cm / hip_cm, 2)
          Else:
              R0_WaistHipRatio = None

Decisions / cutoffs:
  • Pregnancy sentinel:
      - WHR is set to 999 when R0_CurrentPreg == 1, regardless of waist/hip availability.
  • WHR is only computed when both waist and hip are available and hip > 0.
  • WHR is rounded to 2 decimal places.
"""



'\nDerive waist/hip circumference and waist-hip ratio (WHR):\n\n  • R0_WaistCircum     (cm)\n  • R0_HipCircum       (cm)\n  • R0_WaistHipRatio   (waist/hip), or 999 if currently pregnant at questionnaire\n\n-------------------------------------------------------------------------------\n1) Inputs used\n-------------------------------------------------------------------------------\nMethodology:\n  Source: PhysicalDevelopment / physical measurements record (by TCode), using:\n    - Waist:\n        • R0_WaistCircum_In\n        • R0_WaistCircum_Cm\n    - Hip:\n        • R0_HipCircum_In\n        • R0_HipCircum_Cm\n\n  Pregnancy source (by TCode), used only for WHR pregnancy sentinel:\n    - R0_CurrentPreg   (1 = Yes)\n\nDecisions / cutoffs:\n  • Waist and hip circumferences are derived independently; missingness can differ across them.\n\n-------------------------------------------------------------------------------\n2) Convert to cm and select value (_cm_from_inches_or_cm)\n-------------

In [10]:
# =========================
# Waist / Hip circumference + WHR
# =========================
def _cm_from_inches_or_cm(inches_val, cm_val):
    """
    Choose value using priority per schema:
      1) Imperial (inches) converted to cm (in * 2.54)
      2) Direct metric input (cm)
    Returns a float (cm) rounded to 1 dp, or None if invalid or out of plausible bounds (0 < x <= 350).
    """
    def _to_float_or_none_local(x):
        try:
            if x is None or x == "" or (isinstance(x, float) and np.isnan(x)):
                return None
            return float(str(x).strip())
        except Exception:
            return None

    inches = _to_float_or_none_local(inches_val)
    cm     = _to_float_or_none_local(cm_val)

    if inches is not None and inches > 0:
        val = inches * 2.54
    elif cm is not None and cm > 0:
        val = cm
    else:
        return None

    # plausible range per schema (exclusiveMaximum 350)
    if not (0 < val):
        return None
    return rounding(val, 2)

def derive_waist_hip(physical_by_tcode, pregnancies_by_tcode):
    """
    Creates:
      - R0_WaistCircum (cm)
      - R0_HipCircum   (cm)
      - R0_WaistHipRatio (waist/hip), or 999 if currently pregnant at questionnaire (per schema)
    Input dicts: {TCode: section_record}
    """
    out = {}

    for tcode, rec in physical_by_tcode.items():
        if not isinstance(rec, dict):
            continue

        # Pull raw fields
        w_in = rec.get("R0_WaistCircum_In")
        w_cm = rec.get("R0_WaistCircum_Cm")
        h_in = rec.get("R0_HipCircum_In")
        h_cm = rec.get("R0_HipCircum_Cm")

        waist_cm = _cm_from_inches_or_cm(w_in, w_cm)
        hip_cm   = _cm_from_inches_or_cm(h_in, h_cm)

        # Pregnancy flag for WHR (1 = Yes, consistent with other sections)
        preg_flag = None
        preg_rec = pregnancies_by_tcode.get(tcode)
        if isinstance(preg_rec, dict):
            preg_flag = preg_rec.get("R0_CurrentPreg", None)

        # Compute WHR if not pregnant
        if preg_flag == 1:
            whr = 999
        else:
            if waist_cm is not None and hip_cm is not None and hip_cm > 0:
                whr = rounding(waist_cm / hip_cm, 2)
            else:
                whr = None

        out[tcode] = {
            "R0_WaistCircum": waist_cm,
            "R0_HipCircum": hip_cm,
            "R0_WaistHipRatio": whr,
        }

    return out


In [11]:
def calculate_derived_variables_by_tcode(physical_data_by_tcode, pregnancies_data_by_tcode):
    results = []
    for tcode, pdata in physical_data_by_tcode.items():
        preg_status = pregnancies_data_by_tcode.get(tcode, {}).get('R0_CurrentPreg', 2)

        # cleaned entry height/weight
        feet   = pdata.get('R0_CurrentHght_Ft', '')
        inches = pdata.get('R0_CurrentHght_In', '')
        cm     = pdata.get('R0_CurrentHght_Cm', '')
        stone  = pdata.get('R0_CurrentWght_St', '')
        pounds = pdata.get('R0_CurrentWght_Lbs', '')
        kg     = pdata.get('R0_CurrentWght_Kg', '')

        weight_kg, height_cm = clean_body_size(feet, inches, cm, stone, pounds, kg)

        rec = {
            "R0_TCode": tcode,
            "R0_PregAtEntry": preg_status,
            "R0_Height": height_cm,   # None or cm
            "R0_Weight": weight_kg,   # None or kg
        }

        # Entry BMI rules:
        #  - 999 if currently pregnant
        #  - None if height/weight missing/invalid
        #  - else calculated BMI
        if preg_status == 1:
            rec["R0_BMI"] = 999
        else:
            rec["R0_BMI"] = _bmi_value(rec.get("R0_Weight"), rec.get("R0_Height"))

        results.append(rec)
    return results

def calculate_bmi20_by_tcode(physical_data_by_tcode,
                              core_map_by_tcode,
                              pregnancies_data_by_tcode):
    """
    Produces R0_Height20, R0_Weight20, R0_BMI20 per TCode without altering existing call sites.
    Inputs:
      • physical_data_by_tcode: dict[TCode] -> PhysicalDevelopment record (must include Age-20 fields)
      • core_map_by_tcode:     dict[TCode] -> dict with R0_AgeEntry, R0_PregAt20 already derived
      • pregnancies_data_by_tcode: unused here, but kept for symmetry if you want to reuse flags

    Age-20 fields expected in physical:
      R0_HghtAge20_Ft, R0_HghtAge20_In, R0_HghtAge20_Cm
      R0_WghtAge20_St, R0_WghtAge20_Lbs, R0_WghtAge20_Kg

    Rules:
      • R0_BMI20 = 999 if (R0_AgeEntry < 20) or (R0_PregAt20 == 1)
      • else R0_BMI20 = BMI(height20_cm, weight20_kg) or None if missing/invalid
    """
    out = {}
    for tcode, pdata in physical_data_by_tcode.items():
        # Age-20
        ft20, in20, cm20 = _get_recorded_height(pdata, "20")
        st20, lb20, kg20 = _get_recorded_weight(pdata, "20")

        w20, h20 = clean_body_size(ft20, in20, cm20, st20, lb20, kg20)

        age_entry   = None
        preg_at_20  = None
        
        core = core_map_by_tcode.get(tcode, {}) if core_map_by_tcode else {}

        if core:
            age_entry  = core.get("R0_AgeEntry")
            preg_at_20 = core.get("R0_PregAt20")

        rec = {
            "R0_TCode": tcode,
            "R0_Height20": h20,  # None or cm
            "R0_Weight20": w20,  # None or kg
        }

        # BMI20 rules
        set_na = False
        try:
            if age_entry is not None and float(age_entry) < 20:
                set_na = True
        except Exception:
            # if AgeEntry is malformed, we don't force NA here; we fall through to pregnancy check & BMI compute
            pass

        if preg_at_20 == 1:
            set_na = True

        if set_na:
            rec["R0_BMI20"] = 999
        else:
            rec["R0_BMI20"] = _bmi_value(w20, h20)

        out[tcode] = rec
    return out

In [None]:
"""
Derive pregnancy-at-age-20 flag:

  • R0_PregAt20: 1 = pregnant during age-20 window, 0 = not pregnant during age-20 window,
                 999 = not applicable (entered study before age 20)

-------------------------------------------------------------------------------
1) Pregnancy at age 20 (R0_PregAt20)
-------------------------------------------------------------------------------
Methodology:
  1. Inputs / sources (NO Mailing DB fields; no ADOB/Random):
     Raw derivation output (per person; SHIFTED timeline):
       - DOB        (shifted date of birth)
       - AgeEntry   (age at study entry, years)

     Pregnancies S4 JSON (per person; SHIFTED timeline):
       - R0_CurrentPreg
       - Pregnancies[] episode array with:
           • R0_PregnancyEndDate   (shifted end date)
           • R0_Preg_DurationWks   (duration in weeks; may be missing)
           • R0_Preg_Outcome       (used to infer term duration if duration is missing)

     NOTE: DOB and R0_PregnancyEndDate are shifted by the same participant-specific offset,
           so comparisons are valid without “unshifting”.

  2. Age-at-entry gating (applicability):
     If AgeEntry is present and < 20, the variable is treated as not applicable → 999.

  3. Immediate “currently pregnant at entry” rule:
     If R0_CurrentPreg == 1 AND AgeEntry == 20, classify as pregnant at 20 (1).
     (Direct short-circuit without needing episode dates.)

  4. Construct the age-20 time window (SHIFTED):
     - Parse DOB to a date (invalid/unparseable → treated as missing).
     - Define:
         start20 = DOB + 20 years
         end21   = DOB + 21 years
     - The age-20 window is the half-open interval: [start20, end21)

  5. Evaluate pregnancy episodes for overlap with the age-20 window (SHIFTED):
     For each pregnancy episode:
       a) Parse shifted end date:
            end = R0_PregnancyEndDate
       b) Determine duration days:
            - If R0_Preg_DurationWks is present and numeric:
                dur_days = round(R0_Preg_DurationWks * 7)
            - Else, if outcome indicates pregnancy carried to (near) term
              (e.g., live birth / stillbirth categories), assume 40 weeks (280 days).
            - Otherwise (duration unknown and not clearly term), skip the episode.
       c) Back-calculate start date:
            start = end - dur_days
       d) If the pregnancy interval [start, end] overlaps the age-20 window [start20, end21),
          return 1.

  6. If no qualifying overlap is found across episodes (and no earlier rule triggered),
     return 0.

Decisions / cutoffs:
  • Age-20 window definition:
      - “Pregnant at 20” means any overlap with the interval from 20th birthday up to
        (but not including) 21st birthday.
  • Applicability:
      - If AgeEntry < 20 → R0_PregAt20 = 999.
  • “Currently pregnant” short-circuit:
      - If R0_CurrentPreg == 1 and AgeEntry == 20 → R0_PregAt20 = 1.
  • Missing DOB handling:
      - If DOB cannot be parsed (cannot form the [20,21) window) → default output is 0.
  • Conservative handling of missing duration:
      - Duration is inferred only for outcomes consistent with term; otherwise the episode
        is ignored to avoid false positives.
"""

"\nDerive pregnancy-at-age-20 flag:\n\n  • R0_PregAt20: 1 = pregnant during age-20 window, 0 = not pregnant during age-20 window,\n                 999 = not applicable (entered study before age 20)\n\n-------------------------------------------------------------------------------\n1) Pregnancy at age 20 (R0_PregAt20)\n-------------------------------------------------------------------------------\nMethodology:\n  1. Inputs / sources:\n     Mailing / core (per person):\n       - ADOB            (date of birth)\n       - Random          (per-person date-shift value, in days; used to undo shifting)\n       - R0_AgeEntry     (age at study entry, years)\n\n     Pregnancies (per person):\n       - R0_CurrentPreg\n       - Pregnancies[] episode array with:\n           • R0_PregnancyEndDate   (shifted end date in S4 JSON)\n           • R0_Preg_DurationWks   (duration in weeks)\n\n  2. Age-at-entry gating (applicability):\n     If R0_AgeEntry is present and < 20, the variable is treated as not

In [13]:
# Outcomes that imply the pregnancy was carried to (near) term.
# Based on schema enumDescriptions for R0_Preg_Outcome. :contentReference[oaicite:1]{index=1}
_TERM_OUTCOMES = {1, 2, 3, 4, 5, 8, 10}  # live birth(s), stillborn, presumed live birth, twins neither live born

def _derive_R0_PregAt20_for_person(entry_rec: dict, preg_rec: dict) -> int:
    """
    R0_PregAt20:
      1 = pregnant at any time during age 20 (DOB+20y .. DOB+21y),
      0 = not pregnant during that window,
      999 = not applicable (AgeEntry < 20)

    IMPORTANT:
      - Uses SHIFTED DOB from raw derivation output: entry_rec["DOB"]
      - Uses SHIFTED pregnancy end date from S4: ep["R0_PregnancyEndDate"]
      - No ADOB, no Random, no unshifting.
    """

    if not isinstance(entry_rec, dict):
        return 0

    # --- applicability gate ---
    age_entry = entry_rec.get("AgeEntry") or entry_rec.get("R0_AgeEntry")
    try:
        if age_entry is not None and float(age_entry) < 20:
            return 999
    except Exception:
        pass

    # --- current pregnancy at entry shortcut (kept consistent with your original intent) ---
    curr_preg = (preg_rec or {}).get("R0_CurrentPreg")
    try:
        if curr_preg == 1 and age_entry is not None and float(age_entry) == 20:
            return 1
    except Exception:
        pass

    # --- build the age-20 window from SHIFTED DOB (raw derivation output) ---
    dob_shifted = _safe_parse_date(entry_rec.get("DOB") or entry_rec.get("R0_DOB"))
    start20 = _add_years(dob_shifted, 20)
    end21  = _add_years(dob_shifted, 21)
    if not (start20 and end21):
        return 0

    # --- check each episode for overlap with [start20, end21) in the SHIFTED timeline ---
    episodes = (preg_rec or {}).get("Pregnancies") or []
    for ep in episodes:
        if not isinstance(ep, dict):
            continue

        end_shifted = _safe_parse_date(ep.get("R0_PregnancyEndDate"))
        if not end_shifted:
            continue

        dur_days = _preg_duration_days(ep)
        if dur_days is None:
            # duration truly unknown -> cannot form interval safely
            continue

        start_shifted = end_shifted - timedelta(days=dur_days)

        if _intervals_overlap(start_shifted, end_shifted, start20, end21):
            return 1

    return 0


def _preg_duration_days(ep: dict) -> int | None:
    """
    Returns pregnancy duration in days.
    Priority:
      1) R0_Preg_DurationWks if present
      2) If missing AND outcome suggests term -> assume 40 weeks
      3) Else return None (unknown)
    """
    # 1) explicit duration
    wks = ep.get("R0_Preg_DurationWks")
    try:
        if wks is not None:
            return int(round(float(wks) * 7.0))
    except Exception:
        pass

    # 2) infer from outcome
    outcome = ep.get("R0_Preg_Outcome")
    try:
        if outcome is not None and int(outcome) in _TERM_OUTCOMES:
            return 40 * 7
    except Exception:
        pass

    # (optional) you could decide to treat outcome-unknown (14) as term,
    # but I'm leaving it as truly unknown to avoid false positives.
    return None

In [14]:
"""
Derive age at menarche:

  • R0_AgeMenarche (Int64, years)

-------------------------------------------------------------------------------
1) Inputs used
-------------------------------------------------------------------------------
Methodology:
  Source: Menstrual / menopause history record (MenstrualMenopause schema), using:
    - R0_EverHadPeriods   (1 = Yes, 2 = No, else unknown)
    - R0_FirstPeriodAge   (integer years or null)

Decisions / cutoffs:
  • Age is treated as “integer years” per schema; the derivation attempts int() coercion.

-------------------------------------------------------------------------------
2) Derivation logic (as implemented)
-------------------------------------------------------------------------------
Methodology:
  1. If R0_EverHadPeriods == 2 (“No”):
       - Set R0_AgeMenarche = NA

  2. Else:
       - Attempt to coerce R0_FirstPeriodAge to an integer:
           age_int = int(R0_FirstPeriodAge) if present and not NA
       - If age_int is available:
           return it as R0_AgeMenarche
       - Otherwise:
           set R0_AgeMenarche = NA

Decisions / cutoffs:
  • “Never had periods” rule:
      - EverHadPeriods == 2 forces R0_AgeMenarche to missing (NA).
  • No plausibility range is enforced in the implemented logic:
      - Although the function docstring states “keep only if 7 <= age <= 25”, the current
        implementation returns any successfully parsed integer age without applying that filter.
  • Missingness is stored as pandas NA (nullable Int64), with no special numeric missing code.
"""

'\nDerive age at menarche:\n\n  • R0_AgeMenarche (Int64, years)\n\n-------------------------------------------------------------------------------\n1) Inputs used\n-------------------------------------------------------------------------------\nMethodology:\n  Source: Menstrual / menopause history record (MenstrualMenopause schema), using:\n    - R0_EverHadPeriods   (1 = Yes, 2 = No, else unknown)\n    - R0_FirstPeriodAge   (integer years or null)\n\nDecisions / cutoffs:\n  • Age is treated as “integer years” per schema; the derivation attempts int() coercion.\n\n-------------------------------------------------------------------------------\n2) Derivation logic (as implemented)\n-------------------------------------------------------------------------------\nMethodology:\n  1. If R0_EverHadPeriods == 2 (“No”):\n       - Set R0_AgeMenarche = NA\n\n  2. Else:\n       - Attempt to coerce R0_FirstPeriodAge to an integer:\n           age_int = int(R0_FirstPeriodAge) if present and not NA\n

In [15]:
def derive_R0_AgeMenarche_from_menstrual(rec: dict) -> pd.Series:

    ever = rec.get("R0_EverHadPeriods", None)
    if ever == 2:
        return pd.Series({"R0_AgeMenarche": pd.NA}, dtype="Int64")

    age = rec.get("R0_FirstPeriodAge", None)

    # accept only clean ints (schema guarantees int-or-null)
    try:
        age_int = int(age) if age is not None and not pd.isna(age) else None
    except Exception:
        age_int = None

    if age_int is not None:
        return pd.Series({"R0_AgeMenarche": age_int}, dtype="Int64")

    return pd.Series({"R0_AgeMenarche": pd.NA}, dtype="Int64")


def derive_R0_AgeMenarche_df(df: pd.DataFrame) -> pd.Series:

    ever = df["R0_EverHadPeriods"]
    age  = df["R0_FirstPeriodAge"]

    out = pd.Series(pd.NA, index=df.index, dtype="Int64")
    mask_consider = ~(ever == 2)
    mask_valid_age = age
    out.loc[mask_consider & mask_valid_age] = age.loc[mask_consider & mask_valid_age].astype("Int64")
    return out

In [16]:
"""
Derive menopause variables (from menstrual / menopause history + ovary/uterus operations):

  • R0_Menopause        (menopausal status; multi-level code)
  • R0_AgeMenopause     (age at menopause, years; only populated for confirmed postmenopause)
  • R0_MenopauseReason  (derived reason / classification code)

-------------------------------------------------------------------------------
1) Inputs used
-------------------------------------------------------------------------------
Methodology:
  Primary menstrual/menopause history inputs (MenstrualMenopause schema):
    - R0_EverHadPeriods        (ever had periods)
    - R0_PeriodsStoppedPerm    (periods now stopped completely?)
    - R0_HRTPeriodsUnclear     (HRT obscures menopause timing)
    - R0_MenopauseAge          (age when periods stopped completely)
    - R0_PeriodsStoppedReason  (natural / surgery / chemo / DK / other)

  Surgery history inputs (OvaryUterusOperations array):
    - R0_OvaryOp_BothOvaries   (1 = both ovaries removed)
    - R0_OvaryOp_Uterus        (4 = uterus removed / hysterectomy)
    - R0_OvaryOp_Age           (single age at operation)
    - OvaryUterusOperationsExtra[0].R0_OvaryOp_FromAge / ToAge (age range, used if single age missing)

  Additional person-level input:
    - age_entry (R0_AgeEntry), used as “qage” (age at questionnaire/entry)

Decisions / cutoffs:
  • R0_PeriodsStoppedPerm values treated as missing:
      - 0 (“multi”) and 4 (“Not applicable”) are set to missing for menopause status input.
  • All numeric-like fields are coerced to float; non-parsable values become missing (NaN).

-------------------------------------------------------------------------------
2) Constants / age cutoffs used in menopausal classification
-------------------------------------------------------------------------------
Methodology:
  The derivation uses three fixed age threshold:
    - age_min = 45  : minimum age threshold used for some “assumed” postmenopause rules
    - age_as  = 51  : assumed typical age of menopause for “assumed pre/post” classification
    - age_la  = 62  : maximum age at which someone is still considered plausibly premenopausal

Decisions / cutoffs:
  • If a reported/derived menopause age exceeds age_la, it is capped at age_la.
  • If age at entry (qage) exceeds age_la, menopausal status is forced to postmenopausal.

-------------------------------------------------------------------------------
3) Derive R0_MenopauseReason (new_menoreason)
-------------------------------------------------------------------------------
Methodology:
  Step A — Recode questionnaire “periods stopped reason” into menoreason codes:
    - R0_PeriodsStoppedReason:
        1 -> 1  (natural)
        2 -> 4  (surgery, type unknown)
        3 -> 5  (chemo/radiotherapy)
        4 -> 6  (does not know reason)
        5 -> 7  (other reported reason)
      Missing / 0 -> missing

  Step B — Incorporate HRT “timing unclear” flags:
    - menopause_nk_hrt is set when:
        R0_PeriodsStoppedPerm indicates “Do not know” (3) or “Not sure”/unclear (4 treated missing earlier)
        AND R0_HRTPeriodsUnclear == 1
      (Implemented as: menopause in (3,4) and age_nk_hrt == 1.)

  Step C — Resolve / infer new_menoreason using menopause status, age, and flags:
    1) If “does not know reason” (menoreason == 6):
         - If menopause == 1 (stopped) -> new_menoreason = 1 (treat as natural)
         - If menopause in (2,8)       -> new_menoreason = 16 (premenopausal/peri grouping)
         - If menopause unknown but menopause age is present -> keep as 6

    2) If reason is directly available from questionnaire (1,4,5,7), carry through:
         new_menoreason = menoreason

    3) If HRT obscures menopause timing:
         - If menopause_nk_hrt == 1 and reason is not clearly surgical,
           assign new_menoreason = 9 (“not known: on HRT”)

    4) If age is unknown because of HRT timing:
         - If R0_HRTPeriodsUnclear == 1 and reason is not clearly surgical,
           assign new_menoreason = 13 (“natural menopause on HRT/OC” style category)

    5) If “never had periods”:
         - new_menoreason = 19 (overrides all other logic)

    6) If still missing after steps above, infer using menopause status and menopause age:
         - If menopause == 1 -> new_menoreason = 1 (natural default)
         - If menopause == 2 -> new_menoreason = 16 (premenopausal/peri grouping)
         - If menopause unknown AND menopause age missing -> new_menoreason = 17 (status unknown)
         - If menopause unknown AND menopause age present -> new_menoreason = 6 (does not know reason)

Decisions / cutoffs:
  • “Never had periods” (R0_EverHadPeriods == 2) always forces:
      - R0_MenopauseReason = 19
  • HRT-related assignments (9 or 13) are NOT applied if the person is clearly surgical
    (surgical codes are handled in the surgery adjustment step below).

-------------------------------------------------------------------------------
4) Surgery adjustments (use ovary/uterus operations to refine reason and age)
-------------------------------------------------------------------------------
Methodology:
  Step A — Extract earliest operation ages:
    - op_bothovariesage = earliest age where R0_OvaryOp_BothOvaries == 1
    - op_hysterectomyage = earliest age where R0_OvaryOp_Uterus == 4
    Age source precedence:
      - Use R0_OvaryOp_Age if present
      - Otherwise, fall back to FromAge then ToAge from the “Extra” range fields

  Step B — If a menopause age is reported, reconcile it with surgery ages:
    - If menopause age exists, set menoage = min(menoage, op_bothovariesage, op_hysterectomyage)
    - If menoage matches (or is +1 year from) the ovary removal age:
        new_menoreason = 2  (bilateral oophorectomy)
    - Else if menoage matches (or is +1 year from) the hysterectomy age:
        new_menoreason = 3  (hysterectomy)

  Step C — If menopause age is missing but reason suggests surgery:
    - If menopause age is missing and menoreason in (2,3,4) (surgery-type categories),
      then set menoage to the earliest available surgery age.
      Assign reason 2 if it matches ovary removal; 3 if it matches hysterectomy.

  Step D — “Upgrade” hysterectomy/unknown surgery to bilateral oophorectomy if evidence exists:
    - If current reason indicates hysterectomy/unknown/other surgery (3,4,18)
      AND there is a valid bilateral oophorectomy age before age_as (51),
      then:
        new_menoreason = 2
        menoage = op_bothovariesage

Decisions / cutoffs:
  • “Match within 1 year” rule:
      - Surgery is considered consistent with menopause age if:
          menoage == op_age OR menoage == op_age + 1
  • When surgery evidence exists, surgical classifications take precedence over non-surgical/HRT-derived reason codes.

-------------------------------------------------------------------------------
5) Cap implausible menopause ages and final menopausal status (R0_Menopause)
-------------------------------------------------------------------------------
Methodology:
  Step A — Cap ages and adjust reason when menopause age exceeds age_la:
    - If menoage > age_la:
        menoage = age_la
        new_menoreason = 1 (natural)
    - Similarly cap surgery-derived lower/upper bounds if they exceed age_la.

  Step B — Derive R0_Menopause (new_menopause):
    Output codes used:
      - 1 = postmenopausal (known/confirmed)
      - 2 = pre/perimenopausal grouping (explicitly indicated)
      - 3 = assumed postmenopausal (age-based assumption)
      - 4 = assumed premenopausal (age-based assumption)
      - 9 = never had periods

    Rules applied in order:
      1) If new_menoreason in (1,2,5,6,13) -> new_menopause = 1 (known post)
      2) If reason is missing but menopause age exists -> new_menopause = 1
      3) If menopause == 2 and reason is missing -> new_menopause = 2
         OR if new_menoreason indicates pre/peri grouping (e.g., 16) -> new_menopause = 2
      4) If reason is “uncertain/ambiguous” category, classify assumed pre/post by age:
         - For stress/illness/other-style reasons (subset), use age_min (45):
             qage < 45 -> 4 (assumed pre), else 3 (assumed post)
         - For hysterectomy/unknown surgery/on hormones/status unknown-style reasons, use age_as (51):
             qage < 51 -> 4 (assumed pre), else 3 (assumed post)
      5) If qage > age_la (62) -> force new_menopause = 1 (post)
      6) If new_menoreason == 19 -> new_menopause = 9 (never had periods)

Decisions / cutoffs:
  • age-based assumptions use two thresholds:
      - 45 for certain “other/stress/illness”-like uncertainty cases
      - 51 for surgery/unknown/hormone/unclear-status uncertainty cases
  • Everyone older than 62 is classified as postmenopausal, regardless of other uncertainty.

-------------------------------------------------------------------------------
6) Derive R0_AgeMenopause (single age output)
-------------------------------------------------------------------------------
Methodology:
  1. Start with the reconciled menopause age (menoage).
  2. Apply a final cap function:
       - new_menoage = min(new_menoage, qage, age_la) when qage is available
       - new_menoage = min(new_menoage, age_la) when qage missing

  3. Only output an age at menopause for confirmed postmenopausal (status == 1):
       - If R0_Menopause in (2,3,4,9), set R0_AgeMenopause = missing
       - Else R0_AgeMenopause = new_menoage

Decisions / cutoffs:
  • R0_AgeMenopause is intentionally suppressed (set to missing) for:
      - 2 (pre/peri), 3 (assumed post), 4 (assumed pre), 9 (never had periods)
    and is only retained when menopausal status is confirmed postmenopausal (1).
  • Age at menopause is capped to not exceed:
      - the participant's age at entry (qage), if known
      - age_la (62) in all cases
"""

"\nDerive menopause variables (from menstrual / menopause history + ovary/uterus operations):\n\n  • R0_Menopause        (menopausal status; multi-level code)\n  • R0_AgeMenopause     (age at menopause, years; only populated for confirmed postmenopause)\n  • R0_MenopauseReason  (derived reason / classification code)\n\n-------------------------------------------------------------------------------\n1) Inputs used\n-------------------------------------------------------------------------------\nMethodology:\n  Primary menstrual/menopause history inputs (MenstrualMenopause schema):\n    - R0_EverHadPeriods        (ever had periods)\n    - R0_PeriodsStoppedPerm    (periods now stopped completely?)\n    - R0_HRTPeriodsUnclear     (HRT obscures menopause timing)\n    - R0_MenopauseAge          (age when periods stopped completely)\n    - R0_PeriodsStoppedReason  (natural / surgery / chemo / DK / other)\n\n  Surgery history inputs (OvaryUterusOperations array):\n    - R0_OvaryOp_BothOvaries 

In [17]:
def derive_R0_menopause_from_menstrual(rec: dict, age_entry):

    age_min = 45   # minimum age for some assumed postmenopause classifications
    age_as  = 51   # assumed typical age for menopause
    age_la  = 62   # max age to still be plausibly premenopausal

    def as_num(x):
        try:
            if x is None:
                return np.nan
            return float(x)
        except (TypeError, ValueError):
            return np.nan

    # R0_PeriodsStoppedPerm: 1=Yes, 2=No, 3=Do not know, 4=Not applicable, 0=multi
    menopause_raw = rec.get("R0_PeriodsStoppedPerm", None)
    # treat 0 & 4 as missing
    menopause = menopause_raw if menopause_raw not in (0, 4, None) else np.nan

    # R0_HRTPeriodsUnclear: 1=Yes, 0 or null otherwise
    age_nk_hrt = 1.0 if rec.get("R0_HRTPeriodsUnclear", None) == 1 else 0.0

    # Age when periods stopped completely
    menoage = as_num(rec.get("R0_MenopauseAge", None))

    rsr = rec.get("R0_PeriodsStoppedReason", None)
    menoreason = np.nan
    if rsr in (0, None):
        menoreason = np.nan
    elif rsr == 1:
        menoreason = 1
    elif rsr == 2:
        menoreason = 4
    elif rsr == 3:
        menoreason = 5
    elif rsr == 4:
        menoreason = 6
    elif rsr == 5:
        menoreason = 7

    # Ever had periods?
    ever_periods = rec.get("R0_EverHadPeriods", None)

    # Age at entry
    qage = as_num(age_entry)

    # --- Surgery ages from OvaryUterusOperations -------------------------

    op_bothovariesage = np.nan
    op_hysterectomyage = np.nan

    for op in rec.get("OvaryUterusOperations", []) or []:
        if not isinstance(op, dict):
            continue

        age = as_num(op.get("R0_OvaryOp_Age", None))

        # Try range ages if single-point age is missing
        extra = (op.get("OvaryUterusOperationsExtra") or [{}])
        if age is np.nan or np.isnan(age):
            age_from = as_num(extra[0].get("R0_OvaryOp_FromAge") if extra else None)
            age_to   = as_num(extra[0].get("R0_OvaryOp_ToAge")   if extra else None)
            if not np.isnan(age_from) and not np.isnan(age_to):
                age = (age_from + age_to) / 2.0
            elif not np.isnan(age_from):
                age = age_from
            elif not np.isnan(age_to):
                age = age_to

        # Bilateral oophorectomy (both ovaries removed)
        if op.get("R0_OvaryOp_BothOvaries", None) == 1:
            if np.isnan(op_bothovariesage) or (not np.isnan(age) and age < op_bothovariesage):
                op_bothovariesage = age

        # Hysterectomy (uterus removed, any extent)
        if op.get("R0_OvaryOp_Uterus", None) == 4:
            if np.isnan(op_hysterectomyage) or (not np.isnan(age) and age < op_hysterectomyage):
                op_hysterectomyage = age

    # Convert numerics
    menopause = as_num(menopause)
    menoage   = as_num(menoage)
    menoreason = as_num(menoreason)
    age_nk_hrt = as_num(age_nk_hrt)
    qage      = as_num(qage)
    op_bothovariesage = as_num(op_bothovariesage)
    op_hysterectomyage = as_num(op_hysterectomyage)

    menopause_nk_hrt = 1 if (menopause in (3, 4) and age_nk_hrt == 1) else 0

    # --- Step 1: new_menoreason ----------------------------------------

    new_menoreason = np.nan

    # 6: does not know reason → resolve using menopause status and age
    if np.isnan(new_menoreason) and menoreason == 6:
        if menopause == 1:
            new_menoreason = 1
        elif menopause in (2, 8):
            new_menoreason = 16
        elif (menopause in (np.nan, 0, 3, 4) or np.isnan(menopause)) and not np.isnan(menoage):
            new_menoreason = 6

    # Basic 1,4,5,7 from questionnaire
    if np.isnan(new_menoreason) and menoreason in (1, 2, 3, 4, 5, 7):
        # note: 2 & 3 here are never actually set by our recode, but keep for completeness
        new_menoreason = menoreason

    # HRT obscuring natural menopause → “not known: on HRT” (9)
    if menopause_nk_hrt == 1 and new_menoreason not in (2, 3, 4, 13, 18):
        new_menoreason = 9

    # “Age unknown because started HRT around then” → 13 (natural on HRT/OC),
    # unless definitely surgical (2,3,4,18)
    if age_nk_hrt == 1 and new_menoreason not in (2, 3, 4, 18):
        if np.isnan(new_menoreason):
            new_menoreason = 13

    # Never had periods overrides everything
    if ever_periods == 2:
        new_menoreason = 19

    # If still missing, infer from menopause + age at menopause / age at entry
    if np.isnan(new_menoreason):
        if np.isnan(menoreason) and menopause == 1:
            new_menoreason = 1
        elif menopause == 2:
            new_menoreason = 16
        elif (menopause in (np.nan, 0, 3, 4) or np.isnan(menopause)) and np.isnan(menoage):
            new_menoreason = 17
        elif (menopause in (np.nan, 0, 3, 4) or np.isnan(menopause)) and not np.isnan(menoage):
            new_menoreason = 6

    # --- Step 2: surgery adjustments (op_bothovariesage / op_hysterectomyage) ----

    menoage_l = np.nan
    menoage_u = np.nan

    if not np.isnan(menoage):
        candidate_ages = [x for x in [menoage, op_bothovariesage, op_hysterectomyage] if not np.isnan(x)]
        if candidate_ages:
            menoage = min(candidate_ages)

        if not np.isnan(op_bothovariesage) and (menoage == op_bothovariesage or menoage == op_bothovariesage + 1):
            new_menoreason = 2  # bilateral oophorectomy
        elif not np.isnan(op_hysterectomyage) and (menoage == op_hysterectomyage or menoage == op_hysterectomyage + 1):
            new_menoreason = 3  # hysterectomy

    if np.isnan(menoage) and menoreason in (2, 3, 4):
        candidate_ages = [x for x in [op_hysterectomyage, op_bothovariesage] if not np.isnan(x)]
        if candidate_ages:
            menoage = min(candidate_ages)
            if menoage == op_bothovariesage:
                new_menoreason = 2
            elif menoage == op_hysterectomyage:
                new_menoreason = 3

    if new_menoreason in (3, 4, 18) and (0 < op_bothovariesage < age_as):
        new_menoreason = 2
        menoage = op_bothovariesage

    if new_menoreason in (3, 4, 18):
        menoage_l = menoage
    elif new_menoreason == 2:
        menoage_u = menoage

    # --- Step 3: cap ages > age_la and adjust reason ------------------------

    if not np.isnan(menoage) and menoage > age_la:
        menoage = min(menoage, age_la)
        new_menoreason = 1

    if not np.isnan(menoage_l) and menoage_l > age_la:
        menoage_l = min(menoage_l, age_la)
    if not np.isnan(menoage_u) and menoage_u > age_la:
        menoage_u = min(menoage_u, age_la)

    if np.isnan(menoage) and not np.isnan(qage) and qage > age_la and new_menoreason == 16:
        new_menoreason = 1   # premenopausal but >62: assume natural menopause

    # --- Step 4: new_menopause (status) --------------------------------------

    new_menopause = np.nan

    # 1) Known postmenopausal
    if new_menoreason in (1, 2, 5, 6, 13):
        new_menopause = 1

    # 2) If no reason but age at menopause given, treat as post
    if np.isnan(new_menoreason) and not np.isnan(menoage):
        new_menopause = 1

    # 3) Known premenopausal, including perimenopausal
    if np.isnan(new_menopause):
        if menopause == 2 and np.isnan(new_menoreason):
            new_menopause = 2
        if new_menoreason in (11, 12, 16):
            new_menopause = 2

    # 4) Assumed premenopausal or postmenopausal, depending on age
    if np.isnan(new_menopause) and new_menoreason in (3, 4, 7, 8, 9, 10, 14, 15, 17, 18):

        # other, stress, eating disorder, illness → use age_min (45) cut-off
        if new_menoreason in (7, 10, 14, 15):
            if not np.isnan(qage) and qage < age_min:
                new_menopause = 4   # assumed pre
            else:
                new_menopause = 3   # assumed post

        # hysterectomy only, surgery type NK, other surgery, other reason,
        # on hormones, on HRT, status NK → use age_as (51) cut-off
        if new_menoreason in (3, 4, 7, 8, 9, 17, 18):
            if not np.isnan(qage) and qage < age_as:
                new_menopause = 4   # assumed pre
            else:
                new_menopause = 3   # assumed post

    # 5) Everyone older than age_la is postmenopausal
    if not np.isnan(qage) and qage > age_la:
        new_menopause = 1

    # 6) Never had periods
    if new_menoreason == 19:
        new_menopause = 9

    # --- Step 5: single age-at-menopause output -----------------------------

    new_menoage = np.nan if np.isnan(menoage) else float(menoage)

    def cap_age(val):
        if np.isnan(val):
            return np.nan
        v = val
        if not np.isnan(qage):
            v = min(v, qage)
        v = min(v, age_la)
        return v

    new_menoage = cap_age(new_menoage)

    if new_menopause in (2, 3, 4, 9):
        R0_AgeMenopause = np.nan
    else:
        R0_AgeMenopause = new_menoage

    def clean(v):
        if isinstance(v, float) and np.isnan(v):
            return None
        return int(v) if isinstance(v, float) and v.is_integer() else v

    return {
        "R0_Menopause": clean(new_menopause),
        "R0_AgeMenopause": clean(R0_AgeMenopause),
        "R0_MenopauseReason": clean(new_menoreason),
    }

In [18]:
"""
Derive oral contraceptive (OC) variables (Contraceptive/HRT section):

  • R0_OralContraceptiveStatus  (0 = Never, 1 = Former, 2 = Current, None = Unknown)
  • R0_AgeStartedOC             (years; integer, None if missing/invalid)
  • R0_AgeLastUsedOC            (years; integer, None if missing/invalid)
  • R0_OCLength                 (total duration of OC use in years; float, 1 dp)

-------------------------------------------------------------------------------
1) Inputs used
-------------------------------------------------------------------------------
Methodology:
  Source: Contraceptive/HRT questionnaire record (ContraceptiveHRT schema), using:
    - R0_ContracepPillUse              (ever used contraceptive pill; coded)
    - ContracepPill[] episode array:
        • R0_ContracepPill_StartAge    (age started that episode)
        • R0_ContracepPill_StopAge     (age stopped that episode; may be missing)
        • R0_ContracepPill_CurrentUse  (1 = currently using in that episode)
  Plus:
    - age_entry (R0_AgeEntry), used to censor/truncate ongoing use at study entry.

Decisions / cutoffs:
  • All ages are coerced to floats where possible; non-parsable values become missing (None).
  • Negative ages are treated as invalid during interval building (implicitly excluded by checks).

-------------------------------------------------------------------------------
2) Build OC use intervals (“spans”) and starts list
-------------------------------------------------------------------------------
Methodology:
  1. Initialize:
      - spans_raw = []  (list of (start_age, effective_stop_age))
      - starts    = []  (list of start ages)
      - current_flag = False  (set True if any episode indicates current use)

  2. For each ContracepPill episode:
      a) Parse:
          s = StartAge
          t = StopAge
      b) If episode has CurrentUse == 1, set current_flag = True.
      c) If start age is missing, skip the episode.
      d) Define an “effective stop” age (eff_stop):
          - If age_entry is known AND (stop is missing OR stop > age_entry),
              eff_stop = age_entry   (right-censor at entry)
          - Else:
              eff_stop = stop age t
      e) If eff_stop is present and s <= eff_stop:
          - append (s, eff_stop) to spans_raw
          - append s to starts

Decisions / cutoffs:
  • Censoring rule:
      - Any episode extending past study entry is truncated at age_entry for duration calculation.
  • Basic validity rule:
      - Only keep spans where start <= effective stop.

-------------------------------------------------------------------------------
3) Derive OC status (R0_OralContraceptiveStatus)
-------------------------------------------------------------------------------
Methodology:
  1. Read “ever used” indicator:
      - ever = R0_ContracepPillUse
      - said_never = (ever == 2)
      - said_yes   = (ever == 1)

  2. If there are NO episodes and participant explicitly said never:
      - status = 0 (Never)

  3. Otherwise, determine whether OC use covers age at entry:
      - covers_entry = True if any episode satisfies:
          • start_age <= age_entry <= stop_age
            (with stop treated as infinity when missing), OR
          • episode CurrentUse == 1
      - Also treat current_flag (set during episode parsing) as evidence of current use.

  4. Assign status using the following priority:
      - If covers_entry OR current_flag:      status = 2 (Current)
      - Else if spans_raw non-empty:          status = 1 (Former)
      - Else if said_never:                   status = 0 (Never)
      - Else if said_yes:                     status = 1 (Former)
      - Else:                                 status = None (Unknown)

Decisions / cutoffs:
  • Episode evidence overrides the single “ever used” field when available.
  • Missing/ambiguous information:
      - If neither episodes nor ever-used response clearly indicate use, status is None.

-------------------------------------------------------------------------------
4) Age started OC (R0_AgeStartedOC)
-------------------------------------------------------------------------------
Methodology:
  - If any valid start ages were collected:
      R0_AgeStartedOC = floor_to_int(min(starts))
    (Implemented as int(min(starts)) after float parsing.)

Decisions / cutoffs:
  • If computed age is exactly 0, it is set to None (treated as invalid placeholder).
  • If no valid starts exist, value remains None.

-------------------------------------------------------------------------------
5) Age last used OC (R0_AgeLastUsedOC)
-------------------------------------------------------------------------------
Methodology:
  1. If status == 2 (Current):
      - R0_AgeLastUsedOC = int(age_entry)  (censored at entry)

  2. Else if status == 1 (Former) and episodes exist:
      - Collect stop ages t that satisfy:
          • t is present AND s is present AND s <= t
          • and if age_entry is known: t <= age_entry
      - If any qualifying stops exist:
          R0_AgeLastUsedOC = int(max(qualifying_stops))

Decisions / cutoffs:
  • Former users only:
      - Last-used age is defined as the latest recorded stop age that is not after entry.
  • If computed age is exactly 0, it is set to None.
  • If no qualifying stop ages exist, value remains None.

-------------------------------------------------------------------------------
6) Total duration of OC use (R0_OCLength)
-------------------------------------------------------------------------------
Methodology:
  1. If spans_raw is non-empty:
      a) Merge overlapping/adjacent intervals in age space:
           - Sort by start age
           - If next.start <= current.end, merge into one interval
      b) Compute total duration:
           total_years = sum(max(0, end - start) across merged intervals)
      c) Round to 1 decimal place:
           R0_OCLength = round(total_years, 1)

  2. If spans_raw is empty:
      - If status == 0 (Never):  R0_OCLength = 0.0
      - Else (Former/Current/Unknown): R0_OCLength = None

Decisions / cutoffs:
  • Overlap handling:
      - Overlapping intervals are merged so time is not double-counted.
  • “True never” encoding:
      - Only participants classified as Never (status 0) receive 0.0 duration;
        otherwise, missing episode information results in None rather than 0.

-------------------------------------------------------------------------------
7) Final cleanup
-------------------------------------------------------------------------------
Methodology:
  - Store status in output as 0/1/2/None.
  - For R0_AgeStartedOC and R0_AgeLastUsedOC:
      - if computed value equals 0, set to None.

Decisions / cutoffs:
  • Zero ages are treated as invalid placeholders and converted to missing.
"""

'\nDerive oral contraceptive (OC) variables (Contraceptive/HRT section):\n\n  • R0_OralContraceptiveStatus  (0 = Never, 1 = Former, 2 = Current, None = Unknown)\n  • R0_AgeStartedOC             (years; integer, None if missing/invalid)\n  • R0_AgeLastUsedOC            (years; integer, None if missing/invalid)\n  • R0_OCLength                 (total duration of OC use in years; float, 1 dp)\n\n-------------------------------------------------------------------------------\n1) Inputs used\n-------------------------------------------------------------------------------\nMethodology:\n  Source: Contraceptive/HRT questionnaire record (ContraceptiveHRT schema), using:\n    - R0_ContracepPillUse              (ever used contraceptive pill; coded)\n    - ContracepPill[] episode array:\n        • R0_ContracepPill_StartAge    (age started that episode)\n        • R0_ContracepPill_StopAge     (age stopped that episode; may be missing)\n        • R0_ContracepPill_CurrentUse  (1 = currently using in

In [19]:
def _merge_age_intervals(intervals):
    if not intervals:
        return []
    srt = sorted(intervals, key=lambda ab: ab[0])
    merged = [list(srt[0])]
    for s, e in srt[1:]:
        ms, me = merged[-1]
        if s <= me:
            merged[-1][1] = max(me, e)
        else:
            merged.append([s, e])
    return [tuple(x) for x in merged]

def derive_oc_from_contraceptive_section(chrt_rec: dict, age_entry):
    """
    Outputs use Python None for missing so JSON has proper nulls.
    """
    out = {
        "R0_OralContraceptiveStatus": None,
        "R0_AgeStartedOC": None,
        "R0_AgeLastUsedOC": None,
        "R0_OCLength": None,
    }
    if not isinstance(chrt_rec, dict):
        return out

    ever = chrt_rec.get("R0_ContracepPillUse", None)
    episodes = chrt_rec.get("ContracepPill", []) or []
    age_entry_f = _to_float_or_none(age_entry)  # reuse your existing helper

    spans_raw, starts = [], []
    current_flag = False

    # Build truncated spans & collect starts
    for ep in episodes:
        if not isinstance(ep, dict):
            continue
        s = _to_float_or_none(ep.get("R0_ContracepPill_StartAge"))
        t = _to_float_or_none(ep.get("R0_ContracepPill_StopAge"))
        if _is_int_eq(ep.get("R0_ContracepPill_CurrentUse"), 1):
            current_flag = True
        if s is None:
            continue
        eff_stop = (age_entry_f if (age_entry_f is not None and (t is None or t > age_entry_f)) else t)
        if eff_stop is not None and s <= eff_stop:
            spans_raw.append((s, eff_stop))
            starts.append(s)

    # Decide status (avoid pd.NA)
    said_never = _is_int_eq(ever, 2)
    said_yes   = _is_int_eq(ever, 1)

    if (not episodes) and said_never:
        status = 0
    else:
        covers_entry = False
        if age_entry_f is not None:
            for ep in episodes:
                s = _to_float_or_none(ep.get("R0_ContracepPill_StartAge"))
                t = _to_float_or_none(ep.get("R0_ContracepPill_StopAge"))
                cur_now = _is_int_eq(ep.get("R0_ContracepPill_CurrentUse"), 1)
                if s is None:
                    continue
                end_for_status = t if t is not None else float("inf")
                if (s <= age_entry_f <= end_for_status) or cur_now:
                    covers_entry = True
                    break
        if covers_entry or current_flag:
            status = 2
        elif spans_raw:
            status = 1
        elif said_never:
            status = 0
        elif said_yes:
            status = 1
        else:
            status = None  # unknown

    # Age started
    if starts:
        out["R0_AgeStartedOC"] = int(min(starts))

    # Age last used: current → AgeEntry; former → latest stop ≤ entry
    if status == 2:
        out["R0_AgeLastUsedOC"] = int(age_entry_f) if age_entry_f is not None else None
    elif status == 1 and episodes:
        ended_before = []
        if age_entry_f is not None:
            for ep in episodes:
                s = _to_float_or_none(ep.get("R0_ContracepPill_StartAge"))
                t = _to_float_or_none(ep.get("R0_ContracepPill_StopAge"))
                if s is None or t is None:
                    continue
                if t <= age_entry_f and s <= t:
                    ended_before.append(t)
        if ended_before:
            out["R0_AgeLastUsedOC"] = int(max(ended_before))

    # Length (merge + sum)
    if spans_raw:
        merged = _merge_age_intervals(spans_raw)
        total = sum(max(0.0, e - s) for s, e in merged)
        out["R0_OCLength"] = round(total, 1)
    else:
        out["R0_OCLength"] = 0.0 if status == 0 else None

    out["R0_OralContraceptiveStatus"] = status  # 0/1/2 or None
    
    for k in ("R0_AgeStartedOC", "R0_AgeLastUsedOC"):
        if _is_zero(out.get(k)):
            out[k] = None

    return out

In [20]:
"""
Derive parity and breastfeeding variables (from pregnancy history section + pregnancy episodes):

  • R0_Parous                 (SAS-style parity status)
  • R0_Parity                 (count of live-birth pregnancies, with -1 for never pregnant)
  • R0_AgeBirthFirst          (age at first live birth; years)
  • R0_AgeBirthLast           (age at last live birth; years)
  • R0_BreastfeedingDuration  (total breastfeeding duration across live births; weeks)
  • R0_Breastfed              (breastfed indicator)

-------------------------------------------------------------------------------
1) Inputs used
-------------------------------------------------------------------------------
Methodology:
  Person / questionnaire-level inputs (Pregnancy section record):
    - R0_EverPregnant        (coded yes/no/don't know)
    - R0_TotalPregnancies    (reported number of pregnancies)
    - Pregnancies[]          (episode array)

  Pregnancy episode inputs (Pregnancies[*]):
    - R0_Preg_Outcome            (pregnancy outcome code)
    - R0_PregnancyEndDate        (end date; used to compute age at birth)
    - R0_Preg_BreastfeedingWks   (breastfeeding duration for that pregnancy; weeks)

  DOB input:
    - dob_value (date of birth), used to compute ages at live births.

Decisions / cutoffs:
  • Only pregnancy episodes with outcomes in the “live birth” set contribute to:
      - parity count
      - age at first/last birth
      - breastfeeding duration
  • Pregnancy episodes with “unknown” outcome contribute to an “unknown outcome count”
    used to classify parity status as unknown (R0_Parous = 9).

-------------------------------------------------------------------------------
2) Outcome groupings used for classification
-------------------------------------------------------------------------------
Methodology:
  Pregnancy outcomes are grouped into:

  A) Live birth outcomes:
      _LIVE_BIRTH_OUTCOMES = {1, 2, 3, 5, 10}

  B) Unknown outcome:
      _UNKNOWN_PREG_OUTCOMES = {14}

  C) Other (non-live-birth, known outcomes):
      - do not contribute to parity count
      - do not contribute to breastfeeding duration

Decisions / cutoffs:
  • If an episode outcome is missing (None) it is treated like “unknown outcome”
    for parity-status purposes (counts toward unknown_outcome_count).

-------------------------------------------------------------------------------
3) Derive parity status (R0_Parous)
-------------------------------------------------------------------------------
Methodology:
  1. Build episode-level evidence:
     Iterate over Pregnancies[] episodes:
       - Flag any_preg_episodes = True if at least one valid dict episode exists.
       - For each episode:
           • Parse outcome = int(R0_Preg_Outcome) when possible
           • Parse end_date = date(R0_PregnancyEndDate) when possible
           • If outcome is a live-birth code:
               - append end_date into parous_rows
               - append breastfeeding weeks into bf_weeks
           • Else if outcome is unknown (14) OR missing:
               - increment unknown_outcome_count
           • Else:
               - ignore for parity and breastfeeding

     Define:
       parity_count = len(parous_rows)  (number of live-birth pregnancies)

  2. Assign R0_Parous using a SAS-aligned logic:
     A) If parity_count > 0:
          R0_Parous = 1   (parous; at least one live birth)

     B) Else (no observed live births):
        i) Identify a “never pregnant” pattern:
           never_preg_pattern is True if either:
             - R0_EverPregnant explicitly indicates “No” (ever_preg == 2), OR
             - ever_preg is in (2, 3, None) AND
               (R0_TotalPregnancies is missing or 0) AND
               there are no pregnancy episodes (any_preg_episodes == False)

           If never_preg_pattern:
              R0_Parous = -1   (never pregnant)

        ii) Otherwise, check evidence of ever being pregnant:
            ever_preg_evidence is True if any of:
              - R0_EverPregnant == 1 (“Yes”)
              - R0_TotalPregnancies > 0
              - any_preg_episodes == True

            If ever_preg_evidence:
              - If unknown_outcome_count > 0:
                  R0_Parous = 9   (ever pregnant but parity unknown)
              - Else:
                  R0_Parous = 0   (nulliparous: ever pregnant, no live births)

            Else:
              R0_Parous = None    (insufficient or inconsistent evidence)

Decisions / cutoffs:
  • “Never pregnant” is a distinct code:
      - R0_Parous = -1 only when the “never pregnant pattern” is met.
  • Parity unknown is explicitly coded:
      - If there is evidence of ever pregnancy AND unknown outcomes exist AND
        there are no observed live births, then R0_Parous = 9.
  • If there is no coherent evidence either way, R0_Parous remains None.

-------------------------------------------------------------------------------
4) Derive parity count variable (R0_Parity)
-------------------------------------------------------------------------------
Methodology:
  R0_Parity is derived from R0_Parous and parity_count:

    - If R0_Parous == -1:
         R0_Parity = -1

    - If R0_Parous in (0, 1, 9):
         R0_Parity = parity_count
         (This is 0 for nulliparous and also 0 for parity-unknown cases where no
          live births are observed; it is >=1 for parous.)

    - If R0_Parous is None:
         R0_Parity = None

Decisions / cutoffs:
  • R0_Parity uses -1 to explicitly represent “never pregnant” (aligned to R0_Parous = -1).
  • For R0_Parous = 9 (parity unknown), R0_Parity is still set to parity_count (typically 0),
    rather than a separate missing code.

-------------------------------------------------------------------------------
5) Non-parous handling: NA codes for birth- and breastfeeding-derived variables
-------------------------------------------------------------------------------
Methodology:
  If R0_Parous != 1 (i.e., not confirmed parous / no live birth), then:
    - R0_AgeBirthFirst          = 999
    - R0_AgeBirthLast           = 999
    - R0_BreastfeedingDuration  = 999
    - R0_Breastfed              = 999
  and the derivation stops.

Decisions / cutoffs:
  • 999 is used as an explicit “not applicable” code for variables that can only be
    defined for those with at least one live birth.

-------------------------------------------------------------------------------
6) Age at first and last live birth (R0_AgeBirthFirst, R0_AgeBirthLast)
-------------------------------------------------------------------------------
Methodology:
  Only for R0_Parous == 1:

  1. Collect end dates from live-birth pregnancies:
      dates = [end_date for each live-birth episode if end_date is not missing]

  2. If DOB and at least one valid end date exist:
      first_birth = min(dates)
      last_birth  = max(dates)

      Compute age (in whole years) at each date using:
        age = (year difference) minus 1 if birthday has not yet occurred that year.

  3. Post-cleaning:
      - If computed age is 0 or negative, set it to None.

Decisions / cutoffs:
  • No censoring by entry date is applied to pregnancy end dates in this derivation
    (explicitly noted in code as a future enhancement).
  • “Clearly invalid” ages (<= 0) are set back to None rather than kept.

-------------------------------------------------------------------------------
7) Breastfeeding duration and indicator (R0_BreastfeedingDuration, R0_Breastfed)
-------------------------------------------------------------------------------
Methodology:
  Only for R0_Parous == 1:

  1. Aggregate breastfeeding duration (weeks):
     - Extract breastfeeding weeks for each live-birth episode (bf_weeks list).
     - Convert parsable values to float and sum them:
         total_weeks = max(0.0, sum(parsed_week_values))
     - If there are no parsable values, total is treated as missing.

  2. Set outputs:
     A) If total_weeks is not missing:
         - R0_BreastfeedingDuration = total_weeks
         - R0_Breastfed = 1 if total_weeks > 0 else 0

     B) If total_weeks is missing:
         - If at least one breastfeeding value was reported AND all reported values are 0:
             R0_Breastfed = 0
           Else:
             R0_Breastfed = None
         - R0_BreastfeedingDuration remains missing (None)

Decisions / cutoffs:
  • Total breastfeeding duration is stored in WEEKS (sum of pregnancy-level weeks).
  • Negative totals are prevented via max(0.0, sum(...)).
  • If breastfeeding information is missing or inconsistent, R0_Breastfed is left as None
    (rather than forcing 0), unless there is explicit evidence that all reported values are 0.
"""

"\nDerive parity and breastfeeding variables (from pregnancy history section + pregnancy episodes):\n\n  • R0_Parous                 (SAS-style parity status)\n  • R0_Parity                 (count of live-birth pregnancies, with -1 for never pregnant)\n  • R0_AgeBirthFirst          (age at first live birth; years)\n  • R0_AgeBirthLast           (age at last live birth; years)\n  • R0_BreastfeedingDuration  (total breastfeeding duration across live births; weeks)\n  • R0_Breastfed              (breastfed indicator)\n\n-------------------------------------------------------------------------------\n1) Inputs used\n-------------------------------------------------------------------------------\nMethodology:\n  Person / questionnaire-level inputs (Pregnancy section record):\n    - R0_EverPregnant        (coded yes/no/don't know)\n    - R0_TotalPregnancies    (reported number of pregnancies)\n    - Pregnancies[]          (episode array)\n\n  Pregnancy episode inputs (Pregnancies[*]):\n    -

In [21]:
def _is_live_birth(outcome) -> bool:
    try:
        return int(outcome) in _LIVE_BIRTH_OUTCOMES
    except Exception:
        return False

_LIVE_BIRTH_OUTCOMES = {1, 2, 3, 5, 10}
_UNKNOWN_PREG_OUTCOMES = {14}  # outcome unknown

def _age_on(dob: date | None, on: date | None) -> int | None:
    if not dob or not on:
        return None
    yrs = on.year - dob.year
    return yrs - 1 if (on.month, on.day) < (dob.month, dob.day) else yrs

def _weeks_to_total(weeks) -> float | None:
    vals = []
    for w in (weeks or []):
        if w is None or (isinstance(w, float) and pd.isna(w)):
            continue
        try:
            vals.append(float(w))
        except Exception:
            pass
    if not vals:
        return None
    total_weeks = max(0.0, sum(vals))
    return total_weeks

def _as_int(value) -> int | None:
    try:
        if value is None or (isinstance(value, float) and pd.isna(value)):
            return None
        return int(value)
    except Exception:
        return None

def derive_parity_and_breastfeeding(preg_section: dict, dob_value) -> dict:

    out = {
        "R0_Parous": None,
        "R0_Parity": None,
        "R0_AgeBirthFirst": None,
        "R0_AgeBirthLast": None,
        "R0_BreastfeedingDuration": None,
        "R0_Breastfed": None,
    }

    if not isinstance(preg_section, dict):
        return out

    # Questionnaire-level info
    ever_preg = _as_int(preg_section.get("R0_EverPregnant"))
    total_preg = _as_int(preg_section.get("R0_TotalPregnancies"))

    pregnancies = preg_section.get("Pregnancies") or []
    if not isinstance(pregnancies, list):
        pregnancies = []

    dob = _safe_parse_date(dob_value) if not isinstance(dob_value, date) else dob_value

    parous_rows = []
    bf_weeks = []
    unknown_outcome_count = 0
    any_preg_episodes = False

    for p in pregnancies:
        if not isinstance(p, dict):
            continue
        any_preg_episodes = True

        outcome = _as_int(p.get("R0_Preg_Outcome"))
        # IMPORTANT: use the original key so ages work
        end_date = _safe_parse_date(p.get("R0_PregnancyEndDate"))

        # NOTE: currently not censoring by entry date – can be added later
        if outcome in _LIVE_BIRTH_OUTCOMES:
            parous_rows.append({"end": end_date})
            bf_weeks.append(p.get("R0_Preg_BreastfeedingWks"))
        elif outcome in _UNKNOWN_PREG_OUTCOMES or outcome is None:
            unknown_outcome_count += 1
        else:
            # non-parous confirmed outcomes
            pass

    parity_count = len(parous_rows)

    # --- Determine R0_Parous (SAS-style) ---
    parous_value: int | None

    if parity_count > 0:
        # At least one live birth
        parous_value = 1
    else:
        # No live births observed
        never_preg_pattern = (
            ever_preg == 2  # explicitly "No"
            or (
                ever_preg in (2, 3, None)
                and (total_preg is None or total_preg == 0)
                and not any_preg_episodes
            )
        )

        if never_preg_pattern:
            parous_value = -1  # never pregnant
        else:
            # Evidence of ever pregnant?
            ever_preg_evidence = (
                ever_preg == 1
                or (total_preg is not None and total_preg > 0)
                or any_preg_episodes
            )

            if ever_preg_evidence:
                if unknown_outcome_count > 0:
                    # ever pregnant but parity unknown
                    parous_value = 9
                else:
                    # ever pregnant, no live births, no unknowns
                    parous_value = 0  # nulliparous
            else:
                # No consistent evidence either way
                parous_value = None

    out["R0_Parous"] = parous_value

    # --- R0_Parity with -1 for never pregnant ---
    if parous_value == -1:
        out["R0_Parity"] = -1
    elif parous_value in (0, 1, 9):
        out["R0_Parity"] = parity_count
    else:
        out["R0_Parity"] = None

    # If no parous pregnancy (no live birth), use 999 NA codes and stop
    if parous_value != 1:
        out["R0_AgeBirthFirst"] = 999
        out["R0_AgeBirthLast"] = 999
        out["R0_BreastfeedingDuration"] = 999
        out["R0_Breastfed"] = 999
        return out

    # At least one live birth → derive ages at first and last live birth
    if dob and parous_rows:
        dates = [r["end"] for r in parous_rows if r["end"] is not None]
        if dates:
            first_birth = min(dates)
            last_birth = max(dates)
            out["R0_AgeBirthFirst"] = _age_on(dob, first_birth)
            out["R0_AgeBirthLast"] = _age_on(dob, last_birth)

    # Breastfeeding (only for parous pregnancies)
    months_total = _weeks_to_total(bf_weeks)
    if months_total is not None:
        out["R0_BreastfeedingDuration"] = months_total
        out["R0_Breastfed"] = 1 if months_total > 0 else 0
    else:
        any_reported = any(
            (w is not None and not pd.isna(w)) for w in bf_weeks
        )
        all_zero = (
            any_reported
            and all(
                (float(w) == 0.0)
                for w in bf_weeks
                if w is not None and not pd.isna(w)
            )
        )
        out["R0_Breastfed"] = 0 if all_zero else None

    for k in ("R0_AgeBirthFirst", "R0_AgeBirthLast"):
        if _is_zero(out.get(k)) or _is_neg(out.get(k)):
            out[k] = None

    return out


In [22]:
"""
Derive benign breast disease (BBD) variable (from BreastDisease section only):

  • R0_BBD: 1 = evidence of benign breast disease / abnormality, 0 = explicitly no, None = unknown

-------------------------------------------------------------------------------
1) Inputs used
-------------------------------------------------------------------------------
Methodology:
  Source: BreastDisease questionnaire record (BreastDisease schema), using:
    - R0_BBD_Abnormality        (ever had breast abnormality / benign breast disease; coded)
    - BBD[] episode array:
        • R0_BBD_Type           (episode “type” field; presence indicates an episode was recorded)
        • R0_BBD_ProcedureType  (procedure type recorded within an episode)
    - BBD_Extra[] procedure/flag fields (binary tick boxes):
        • R0_BBD_Biopsy
        • R0_BBD_Lump
        • R0_BBD_Mastec
        • R0_BBD_BothRem
        • R0_BBD_Other
    - Additional “more/further” tick boxes:
        • R0_BBD_Further
        • R0_BBD_MoreThan4

Decisions / cutoffs:
  • The derivation intentionally treats multiple sources of “evidence” as sufficient to
    classify as BBD=1 (see section 3), even if the main abnormality question is missing.

-------------------------------------------------------------------------------
2) Primary “ever abnormality” response (R0_BBD_Abnormality)
-------------------------------------------------------------------------------
Methodology:
  1. Read R0_BBD_Abnormality (“ever had abnormality/BBD”):
       - If R0_BBD_Abnormality == 1 (Yes) → set R0_BBD = 1 and stop.
       - Otherwise continue to evidence-based checks.

Decisions / cutoffs:
  • This is the highest-priority rule:
      - A direct “Yes” sets R0_BBD to 1 regardless of episode/flag completeness.

-------------------------------------------------------------------------------
3) Evidence-based classification from episodes / procedures / extra flags
-------------------------------------------------------------------------------
Methodology:
  If the direct “Yes” rule does not apply, then classify as BBD=1 if ANY of the following
  evidence is present:

  A) Any BBD episode recorded (BBD[]):
     - At least one BBD[] item contains either:
         • R0_BBD_Type present, OR
         • R0_BBD_ProcedureType present

  B) Any procedure/operation ticked (BBD_Extra[] flags):
     - At least one BBD_Extra[] record has any of the following == 1:
         • R0_BBD_Biopsy
         • R0_BBD_Lump
         • R0_BBD_Mastec
         • R0_BBD_BothRem
         • R0_BBD_Other

  C) “Further / more than 4” boxes ticked:
     - R0_BBD_Further == 1 OR R0_BBD_MoreThan4 == 1

  If any of A-C is true → set R0_BBD = 1 and stop.

Decisions / cutoffs:
  • Presence-based evidence rule:
      - The code treats “any recorded episode detail” or “any relevant procedure tick”
        as sufficient evidence of BBD (R0_BBD=1), even if the main abnormality question
        is not explicitly answered “Yes”.
  • Tick-box interpretation:
      - Any of the listed BBD_Extra[] flags set to 1 is interpreted as evidence of BBD.

-------------------------------------------------------------------------------
4) Explicit “No” rule (R0_BBD_Abnormality)
-------------------------------------------------------------------------------
Methodology:
  If no evidence-based rule sets BBD=1, then:
    - If R0_BBD_Abnormality == 2 (No) → set R0_BBD = 0

Decisions / cutoffs:
  • “No” is only used after checking for evidence:
      - This avoids classifying someone as BBD=0 if they have episode/procedure evidence
        but selected “No” (or left the main question blank).

-------------------------------------------------------------------------------
5) Missingness and unknown classification
-------------------------------------------------------------------------------
Methodology:
  If none of the above rules apply (no “Yes”, no evidence, no explicit “No”):
    - Set R0_BBD = None (unknown)

Decisions / cutoffs:
  • No special numeric missing code is used for R0_BBD; missingness is stored as None.
  • If the BreastDisease record is absent or not a dict-like structure, R0_BBD remains None.
"""


'\nDerive benign breast disease (BBD) variable (from BreastDisease section only):\n\n  • R0_BBD: 1 = evidence of benign breast disease / abnormality, 0 = explicitly no, None = unknown\n\n-------------------------------------------------------------------------------\n1) Inputs used\n-------------------------------------------------------------------------------\nMethodology:\n  Source: BreastDisease questionnaire record (BreastDisease schema), using:\n    - R0_BBD_Abnormality        (ever had breast abnormality / benign breast disease; coded)\n    - BBD[] episode array:\n        • R0_BBD_Type           (episode “type” field; presence indicates an episode was recorded)\n        • R0_BBD_ProcedureType  (procedure type recorded within an episode)\n    - BBD_Extra[] procedure/flag fields (binary tick boxes):\n        • R0_BBD_Biopsy\n        • R0_BBD_Lump\n        • R0_BBD_Mastec\n        • R0_BBD_BothRem\n        • R0_BBD_Other\n    - Additional “more/further” tick boxes:\n        • R0_BB

In [23]:
def derive_R0_BBD_from_breast_disease(bd_rec: dict) -> dict:

    out = {"R0_BBD": None}
    if not isinstance(bd_rec, dict):
        return out

    # 1) Direct “ever told” question
    ever = bd_rec.get("R0_BBD_Abnormality")
    if ever == 1:
        out["R0_BBD"] = 1
        return out

    # 2) Evidence from episode arrays / procedure flags
    bbd_list = bd_rec.get("BBD") or []
    any_episode = False
    any_proc_flag = False

    for ep in bbd_list:
        if not isinstance(ep, dict):
            continue
        # episode has a specific type or a coded procedure
        if ep.get("R0_BBD_Type") is not None or ep.get("R0_BBD_ProcedureType") is not None:
            any_episode = True

        # “extra” boolean flags for specific procedures
        extra_list = ep.get("BBD_Extra") or []
        for ex in extra_list:
            if not isinstance(ex, dict):
                continue
            for k in ("R0_BBD_Biopsy", "R0_BBD_Lump", "R0_BBD_Mastec", "R0_BBD_BothRem", "R0_BBD_Other"):
                v = ex.get(k, None)
                if v == 1:
                    any_proc_flag = True
                    break
            if any_proc_flag:
                break

    more_further = any(bd_rec.get(k) == 1 for k in ("R0_BBD_Further", "R0_BBD_MoreThan4"))

    if any_episode or any_proc_flag or more_further:
        out["R0_BBD"] = 1
        return out

    # 3) Explicit “No”
    if ever == 2:
        out["R0_BBD"] = 0
        return out

    # 4) Otherwise unknown
    return out

In [24]:
"""
Derive diabetes variables (from medical history illnesses record):

  • R0_DiabetesStatus   (1 = Yes, 0 = No, None = Unknown/ambiguous)
  • R0_AgeDiabetes      (age at diabetes diagnosis in years; int, None if not applicable/invalid)
  • R0_DiabetesInsulin  (1 = treated with insulin, 0 = not treated with insulin, None = unknown/not applicable)

-------------------------------------------------------------------------------
1) Inputs used
-------------------------------------------------------------------------------
Methodology:
  Source: MH_Illnesses_anon record (single person-level medical history record), using:
    - R0_Diabetes            ∈ {1: Yes, 2: No, 0: Multiple, None}
    - R0_DiabetesAge         ∈ {integer >= 0, None}
    - R0_TreatedWithInsulin  ∈ {1: Yes, 2: No, 0: Multiple, None}

Decisions / cutoffs:
  • Values are treated as coded questionnaire responses (not free text).
  • Non-parsable ages are treated as missing.

-------------------------------------------------------------------------------
2) Diabetes status (R0_DiabetesStatus)
-------------------------------------------------------------------------------
Methodology:
  1. Read R0_Diabetes (ever had diabetes).
  2. Map to a binary status:
       - If R0_Diabetes == 1 → R0_DiabetesStatus = 1
       - If R0_Diabetes == 2 → R0_DiabetesStatus = 0
       - Otherwise (0 “Multiple”, missing, or any non-standard value) → R0_DiabetesStatus = None

Decisions / cutoffs:
  • Code 0 (“Multiple”) is not forced to Yes or No; it is treated as ambiguous and mapped to None.
  • Missing/unknown is kept as None (no special numeric missing code is used).

-------------------------------------------------------------------------------
3) Insulin treatment flag (R0_DiabetesInsulin)
-------------------------------------------------------------------------------
Methodology:
  1. Read R0_TreatedWithInsulin.
  2. Map to a binary insulin flag, and only retain it when diabetes status indicates diabetes:
       - If diabetes status is not Yes (R0_DiabetesStatus != 1) → R0_DiabetesInsulin = None
       - Else if R0_TreatedWithInsulin == 1 → R0_DiabetesInsulin = 1
       - Else if R0_TreatedWithInsulin == 2 → R0_DiabetesInsulin = 0
       - Else → R0_DiabetesInsulin = None

Decisions / cutoffs:
  • Insulin use is treated as not applicable unless the participant is classified as diabetic (status == 1).
  • Code 0 (“Multiple”) is treated as ambiguous and mapped to None.

-------------------------------------------------------------------------------
4) Age at diabetes diagnosis (R0_AgeDiabetes)
-------------------------------------------------------------------------------
Methodology:
  1. Read R0_DiabetesAge and attempt to coerce to integer.
  2. Apply applicability and validity rules:
       - If R0_DiabetesStatus != 1 → R0_AgeDiabetes = None
       - If age cannot be parsed → None
       - If age < 0 → None
       - Otherwise keep parsed integer age.

Decisions / cutoffs:
  • Age is only retained for confirmed diabetes cases (status == 1).
  • Negative ages are treated as invalid and set to missing.
  • No upper-bound plausibility filter is applied in the derivation step (only non-negative constraint).
"""


'\nDerive diabetes variables (from medical history illnesses record):\n\n  • R0_DiabetesStatus   (1 = Yes, 0 = No, None = Unknown/ambiguous)\n  • R0_AgeDiabetes      (age at diabetes diagnosis in years; int, None if not applicable/invalid)\n  • R0_DiabetesInsulin  (1 = treated with insulin, 0 = not treated with insulin, None = unknown/not applicable)\n\n-------------------------------------------------------------------------------\n1) Inputs used\n-------------------------------------------------------------------------------\nMethodology:\n  Source: MH_Illnesses_anon record (single person-level medical history record), using:\n    - R0_Diabetes            ∈ {1: Yes, 2: No, 0: Multiple, None}\n    - R0_DiabetesAge         ∈ {integer >= 0, None}\n    - R0_TreatedWithInsulin  ∈ {1: Yes, 2: No, 0: Multiple, None}\n\nDecisions / cutoffs:\n  • Values are treated as coded questionnaire responses (not free text).\n  • Non-parsable ages are treated as missing.\n\n-----------------------------

In [25]:
def derive_diabetes_from_mh(mh: Dict[str, Any]) -> Dict[str, Optional[int]]:
    """
    Inputs (from MH_Illnesses_anon record 'mh'):
      - R0_Diabetes            ∈ {1:Yes, 2:No, 0:Multiple, None}
      - R0_DiabetesAge         ∈ {integer >=0, None}
      - R0_TreatedWithInsulin  ∈ {1:Yes, 2:No, 0:Multiple, None}

    Returns:
      {
        "R0_DiabetesStatus": {1,0,None},
        "R0_AgeDiabetes": {int,None},
        "R0_DiabetesInsulin": {1,0,None}
      }
    """
    out: Dict[str, Optional[int]] = {
        "R0_DiabetesStatus": None,
        "R0_AgeDiabetes": None,
        "R0_DiabetesInsulin": None,
    }

    if not isinstance(mh, dict) or not mh:
        return out

    def yn_map(v):
        if v == 1: return 1
        if v == 2: return 0
        # 0 (multiple) or anything else -> None
        return None

    status = yn_map(mh.get("R0_Diabetes"))
    insulin = yn_map(mh.get("R0_TreatedWithInsulin"))

    # Age only if status == 1 and is a valid non-negative number
    age = mh.get("R0_DiabetesAge")
    try:
        age = int(age) if age is not None else None
    except Exception:
        age = None
    if status != 1:
        age = None
    if age is not None and age < 0:
        age = None

    out["R0_DiabetesStatus"] = status
    out["R0_AgeDiabetes"] = age
    out["R0_DiabetesInsulin"] = insulin
    return out

In [26]:


# # Acceptable exact site labels (normalized) for breast cancer
# _BREAST_SITES_EQ = {"breast", "male breast"}

# # Sibling relation codes (from your schema)
# # 1: Brother, 3: Sister, 5: Sex unknown  → FULL siblings
# # 2: Half-brother, 4: Half-sister       → HALF siblings (EXCLUDE)
# _FULL_SIB_REL = {1, 3, 5}

# def derive_R0_FamHistBC_from_cancer_relatives_eq(cr: dict) -> dict:
#     """
#     R0_FamHistBC (first-degree family history of breast cancer) mirroring SAS-style logic:
#       • Uses equality on coded/normalized site labels (no regex).
#       • Counts first-degree relatives: parents, FULL siblings, and children.
#       • Excludes half-siblings from the first-degree roll-up.
#       • Does not try to infer from 'any cancer' flags when site is unknown.

#     Returns: {"R0_FamHistBC": 0|1|None}
#       1  → any first-degree relative has site exactly 'breast' (or 'male breast')
#       0  → all available first-degree sites provide no breast match AND
#             available first-degree “any cancer” flags are explicitly 'No'
#       None → unknown / no usable information / only 'Yes' without sites
#     """
#     out = {"R0_FamHistBC": None}
#     if not isinstance(cr, dict):
#         return out

#     # ---- Parents: arrays with coded/normalized site in the *CancerType* field in your schema
#     parent_sites = []
#     for arr_key, type_key in (("FatherCancers", "R0_FatherCancerType"),
#                               ("MotherCancers", "R0_MotherCancerType")):
#         arr = cr.get(arr_key) or []
#         if isinstance(arr, list):
#             for it in arr:
#                 if isinstance(it, dict):
#                     parent_sites.append(_norm_site(it.get(type_key)))

#     parent_bc = any(s in _BREAST_SITES_EQ for s in parent_sites if s is not None)

#     # Parent any-cancer flags (site-agnostic; only used to possibly set 0 when all are explicit No)
#     father_any = cr.get("R0_FatherCancerHistory", None)  # 1 Yes, 2 No, 3 DK
#     mother_any = cr.get("R0_MotherCancerHistory", None)  # 1 Yes, 2 No, 3 DK

#     # ---- Siblings: include FULL siblings only
#     sib_sites = []
#     sib_any_flags = []
#     for s in (cr.get("Siblings") or []):
#         if not isinstance(s, dict):
#             continue
#         rel = s.get("R0_SiblingRelation", None)
#         if rel in _FULL_SIB_REL:  # exclude half-sibs
#             sib_sites.append(_norm_site(s.get("R0_SiblingCancerType")))
#             sib_any_flags.append(s.get("R0_SiblingCancerHistory"))  # 1 Yes, 2 No, 3 DK

#     sib_bc = any(s in _BREAST_SITES_EQ for s in sib_sites if s is not None)

#     # ---- Children
#     child_sites = []
#     for c in (cr.get("Children") or []):
#         if isinstance(c, dict):
#             child_sites.append(_norm_site(c.get("R0_ChildCancerType")))
#     child_bc = any(s in _BREAST_SITES_EQ for s in child_sites if s is not None)

#     child_any = cr.get("R0_ChildCancerHistory", None)  # 2 Yes, 1 No, 3 N/A in your schema

#     # ---- Decision: prefer coded site equality; do not use regex/contains
#     if parent_bc or sib_bc or child_bc:
#         out["R0_FamHistBC"] = 1
#         return out

#     # If we reach here, there is no explicit breast site among first-degree relatives.
#     # Decide if we can say 0 based on explicit 'No' first-degree any-cancer flags:
#     any_parent_no = (father_any == 2) or (mother_any == 2)
#     any_sib_no = any(f == 2 for f in sib_any_flags if f is not None)
#     child_no = (child_any == 1)  # in your schema: 1=No, 2=Yes
#     if any_parent_no or any_sib_no or child_no:
#         out["R0_FamHistBC"] = 0
#         return out

#     # Otherwise: unknown (e.g., Yes but site not given, DK, or no info)
#     out["R0_FamHistBC"] = None
#     return out

In [27]:
"""
Derive hormone replacement therapy (HRT) variables (from ContraceptiveHRT section + episodes):

  • R0_HRTStatus    (0 = Never, 1 = Former, 2 = Current, None = Unknown/ambiguous)
  • R0_HRTStartAge  (age at first HRT use in years; int, None if missing/invalid)
  • R0_HRTStopAge   (age at last HRT use in years; int, None if missing/invalid)
  • R0_HRTDuration  (total duration of HRT use in years; float, 1 dp; 0.0 for true never)

-------------------------------------------------------------------------------
1) Inputs used
-------------------------------------------------------------------------------
Methodology:
  Source: ContraceptiveHRT questionnaire record (ContraceptiveHRT schema), using:
    - R0_HRT_Use          (ever used HRT; coded)
    - HRT[] episode array:
        • R0_HRT_StartAge
        • R0_HRT_StopAge
        • R0_HRT_CurrentUse  (1 = currently using in that episode)
  Plus:
    - age_entry (R0_AgeEntry), used to censor/truncate ongoing HRT use at study entry.

Decisions / cutoffs:
  • All ages are coerced to floats where possible; non-parsable values become missing (None).
  • Negative ages are treated as invalid and excluded from span-building and stop-age selection.

-------------------------------------------------------------------------------
2) Build HRT use intervals (“spans”) and starts list
-------------------------------------------------------------------------------
Methodology:
  1. Initialise containers:
      - starts = []       (all valid start ages)
      - spans_raw = []    (list of (start_age, effective_stop_age))
      - current_flag = False  (True if any episode has CurrentUse == 1)

  2. For each HRT episode:
      a) Parse:
          s = R0_HRT_StartAge
          t = R0_HRT_StopAge
      b) If CurrentUse == 1, set current_flag = True.
      c) If start age s is missing, skip the episode.
      d) Append s to starts (for later first-use age derivation).
      e) Define an “effective stop” age (eff_stop):
          - Start with eff_stop = t
          - If age_entry is known AND (eff_stop is missing OR eff_stop > age_entry):
                eff_stop = age_entry   (right-censor at entry)
      f) If eff_stop is still missing, the episode cannot contribute to duration (skip).
      g) If s <= eff_stop, append (s, eff_stop) to spans_raw.

Decisions / cutoffs:
  • Censoring rule:
      - Episodes that are open-ended or extend beyond entry are truncated at age_entry.
  • Duration requires a bounded interval:
      - Episodes without a usable stop age after censoring do not contribute to duration.

-------------------------------------------------------------------------------
3) Derive HRT status (R0_HRTStatus)
-------------------------------------------------------------------------------
Methodology:
  1. Read “ever used” indicator:
      - ever = R0_HRT_Use
      - said_never = (ever == 2)   # questionnaire: 2 = "No"
      - said_yes   = (ever == 1)   # questionnaire: 1 = "Yes"

  2. Base status logic:
      - If there are no episodes AND said_never:
            status = 0 (Never)

      - Otherwise, determine whether HRT use covers age at entry:
          covers_entry = True if age_entry known and any episode satisfies:
              • start_age <= age_entry <= stop_age (stop treated as infinity when missing), OR
              • episode CurrentUse == 1

        Then assign:
          • If covers_entry OR current_flag:  status = 2 (Current)
          • Else if spans_raw non-empty:      status = 1 (Former)
          • Else if said_never:               status = 0 (Never)
          • Else if said_yes:                 status = 1 (Former; “yes” but episodes unusable)
          • Else:                             status = None (Unknown)

  3. Store output:
      - R0_HRTStatus = status

Decisions / cutoffs:
  • Episode evidence is prioritised:
      - “Current” is assigned when coverage-at-entry is detected from episodes, even if ever-use
        is missing/ambiguous.
  • If the questionnaire says “Yes” but no usable episode information exists:
      - classify as Former (1) rather than Unknown.
  • If neither episodes nor ever-use response provide clear evidence:
      - status remains None.

-------------------------------------------------------------------------------
4) Age at first and last HRT use (R0_HRTStartAge, R0_HRTStopAge)
-------------------------------------------------------------------------------
Methodology:
  A) Age at first HRT use (StartAge):
     - If starts list is non-empty:
          R0_HRTStartAge = int(min(starts))

  B) Age at last HRT use (StopAge):
     - If status == 2 (Current):
          R0_HRTStopAge = int(age_entry)  (censored at entry)

     - Else if status == 1 (Former):
          • Collect stop ages t from episodes where:
              - s and t are present
              - t is non-negative
              - s <= t
              - if age_entry is known: t <= age_entry
          • If any qualifying stops exist:
              R0_HRTStopAge = int(max(qualifying_stops))

Decisions / cutoffs:
  • StopAge for current users is always set to age_entry (right-censoring).
  • StopAge for former users is the latest valid stop age not after entry.
  • If a derived StartAge or StopAge equals literal 0, it is normalised to None
    (treated as an invalid placeholder).

-------------------------------------------------------------------------------
5) Post-hoc consistency rules (status checks against ages and episode count)
-------------------------------------------------------------------------------
Methodology:
  If age_entry is known (ae):
    1) If derived StopAge > ae:
         - set R0_HRTStopAge = None
         - set R0_HRTStatus  = None
       (This indicates inconsistent data, because spans should have been truncated at entry.)

    2) Otherwise refine status based on derived ages and episode count:
         - If StopAge == ae OR StartAge == ae:  force status = Current (2)
         - Else if StopAge < ae:                force status = Former (1)
         - Else if
"""

'\nDerive hormone replacement therapy (HRT) variables (from ContraceptiveHRT section + episodes):\n\n  • R0_HRTStatus    (0 = Never, 1 = Former, 2 = Current, None = Unknown/ambiguous)\n  • R0_HRTStartAge  (age at first HRT use in years; int, None if missing/invalid)\n  • R0_HRTStopAge   (age at last HRT use in years; int, None if missing/invalid)\n  • R0_HRTDuration  (total duration of HRT use in years; float, 1 dp; 0.0 for true never)\n\n-------------------------------------------------------------------------------\n1) Inputs used\n-------------------------------------------------------------------------------\nMethodology:\n  Source: ContraceptiveHRT questionnaire record (ContraceptiveHRT schema), using:\n    - R0_HRT_Use          (ever used HRT; coded)\n    - HRT[] episode array:\n        • R0_HRT_StartAge\n        • R0_HRT_StopAge\n        • R0_HRT_CurrentUse  (1 = currently using in that episode)\n  Plus:\n    - age_entry (R0_AgeEntry), used to censor/truncate ongoing HRT use at 

In [28]:
def derive_hrt_from_contraceptive_section(chrt_rec: dict, age_entry):

    out = {
        "R0_HRTStatus": None,
        "R0_HRTStartAge": None,
        "R0_HRTStopAge": None,
        "R0_HRTDuration": None,
    }
    if not isinstance(chrt_rec, dict):
        return out

    ever = chrt_rec.get("R0_HRT_Use", None)
    episodes = chrt_rec.get("HRT", []) or []
    age_entry_f = _to_float_or_none(age_entry)

    spans_raw, starts = [], []
    current_flag = False

    # Build truncated spans (for duration) & collect starts
    for ep in episodes:
        if not isinstance(ep, dict):
            continue

        s = _to_float_or_none(ep.get("R0_HRT_StartAge"))
        t = _to_float_or_none(ep.get("R0_HRT_StopAge"))

        # discard clearly invalid starts
        if s is None or _is_neg(s):
            continue
        # treat negative stops as missing
        if t is not None and _is_neg(t):
            t = None

        cur_now = _is_int_eq(ep.get("R0_HRT_CurrentUse"), 1)
        if cur_now:
            current_flag = True

        # record any plausible start age
        starts.append(s)

        # duration only counts time up to entry
        eff_stop = t
        if age_entry_f is not None:
            # cap at entry; open-ended or post-entry episodes stop at AgeEntry
            if eff_stop is None or eff_stop > age_entry_f:
                eff_stop = age_entry_f

        # if we still don't have a stop, we cannot bound this episode
        if eff_stop is None:
            continue
        if s <= eff_stop:
            spans_raw.append((s, eff_stop))

    # Decide status
    said_never = _is_int_eq(ever, 2)  # questionnaire: 2 = "No"
    said_yes   = _is_int_eq(ever, 1)  # questionnaire: 1 = "Yes"

    status = None
    if (not episodes) and said_never:
        status = 0
    else:
        covers_entry = False
        if age_entry_f is not None:
            for ep in episodes:
                if not isinstance(ep, dict):
                    continue
                s = _to_float_or_none(ep.get("R0_HRT_StartAge"))
                t = _to_float_or_none(ep.get("R0_HRT_StopAge"))
                cur_now = _is_int_eq(ep.get("R0_HRT_CurrentUse"), 1)
                if s is None:
                    continue
                end_for_status = t if t is not None else float("inf")
                if (s <= age_entry_f <= end_for_status) or cur_now:
                    covers_entry = True
                    break

        if covers_entry or current_flag:
            status = 2  # current user at entry
        elif spans_raw:
            status = 1  # some use in the past
        elif said_never:
            status = 0  # explicitly never
        elif said_yes:
            status = 1  # said yes but episodes unusable → treat as former
        else:
            status = None  # unknown

    out["R0_HRTStatus"] = status

    # Age at first HRT use
    if starts:
        out["R0_HRTStartAge"] = int(min(starts))

    # Age at last HRT use
    if status == 2:
        # current → last use age is age at entry (censored)
        out["R0_HRTStopAge"] = int(age_entry_f) if age_entry_f is not None else None
    elif status == 1 and episodes:
        stops_before = []
        for ep in episodes:
            if not isinstance(ep, dict):
                continue
            s = _to_float_or_none(ep.get("R0_HRT_StartAge"))
            t = _to_float_or_none(ep.get("R0_HRT_StopAge"))
            if s is None or t is None or _is_neg(t):
                continue

            if age_entry_f is not None:
                if t <= age_entry_f and s <= t:
                    stops_before.append(t)
            else:
                if s <= t:
                    stops_before.append(t)

        if stops_before:
            out["R0_HRTStopAge"] = int(max(stops_before))

    # Post-hoc consistency rules based on derived ages and episode count
    start_age = out.get("R0_HRTStartAge")
    stop_age = out.get("R0_HRTStopAge")
    n_episodes = len(episodes) if isinstance(episodes, list) else 0
    ae = int(age_entry_f) if age_entry_f is not None else None

    if ae is not None:
        # 1) If HRT age last is greater than age at entry, treat as invalid:
        #    last age and status should be null (data should have been truncated at entry).
        if stop_age is not None and stop_age > ae:
            out["R0_HRTStopAge"] = None
            out["R0_HRTStatus"] = None
            status = None
        else:
            # 2) If last age == age at entry OR first age == age at entry → current
            if ((stop_age is not None and stop_age == ae) or
                (start_age is not None and start_age == ae)):
                out["R0_HRTStatus"] = 2
                status = 2

            # 3) If last age < age at entry → former
            elif stop_age is not None and stop_age < ae:
                out["R0_HRTStatus"] = 1
                status = 1

            # 4) If last age missing and number of HRT episodes missing → never
            elif stop_age is None and n_episodes == 0:
                out["R0_HRTStatus"] = 0
                status = 0

    # Total duration (years), merged across overlapping spans
    if spans_raw:
        merged = _merge_age_intervals(spans_raw)
        total = sum(max(0.0, e - s) for s, e in merged)
        out["R0_HRTDuration"] = round(total, 1)
    else:
        out["R0_HRTDuration"] = 0.0 if status == 0 else None

    # normalise any literal zeros for ages to None
    for k in ("R0_HRTStartAge", "R0_HRTStopAge"):
        if _is_zero(out.get(k)):
            out[k] = None

    return out

In [29]:
"""
Derive alcohol variables (from AlcoholSmokingDiet_anon):

  • R0_AlcoholStatus        (0 = never, 1 = former, 2 = current, None = unknown/inconsistent)
  • R0_AgeStartedDrinking   (int years, None if missing/invalid)
  • R0_AgeStoppedDrinking   (int years, None if missing/invalid)
  • R0_AlcoholUnitsPerWeek  (float units/week at age of entry; CURRENT drinkers only)

-------------------------------------------------------------------------------
1) Inputs used
-------------------------------------------------------------------------------
Methodology:
  Source: AlcoholSmokingDiet_anon record (per participant), using:
    - R0_AlcoholRegularUse        (coded indicator: regular drinking)
    - R0_AlcoholCurrentUse        (coded indicator: current drinking)
    - R0_AlcoholStartAge          (age started drinking)
    - R0_AlcoholStopAge           (age stopped drinking)
    - AlcoholConsumption[] array with one or more rows, each containing:
        • R0_AlcoholConsumption_Num  (age band label: "18-24", "25-49", "50+")
        • R0_AlcoholBeer
        • R0_AlcoholRedWine
        • R0_AlcoholWhiteWine
        • R0_AlcoholSherry
        • R0_AlcoholSpirits
        • R0_AlcoholOther
  Plus:
    - age_entry (R0_AgeEntry), used for (a) “ever” inference from ages and (b) selecting
      units/week at entry band for current drinkers.

Decisions / cutoffs:
  • Only AlcoholConsumption rows with band exactly in {"18-24","25-49","50+"} are used.
  • All numeric-like values are coerced to float where possible; non-parsable values are treated as missing.

-------------------------------------------------------------------------------
2) Clean start/stop drinking ages
-------------------------------------------------------------------------------
Methodology:
  1. Parse R0_AlcoholStartAge and R0_AlcoholStopAge to floats.
  2. Retain only plausible values:
      - If 0 < age < 120, keep and cast to int.
      - Otherwise set to None.

Decisions / cutoffs:
  • Validity filter is strictly:
      - 0 < age < 120
    (0, negatives, and extreme values are treated as invalid and set to None.)
  • A final “zero normalisation” step converts any literal zero values back to None.

-------------------------------------------------------------------------------
3) Derive weekly alcohol units per band (from AlcoholConsumption[])
-------------------------------------------------------------------------------
Methodology:
  1. For each AlcoholConsumption episode row:
     - Extract beverage counts (missing values treated as 0.0):
         beer, red, white, sherry, spirits, other
  2. Convert beverage counts to units/week using fixed multipliers:
       units = (
         3.0 * beer +
         2.1 * (red + white) +
         1.0 * sherry +
         1.4 * spirits +
         1.5 * other
       )
     Then round to 1 decimal place.

  3. If multiple rows exist for the same band, keep the MAX units value observed for that band:
       units_by_band[band] = max(units_by_band[band], units)

Decisions / cutoffs:
  • Multipliers are fixed constants (aligned to the implementation comment: “Same unit conversions as SAS (updated NHS values)”).
  • When multiple entries exist for a band, the derivation uses the maximum (not the mean or sum).

-------------------------------------------------------------------------------
4) Derive ever/current drinking and alcohol status (R0_AlcoholStatus)
-------------------------------------------------------------------------------
Methodology:
  1. Construct “ever drank” evidence from multiple sources:
      - ever_from_regular  = (R0_AlcoholRegularUse == 1)
      - ever_from_current  = (R0_AlcoholCurrentUse == 1)
      - ever_from_units    = any(band units > 0.0)
      - ever_from_age      = True if age_entry is known AND:
            • AgeStartedDrinking is not None and age_entry >= AgeStartedDrinking, OR
            • AgeStoppedDrinking is not None and age_entry >= AgeStoppedDrinking

     ever = OR across all four evidence flags.

  2. Determine current drinking flag:
      - current_flag = (R0_AlcoholCurrentUse == 1)

  3. Assign status (mirrors SAS-style intent in code comments):
      - If not ever:                    status = 0  (never)
      - Else if current_flag:           status = 2  (current)
      - Else if ever and curr is not None and not current_flag:
                                      status = 1  (former)
      - Else:                           status = None (unknown / inconsistent)

Decisions / cutoffs:
  • “Ever” is inferred broadly:
      - A participant can be classified as ever-drinker even if the direct “ever/regular” field
        is missing, provided there is evidence from consumption units or ages.
  • “Former” requires that the current-use field is explicitly answered (curr is not None)
    and indicates not current.
  • Otherwise, ambiguous combinations are left as None (unknown).

-------------------------------------------------------------------------------
5) Units/week at age of entry (R0_AlcoholUnitsPerWeek)
-------------------------------------------------------------------------------
Methodology:
  Only computed for CURRENT drinkers (status == 2) with a known age_entry.

  1. Map age_entry to the relevant AlcoholConsumption band:
      - if 16 <= age_entry < 25  -> "18-24"
      - if 25 <= age_entry < 50  -> "25-49"
      - if 50 <= age_entry < 120 -> "50+"
      - else band = None

  2. If a band is identified and units_by_band has a value for that band:
      - set R0_AlcoholUnitsPerWeek = float(units_by_band[band])

Decisions / cutoffs:
  • Units/week is only populated for status == 2 (current drinkers).
  • Age banding uses:
      - lower bound 16 for mapping into the "18-24" band
      - upper bound 120 for the overall age-entry plausibility window
  • If band-specific units are missing, R0_AlcoholUnitsPerWeek remains None (no imputation).
"""

'\nDerive alcohol variables (from AlcoholSmokingDiet_anon):\n\n  • R0_AlcoholStatus        (0 = never, 1 = former, 2 = current, None = unknown/inconsistent)\n  • R0_AgeStartedDrinking   (int years, None if missing/invalid)\n  • R0_AgeStoppedDrinking   (int years, None if missing/invalid)\n  • R0_AlcoholUnitsPerWeek  (float units/week at age of entry; CURRENT drinkers only)\n\n-------------------------------------------------------------------------------\n1) Inputs used\n-------------------------------------------------------------------------------\nMethodology:\n  Source: AlcoholSmokingDiet_anon record (per participant), using:\n    - R0_AlcoholRegularUse        (coded indicator: regular drinking)\n    - R0_AlcoholCurrentUse        (coded indicator: current drinking)\n    - R0_AlcoholStartAge          (age started drinking)\n    - R0_AlcoholStopAge           (age stopped drinking)\n    - AlcoholConsumption[] array with one or more rows, each containing:\n        • R0_AlcoholConsumpti

In [30]:
def derive_alcohol_from_alcohol_section(alc_rec: dict, age_entry):
    """
    Derive per-participant alcohol variables from AlcoholSmokingDiet_anon:
      - R0_AlcoholStatus          (0=never, 1=former, 2=current)
      - R0_AgeStartedDrinking     (int or None)
      - R0_AgeStoppedDrinking     (int or None)
      - R0_AlcoholUnitsPerWeek    (float units/week at age of entry, current drinkers only)
    """
    out = {
        "R0_AlcoholStatus": None,
        "R0_AgeStartedDrinking": None,
        "R0_AgeStoppedDrinking": None,
        "R0_AlcoholUnitsPerWeek": None,
    }
    if not isinstance(alc_rec, dict):
        return out

    age_entry_f = _to_float_or_none(age_entry)

    # Basic questionnaire fields
    ever_reg = alc_rec.get("R0_AlcoholRegularUse")
    curr     = alc_rec.get("R0_AlcoholCurrentUse")
    start_age = _to_float_or_none(alc_rec.get("R0_AlcoholStartAge"))
    stop_age  = _to_float_or_none(alc_rec.get("R0_AlcoholStopAge"))

    # Clean ages: 0/neg/very large -> None
    if start_age is not None and 0 < start_age < 120:
        out["R0_AgeStartedDrinking"] = int(start_age)
    if stop_age is not None and 0 < stop_age < 120:
        out["R0_AgeStoppedDrinking"] = int(stop_age)

    # Per-band weekly units from AlcoholConsumption array
    units_by_band = {}  # band ("18-24", "25-49", "50+") -> max units across rows
    episodes = alc_rec.get("AlcoholConsumption", []) or []
    for ep in episodes:
        if not isinstance(ep, dict):
            continue
        band = ep.get("R0_AlcoholConsumption_Num")
        if band not in ("18-24", "25-49", "50+"):
            continue

        beer    = _to_float_or_none(ep.get("R0_AlcoholBeer")) or 0.0
        red     = _to_float_or_none(ep.get("R0_AlcoholRedWine")) or 0.0
        white   = _to_float_or_none(ep.get("R0_AlcoholWhiteWine")) or 0.0
        sherry  = _to_float_or_none(ep.get("R0_AlcoholSherry")) or 0.0
        spirits = _to_float_or_none(ep.get("R0_AlcoholSpirits")) or 0.0
        other   = _to_float_or_none(ep.get("R0_AlcoholOther")) or 0.0

        # Same unit conversions as SAS (updated NHS values)
        units = (
            3.0 * beer +
            2.1 * (red + white) +
            1.0 * sherry +
            1.4 * spirits +
            1.5 * other
        )
        units = round(units, 1)

        # Keep the maximum units observed for that band
        if band in units_by_band:
            prev = units_by_band[band]
            if prev is None or units > prev:
                units_by_band[band] = units
        else:
            units_by_band[band] = units

    # --- Ever / current logic (mirrors SAS) ---------------------------------
    ever_from_regular = _is_int_eq(ever_reg, 1)
    ever_from_current = _is_int_eq(curr, 1)
    ever_from_units   = any(u is not None and u > 0.0 for u in units_by_band.values())

    ever_from_age = False
    if age_entry_f is not None:
        if out["R0_AgeStartedDrinking"] is not None and age_entry_f >= out["R0_AgeStartedDrinking"]:
            ever_from_age = True
        if out["R0_AgeStoppedDrinking"] is not None and age_entry_f >= out["R0_AgeStoppedDrinking"]:
            ever_from_age = True

    ever = ever_from_regular or ever_from_current or ever_from_units or ever_from_age
    current_flag = _is_int_eq(curr, 1)

    if not ever:
        status = 0                      # never
    elif current_flag:
        status = 2                      # current
    elif ever and curr is not None and not current_flag:
        status = 1                      # former
    else:
        status = None                   # unknown / inconsistent

    out["R0_AlcoholStatus"] = status

    # --- Units/week at age of entry (current drinkers only) ------------------
    if status == 2 and age_entry_f is not None:
        band = None
        if 16 <= age_entry_f < 25:
            band = "18-24"
        elif 25 <= age_entry_f < 50:
            band = "25-49"
        elif 50 <= age_entry_f < 120:
            band = "50+"

        if band is not None:
            units_at_entry = units_by_band.get(band)
            if units_at_entry is not None:
                out["R0_AlcoholUnitsPerWeek"] = float(units_at_entry)

    # Normalise literal zeros for ages back to None (if any)
    for k in ("R0_AgeStartedDrinking", "R0_AgeStoppedDrinking"):
        if _is_zero(out.get(k)):
            out[k] = None

    return out

In [31]:
"""
Derive smoking variables (from AlcoholSmokingDiet_anon):

  • R0_SmokingStatus       (0 = never, 1 = former, 2 = current, None = unknown/inconsistent)
  • R0_AgeStartedSmoking   (int years, None if missing/invalid)
  • R0_AgeStoppedSmoking   (int years, None if missing/invalid)
  • R0_CigsPerDay          (float cigs/day at age of entry; current smokers only)
  • R0_PackYears           (float pack-years up to entry; 0.0 for never-smokers, None if insufficient info)

-------------------------------------------------------------------------------
1) Inputs used
-------------------------------------------------------------------------------
Methodology:
  Source: AlcoholSmokingDiet_anon record, using:
    - R0_SmokingRegularUse
    - R0_SmokingCurrentUse
    - R0_SmokingStartAge
    - R0_SmokingStopAge
    - R0_SmokingDuringPregnancy
    - R0_SmokingRestartUse
    - R0_SmokingInhale
    - CigaretteConsumption[] episode array with:
        • R0_CigaretteConsumption_Num  (band: "16-24", "25-49", "50+")
        • R0_Cigarettes               (cigarettes/day in that band)

  Plus:
    - age_entry (R0_AgeEntry), used for (a) status at entry, (b) band selection, and (c) pack-years.

Decisions / cutoffs:
  • Ages are treated as valid only if 0 < age < 120.
  • Cigarettes/day entries are used only if > 0 and band is one of the three allowed labels.

-------------------------------------------------------------------------------
2) Clean start/stop ages (R0_AgeStartedSmoking, R0_AgeStoppedSmoking)
-------------------------------------------------------------------------------
Methodology:
  1. Parse start_age and stop_age as floats.
  2. If 0 < start_age < 120:
       R0_AgeStartedSmoking = int(start_age)
     Else leave None.
  3. If 0 < stop_age < 120:
       R0_AgeStoppedSmoking = int(stop_age)
     Else leave None.

Decisions / cutoffs:
  • Out-of-range values (<=0 or >=120) are treated as missing.
  • A final cleanup step converts any stray literal “0” ages back to None.

-------------------------------------------------------------------------------
3) Cigarettes/day by age band (CigaretteConsumption → cigs_by_band)
-------------------------------------------------------------------------------
Methodology:
  1. Iterate CigaretteConsumption episodes.
  2. Keep only episodes where:
       - band ∈ {"16-24","25-49","50+"}
       - cigs = float(R0_Cigarettes) is present and cigs > 0
  3. For each band, retain the maximum observed cigs/day:
       cigs_by_band[band] = max(cigs_by_band[band], cigs)

Decisions / cutoffs:
  • Multiple entries per band are resolved by taking the maximum (not mean/sum).
  • any_cigs is True if any band has a retained value.

-------------------------------------------------------------------------------
4) Effective “never-smoker” override from “all items no/blank”
-------------------------------------------------------------------------------
Methodology:
  The function constructs an “all_items_no_or_blank” pattern:
    - regular-use missing or “No”
    - start/stop ages missing
    - pregnancy smoking is blank/No/Not applicable-style codes
    - current-use is blank/No
    - restart-use is blank/No
    - no CigaretteConsumption cigs evidence
    - inhale blank

  If that pattern is met:
    - smoke_eff is forced to 2 (No), even if smoke_raw was blank.

Decisions / cutoffs:
  • This rule treats comprehensive “no/blank everywhere” as an explicit never-smoker response.

-------------------------------------------------------------------------------
5) Current-use effective override using stop age
-------------------------------------------------------------------------------
Methodology:
  If age_entry and stop_age are valid and stop_age < age_entry:
    - curr_eff is forced to 2 (“not current”)
  Otherwise:
    - curr_eff = raw current-use response.

Decisions / cutoffs:
  • This ensures “stopped before entry” is not classified as current at entry.

-------------------------------------------------------------------------------
6) Ever-smoker (ever) and current-smoker at entry (current)
-------------------------------------------------------------------------------
Methodology:
  Ever-smoker (ever):
    1) Start with smoke_eff:
        - 1 -> ever = True
        - 2 -> ever = False
    2) Use age evidence when age_entry is known:
        - If smoke_eff is blank AND (start_age <= entry OR stop_age <= entry): ever = True
        - If smoke_eff is “No” BUT ages indicate smoking: ever = True
        - If start_age exists and start_age > entry: ever = False (never at entry)
    3) Add evidence:
        - If current-use indicates “Yes”: ever = True
        - If any_cigs and age_entry known: ever = True

  Current-smoker at entry (current):
    1) Start with curr_eff:
        - 1 -> current = True
        - 2 -> current = False
    2) Use start/stop ages when age_entry and start_age valid:
        - If started on/before entry:
            • If stop missing OR stop >= entry -> current = True
            • If stop < entry -> current = False
        - If started after entry -> current = False
    3) If ever is False -> force current = False

Decisions / cutoffs:
  • Ages can override inconsistent questionnaire flags to set status at entry.
  • “Started after entry” is treated as never/current False at entry.

-------------------------------------------------------------------------------
7) Smoking status (R0_SmokingStatus)
-------------------------------------------------------------------------------
Methodology:
  Map (ever, current) into:
    - ever == False                     -> status = 0 (never)
    - ever == True and current == True  -> status = 2 (current)
    - ever == True and current == False -> status = 1 (former)
    - otherwise                         -> status = None

Decisions / cutoffs:
  • If ever/current cannot be resolved from available evidence, status remains None.

-------------------------------------------------------------------------------
8) Cigarettes/day at age of entry (R0_CigsPerDay)
-------------------------------------------------------------------------------
Methodology:
  Only for status == 2 (current smoker) and age_entry known:
    - Select band from age_entry:
        • 16 <= age < 25  -> "16-24"
        • 25 <= age < 50  -> "25-49"
        • 50 <= age < 120 -> "50+"
    - If cigs_by_band has a value for that band:
        R0_CigsPerDay = float(value)

Decisions / cutoffs:
  • No imputation across bands: if the entry band is missing, R0_CigsPerDay remains None.

-------------------------------------------------------------------------------
9) Pack-years up to entry (R0_PackYears)
-------------------------------------------------------------------------------
Methodology:
  Only when ever == True and age_entry is known and age_entry >= 16:
    1) Define smoking interval:
        - s = start_age
        - t = stop_age
        - If current == True and stop age is missing/invalid:
            set t = age_entry (censor at entry)
    2) Require a valid interval:
        - s and t must exist and satisfy 0 < s < t <= age_entry
    3) Clamp interval to [16, age_entry]:
        - s_eff = max(s, 16.0)
        - t_eff = min(t, age_entry)
    4) Piecewise pack-years by age band using available band-specific cigs/day:
        bands = [
          (16, 25, cigs_by_band["16-24"]),
          (25, 50, cigs_by_band["25-49"]),
          (50, 120, cigs_by_band["50+"]),
        ]
        For each band segment overlapping [s_eff, t_eff]:
          years = seg_end - seg_start
          total += years * (cigs_per_day / 20.0)
    5) If total > 0:
         R0_PackYears = round(total, 2)
       Else leave missing.

  Final assignment:
    - If ever is False: R0_PackYears = 0.0
    - Else:            R0_PackYears = packyears (may be None)

Decisions / cutoffs:
  • Pack-years are only computed when enough information exists (start/stop + band cigs).
  • Never-smokers are explicitly assigned 0.0 pack-years; others with insufficient info remain None.
"""

'\nDerive smoking variables (from AlcoholSmokingDiet_anon):\n\n  • R0_SmokingStatus       (0 = never, 1 = former, 2 = current, None = unknown/inconsistent)\n  • R0_AgeStartedSmoking   (int years, None if missing/invalid)\n  • R0_AgeStoppedSmoking   (int years, None if missing/invalid)\n  • R0_CigsPerDay          (float cigs/day at age of entry; current smokers only)\n  • R0_PackYears           (float pack-years up to entry; 0.0 for never-smokers, None if insufficient info)\n\n-------------------------------------------------------------------------------\n1) Inputs used\n-------------------------------------------------------------------------------\nMethodology:\n  Source: AlcoholSmokingDiet_anon record, using:\n    - R0_SmokingRegularUse\n    - R0_SmokingCurrentUse\n    - R0_SmokingStartAge\n    - R0_SmokingStopAge\n    - R0_SmokingDuringPregnancy\n    - R0_SmokingRestartUse\n    - R0_SmokingInhale\n    - CigaretteConsumption[] episode array with:\n        • R0_CigaretteConsumption_N

In [32]:
def derive_smoking_from_alcohol_section(alc_rec: dict, age_entry):
    """
    Derive per-participant smoking variables from AlcoholSmokingDiet_anon:
      - R0_SmokingStatus       (0=never, 1=former, 2=current)
      - R0_AgeStartedSmoking   (int or None)
      - R0_AgeStoppedSmoking   (int or None)
      - R0_CigsPerDay          (float cigs/day at age of entry, current smokers only)
      - R0_PackYears           (float pack-years up to entry; 0 for never)
    """
    out = {
        "R0_SmokingStatus": None,
        "R0_AgeStartedSmoking": None,
        "R0_AgeStoppedSmoking": None,
        "R0_CigsPerDay": None,
        "R0_PackYears": None,
    }
    if not isinstance(alc_rec, dict):
        return out

    age_entry_f = _to_float_or_none(age_entry)

    # ---- Raw questionnaire fields ------------------------------------------
    smoke_raw   = alc_rec.get("R0_SmokingRegularUse")
    curr_raw    = alc_rec.get("R0_SmokingCurrentUse")
    start_raw   = alc_rec.get("R0_SmokingStartAge")
    stop_raw    = alc_rec.get("R0_SmokingStopAge")
    preg_raw    = alc_rec.get("R0_SmokingDuringPregnancy")
    restart_raw = alc_rec.get("R0_SmokingRestartUse")
    inhale_raw  = alc_rec.get("R0_SmokingInhale")

    start_age = _to_float_or_none(start_raw)
    stop_age  = _to_float_or_none(stop_raw)

    # Clean ages into outputs (0 or out-of-range -> None)
    if start_age is not None and 0 < start_age < 120:
        out["R0_AgeStartedSmoking"] = int(start_age)
    if stop_age is not None and 0 < stop_age < 120:
        out["R0_AgeStoppedSmoking"] = int(stop_age)

    # ---- Per-band cigs/day from CigaretteConsumption -----------------------
    cigs_by_band = {}  # band ("16-24","25-49","50+") -> max cigs/day
    episodes = alc_rec.get("CigaretteConsumption", []) or []
    for ep in episodes:
        if not isinstance(ep, dict):
            continue
        band = ep.get("R0_CigaretteConsumption_Num")
        if band not in ("16-24", "25-49", "50+"):
            continue
        cigs = _to_float_or_none(ep.get("R0_Cigarettes"))
        if cigs is None or cigs <= 0:
            continue
        prev = cigs_by_band.get(band)
        if prev is None or cigs > prev:
            cigs_by_band[band] = cigs

    any_cigs = any(v is not None for v in cigs_by_band.values())

    def _is_no(x) -> bool:
        return _is_int_eq(x, 2)

    # ---- 1) "All items no or missing" -> explicit never-smoker -------------
    all_items_no_or_blank = (
        (smoke_raw in (None, "") or _is_no(smoke_raw))
        and (start_age is None)
        and (stop_age is None)
        and (preg_raw in (None, "", 2, 3))
        and (curr_raw in (None, "", 2))
        and (restart_raw in (None, "", 2))
        and (not any_cigs)
        and (inhale_raw in (None, ""))
    )
    if all_items_no_or_blank:
        smoke_eff = 2
    else:
        smoke_eff = smoke_raw

    # Fix inconsistent "current smoker" vs stop age < entry
    if (
        age_entry_f is not None
        and _is_int_eq(curr_raw, 1)
        and stop_age is not None
        and 0 < stop_age < age_entry_f
    ):
        curr_eff = 2
    else:
        curr_eff = curr_raw

    # ---- 2) Ever-smoker flag (SAS-style) -----------------------------------
    ever = None
    if _is_int_eq(smoke_eff, 1):
        ever = True
    elif _is_int_eq(smoke_eff, 2):
        ever = False

    if age_entry_f is not None:
        has_age_evidence = (
            (start_age is not None and 0 < start_age <= age_entry_f)
            or (stop_age is not None and 0 < stop_age <= age_entry_f)
        )
        # Blank "ever" but ages indicate smoking
        if smoke_eff in (None, "") and has_age_evidence:
            ever = True
        # Answered "no" but ages indicate smoking
        if _is_int_eq(smoke_eff, 2) and has_age_evidence:
            ever = True
        # Started after entry -> never-smoker at entry
        if start_age is not None and age_entry_f < start_age < 120:
            ever = False

    # Additional evidence from current flag and cigs
    if ever is not True:
        if _is_int_eq(curr_eff, 1):
            ever = True
        elif any_cigs and age_entry_f is not None:
            ever = True

    # ---- 3) Current-smoker flag at entry -----------------------------------
    current = None
    if _is_int_eq(curr_eff, 1):
        current = True
    elif _is_int_eq(curr_eff, 2):
        current = False

    if age_entry_f is not None and start_age is not None and 0 < start_age < 120:
        if start_age <= age_entry_f:
            if stop_age is None or stop_age >= age_entry_f:
                current = True
            elif 0 < stop_age < age_entry_f:
                current = False
        else:
            # Started after entry => not current at entry
            current = False

    if ever is False:
        current = False

    # ---- 4) Map to R0_SmokingStatus (0/1/2) --------------------------------
    if ever is False:
        status = 0   # never
    elif ever is True and current is True:
        status = 2   # current
    elif ever is True and current is False:
        status = 1   # former
    else:
        status = None

    out["R0_SmokingStatus"] = status

    # ---- 5) Cigs/day at entry (current smokers only) -----------------------
    if status == 2 and age_entry_f is not None:
        band = None
        if 16 <= age_entry_f < 25:
            band = "16-24"
        elif 25 <= age_entry_f < 50:
            band = "25-49"
        elif 50 <= age_entry_f < 120:
            band = "50+"

        if band is not None:
            cigs = cigs_by_band.get(band)
            if cigs is not None:
                out["R0_CigsPerDay"] = float(cigs)

    # ---- 6) Pack-years up to entry ----------------------------------------
    packyears = None
    if ever is True and age_entry_f is not None and age_entry_f >= 16:
        s = start_age
        t = stop_age

        # If still smoking and no valid stop age, treat stop as entry age
        if (t is None or t <= 0 or t >= 120) and current is True:
            t = age_entry_f

        if s is not None and t is not None and 0 < s < t <= age_entry_f:
            # Clamp to >=16 and <=entry
            s_eff = max(s, 16.0)
            t_eff = min(t, age_entry_f)
            if t_eff > s_eff:
                c16 = cigs_by_band.get("16-24")
                c25 = cigs_by_band.get("25-49")
                c50 = cigs_by_band.get("50+")

                bands = [
                    (16.0, 25.0, c16),
                    (25.0, 50.0, c25),
                    (50.0, 120.0, c50),
                ]

                total = 0.0
                for b_start, b_end, c in bands:
                    if c is None or c <= 0:
                        continue
                    seg_start = max(s_eff, b_start)
                    seg_end   = min(t_eff, b_end)
                    if seg_end <= seg_start:
                        continue
                    years = seg_end - seg_start
                    total += years * (c / 20.0)

                if total > 0:
                    packyears = round(total, 2)

    # Never-smokers: explicit 0; others with insufficient info: None
    if ever is False:
        out["R0_PackYears"] = 0.0
    else:
        out["R0_PackYears"] = packyears

    # Clean any stray zero ages back to None
    for k in ("R0_AgeStartedSmoking", "R0_AgeStoppedSmoking"):
        if _is_zero(out.get(k)):
            out[k] = None

    return out

In [33]:
physical_activity_derivation_explanation = """
Derive physical activity variable (from PhysicalActivity_anon questionnaire):

  • R0_PhysicalActivity  (baseline physical activity in MET-hours/week; float, 1 dp)

-------------------------------------------------------------------------------
1) Inputs used
-------------------------------------------------------------------------------
Methodology:
  Source: PhysicalActivity_anon section record, using the following hour-based fields:
    - R0_SportsSweatHours
    - R0_WorkSweatHours
    - R0_OtherSweatHours
    - R0_ActiveHouseworkHours
    - R0_ManualLaborHours
    - R0_WalkingHours
    - R0_CyclingHours
    - R0_DancingHours
    - R0_OtherModerateExerciseHours

Decisions / cutoffs:
  • Inputs are interpreted as “hours per week” for each activity domain.
  • Values are coerced to float where possible; non-parsable values are ignored (treated as missing).

-------------------------------------------------------------------------------
2) MET values and source (as defined in-script)
-------------------------------------------------------------------------------
Methodology:
  The function uses the following MET multipliers, explicitly documented in the script:

    Uses updated MET values based on averages from the 2024 Adult Compendium:

      - R0_SportsSweatHours                * 7.2
      - R0_WorkSweatHours                  * 5.5
      - R0_OtherSweatHours                 * 5.5
      - R0_ActiveHouseworkHours           * 3.4
      - R0_ManualLaborHours               * 3.9
      - R0_WalkingHours                   * 3.4
      - R0_CyclingHours                   * 5.0
      - R0_DancingHours                   * 4.8
      - R0_OtherModerateExerciseHours     * 3.8

Decisions / cutoffs:
  • MET multipliers are fixed constants (not participant-specific).
  • Only domains with a non-missing numeric hour value contribute to the total.

-------------------------------------------------------------------------------
3) Compute total MET-hours/week (R0_PhysicalActivity)
-------------------------------------------------------------------------------
Methodology:
  1. Initialise:
      - total_met_hours = 0.0
      - any_non_null = False

  2. For each activity domain in the mapping:
      - Parse hours (h) as float
      - If h is missing: skip
      - Else:
          any_non_null = True
          total_met_hours += h * MET_value

  3. Output:
      - If any_non_null is True:
          R0_PhysicalActivity = rounding(total_met_hours, 1)
      - Else:
          R0_PhysicalActivity remains missing (None)

Decisions / cutoffs:
  • Rounding is to 1 decimal place.
  • There is no additional plausibility filter (e.g., max hours/week) applied in this step.
"""


In [34]:
def derive_R0_PhysicalActivity_from_physical_section(pa_rec: dict) -> dict:

    out = {"R0_PhysicalActivity": None}

    if not isinstance(pa_rec, dict):
        return out

    mapping = [
        ("R0_SportsSweatHours",            7.2),
        ("R0_WorkSweatHours",              5.5),
        ("R0_OtherSweatHours",             5.5),
        ("R0_ActiveHouseworkHours",        3.4),
        ("R0_ManualLaborHours",            3.9),
        ("R0_WalkingHours",                3.4),
        ("R0_CyclingHours",                5.0),
        ("R0_DancingHours",                4.8),
        ("R0_OtherModerateExerciseHours",  3.8),
    ]

    total_met_hours = 0.0
    any_non_null = False

    for var, met in mapping:
        h = _to_float_or_none(pa_rec.get(var))
        if h is None:
            continue
        any_non_null = True
        total_met_hours += h * met

    if any_non_null:
        # Round to 1 decimal place
        out["R0_PhysicalActivity"] = rounding(total_met_hours, 1)

    return out

In [35]:
"""
Derive diet variables (from AlcoholSmokingDiet_anon):

  • R0_GreenVegDailyServings  (integer servings/day; None if missing)
  • R0_FruitDailyServings     (integer servings/day; None if missing)

-------------------------------------------------------------------------------
1) Inputs used
-------------------------------------------------------------------------------
Methodology:
  Source: AlcoholSmokingDiet_anon record (per participant), using:
    - R0_GreenVegDailyServings
    - R0_FruitDailyServings

  The derivation assumes AlcoholSmokingDiet_anon has already been validated against
  AlcoholSmokingDiet_Schema.json prior to derivation, so these values should already
  be cleaned to integer-or-null and constrained to be >= 0 at the schema layer.

Decisions / cutoffs:
  • No additional cleaning rules (beyond schema validation) are applied inside the
    derivation function itself.

-------------------------------------------------------------------------------
2) Derivation logic (as implemented)
-------------------------------------------------------------------------------
Methodology:
  1. Initialise outputs:
       - R0_GreenVegDailyServings = None
       - R0_FruitDailyServings    = None

  2. If the AlcoholSmokingDiet_anon record is not a dict-like object:
       - return the initial outputs (both None)

  3. Otherwise:
       - Read the raw values:
           gv_raw = alc_rec["R0_GreenVegDailyServings"]
           fr_raw = alc_rec["R0_FruitDailyServings"]

       - Normalise each using the existing helper:
           gv = _to_float_or_none(gv_raw)
           fr = _to_float_or_none(fr_raw)

       - If gv is not None, cast to int and store:
           R0_GreenVegDailyServings = int(gv)

       - If fr is not None, cast to int and store:
           R0_FruitDailyServings = int(fr)

Decisions / cutoffs:
  • Type coercion:
      - Values are normalised to float/None via _to_float_or_none, then cast to int
        to match DerivedVariables_Schema (integer/null).
  • Missingness:
      - Missing or non-parsable values remain None.
  • No upper-bound plausibility cutoffs are applied in this function.
"""


'\nDerive diet variables (from AlcoholSmokingDiet_anon):\n\n  • R0_GreenVegDailyServings  (integer servings/day; None if missing)\n  • R0_FruitDailyServings     (integer servings/day; None if missing)\n\n-------------------------------------------------------------------------------\n1) Inputs used\n-------------------------------------------------------------------------------\nMethodology:\n  Source: AlcoholSmokingDiet_anon record (per participant), using:\n    - R0_GreenVegDailyServings\n    - R0_FruitDailyServings\n\n  The derivation assumes AlcoholSmokingDiet_anon has already been validated against\n  AlcoholSmokingDiet_Schema.json prior to derivation, so these values should already\n  be cleaned to integer-or-null and constrained to be >= 0 at the schema layer.\n\nDecisions / cutoffs:\n  • No additional cleaning rules (beyond schema validation) are applied inside the\n    derivation function itself.\n\n------------------------------------------------------------------------------

In [36]:
def derive_diet_from_alcohol_section(alc_rec: dict):
    """
    Derive per-participant diet variables from AlcoholSmokingDiet_anon:
      - R0_GreenVegDailyServings
      - R0_FruitDailyServings

    Assumes AlcoholSmokingDiet_anon has already been validated
    against AlcoholSmokingDiet_Schema.json, so values are already
    cleaned (integer or null, >= 0).
    """
    out = {
        "R0_GreenVegDailyServings": None,
        "R0_FruitDailyServings": None,
    }

    if not isinstance(alc_rec, dict):
        return out

    # Raw values from AlcoholSmokingDiet_anon
    gv_raw = alc_rec.get("R0_GreenVegDailyServings")
    fr_raw = alc_rec.get("R0_FruitDailyServings")

    # Use existing helper to normalise to float/None; 
    # then cast to int to match DerivedVariables_Schema (integer/null)
    gv = _to_float_or_none(gv_raw)
    fr = _to_float_or_none(fr_raw)

    if gv is not None:
        out["R0_GreenVegDailyServings"] = int(gv)
    if fr is not None:
        out["R0_FruitDailyServings"] = int(fr)

    return out

In [37]:
# =========================
# IO helpers (s4 in, s5 out)
# =========================
S4 = Path(out_json_path) / "s4_anon"
S5 = Path(out_json_path) / "s5_derived"

def _load_first_existing(paths: List[Path]) -> Optional[Any]:
    for p in paths:
        if p.exists():
            with open(p, "r") as f:
                return json.load(f)
    return None

In [38]:
def _load_raw_derived_s5() -> List[Dict[str, Any]]:
    """
    Load raw-derivation output (entry variables) from S5.
    Expected output from RawDerivation.ipynb:
      [{ "TCode": "...", "DOB": "YYYY-MM-DD", "YOB": 19xx, "EntryDate": "...", "AgeEntry": n, ... }, ...]
    """
    candidates = [
        S5 / "RawDerivedVariables.json",
        S5 / "Output_RawDerivedVariables.json",
    ]
    data = _load_first_existing(candidates)
    if data is None:
        return []
    if isinstance(data, list):
        return data
    if isinstance(data, dict):
        return [data]
    return []

def _map_raw_by_tcode(raw_list: List[Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
    out = {}
    for rec in raw_list or []:
        if not isinstance(rec, dict):
            continue
        tc = rec.get("TCode") or rec.get("R0_TCode")
        if isinstance(tc, str) and tc:
            out[tc] = rec
    return out

In [39]:
def _load_section_json_s4(section_base: str, alt_names: List[str] = None) -> List[Dict[str, Any]]:
    candidates = [S4 / f"{section_base}.json", S4 / f"Output_{section_base}.json"]
    if alt_names:
        candidates += [S4 / n for n in alt_names]
    data = _load_first_existing(candidates)
    if data is None:
        return []
    if isinstance(data, list):
        return data
    if isinstance(data, dict):
        return [data]
    return []

In [40]:
def run():
    # Load schema for validation
    derived_schema = load_schema(r0_json_path, 'DerivedVariables_Schema')

    # ---- Load s4_anon inputs ----
    physical_list     = _load_section_json_s4("PhysicalDevelopment_anon")
    pregnancies_list  = _load_section_json_s4("Pregnancies_anon")
    generalinfo_list  = _load_section_json_s4("GeneralInformation_anon")
    menstrual_list    = _load_section_json_s4("MenstrualMenopause_anon")
    chrt_list         = _load_section_json_s4("ContraceptiveHRT_anon")
    preg_list         = _load_section_json_s4("Pregnancies_anon")
    illnesses_list    = _load_section_json_s4("MH_Illnesses_anon")
    alcohol_list      = _load_section_json_s4("AlcoholSmokingDiet_anon")
    physical_activity_list = _load_section_json_s4("PhysicalActivity_anon")
    
    # Map by R0_TCode (s4_anon uses TCode)
    phys_by_tcode = {rec.get("R0_TCode"): rec for rec in physical_list if isinstance(rec, dict) and rec.get("R0_TCode")}
    preg_by_tcode = {rec.get("R0_TCode"): rec for rec in pregnancies_list if isinstance(rec, dict) and rec.get("R0_TCode")}
    gi_by_tcode   = {rec.get("R0_TCode"): rec for rec in generalinfo_list if isinstance(rec, dict) and rec.get("R0_TCode")}
    mm_by_tcode   = {rec.get("R0_TCode"): rec for rec in menstrual_list if isinstance(rec, dict) and rec.get("R0_TCode")}
    chrt_by_tcode = {rec.get("R0_TCode"): rec for rec in chrt_list if isinstance(rec, dict) and rec.get("R0_TCode")}
    preg_by_tcode = {rec.get("R0_TCode"): rec for rec in preg_list if isinstance(rec, dict) and rec.get("R0_TCode")}
    ill_by_tcode  = {rec.get("R0_TCode"): rec for rec in illnesses_list if isinstance(rec, dict) and rec.get("R0_TCode")}
    alcohol_by_tcode = {rec.get("R0_TCode"): rec for rec in alcohol_list  if isinstance(rec, dict) and rec.get("R0_TCode")}
    pa_by_tcode = {rec.get("R0_TCode"): rec for rec in physical_activity_list if isinstance(rec, dict) and rec.get("R0_TCode")}

    # Ensure any TCodes seen in S4 exist in by_tcode even if raw derivation is missing
    all_s4_tcodes = set()
    for m in [phys_by_tcode, preg_by_tcode, gi_by_tcode, mm_by_tcode, chrt_by_tcode, ill_by_tcode, alcohol_by_tcode, pa_by_tcode]:
        all_s4_tcodes.update([tc for tc in (m or {}).keys() if isinstance(tc, str) and tc])

    # 1) Raw derivation first (no Mailing DB): provides shifted DOB + AgeEntry etc.
    raw_list = _load_raw_derived_s5()
    raw_by_tcode = _map_raw_by_tcode(raw_list)

    by_tcode = {}
    for tcode, v in raw_by_tcode.items():
        by_tcode[tcode] = {
            "R0_TCode": tcode,
            "R0_DOB": v.get("DOB"),         # shifted DOB from raw derivation output
            "R0_YOB": v.get("YOB"),
            "R0_EntryDate": v.get("EntryDate"),
            "R0_AgeEntry": v.get("AgeEntry"),
        }

    # Ensure any TCodes seen in S4 exist in by_tcode even if raw derivation is missing
    all_s4_tcodes = set()
    for m in [phys_by_tcode, preg_by_tcode, gi_by_tcode, mm_by_tcode, chrt_by_tcode, ill_by_tcode, alcohol_by_tcode, pa_by_tcode]:
        all_s4_tcodes.update([tc for tc in (m or {}).keys() if isinstance(tc, str) and tc])

    for tc in all_s4_tcodes:
        if tc not in by_tcode:
            by_tcode[tc] = {
                "R0_TCode": tc,
                "R0_DOB": None,
                "R0_YOB": None,
                "R0_EntryDate": None,
                "R0_AgeEntry": None,
            }

    # 2) Ethnicity/AJ next
    if generalinfo_list:
        gi_df = pd.DataFrame.from_records(generalinfo_list)
        if not gi_df.empty:
            gi_out = derive_ethnicity_simple(gi_df)
            eth_map = gi_out.set_index("R0_TCode")[["R0_Ethnicity", "R0_AshkenaziAncestry"]].to_dict(orient="index")
            for tcode, e in eth_map.items():
                if tcode not in by_tcode:
                    # create with mailing fields missing but still keep ordering (TCode first)
                    by_tcode[tcode] = {
                        "R0_TCode": tcode,
                        "R0_DOB": None,
                        "R0_YOB": None,
                        "R0_EntryDate": None,
                        "R0_AgeEntry": None,
                    }
                # Insert ethnicity keys now (come after the 4 entry fields)
                by_tcode[tcode]["R0_Ethnicity"] = int(e["R0_Ethnicity"]) if pd.notna(e["R0_Ethnicity"]) else None
                by_tcode[tcode]["R0_AshkenaziAncestry"] = int(e["R0_AshkenaziAncestry"]) if pd.notna(e["R0_AshkenaziAncestry"]) else None
    else:
        print("GeneralInformation not found in s4_anon — skipping ethnicity/AJ derivation.")

    for tcode, out_rec in by_tcode.items():
        entry_rec = raw_by_tcode.get(tcode, {})  # raw derivation record with DOB/AgeEntry
        preg_rec  = preg_by_tcode.get(tcode)
        out_rec["R0_PregAt20"] = _derive_R0_PregAt20_for_person(entry_rec, preg_rec)   

    # ---- Body size: clean → compute (ENTRY) -----------------------------
    # Uses clean_body_size(feet, inches, cm, stone, pounds, kg) and _bmi_value(w, h)
    if phys_by_tcode:
        for tcode, pdata in phys_by_tcode.items():
            if tcode not in by_tcode:
                by_tcode[tcode] = {"R0_TCode": tcode}

            # ENTRY height/weight cleaned: use RecordedHeights/RecordedWeights with flag "Cur"
            ft_cur, in_cur, cm_cur = _get_recorded_height(pdata, "Cur")
            st_cur, lb_cur, kg_cur = _get_recorded_weight(pdata, "Cur")

            wt, ht = clean_body_size(ft_cur, in_cur, cm_cur, st_cur, lb_cur, kg_cur)

            by_tcode[tcode]["R0_Height"] = rounding(ht, 1) if ht is not None else None
            by_tcode[tcode]["R0_Weight"] = rounding(wt, 1) if wt is not None else None

            # ENTRY BMI
            curr_preg = preg_by_tcode.get(tcode, {}).get("R0_CurrentPreg", 0) if preg_by_tcode else 0
            if curr_preg != 1:
                curr_preg = 0
            by_tcode[tcode]["R0_PregAtEntry"] = curr_preg
            if curr_preg == 1:
                by_tcode[tcode]["R0_BMI"] = 999
            else:
                by_tcode[tcode]["R0_BMI"] = _bmi_value(by_tcode[tcode]["R0_Weight"], by_tcode[tcode]["R0_Height"])

            # ---- Age-20 height/weight cleaned ---------------------------
            ft20, in20, cm20 = _get_recorded_height(pdata, "20")
            st20, lb20, kg20 = _get_recorded_weight(pdata, "20")

            wt20, ht20 = clean_body_size(ft20, in20, cm20, st20, lb20, kg20)

            by_tcode[tcode]["R0_Height20"] = rounding(ht20, 1)   # cm or None
            by_tcode[tcode]["R0_Weight20"] = rounding(wt20, 1)   # kg or None

            # BMI at 20 rules:
            #   999 if (R0_AgeEntry < 20) or (R0_PregAt20 == 1)
            #   else _bmi_value(w20, h20) or None
            age_entry  = by_tcode[tcode].get("R0_AgeEntry")
            preg_at_20 = by_tcode[tcode].get("R0_PregAt20")
            na_bmi20 = False
            try:
                if age_entry is not None and float(age_entry) < 20:
                    na_bmi20 = True
            except Exception:
                pass
            if preg_at_20 == 1:
                na_bmi20 = True

            if na_bmi20:
                by_tcode[tcode]["R0_BMI20"] = 999
            else:
                by_tcode[tcode]["R0_BMI20"] = _bmi_value(wt20, ht20)
    else:
        print("PhysicalDevelopment not found in s4_anon — skipping body size derivations.")

        # ---- Waist/Hip + WHR -----------------------------------------------
    try:
        wh_by_tcode = derive_waist_hip(phys_by_tcode, preg_by_tcode)
        for tc, wh in wh_by_tcode.items():
            if tc not in by_tcode:
                by_tcode[tc] = {"R0_TCode": tc}
            by_tcode[tc].update({
                "R0_WaistCircum": wh.get("R0_WaistCircum"),
                "R0_HipCircum": wh.get("R0_HipCircum"),
                "R0_WaistHipRatio": wh.get("R0_WaistHipRatio"),
            })
    except Exception as e:
        print(f"Waist/Hip derivation failed: {e}")

    # ---- Menstrual/Menopause: Age at Menarche -------------------------------
    try:
        for tcode in list(by_tcode.keys()):
            mrec = mm_by_tcode.get(tcode, {})
            ser  = derive_R0_AgeMenarche_from_menstrual(mrec) if mrec else pd.Series({"R0_AgeMenarche": pd.NA}, dtype="Int64")
            by_tcode[tcode].update(ser.to_dict())
    except Exception as e:
        print(f"R0_AgeMenarche derivation failed: {e}")

    # ---- Menstrual/Menopause: R0_Menopause + R0_AgeMenopause + R0_MenopauseReason ----
    try:
        for tcode, outrec in by_tcode.items():
            mrec = mm_by_tcode.get(tcode, {})
            age_entry = outrec.get("R0_AgeEntry")
            if mrec:
                vals = derive_R0_menopause_from_menstrual(mrec, age_entry)
            else:
                vals = {
                    "R0_Menopause": None,
                    "R0_AgeMenopause": None,
                    "R0_MenopauseReason": None,
                }
            outrec.update(vals)
    except Exception as e:
        print(f"R0_Menopause derivation failed for {tcode}: {e}")

    # Derive the four OC variables and update each record
    for tc, outrec in by_tcode.items():
        chrt = chrt_by_tcode.get(tc, {})
        age_entry = outrec.get("R0_AgeEntry")
        try:
            oc = derive_oc_from_contraceptive_section(chrt, age_entry)
            outrec.update(oc)
        except Exception as e:
            print(f"OC derivation failed for {tc}: {e}")

    # derive + update per participant (after Mailing/DOB are in by_tcode)
    for tcode, outrec in by_tcode.items():
        preg_sec = preg_by_tcode.get(tcode, {})
        
        dob = outrec.get("R0_DOB")
        try:
            preg_vars = derive_parity_and_breastfeeding(preg_sec, dob)

            outrec.update(preg_vars)
        except Exception as e:
            print(f"Preg derivation failed for {tcode}: {e}")

    # ---- BreastDisease -> R0_BBD --------------------------------------------
    try:
        bd_list = _load_section_json_s4("BreastDisease_anon")  # or "BreastDisease_PII_anon" in your project
    except Exception:
        bd_list = []
    bd_by_tcode = {r.get("R0_TCode"): r for r in bd_list if isinstance(r, dict) and r.get("R0_TCode")}

    for tc, outrec in by_tcode.items():
        bd = bd_by_tcode.get(tc, {})
        try:
            outrec.update(derive_R0_BBD_from_breast_disease(bd))
        except Exception as e:
            print(f"BBD derivation failed for {tc}: {e}")

    # # ---- Family history (first-degree, SAS-style equality) -> R0_FamHistBC ---
    # try:
    #     cr_list = _load_section_json_s4("CancerRelatives_anon")  # adjust to your exact name if needed
    # except Exception:
    #     cr_list = []
    # cr_by_tcode = {r.get("R0_TCode"): r for r in cr_list if isinstance(r, dict) and r.get("R0_TCode")}

    # for tc, outrec in by_tcode.items():
    #     try:
    #         cr = cr_by_tcode.get(tc, {})
    #         outrec.update(derive_R0_FamHistBC_from_cancer_relatives_eq(cr))
    #     except Exception as e:
    #         print(f"FamHist derivation failed for {tc}: {e}")

    # ---- Diabetes derivations (from MH_Illnesses_anon) ----------------------
    for tc, outrec in by_tcode.items():
        try:
            mh_rec = ill_by_tcode.get(tc, {})  # section record for this TCode (may be empty)
            outrec.update(derive_diabetes_from_mh(mh_rec))
        except Exception as e:
            print(f"Diabetes derivation failed for {tc}: {e}")

    # Derive the four HRT variables and update each record
    for tc, outrec in by_tcode.items():
        chrt = chrt_by_tcode.get(tc, {})
        age_entry = outrec.get("R0_AgeEntry")
        try:
            hrt = derive_hrt_from_contraceptive_section(chrt, age_entry)
            outrec.update(hrt)
        except Exception as e:
            print(f"HRT derivation failed for {tc}: {e}")

    # ---- Alcohol derivations (from AlcoholSmokingDiet_anon) -----------------
    for tc, outrec in by_tcode.items():
        alc_rec = alcohol_by_tcode.get(tc, {})  # may be {}
        age_entry = outrec.get("R0_AgeEntry")
        try:
            alc_vars = derive_alcohol_from_alcohol_section(alc_rec, age_entry)
            outrec.update(alc_vars)
        except Exception as e:
            print(f"Alcohol derivation failed for {tc}: {e}")

    # ---- Smoking derivations (from AlcoholSmokingDiet_anon) -----------------
    for tc, outrec in by_tcode.items():
        smk_rec = alcohol_by_tcode.get(tc, {})  # same section; uses smoking subset
        age_entry = outrec.get("R0_AgeEntry")
        try:
            smk_vars = derive_smoking_from_alcohol_section(smk_rec, age_entry)
            outrec.update(smk_vars)
        except Exception as e:
            print(f"Smoking derivation failed for {tc}: {e}")

    
    # ---- Fruit & vegetable derivations (from AlcoholSmokingDiet_anon) -----
    for tc, outrec in by_tcode.items():
        diet_rec = alcohol_by_tcode.get(tc, {})  # same section, diet subset
        try:
            diet_vars = derive_diet_from_alcohol_section(diet_rec)
            outrec.update(diet_vars)
        except Exception as e:
            print(f"Diet (fruit/veg) derivation failed for {tc}: {e}")

    # ---- Physical activity -> R0_PhysicalActivity --------------------------
    for tc, outrec in by_tcode.items():
        pa_rec = pa_by_tcode.get(tc, {})
        try:
            pa_vars = derive_R0_PhysicalActivity_from_physical_section(pa_rec)
            outrec.update(pa_vars)
        except Exception as e:
            print(f"Physical activity derivation failed for {tc}: {e}")

    # ---- Finalize & validate/save --------------------------------------
    derived_results = list(by_tcode.values())

    #TEMP FAM HIST
    output_path = r"N:\CancerEpidem\BrBreakthrough\DeliveryProcess\Data_Output_Testing\temp_fam_hist\fam_hist.json"

    with open(output_path, "r") as f:
        famhist = json.load(f)

    lookup = {
        item["R0_TCode"]: {k: v for k, v in item.items() if k != "R0_TCode"}
        for item in famhist
    }

    for item in derived_results:
        item.update(lookup.get(item["R0_TCode"], {}))

    keys_to_delete = ['R0_AgeEntry', 'R0_DOB', 'R0_EntryDate', 'R0_YOB']
    
    cleaned_results = [
        {k: v for k, v in item.items() if k not in keys_to_delete}
        for item in derived_results
    ]

    try:
        validate_data(cleaned_results, derived_schema, os.path.join(r0_json_path, 'DerivedVariables_Schema.json'))
    except ValidationError as e:
        print("✗ Validation error:", e)

    S5.mkdir(parents=True, exist_ok=True)
    out_path = S5 / "DerivedVariables.json"
    with open(out_path, "w") as f:
        json.dump(cleaned_results, f, indent=2)
    print(f"✓ Saved derived output: {out_path}")

In [None]:
if __name__ == "__main__":
    run()