In [1]:
# Imports
import sys
import os
import json
import re
import pandas as pd
from typing import Any, Dict, List, Optional, Union

sys.path.append(os.path.abspath("N:\\CancerEpidem\\BrBreakthrough\\DeliveryProcess\\Schema_and_Derivation_utils\\Questionnaire\\R0\\scripts"))
from common_utils import get_config, createLogger, load_schema, validate_data, save_output
from pseudo_anon_utils import load_sid_codes, pseudo_anonymize_studyid

sys.path.append(os.path.abspath("N:\\CancerEpidem\\BrBreakthrough\\DeliveryProcess\\Schema_and_Derivation_utils"))
from config import Delivery_log_path, live_server

sys.path.append(os.path.abspath("N:\\CancerEpidem\\BrBreakthrough\\DeliveryProcess\\Schema_and_Derivation_utils\\Pathology\\scripts"))
from building_utils import make_json_safe

In [2]:
# detects strings like 2021-01-19T00:00:00Z or 2021-01-19 00:00:00
_ISO_DT_PREFIX = re.compile(r"^\d{4}-\d{2}-\d{2}[T\s].*")

def _datetime_str_to_date_str(s: str) -> str:
    s2 = s.strip()
    if _ISO_DT_PREFIX.match(s2):
        return s2[:10]
    return s

def coerce_dates_by_schema_debug(
    data: Any,
    schema: Dict[str, Any],
    path: str = "<root>",
    *,
    max_prints: int = 300,
    _state: Optional[dict] = None
) -> Any:
    """
    Coerce ISO datetime strings to YYYY-MM-DD *only where schema expects format:'date'*.

    Special handling for your exact situation:
      - If data is a LIST and schema is an OBJECT schema, apply object schema to each list element.
    """
    if _state is None:
        _state = {"prints": 0, "converted": 0, "date_fields_seen": 0}

    def _p(msg: str):
        if _state["prints"] < max_prints:
            print(msg)
            _state["prints"] += 1

    # --- LIST + OBJECT-SCHEMA (your case) ---
    if isinstance(data, list) and isinstance(schema, dict) and schema.get("type") == "object":
        _p(f"[LIST of records] path={path} | len={len(data)} | schema.type=object -> applying to each element")
        return [
            coerce_dates_by_schema_debug(item, schema, path=f"{path}[{i}]", max_prints=max_prints, _state=_state)
            for i, item in enumerate(data)
        ]

    # --- DATE leaf ---
    if isinstance(schema, dict) and schema.get("format") == "date":
        _state["date_fields_seen"] += 1
        _p(f"[DATE FIELD] path={path} | type={type(data).__name__} | value={repr(data)}")
        if isinstance(data, str):
            converted = _datetime_str_to_date_str(data)
            if converted != data:
                _state["converted"] += 1
                _p(f"  -> CONVERT {repr(data)} ==> {repr(converted)} (converted_count={_state['converted']})")
            else:
                _p("  -> no change (not datetime-like string)")
            return converted
        else:
            _p("  -> no change (not a string)")
            return data

    # --- OBJECT ---
    if isinstance(data, dict):
        props = schema.get("properties") if isinstance(schema, dict) else None
        if not isinstance(props, dict):
            _p(f"[OBJECT no schema.properties] path={path} | leaving unchanged")
            return data

        out = {}
        for k, v in data.items():
            if k in props:
                out[k] = coerce_dates_by_schema_debug(v, props[k], path=f"{path}.{k}", max_prints=max_prints, _state=_state)
            else:
                # note: schema has additionalProperties:false, so these would also fail validation
                _p(f"[KEY NOT IN SCHEMA] path={path}.{k} | type={type(v).__name__} | value={repr(v)[:120]}")
                out[k] = v
        return out

    # --- ARRAY schema case (not used by your Outcomes schema) ---
    if isinstance(data, list):
        items_schema = schema.get("items") if isinstance(schema, dict) else None
        if isinstance(items_schema, dict):
            _p(f"[ARRAY with items] path={path} | len={len(data)}")
            return [
                coerce_dates_by_schema_debug(item, items_schema, path=f"{path}[{i}]", max_prints=max_prints, _state=_state)
                for i, item in enumerate(data)
            ]
        _p(f"[ARRAY no items and schema.type != object] path={path} | len={len(data)} | leaving unchanged")
        return data

    return data


In [3]:
def _shift_dt(val, shift_days: int, fmt: str | None):
    if val is None or (isinstance(val, float) and pd.isna(val)):
        return None

    dt = pd.to_datetime(val, errors="coerce")
    if pd.isna(dt):
        return val  # leave unparseable values unchanged

    dt2 = dt + pd.Timedelta(days=int(shift_days))

    if fmt == "date":
        return dt2.strftime("%Y-%m-%d")
    elif fmt == "date-time":
        return dt2.isoformat()
    else:
        return dt2.strftime("%Y-%m-%d")


def rename_and_shift_schema_shifted_fields(data: list[dict], schema: dict, sid_df: pd.DataFrame) -> list[dict]:
    """
    For each schema property ending with '_shifted':
      - expects source field in data WITHOUT suffix (e.g. 'DOD')
      - renames it to 'DOD_shifted'
      - shifts the value by SIDCodes.Random days (matched on TCode)

    Assumes:
      - data is list[dict] with top-level 'TCode'
      - sid_df has columns: 'TCode', 'Random'
      - schema has top-level properties (flat)
    """

    if "TCode" not in sid_df.columns or "Random" not in sid_df.columns:
        raise KeyError("sid_df must contain 'TCode' and 'Random' columns.")

    tcode_to_random = sid_df.set_index("TCode")["Random"].astype(int).to_dict()

    # Map base_field -> (shifted_field, format)
    base_to_shifted = {}
    for k, v in schema.get("properties", {}).items():
        if k.endswith("_shifted"):
            base = k[:-8]  # remove "_shifted"
            base_to_shifted[base] = (k, v.get("format"))

    for rec in data:
        tcode = rec.get("TCode")
        shift_days = int(tcode_to_random.get(tcode, 0))

        for base, (shifted_name, fmt) in base_to_shifted.items():
            # Only rename if base exists and shifted doesn't already
            if base in rec and shifted_name not in rec:
                rec[shifted_name] = _shift_dt(rec.pop(base), shift_days, fmt)
            # If shifted already exists, still shift it (optional, but usually helpful)
            elif shifted_name in rec:
                rec[shifted_name] = _shift_dt(rec.get(shifted_name), shift_days, fmt)

    return data

In [4]:
# Config
# Assumes common_utils.get_config() is the single source of truth for servers/paths.
config = get_config()

# Choose which SQL server to use
SERVER = config.get(live_server)

logger = createLogger("Outcomes", Delivery_log_path)

# Data
CSV_PATH = r'N:\NOBACKUP\Martina\ndrs_data_2025\data_outputs\outcomes_df_20260219.csv'

# Schema location
SCHEMA_DIR = r'N:\CancerEpidem\BrBreakthrough\DeliveryProcess\Schema_and_Derivation_utils\Questionnaire\R0\json_schemas\outcomes'
SCHEMA_NAME = 'Outcomes_Schema'

# Output
OUT_JSON_PATH = 'N:\CancerEpidem\BrBreakthrough\DeliveryProcess\Data_Output_Testing\s5_derived\Outcomes.json'

In [5]:
df = pd.read_csv(CSV_PATH)

# Basic sanity checks
if "StudyID" not in df.columns:
    raise ValueError("CSV must contain a 'StudyID' column to map to TCode.")

# Ensure StudyID is int (SIDCodes expects int StudyID mapping)
df["StudyID"] = pd.to_numeric(df["StudyID"], errors="raise").astype(int)


  df = pd.read_csv(CSV_PATH)


In [6]:
sid_df = load_sid_codes(live_server, logger)

# Convert the dataframe rows to a list-of-dicts suitable for pseudo_anonymize_studyid
records = df.to_dict(orient="records")

# Replace StudyID with TCode, preserving all other keys
pseudo_records = pseudo_anonymize_studyid(records, sid_df)


In [7]:
pseudo_records = make_json_safe(pseudo_records)

In [8]:
schema = load_schema(SCHEMA_DIR, SCHEMA_NAME)

pseudo_records = coerce_dates_by_schema_debug(pseudo_records, schema, max_prints=0)

In [9]:
out_data = rename_and_shift_schema_shifted_fields(
    data=pseudo_records,
    schema=schema,
    sid_df=sid_df
)

In [10]:
# validate_data prints progress and summary
validate_data(out_data, schema, schema_path=os.path.join(SCHEMA_DIR, f"{SCHEMA_NAME}.json"))

Validating 116,641 items...
100% - Validation completed in 73.68 seconds
âœ“ All items are valid


In [11]:
os.makedirs(os.path.dirname(OUT_JSON_PATH) or ".", exist_ok=True)
with open(OUT_JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(out_data, f, indent=2)

print("Wrote:", OUT_JSON_PATH)

Wrote: N:\CancerEpidem\BrBreakthrough\DeliveryProcess\Data_Output_Testing\s5_derived\Outcomes.json
