In [1]:
import sys
import os
import json
from pathlib import Path

sys.path.append(os.path.abspath("N:\\CancerEpidem\\BrBreakthrough\\DeliveryProcess\\Schema_and_Derivation_utils\\Pathology\\scripts"))
from loader_utils import load_pathology_dfs
from building_utils import build_nested_breast_tumour_link
from pseudo_anon_path import apply_pathology_privacy_transforms, write_pseudoanon_schema

sys.path.append(os.path.abspath("N:\\CancerEpidem\\BrBreakthrough\\DeliveryProcess\\Schema_and_Derivation_utils"))
from utilities import createLogger
from config import live_server, Delivery_log_path

sys.path.append(os.path.abspath("N:\\CancerEpidem\\BrBreakthrough\\DeliveryProcess\\Schema_and_Derivation_utils\\Questionnaire\\R0\\scripts"))
from common_utils import validate_data

In [2]:
logger = createLogger("Pathology", Delivery_log_path)

In [3]:
schema_base = r"N:\CancerEpidem\BrBreakthrough\DeliveryProcess\Schema_and_Derivation_utils\Pathology\json_schemas"
SCHEMA_PATH = schema_base + "\\" + r"raw\BreastTumourLink_Schema.json"

In [4]:
def run_breast_tumour_link(server: str, pathology_db: str = "Pathology"):
    logger = createLogger("BreastTumourLink", r"N:\CancerEpidem\BrBreakthrough\DeliveryProcess\Logs")

    schema, df_tt, df_lt, df_tma = load_pathology_dfs(
        schema_path=SCHEMA_PATH,
        server=server,
        logger=logger,
        pathology_db=pathology_db,
    )

    nested = build_nested_breast_tumour_link(schema, df_tt, df_lt, df_tma)

    # optional quick sanity checks
    logger.info(f"TumourTracking rows: {0 if df_tt is None else len(df_tt):,}")
    logger.info(f"LabTracking rows:    {0 if df_lt is None else len(df_lt):,}")
    logger.info(f"TMAs_All rows:       {0 if df_tma is None else len(df_tma):,}")
    logger.info(f"Output StudyIDs:     {len(nested):,}")

    return nested

In [None]:
nested = run_breast_tumour_link(server=live_server)

In [6]:
# Validate pseudo-anonymised output against pseudo-anon schema
schema_path = schema_base + "\\" + r"raw\\BreastTumourLink_Schema.json"
with open(schema_path, 'r') as f:
    schema = json.load(f)
validate_data(nested, schema, schema_path=schema_path)

Validating 2,936 items...
100% - Validation completed in 11.55 seconds
✓ All items are valid


In [7]:
nested_pa = apply_pathology_privacy_transforms(nested, server=live_server, logger=logger)

In [None]:
write_pseudoanon_schema(
    in_schema_path=SCHEMA_PATH,
    out_schema_path=schema_base + "\\" + r"pseudo_anon\BreastTumourLink_Schema_PseudoAnon.json"
)

In [9]:
# Validate pseudo-anonymised output against pseudo-anon schema
pseudo_schema_path = schema_base + "\\" + r"pseudo_anon\\BreastTumourLink_Schema_PseudoAnon.json"
with open(pseudo_schema_path, 'r') as f:
    pseudo_schema = json.load(f)
validate_data(nested_pa, pseudo_schema, schema_path=pseudo_schema_path)

Validating 2,936 items...
0%

100% - Validation completed in 4.88 seconds
✓ All items are valid
