In [1]:
import os
import sys
import math
import json
from jsonschema import validate, ValidationError, FormatChecker, RefResolver
from pathlib import Path

  from jsonschema import validate, ValidationError, FormatChecker, RefResolver


In [2]:
sys.path.append(os.path.abspath("N:\\CancerEpidem\\BrBreakthrough\\DeliveryProcess\Schema_and_Derivation_utils"))
from config import r0_json_path, out_json_path

In [3]:
def clean_body_size(raw_ft, raw_in, raw_cm, raw_st, raw_lbs, raw_kg):
    """
    Cleans and converts raw height and weight inputs using SAS logic.
    Returns tuple (weight_kg, height_cm)
    """

    # Interpret 99999 as missing
    def to_none(x):
        return None if x in {99999, 88888, 77777, None} else x

    ft = to_none(raw_ft)
    inch = to_none(raw_in)
    cm_val = to_none(raw_cm)
    st_val = to_none(raw_st)
    lbs_val = to_none(raw_lbs)
    kg_val = to_none(raw_kg)

    # Convert imperial to metric
    w_imp = None
    if st_val is not None:
        lbs_val = lbs_val if lbs_val is not None else 0
        w_imp = st_val * 6.350293 + lbs_val * 0.45359237
    w_met = kg_val

    h_imp = None
    if ft is not None and inch is not None:
        h_imp = ft * 30.48 + inch * 2.54
    h_met = cm_val

    # SAS selection logic - choose best value
    w_now = w_imp if w_imp is not None else w_met
    h_now = h_imp if h_imp is not None else h_met

    # Resolve height discrepancies >5cm
    if h_imp is not None and h_met is not None and abs(h_imp - h_met) > 5:
        if 145 < h_imp < 185 and not (145 < h_met < 185):
            h_now = h_imp
        elif 145 < h_met < 185 and not (145 < h_imp < 185):
            h_now = h_met

    # Handle out-of-range/missing values
    if w_now is None:
        w_now = 99999  # Unknown weight
    if h_now is None:
        h_now = 99999  # Unknown height

    # Round results
    weight_kg = round(w_now, 1)
    height_cm = round(h_now, 1)
    
    return weight_kg, height_cm

In [4]:
def calculate_derived_variables(physical_data, pregnancies_data):
    derived_data = {}
    
    for participant_id, data in physical_data.items():
        # Extract pregnancy status (default to 2=No if not available)
        preg_status = pregnancies_data.get(participant_id, {}).get('R0_CurrentPreg', 2)
        
        # Extract raw values
        feet = data.get('R0_CurrentHght_Ft', '')
        inches = data.get('R0_CurrentHght_In', '')
        cm = data.get('R0_CurrentHght_Cm', '')
        stone = data.get('R0_CurrentWght_St', '')
        pounds = data.get('R0_CurrentWght_Lbs', '')
        kg = data.get('R0_CurrentWght_Kg', '')
        
        # Apply cleaning and conversion
        weight_kg, height_cm = clean_body_size(feet, inches, cm, stone, pounds, kg)
        
        # Initialize derived record
        derived_record = {
            "baselineHeight": height_cm,
            "baselineWeight": weight_kg,
            "baselineBMI": None,
            "baselineCurrPreg": preg_status  # New field
        }
        
        # Handle pregnancy override
        if preg_status == 1:  # Currently pregnant
            derived_record["baselineWeight"] = 88888
            derived_record["baselineBMI"] = 88888
        else:
            # Calculate BMI normally
            if (derived_record["baselineHeight"] not in {77777, 99999} and 
                derived_record["baselineWeight"] not in {77777, 99999}):
                
                height_m = derived_record["baselineHeight"] / 100
                bmi = derived_record["baselineWeight"] / (height_m ** 2)
                
                if 15 <= bmi <= 60:
                    derived_record["baselineBMI"] = round(bmi, 1)
                else:
                    derived_record["baselineBMI"] = 77777  # Out-of-range
            else:
                derived_record["baselineBMI"] = (
                    77777 if derived_record["baselineHeight"] == 77777 
                    else 99999
                )
        
        derived_data[participant_id] = derived_record
    
    return derived_data

In [5]:
with open(os.path.join(out_json_path, 'Output_PhysicalDevelopment.json'), 'r') as f:
    processed_data = json.load(f)

In [6]:
# Load derived variables schema
with open(os.path.join(r0_json_path, 'DerivedVariables_JSON.json'), 'r') as f:
    derived_schema = json.load(f)

In [7]:
with open(os.path.join(r0_json_path, 'PhysicalDevelopment_JSON.json'), 'r') as f:
    physical_schema = json.load(f)

In [8]:
# Load PREGNANCIES data
with open(os.path.join(out_json_path, 'Output_Pregnancies.json'), 'r') as f:
    pregnancies_data = json.load(f)

In [9]:
schema_dir = Path(r0_json_path).resolve()
base_uri = f'file://{schema_dir}/'

store = {
    f'{base_uri}DerivedVariables_JSON.json': derived_schema,
    f'{base_uri}PhysicalDevelopment_JSON.json': physical_schema
}
resolver = RefResolver(base_uri, derived_schema, store=store)

In [11]:
# Main processing flow
if __name__ == "__main__":
    # Load physical data
    with open(os.path.join(out_json_path, 'Output_PhysicalDevelopment.json'), 'r') as f:
        physical_data = json.load(f)
    
    # Calculate derived variables (pass both datasets)
    derived_results = calculate_derived_variables(physical_data, pregnancies_data)

    # Validate
    try:
        validate(instance=derived_results, schema=derived_schema, 
                 resolver=resolver, format_checker=FormatChecker())
        print('JSON data validated successfully')
    except ValidationError as e:
        print('Validation Error:', e)

JSON data validated successfully


In [None]:
# Save final output
outJSON = 'Output_Derivation.json'
file_path = os.path.join(out_json_path, outJSON)

if not os.path.exists(out_json_path):
    os.makedirs(out_json_path)

with open(file_path, 'w') as f:
    json.dump(derived_results, f, indent=4)

Successfully saved processed data to N:\CancerEpidem\BrBreakthrough\DeliveryProcess\Data_Output_Testing\Output_Derivation.json


In [13]:
for i in derived_results:
    if derived_results[i]['baselineCurrPreg'] == 1:
        print(i)

105081
105089
108847
112540
112660
112721
112771
112845
113219
113223
113433
113530
113547
113717
113795
113829
114235
114617
114645
114699
114750
115212
115221
115289
115381
115387
115417
115603
115615
115616
115638
115647
115668
115728
115729
115858
115914
115950
115968
116045
116076
116196
116307
116434
116537
116777
116944
117100
117390
117426
117452
117584
117636
118281
118610
118621
118971
119165
119649
119766
119836
120101
120324
120475
120672
120700
120770
120917
120948
121094
121217
121304
121307
121367
121505
121513
121779
122045
122206
122480
122566
122715
122837
122992
123047
123185
123594
123833
123939
123996
124071
124339
124356
124386
124730
124859
124942
124992
124996
125166
125561
125822
125896
126060
126070
126125
126157
126282
126618
126935
127263
127328
127416
127608
127690
127712
127730
127755
127820
128029
128040
128087
128090
128150
128226
128388
128441
128455
128478
128571
128715
128741
128771
128797
128868
128879
128901
128932
128934
129012
129015
129078
129162