# Audit Sprint 4: Clinical Data Integrity Verification

**Objective:** Verify the integrity and biological plausibility of the new NHANES clinical dataset.
**Key Changes:**
- Loads configuration from `models/model_config.json`.
- Implemented **Biological Range Checks** (Sanity Checks).
- Type verification for clinical biomarkers.

In [None]:
import pandas as pd
import numpy as np
import os
import json

# 1. Load Configuration (Critical Step)
config_path = "../models/model_config.json"
if not os.path.exists(config_path):
    config_path = "models/model_config.json"

try:
    with open(config_path, 'r') as f:
        config = json.load(f)
    print("✅ Configuration loaded successfully.")
    print("Features:", config.get('features'))
except json.JSONDecodeError as e:
    print(f"❌ JSON Syntax Error in {config_path}: {e}")
    raise e
except Exception as e:
    print(f"❌ Error loading config: {e}")
    raise e

In [None]:
# Define Biological Constraints (Sanity Checks)
BIOLOGICAL_RANGES = {
    'SystolicBP': (0, 300),         # mmHg
    'TotalCholesterol': (50, 600),  # mg/dL
    'LDL': (20, 400),               # mg/dL
    'Triglycerides': (20, 2000),    # mg/dL
    'HbA1c': (2, 20),               # %
    'Glucose': (20, 600),           # mg/dL
    'UricAcid': (1, 20),            # mg/dL
    'Creatinine': (0.1, 15),        # mg/dL
    'BMI': (10, 100),               # kg/m2
    'WaistCircumference': (30, 200) # cm
}

## 2. Load and Harmonize Data

In [None]:
data_path = "../data/02_intermediate/process_data.parquet"

if not os.path.exists(data_path):
    data_path = "data/02_intermediate/process_data.parquet"

try:
    df = pd.read_parquet(data_path)
    print(f"✅ Data loaded. Shape: {df.shape}")
    
    # Renaming Dictionary (Spanish -> English Schema)
    spanish_cols = {
        'Presion_Sistolica': 'SystolicBP',
        'Colesterol_Total': 'TotalCholesterol',
        'LDL': 'LDL',
        'Triglycerides': 'Triglycerides',
        'HbA1c': 'HbA1c',
        'Glucosa': 'Glucose',
        'Acido_Urico': 'UricAcid',
        'Creatinina': 'Creatinine',
        'BMI': 'BMI',
        'Cintura': 'WaistCircumference',
        'Sexo': 'Sex',
        'Fumador': 'Smoking',
        'Actividad_Fisica': 'PhysicalActivity',
        'TARGET': 'HeartDisease'
    }
    
    df.rename(columns=spanish_cols, inplace=True)
    
    # Verify columns against config
    missing_cols = [col for col in config['features'] if col not in df.columns]
    if missing_cols:
        print(f"⚠️ Warning: Missing columns from config: {missing_cols}")
        for col in missing_cols:
            df[col] = np.nan # Add as NaN for audit purposes
            
    print("✅ Columns standardized.")
    print(df.columns.tolist())
    
except Exception as e:
    print(f"❌ Error loading data: {e}")
    df = pd.DataFrame()

## 3. Data Type Verification

In [None]:
if not df.empty:
    type_errors = []
    for col in config['numeric_features']:
        if col in df.columns:
            dtype = df[col].dtype
            if not (pd.api.types.is_float_dtype(dtype) or pd.api.types.is_integer_dtype(dtype)):
                type_errors.append(f"{col} has type {dtype} (Expected Numeric)")
    
    if type_errors:
        print("❌ Data Type Errors Found:")
        for err in type_errors:
            print(f"  - {err}")
    else:
        print("✅ All numeric variables have correct types.")

## 4. Biological Sanity Checks

In [None]:
if not df.empty:
    sanity_issues = {}
    
    for col, (min_val, max_val) in BIOLOGICAL_RANGES.items():
        if col in df.columns:
            outliers = df[(df[col] < min_val) | (df[col] > max_val)]
            count = len(outliers)
            
            if count > 0:
                sanity_issues[col] = {
                    'count': count,
                    'percentage': (count / len(df)) * 100,
                    'range': (min_val, max_val)
                }

    if sanity_issues:
        print(f"⚠️ Found biological anomalies in {len(sanity_issues)} variables:\n")
        for col, info in sanity_issues.items():
            print(f"🔸 {col}: {info['count']} rows ({info['percentage']:.2f}%) outside range {info['range']}")
    else:
        print("✅ Biological Sanity Check Passed.")