# Part 2b: Add Validation

Load and process patient data with BMI calculations.

**Your task:** Add schema and bounds validation to catch data quality issues early.

---

## Load data

In [6]:
import pandas as pd

# TODO: Define a function validate_schema(df, required_columns) that:
#       - Checks if all required columns are present
#       - Raises ValueError with list of missing columns if any are missing
def validate_schema(df,required_columns):
        missing = [col for col in required_columns if col not in df.columns]
        if missing:
               raise ValueError(f"Missing Required Columns:{missing}")
        print("required columns checked")
        return True



# TODO: Define a function validate_bounds(df, bounds_dict) that:
#       - For each column in bounds_dict, check if values are within (min, max)
#       - Use df[col].between(min, max) to find out-of-bounds values
#       - Raises ValueError showing patient_id and value for any out-of-bounds rows
def validate_bounds(df,bounds_dict):
        for key, value in bounds_dict.items():
                column_name = key
                min_val, max_val = value
                in_bounds = df[column_name].between(min_val,max_val)
                out_bounds = ~in_bounds
                if out_bounds.any():
                        out_rows = df[out_bounds]
                        raise ValueError(f"out of bounds in {column_name}: {out_rows}")
                else:
                        print(f"no values out of bounds in {column_name}")

        

df = pd.read_csv("data/patient_intake.csv")

# TODO: Call validate_schema() to check for required columns:
#       ["patient_id", "weight_kg", "height_cm", "age"]
required_columns = ["patient_id", "weight_kg", "height_cm", "age"]
validate_schema(df,required_columns)
# TODO: Call validate_bounds() with bounds:
#       weight_kg: (30, 250)
#       height_cm: (120, 230)
#       age: (0, 110)
bounds_dict = {"weight_kg": (30, 250),"height_cm": (120, 230),"age": (0, 110)}
validate_bounds(df,bounds_dict)

df.head()

required columns checked
no values out of bounds in weight_kg
no values out of bounds in height_cm
no values out of bounds in age


Unnamed: 0,patient_id,first_name,last_name,weight_kg,height_cm,age,sex
0,P001,Mark,Johnson,91.5,177,46,M
1,P002,Donald,Walker,80.5,164,29,M
2,P003,Nancy,Rhodes,74.3,163,47,F
3,P004,Steven,Miller,64.4,171,71,M
4,P005,Javier,Johnson,72.8,178,18,M


---

## Calculate BMI

In [None]:
df["height_m"] = df["height_cm"] / 100
df["bmi"] = df["weight_kg"] / (df["height_m"] ** 2)
df["bmi"] = df["bmi"].round(1)

df[["patient_id", "weight_kg", "height_cm", "bmi"]].head()

---

## Categorize BMI

In [None]:
df["bmi_category"] = pd.cut(
    df["bmi"],
    bins=[0, 18.5, 25, 30, float("inf")],
    labels=["Underweight", "Normal", "Overweight", "Obese"],
    right=False
)

df[["patient_id", "bmi", "bmi_category"]].head()

---

## Summary statistics

In [None]:
summary = df.groupby("bmi_category")["patient_id"].count()
print("\nBMI category distribution:")
print(summary)

high_risk = df[df["bmi"] > 30]
print(f"\nHigh-risk patients (BMI > 30): {len(high_risk)}")