## Real-World Case Studies

### Healthcare - Medical Prediction Errors:
**Description**: Implement validation rules using a healthcare dataset to reduce errors in
predictive models by automating data quality checks.

In [2]:
import pandas as pd
from great_expectations.dataset import PandasDataset

# STEP 1: Create healthcare dataset
def create_healthcare_data():
    data = {
        "patient_id": [1, 2, 3, 4],
        "age": [25, 140, 35, 50],
        "heart_rate": [72, 80, None, 88],
        "blood_pressure": [120, 130, 115, 140],
        "glucose_level": [90, 110, 85, None],
        "birth_date": ["1998-03-12", "1880-01-01", "1989-07-22", "1975-10-02"],
        "diagnosis_date": ["2022-05-10", "2021-11-03", "2020-08-15", "1970-01-01"],
    }
    df = pd.DataFrame(data)
    df["birth_date"] = pd.to_datetime(df["birth_date"], errors='coerce')
    df["diagnosis_date"] = pd.to_datetime(df["diagnosis_date"], errors='coerce')
    return df

# STEP 2: Extend PandasDataset with expectations
class HealthcareDataset(PandasDataset):
    def run_validations(self):
        self.expect_column_values_to_be_between("age", min_value=0, max_value=120)
        self.expect_column_values_to_not_be_null("heart_rate")
        self.expect_column_values_to_not_be_null("blood_pressure")
        self.expect_column_values_to_not_be_null("glucose_level")
        self.expect_column_pair_values_A_to_be_less_than_B("birth_date", "diagnosis_date")

# STEP 3: Run validation with error handling
def run_validation():
    df = create_healthcare_data()
    ge_df = HealthcareDataset(df)

    try:
        ge_df.run_validations()
        results = ge_df.validate()
        print("✅ Validation Success:", results["success"])
        for r in results["results"]:
            print(f"{r['expectation_config']['expectation_type']}: {r['success']}")
        return results
    except Exception as e:
        print("❌ Error during validation:", e)
        return None

# STEP 4: Basic unit test
def test_validation():
    results = run_validation()
    assert results is not None, "Validation failed to run."
    assert results["success"] is False, "Expected validation to fail due to invalid data."
    failed_tests = [r for r in results["results"] if not r["success"]]
    assert len(failed_tests) >= 1, "At least one validation rule should fail."


if __name__ == "__main__":
    test_validation()
    print("✅ All unit tests passed.")

ModuleNotFoundError: No module named 'great_expectations.dataset'