## Real-World Case Studies

### Healthcare - Medical Prediction Errors:
**Description**: Implement validation rules using a healthcare dataset to reduce errors in
predictive models by automating data quality checks.

In [1]:
# write your code from here
import pandas as pd
import numpy as np

# Sample healthcare dataset
data = {
    'patient_id': [1, 2, 3, 4],
    'age': [25, -5, 70, 130],                  # Age should be 0 < age < 120
    'blood_pressure': [120, 85, 300, 90],      # Plausible range 50-200 mmHg
    'heart_rate': [80, 0, 55, 180],             # Plausible range 40-180 bpm
    'admission_date': ['2025-05-01', '2025-05-05', '2025-05-10', '2025-05-08'],
    'discharge_date': ['2025-05-10', '2025-05-04', '2025-05-15', '2025-05-12'],
    'diagnosis_code': ['A10', None, 'B20', 'C30']  # Should not be null
}

df = pd.DataFrame(data)

# Convert dates to datetime
df['admission_date'] = pd.to_datetime(df['admission_date'])
df['discharge_date'] = pd.to_datetime(df['discharge_date'])

# Validation rules
def validate_data(df):
    errors = {}

    # Age check: 0 < age < 120
    invalid_age = df[(df['age'] <= 0) | (df['age'] > 120)].index.tolist()
    if invalid_age:
        errors['age'] = invalid_age

    # Blood pressure: 50 <= bp <= 200
    invalid_bp = df[(df['blood_pressure'] < 50) | (df['blood_pressure'] > 200)].index.tolist()
    if invalid_bp:
        errors['blood_pressure'] = invalid_bp

    # Heart rate: 40 <= hr <= 180
    invalid_hr = df[(df['heart_rate'] < 40) | (df['heart_rate'] > 180)].index.tolist()
    if invalid_hr:
        errors['heart_rate'] = invalid_hr

    # Admission date before discharge date
    invalid_dates = df[df['admission_date'] > df['discharge_date']].index.tolist()
    if invalid_dates:
        errors['date_consistency'] = invalid_dates

    # Diagnosis code not null
    missing_diag = df[df['diagnosis_code'].isnull()].index.tolist()
    if missing_diag:
        errors['missing_diagnosis_code'] = missing_diag

    return errors

# Run validation
validation_errors = validate_data(df)

if validation_errors:
    print("Data validation errors found:")
    for field, rows in validation_errors.items():
        print(f"- {field}: rows {rows}")
else:
    print("No validation errors found. Data is clean.")



Data validation errors found:
- age: rows [1, 3]
- blood_pressure: rows [2]
- heart_rate: rows [1]
- date_consistency: rows [1]
- missing_diagnosis_code: rows [1]
