### Metadata Management for Data Quality
**Description**: Store and use metadata to manage data quality in a pipeline.

**Steps**:
1. Load metadata
2. Load data
3. Use metadata to validate data quality
4. Show valid data


In [6]:
import pandas as pd

# Step 1: Load metadata
# Metadata could define data types, valid ranges, required fields, etc.
metadata = {
    "name": {"type": "str", "required": True},
    "age": {"type": "int", "min": 0, "max": 120, "required": True},
    "email": {"type": "str", "required": False}
}

# Step 2: Load data (example data)
data = pd.DataFrame([
    {"name": "Alice", "age": 29, "email": "alice@example.com"},
    {"name": "Bob", "age": -5, "email": "bob@example.com"},
    {"name": "Charlie", "age": 135, "email": "charlie@example.com"},
    {"name": "", "age": 45, "email": None},
])

# Step 3: Validate data using metadata
def validate_row(row, metadata):
    for col, rules in metadata.items():
        value = row.get(col)

        # Check required
        if rules.get("required") and (value is None or (isinstance(value, str) and value.strip() == "")):
            return False

        # Check type
        expected_type = rules.get("type")
        if expected_type:
            if expected_type == "int" and not isinstance(value, int):
                return False
            if expected_type == "str" and not isinstance(value, str):
                return False

        # Check min/max
        if isinstance(value, int):
            if "min" in rules and value < rules["min"]:
                return False
            if "max" in rules and value > rules["max"]:
                return False
    return True

# Apply validation
valid_data = data[data.apply(lambda row: validate_row(row, metadata), axis=1)]

# Step 4: Show valid data
print("✅ Valid Data:")
print(valid_data)


✅ Valid Data:
    name  age              email
0  Alice   29  alice@example.com
