### Metadata Management for Data Quality
**Description**: Store and use metadata to manage data quality in a pipeline.

**Steps**:
1. Load metadata
2. Load data
3. Use metadata to validate data quality
4. Show valid data


In [None]:
import pandas as pd
import numpy as np


# --- Step 1: Load Metadata ---
# Simulated metadata specifying expected schema
metadata = {
    'age': {'type': 'int', 'min': 0, 'max': 120, 'nullable': False},
    'salary': {'type': 'float', 'min': 10000.0, 'max': 1000000.0, 'nullable': False},
    'department': {'type': 'str', 'allowed_values': ['HR', 'IT', 'Finance'], 'nullable': False}
}

# --- Step 2: Load Sample Data (simulating raw data input) ---
data = {
    'age': [25, 45, 130, None, 35],
    'salary': [50000.0, 150000.0, 25000.0, 80000.0, None],
    'department': ['HR', 'IT', 'Finance', 'Admin', 'Finance']
}

df = pd.DataFrame(data)
print("Original Data:\n", df)

# --- Step 3: Validate Data Using Metadata ---
def validate_row(row, metadata):
    errors = []
    for col, rules in metadata.items():
        value = row[col]
        # Check for nulls
        if pd.isnull(value):
            if not rules['nullable']:
                errors.append(f"{col} is null")
            continue
        # Type check is implicit in range/allowed checks
        if 'min' in rules and value < rules['min']:
            errors.append(f"{col} below min ({value} < {rules['min']})")
        if 'max' in rules and value > rules['max']:
            errors.append(f"{col} above max ({value} > {rules['max']})")
        if 'allowed_values' in rules and value not in rules['allowed_values']:
            errors.append(f"{col} has invalid value: {value}")
    return errors

# Apply validation
df['errors'] = df.apply(lambda row: validate_row(row, metadata), axis=1)
df['is_valid'] = df['errors'].apply(lambda e: len(e) == 0)

# --- Step 4: Show Valid Data ---
print("\nValidation Results:\n", df[['age', 'salary', 'department', 'is_valid', 'errors']])

valid_data = df[df['is_valid']].drop(columns=['errors', 'is_valid'])
print("\n✅ Valid Data:\n", valid_data)

Original Data:
      age    salary department
0   25.0   50000.0         HR
1   45.0  150000.0         IT
2  130.0   25000.0    Finance
3    NaN   80000.0      Admin
4   35.0       NaN    Finance

Validation Results:
      age    salary department  is_valid  \
0   25.0   50000.0         HR      True   
1   45.0  150000.0         IT      True   
2  130.0   25000.0    Finance     False   
3    NaN   80000.0      Admin     False   
4   35.0       NaN    Finance     False   

                                              errors  
0                                                 []  
1                                                 []  
2                      [age above max (130.0 > 120)]  
3  [age is null, department has invalid value: Ad...  
4                                   [salary is null]  

✅ Valid Data:
     age    salary department
0  25.0   50000.0         HR
1  45.0  150000.0         IT
