### Metadata Management for Data Quality
**Description**: Store and use metadata to manage data quality in a pipeline.

**Steps**:
1. Load metadata
2. Load data
3. Use metadata to validate data quality
4. Show valid data


In [2]:
import pandas as pd
import os

# -----------------------
# Step 0: Manually create CSV files if not present
# -----------------------
if not os.path.exists("metadata.csv"):
    metadata_csv = """\
column_name,expected_dtype,allow_null,min_value,max_value
id,int,False,,
name,str,False,,
age,int,True,0,120
salary,float,True,0,
department,str,True,,
"""
    with open("metadata.csv", "w") as f:
        f.write(metadata_csv)

if not os.path.exists("data.csv"):
    data_csv = """\
id,name,age,salary,department
1,Alice,30,70000,HR
2,Bob,25,55000,Engineering
3,Charlie,150,60000,Sales
4,David,,40000,
5,Eve,29,,Marketing
6,,40,50000,HR
"""
    with open("data.csv", "w") as f:
        f.write(data_csv)

# -----------------------
# Step 1: Load metadata
# -----------------------
metadata = pd.read_csv("metadata.csv")
print("📋 Metadata:")
print(metadata, "\n")

# -----------------------
# Step 2: Load data
# -----------------------
data = pd.read_csv("data.csv")
print("📊 Raw Data:")
print(data, "\n")

# -----------------------
# Step 3: Use metadata to validate data quality
# -----------------------
valid_mask = pd.Series(True, index=data.index)

for _, row in metadata.iterrows():
    col = row['column_name']
    expected_dtype = row['expected_dtype']
    allow_null = row['allow_null']
    min_val = row['min_value']
    max_val = row['max_value']

    # Check if column exists
    if col not in data.columns:
        print(f" Column '{col}' missing from data")
        valid_mask &= False
        continue

    # Check for nulls if not allowed
    if not allow_null:
        nulls = data[col].isnull()
        if nulls.any():
            print(f" Column '{col}' contains null values where not allowed")
            valid_mask &= ~nulls  # Mark rows with nulls as invalid

    # Check data types (basic check)
    if expected_dtype == 'int':
        # Try convert to numeric int, invalid conversion results in NaN
        converted = pd.to_numeric(data[col], errors='coerce').dropna()
        valid_rows = data[col].isin(converted) | data[col].isnull()
        if not valid_rows.all():
            print(f" Column '{col}' has invalid int values")
            valid_mask &= valid_rows
    elif expected_dtype == 'float':
        converted = pd.to_numeric(data[col], errors='coerce').dropna()
        valid_rows = data[col].isin(converted) | data[col].isnull()
        if not valid_rows.all():
            print(f" Column '{col}' has invalid float values")
            valid_mask &= valid_rows
    elif expected_dtype == 'str':
        # Check if non-null values are strings
        non_null = data[col].dropna()
        non_str_mask = ~non_null.apply(lambda x: isinstance(x, str))
        if non_str_mask.any():
            print(f" Column '{col}' has non-string values")
            valid_mask &= ~non_str_mask.reindex(data.index, fill_value=False)

    # Check min/max if applicable and numeric
    if expected_dtype in ['int', 'float']:
        if pd.notna(min_val):
            valid_mask &= data[col].ge(float(min_val)) | data[col].isnull()
        if pd.notna(max_val):
            valid_mask &= data[col].le(float(max_val)) | data[col].isnull()

# -----------------------
# Step 4: Show valid data only
# -----------------------
print("\n Valid rows after metadata-based validation:")
print(data[valid_mask])


📋 Metadata:
  column_name expected_dtype  allow_null  min_value  max_value
0          id            int       False        NaN        NaN
1        name            str       False        NaN        NaN
2         age            int        True        0.0      120.0
3      salary          float        True        0.0        NaN
4  department            str        True        NaN        NaN 

📊 Raw Data:
   id     name    age   salary   department
0   1    Alice   30.0  70000.0           HR
1   2      Bob   25.0  55000.0  Engineering
2   3  Charlie  150.0  60000.0        Sales
3   4    David    NaN  40000.0          NaN
4   5      Eve   29.0      NaN    Marketing
5   6      NaN   40.0  50000.0           HR 

 Column 'name' contains null values where not allowed

 Valid rows after metadata-based validation:
   id   name   age   salary   department
0   1  Alice  30.0  70000.0           HR
1   2    Bob  25.0  55000.0  Engineering
3   4  David   NaN  40000.0          NaN
4   5    Eve  29.0    