### Metadata Management for Data Quality
**Description**: Store and use metadata to manage data quality in a pipeline.

**Steps**:
1. Load metadata
2. Load data
3. Use metadata to validate data quality
4. Show valid data


In [1]:
# write your code from here
import pandas as pd
import numpy as np

# -------------------------------------
# Step 1: Define and Load Metadata
# -------------------------------------
# Simulating metadata as a DataFrame (could be from a JSON or config file)

metadata = pd.DataFrame({
    'column': ['Name', 'Age', 'Department', 'Salary'],
    'type': ['string', 'int', 'string', 'float'],
    'required': [True, True, True, False],
    'allowed_values': [None, None, ['HR', 'IT', 'Finance'], None]
})

print("📘 Metadata:\n")
print(metadata)

# -------------------------------------
# Step 2: Load Actual Data
# -------------------------------------

data = {
    'Name': ['Alice', 'Bob', None, 'David'],
    'Age': [25, 'Thirty', 30, 28],
    'Department': ['HR', 'IT', 'Marketing', None],
    'Salary': [50000, 60000, None, 52000]
}

df = pd.DataFrame(data)
print("\n📦 Actual Data:\n")
print(df)

# -------------------------------------
# Step 3: Validate Data Using Metadata
# -------------------------------------

valid_rows = []

for idx, row in df.iterrows():
    is_valid = True

    for _, meta in metadata.iterrows():
        col = meta['column']
        expected_type = meta['type']
        required = meta['required']
        allowed_values = meta['allowed_values']

        val = row[col]

        # Check for required fields
        if required and pd.isnull(val):
            is_valid = False
            break

        # Type validation
        if expected_type == 'int':
            try:
                int(val)
            except:
                is_valid = False
                break
        elif expected_type == 'float':
            try:
                float(val)
            except:
                is_valid = False
                break

        # Allowed values check
        if allowed_values is not None and val not in allowed_values:
            is_valid = False
            break

    if is_valid:
        valid_rows.append(idx)

# -------------------------------------
# Step 4: Show Valid Data
# -------------------------------------
print("\n✅ Valid Data Based on Metadata Rules:\n")
print(df.loc[valid_rows])


📘 Metadata:

       column    type  required     allowed_values
0        Name  string      True               None
1         Age     int      True               None
2  Department  string      True  [HR, IT, Finance]
3      Salary   float     False               None

📦 Actual Data:

    Name     Age Department   Salary
0  Alice      25         HR  50000.0
1    Bob  Thirty         IT  60000.0
2   None      30  Marketing      NaN
3  David      28       None  52000.0

✅ Valid Data Based on Metadata Rules:

    Name Age Department   Salary
0  Alice  25         HR  50000.0
