### Metadata Management for Data Quality
**Description**: Store and use metadata to manage data quality in a pipeline.

**Steps**:
1. Load metadata
2. Load data
3. Use metadata to validate data quality
4. Show valid data


In [2]:
# write your code from here
import pandas as pd

def load_metadata(metadata_path):
    """
    Load metadata from a CSV or JSON file defining data quality rules.
    For this example, assume a CSV with columns:
    column_name, data_type, required (True/False)
    """
    metadata = pd.read_csv(metadata_path)
    return metadata

def load_data(data_path):
    """Load the dataset to validate."""
    return pd.read_csv(data_path)

def validate_data(data, metadata):
    """
    Validate the data based on metadata rules:
    - Check if required columns exist
    - Check data types
    - Check completeness of required columns
    """
    errors = []
    
    # Check required columns exist
    required_columns = metadata[metadata['required'] == True]['column_name'].tolist()
    for col in required_columns:
        if col not in data.columns:
            errors.append(f"Missing required column: {col}")
    
    # Validate data types & completeness for existing columns
    for _, row in metadata.iterrows():
        col = row['column_name']
        expected_type = row['data_type']
        required = row['required']
        
        if col in data.columns:
            # Check data type
            actual_type = str(data[col].dtype)
            
            # Map simple types to pandas dtype substrings
            type_map = {
                'int': ['int64', 'int32', 'int16'],
                'float': ['float64', 'float32'],
                'str': ['object', 'string'],
                'bool': ['bool']
            }
            
            if not any(t in actual_type for t in type_map.get(expected_type, [])):
                errors.append(f"Column '{col}' expected type '{expected_type}' but found '{actual_type}'")
            
            # Check completeness if required
            if required:
                missing_count = data[col].isnull().sum()
                if missing_count > 0:
                    errors.append(f"Column '{col}' has {missing_count} missing values but is required")
    
    if errors:
        print("Data Quality Validation Errors:")
        for e in errors:
            print(f" - {e}")
        return False
    else:
        print("All data quality checks passed.")
        return True

def show_valid_data(data):
    print("\nValid Data Preview:")
    print(data.head())

# Example usage
if __name__ == "__main__":
    metadata_file = "metadata.csv"  # Define your metadata CSV path
    data_file = "data.csv"          # Define your data CSV path
    
    metadata = load_metadata(metadata_file)
    data = load_data(data_file)
    
    if validate_data(data, metadata):
        show_valid_data(data)


FileNotFoundError: [Errno 2] No such file or directory: 'metadata.csv'