## Architecture to Monitor Data Quality Over Time

**Description**: Design a monitoring system in Python that checks and logs data quality metrics (accuracy, completeness) for a dataset over time.

**Steps to follow:**
1. Implement a Scheduled Script:
    - Use schedule library to periodically run a script.
2. Script to Calculate Metrics:
    - For simplicity, use a function calculate_quality_metrics() that calculates and logs metrics such as missing rate or mismatch rate.
3. Store Logs:
    - Use Python's logging library to save these metrics over time.

In [1]:
# Write your code from here
import pandas as pd
import numpy as np
import logging
import schedule
import time
from datetime import datetime

# Setup logger to log to file with timestamps
logging.basicConfig(
    filename='data_quality.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

def load_data():
    """
    Simulate loading dataset.
    Replace this with actual data loading logic.
    """
    data = {
        'id': [1, 2, 3, 4, 5, 6],
        'price': [10.5, 23.0, None, 15.0, -5.0, 30.0],
        'email': ['a@example.com', 'b@example', 'c@example.com', None, 'e@example.com', 'f@example.com']
    }
    return pd.DataFrame(data)

def calculate_quality_metrics(df: pd.DataFrame) -> dict:
    """
    Calculate data quality metrics:
    - Missing rate (percentage of missing values)
    - Negative values rate (price < 0)
    - Invalid emails (simple regex check)
    """
    total_rows = len(df)
    missing_rate = df.isna().mean().mean() * 100  # Average missing rate across all columns
    
    # Negative price rate
    negative_price_count = (df['price'] < 0).sum()
    negative_price_rate = negative_price_count / total_rows * 100
    
    # Simple email validity check
    invalid_emails = df['email'].apply(lambda x: isinstance(x, str) and '@' in x and '.' in x)
    invalid_email_count = (~invalid_emails).sum()
    invalid_email_rate = invalid_email_count / total_rows * 100
    
    metrics = {
        'missing_rate_percent': round(missing_rate, 2),
        'negative_price_rate_percent': round(negative_price_rate, 2),
        'invalid_email_rate_percent': round(invalid_email_rate, 2)
    }
    return metrics

def log_quality_metrics():
    df = load_data()
    metrics = calculate_quality_metrics(df)
    
    log_msg = (
        f"Missing Data Rate: {metrics['missing_rate_percent']}%, "
        f"Negative Price Rate: {metrics['negative_price_rate_percent']}%, "
        f"Invalid Email Rate: {metrics['invalid_email_rate_percent']}%"
    )
    logging.info(log_msg)
    print(f"[{datetime.now()}] Logged metrics: {log_msg}")

def main():
    # Schedule to run every 10 seconds (for demo; change as needed e.g., 'every().day.at("10:00")')
    schedule.every(10).seconds.do(log_quality_metrics)

    print("Starting data quality monitoring...")
    while True:
        schedule.run_pending()
        time.sleep(1)

if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'schedule'