## Architecture to Monitor Data Quality Over Time

**Description**: Design a monitoring system in Python that checks and logs data quality metrics (accuracy, completeness) for a dataset over time.

**Steps to follow:**
1. Implement a Scheduled Script:
    - Use schedule library to periodically run a script.
2. Script to Calculate Metrics:
    - For simplicity, use a function calculate_quality_metrics() that calculates and logs metrics such as missing rate or mismatch rate.
3. Store Logs:
    - Use Python's logging library to save these metrics over time.

In [5]:

import pandas as pd
import logging
import schedule
import time
import random

# -------------------------------------------
# 1. Setup logging
# -------------------------------------------
logging.basicConfig(
    filename='data_quality.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# -------------------------------------------
# 2. Simulated data source
# Replace with actual CSV in real use
# -------------------------------------------
def load_data():
    data = {
        "transaction_id": [1001, 1002, None, 1004, None],
        "amount": [200, None, 300, 400, 500],
        "date": ["2024-01-01", "2024-01-02", "2024-01-03", None, "2024-01-05"],
        "status": ["Completed", "Failed", "Completed", "Completed", random.choice(["Failed", None])]
    }
    return pd.DataFrame(data)

# -------------------------------------------
# 3. Metric calculation function
# -------------------------------------------
def calculate_quality_metrics(df):
    metrics = {}

    # Completeness: % missing in key fields
    fields = ['transaction_id', 'amount', 'date']
    for col in fields:
        metrics[f"{col}_missing_pct"] = round(df[col].isnull().mean() * 100, 2)

    # Accuracy: Dummy rule - "status" must be in known list
    known_status = {"Completed", "Failed"}
    invalid_status_pct = (~df["status"].isin(known_status)).mean() * 100
    metrics["status_accuracy_pct"] = round(100 - invalid_status_pct, 2)

    return metrics

# -------------------------------------------
# 4. Scheduler job
# -------------------------------------------
def job():
    df = load_data()
    metrics = calculate_quality_metrics(df)
    log_msg = " | ".join([f"{k}: {v}%" for k, v in metrics.items()])
    logging.info(f"Data Quality Metrics - {log_msg}")
    print("Metrics logged.")

# -------------------------------------------
# 5. Schedule the job to run every minute
# -------------------------------------------
schedule.every(1).minutes.do(job)

print("Starting Data Quality Monitor... (press Ctrl+C to stop)")
while True:
    schedule.run_pending()
    time.sleep(1)

Starting Data Quality Monitor... (press Ctrl+C to stop)
Metrics logged.
Metrics logged.
Metrics logged.


KeyboardInterrupt: 