## Architecture to Monitor Data Quality Over Time

**Description**: Design a monitoring system in Python that checks and logs data quality metrics (accuracy, completeness) for a dataset over time.

**Steps to follow:**
1. Implement a Scheduled Script:
    - Use schedule library to periodically run a script.
2. Script to Calculate Metrics:
    - For simplicity, use a function calculate_quality_metrics() that calculates and logs metrics such as missing rate or mismatch rate.
3. Store Logs:
    - Use Python's logging library to save these metrics over time.

In [4]:
import pandas as pd
import numpy as np
import logging
import time
from datetime import datetime
import sys

# Logging setup
logging.basicConfig(filename='data_quality_log.txt', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

def calculate_quality_metrics(data_file, trusted_file=None):
    """
    Calculates and logs data quality metrics (completeness and accuracy) for a given dataset.

    Args:
        data_file (str): Path to the CSV file containing the data to be checked.
        trusted_file (str, optional): Path to the CSV file containing trusted data for accuracy checks.
            If provided, accuracy is calculated; otherwise, only completeness is calculated.
    """
    try:
        df = pd.read_csv(data_file)
        logging.info(f"Data file '{data_file}' loaded successfully.")
    except FileNotFoundError:
        logging.error(f"Error: Data file '{data_file}' not found.")
        return
    except Exception as e:
        logging.error(f"Error reading data file '{data_file}': {e}")
        return

    total_rows = len(df)

    # Calculate completeness for all columns
    completeness = {}
    for col in df.columns:
        missing_count = df[col].isnull().sum()
        completeness[col] = ((total_rows - missing_count) / total_rows) * 100
        logging.info(f"Completeness for column '{col}': {completeness[col]:.2f}%")

    # Calculate overall completeness
    overall_completeness = df.dropna().shape[0] / total_rows * 100 if total_rows else 0
    logging.info(f"Overall Completeness: {overall_completeness:.2f}%")

    if trusted_file:
        try:
            trusted_df = pd.read_csv(trusted_file)
            logging.info(f"Trusted data file '{trusted_file}' loaded successfully.")
        except FileNotFoundError:
            logging.error(f"Error: Trusted data file '{trusted_file}' not found. Skipping accuracy check.")
            return
        except Exception as e:
            logging.error(f"Error reading trusted data file '{trusted_file}': {e}. Skipping accuracy check.")
            return

        # Check for common columns
        common_columns = list(set(df.columns) & set(trusted_df.columns))
        if not common_columns:
            logging.error("Error: No common columns between data and trusted data files. Skipping accuracy check.")
            return

        # Calculate accuracy for common columns
        accuracy = {}
        for col in common_columns:
            if df[col].dtype == trusted_df[col].dtype:  # Compare values only if types match
                match_count = (df[col] == trusted_df[col]).sum()
                accuracy[col] = (match_count / total_rows) * 100 if total_rows else 0
                logging.info(f"Accuracy for column '{col}': {accuracy[col]:.2f}%")
            else:
                logging.warning(f"Skipping accuracy check for column '{col}' as data types differ.")
                accuracy[col] = None  # Explicitly set to None for skipped columns
        #Calculate overall accuracy
        valid_accuracy_values = [a for a in accuracy.values() if a is not None] #skipping None values
        overall_accuracy = np.mean(valid_accuracy_values) if valid_accuracy_values else 0
        logging.info(f"Overall Accuracy: {overall_accuracy:.2f}%")

    logging.info("Data quality check completed.")



def run_data_quality_check():
    """
    Runs the data quality check and logs the metrics.  This function is called by the scheduler.
    """
    #  Replace 'your_data.csv' and 'your_trusted_data.csv' with your actual file paths
    data_file = 'sales_data.csv'
    trusted_file = 'trusted_sales_data.csv'  # Optional:  Only if you have trusted data

    # Create dummy CSV files if they don't exist
    try:
        with open(data_file, 'w') as f:
            f.write("order_id,customer_id,order_date,amount,status\n1,101,2024-01-15,100.0,Shipped\n2,102,2024-01-20,150.0,Shipped\n3,103,2024-02-10,,Pending\n4,104,2024-02-25,200.0,Shipped\n5,105,2024-03-05,250.0,Delivered")
        if trusted_file:
            with open(trusted_file, 'w') as f:
                f.write("order_id,customer_id,order_date,amount,status\n1,101,2024-01-15,100.0,Shipped\n2,102,2024-01-20,150.0,Shipped\n3,103,2024-02-10,120.0,Pending\n4,104,2024-02-25,200.0,Shipped\n5,105,2024-03-05,250.0,Delivered")
    except FileExistsError:
        pass

    calculate_quality_metrics(data_file, trusted_file)



if __name__ == "__main__":
    # Schedule the data quality check to run daily at 00:00
    # schedule.every().day.at("00:00").do(run_data_quality_check)
    # logging.info("Data quality monitoring scheduled to run daily at 00:00.")
    #
    # # Keep the script running to allow the scheduler to do its work
    # while True:
    #     schedule.run_pending()
    #     time.sleep(1)
    run_data_quality_check() # run once
