## Architecture to Monitor Data Quality Over Time

**Description**: Design a monitoring system in Python that checks and logs data quality metrics (accuracy, completeness) for a dataset over time.

**Steps to follow:**
1. Implement a Scheduled Script:
    - Use schedule library to periodically run a script.
2. Script to Calculate Metrics:
    - For simplicity, use a function calculate_quality_metrics() that calculates and logs metrics such as missing rate or mismatch rate.
3. Store Logs:
    - Use Python's logging library to save these metrics over time.

In [1]:
# Write your code from here
! pip install schedule


Defaulting to user installation because normal site-packages is not writeable
Collecting schedule
  Downloading schedule-1.2.2-py3-none-any.whl (12 kB)
Installing collected packages: schedule
Successfully installed schedule-1.2.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
import pandas as pd

# Sample data to write to CSV
data = {
    'transaction_id': [1, 2, 3, 4, 5],
    'amount': [100.0, 200.5, None, 400.0, 500.5],
    'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05']
}

# Create a DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('transaction_data.csv', index=False)
print("CSV file 'transaction_data.csv' created successfully.")
# Write your code from hereimport pandas as pd
from datetime import datetime
import unittest

# Function to calculate accuracy
def calculate_accuracy(df, correct_values_dict):
    """
    Calculate the accuracy of specific fields in a DataFrame based on correct values.
    """
    if df is None or df.empty:
        raise ValueError("The DataFrame is empty or None.")
    
    correct_count = 0
    total_count = 0
    for column, correct_value in correct_values_dict.items():
        if column in df.columns:
            correct_count += (df[column] == correct_value).sum()
            total_count += df[column].notnull().sum()
    
    if total_count == 0:
        raise ValueError("No valid (non-null) records found to calculate accuracy.")
    
    accuracy = correct_count / total_count
    return accuracy

# Function to calculate completeness
def calculate_completeness(df):
    """
    Calculate the completeness of the DataFrame (percentage of non-null values).
    """
    if df is None or df.empty:
        raise ValueError("The DataFrame is empty or None.")
    
    completeness = df.notnull().mean().mean()  # Proportion of non-null entries across the entire DataFrame
    return completeness

# Function to log metrics over time (batching log entries)
def batch_log_metrics(log_file, metrics_batch):
    """
    Batch log metrics to a CSV file after reaching a certain batch size.
    """
    if len(metrics_batch) >= 10:  # Example: Batch size of 10
        metrics_df = pd.DataFrame(metrics_batch)
        metrics_df.to_csv(log_file, mode='a', header=False, index=False)
        metrics_batch.clear()  # Clear the batch after writing

# Function to run the data quality monitoring task
def run_monitoring_task(df, correct_values, log_file, metrics_batch):
    """
    Run the monitoring task, calculating metrics and logging them.
    """
    try:
        accuracy = calculate_accuracy(df, correct_values)
        completeness = calculate_completeness(df)
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
        metrics_batch.append({'timestamp': timestamp, 'accuracy': accuracy, 'completeness': completeness})
        batch_log_metrics(log_file, metrics_batch)
        
        print(f"Logged data quality metrics: Accuracy = {accuracy}, Completeness = {completeness}")
    except Exception as e:
        print(f"Error in monitoring task: {e}")

# Unit tests for the functions
class TestDataQualityMetrics(unittest.TestCase):
    
    def test_calculate_accuracy(self):
        df = pd.DataFrame({
            'age': [25, 30, 35],
            'city': ['New York', 'New York', 'Chicago']
        })
        correct_values = {'age': 30, 'city': 'New York'}
        result = calculate_accuracy(df, correct_values)
        self.assertEqual(result, 1/3)  # Only 1 out of 3 is correct for 'age'

    def test_calculate_completeness(self):
        df = pd.DataFrame({
            'age': [25, None, 35],
            'city': ['New York', 'Los Angeles', 'Chicago']
        })
        result = calculate_completeness(df)
        self.assertEqual(result, 0.75)  # 75% of the values are non-null

    def test_empty_dataframe(self):
        df = pd.DataFrame()
        correct_values = {'age': 30, 'city': 'New York'}
        with self.assertRaises(ValueError):
            calculate_accuracy(df, correct_values)
        with self.assertRaises(ValueError):
            calculate_completeness(df)

# Example usage:
if __name__ == '__main__':
    # Sample DataFrame (you can replace this with actual data)
    data = {
        'name': ['Alice', 'Bob', 'Charlie', None],
        'age': [25, 30, 35, 40],
        'city': ['New York', 'Los Angeles', None, 'Chicago']
    }

    df = pd.DataFrame(data)

    # Correct values for accuracy check (you can define the correct values as per your requirement)
    correct_values = {
        'age': 30,  # Example: correct age is 30
        'city': 'New York'  # Example: correct city is 'New York'
    }

    # Log file location
    log_file = 'data_quality_metrics.csv'

    # Initialize an empty list for batch logging
    metrics_batch = []

    # Run monitoring task
    run_monitoring_task(df, correct_values, log_file, metrics_batch)

    # Run unit tests
    unittest.main(argv=[''], exit=False)  # To run the unit tests in the notebook or script

FF.
FAIL: test_calculate_accuracy (__main__.TestDataQualityMetrics)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_362/2257177950.py", line 89, in test_calculate_accuracy
    self.assertEqual(result, 1/3)  # Only 1 out of 3 is correct for 'age'
AssertionError: 0.5 != 0.3333333333333333

FAIL: test_calculate_completeness (__main__.TestDataQualityMetrics)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_362/2257177950.py", line 97, in test_calculate_completeness
    self.assertEqual(result, 0.75)  # 75% of the values are non-null
AssertionError: 0.8333333333333333 != 0.75

----------------------------------------------------------------------
Ran 3 tests in 0.005s

FAILED (failures=2)


CSV file 'transaction_data.csv' created successfully.
Logged data quality metrics: Accuracy = 0.2857142857142857, Completeness = 0.8333333333333334
