In [2]:
# ============================================================
# DataGuard — Automated Data Quality & Monitoring Tool (Colab ready)
# ============================================================

import os
import csv
import json
import random
from datetime import datetime
import pandas as pd

# Configuration
INPUT_CSV = "data/input_data.csv"
REPORT_JSON = "data/quality_report.json"
THRESHOLDS = {
    "missing_value_ratio": 0.1,
    "duplicate_record_ratio": 0.05,
    "outlier_zscore": 3.0
}

# Ensure directories
os.makedirs(os.path.dirname(INPUT_CSV), exist_ok=True)
os.makedirs(os.path.dirname(REPORT_JSON), exist_ok=True)

# Generate dummy data if not exist
if not os.path.exists(INPUT_CSV):
    df = pd.DataFrame({
        "id": range(1, 101),
        "value": [random.gauss(50,10) for _ in range(100)],
        "status": [random.choice(["A","B","C","D"]) for _ in range(100)]
    })
    # introduce missing values
    df.loc[random.sample(range(100), 10), "value"] = None
    # duplicate some rows
    rows_to_duplicate = df.iloc[random.sample(range(100), 5)]
    df = pd.concat([df, rows_to_duplicate], ignore_index=True)
    df.to_csv(INPUT_CSV, index=False)

def load_data(path):
    return pd.read_csv(path)

def check_missing(df):
    total = df.shape[0] * df.shape[1]
    missing = df.isna().sum().sum()
    return missing / total

def check_duplicates(df):
    total = df.shape[0]
    dup = df.duplicated().sum()
    return dup / total

def check_outliers(df, column, z_thresh):
    col = df[column].dropna()
    mean = col.mean()
    std = col.std()
    outliers = col[(col - mean).abs() > z_thresh * std]
    return len(outliers) / len(col) if len(col) > 0 else 0

def run_quality_checks(path):
    df = load_data(path)
    report = {}
    report["timestamp"] = datetime.utcnow().isoformat()
    report["missing_value_ratio"] = check_missing(df)
    report["duplicate_record_ratio"] = check_duplicates(df)
    report["outlier_ratio_value"] = check_outliers(df, "value", THRESHOLDS["outlier_zscore"])
    report["status"] = "PASS"
    if (report["missing_value_ratio"] > THRESHOLDS["missing_value_ratio"] or
        report["duplicate_record_ratio"] > THRESHOLDS["duplicate_record_ratio"] or
        report["outlier_ratio_value"] > 0.10):
        report["status"] = "FAIL"
    with open(REPORT_JSON, "w", encoding="utf-8") as f:
        json.dump(report, f, indent=2)
    return report

if __name__ == "__main__":
    report = run_quality_checks(INPUT_CSV)
    print("Quality Report:", report)


Quality Report: {'timestamp': '2025-10-19T09:57:25.108559', 'missing_value_ratio': np.float64(0.031746031746031744), 'duplicate_record_ratio': np.float64(0.047619047619047616), 'outlier_ratio_value': 0.010526315789473684, 'status': 'PASS'}


  report["timestamp"] = datetime.utcnow().isoformat()
