## Automated Data Quality Monitoring
**Objective**: Use Great Expectations to perform data profiling and write validation rules.

1. Data Profiling with Great Expectations
### Profile a CSV dataset containing customer information to inspect distribution patterns of 'Age' and 'Income' columns.
- Load the dataset using Great Expectations and create a data context.
- Generate a data asset to inspect the summary statistics.
- View the generated expectation suite to analyze data distributions.

In [1]:
import great_expectations as ge
from great_expectations.data_context import get_context
import os

# Use current project directory
project_dir = os.getcwd()
GE_ROOT_DIR = os.path.join(project_dir, "great_expectations")
CSV_PATH = os.path.join(project_dir, "data", "customer_data.csv")
DATASOURCE_NAME = "customer_data_csv"
DATA_ASSET_NAME = "customer_data"
SUITE_NAME = "customer_data_suite"

# Ensure the Great Expectations project is initialized
if not os.path.exists(GE_ROOT_DIR):
    print("Initializing Great Expectations...")
    os.system("great_expectations init")

# Get context
context = get_context(context_root_dir=GE_ROOT_DIR)

# Add datasource
datasource_config = {
    "name": DATASOURCE_NAME,
    "class_name": "Datasource",
    "execution_engine": {"class_name": "PandasExecutionEngine"},
    "data_connectors": {
        "default_runtime_data_connector_name": {
            "class_name": "RuntimeDataConnector",
            "batch_identifiers": ["default_identifier_name"]
        }
    }
}

context.add_or_update_datasource(**datasource_config)

# Create expectation suite
try:
    context.create_expectation_suite(suite_name=SUITE_NAME, overwrite_existing=True)
except Exception:
    print("Suite exists. Loading...")

suite = context.get_expectation_suite(SUITE_NAME)

# Create batch request
batch_request = {
    "datasource_name": DATASOURCE_NAME,
    "data_connector_name": "default_runtime_data_connector_name",
    "data_asset_name": DATA_ASSET_NAME,
    "runtime_parameters": {"path": CSV_PATH},
    "batch_identifiers": {"default_identifier_name": "default"}
}

# Validate
validator = context.get_validator(batch_request=batch_request, expectation_suite=suite)
validator.expect_column_values_to_not_be_null("Age")
validator.expect_column_values_to_be_between("Age", min_value=18, max_value=99)
validator.expect_column_values_to_not_be_null("Income")
validator.expect_column_values_to_be_between("Income", min_value=10000, max_value=1000000)
validator.save_expectation_suite(discard_failed_expectations=False)

# Create and run checkpoint
checkpoint_name = "customer_data_checkpoint"
checkpoint_config = {
    "name": checkpoint_name,
    "config_version": 1.0,
    "class_name": "SimpleCheckpoint",
    "run_name_template": "%Y-%M-%D-run",
    "validations": [
        {
            "batch_request": batch_request,
            "expectation_suite_name": SUITE_NAME,
        }
    ],
}

context.add_or_update_checkpoint(**checkpoint_config)
results = context.run_checkpoint(checkpoint_name=checkpoint_name)

if results["success"]:
    print("✅ Data validation PASSED.")
else:
    print("❌ Data validation FAILED.")

# Optional: Show report path
html_path = os.path.join(GE_ROOT_DIR, "uncommitted", "data_docs", "local_site", "index.html")
print(f"\n📊 View results: file://{html_path}")

Initializing Great Expectations...


sh: 1: great_expectations: not found


ValueError: Either datasource or kwargs are required

2. Writing Validation Rules for Data Ingestion
### Write validation rules for a CSV file to ensure the 'Date' column follows a specific date format.
- Utilize expect_column_values_to_match_regex to enforce date format validation.
- Run the validation and interpret the output.

In [2]:
import os
import pandas as pd
from great_expectations.data_context import get_context
from great_expectations.core.batch import RuntimeBatchRequest

# Setup paths
project_dir = os.getcwd()
ge_root_dir = os.path.join(project_dir, "great_expectations")
csv_path = os.path.join(project_dir, "data", "customer_data.csv")

# Load CSV
df = pd.read_csv(csv_path)

# Get Great Expectations context
context = get_context(context_root_dir=ge_root_dir)

# Add Pandas datasource using Fluent API
datasource = context.sources.add_pandas(name="pandas_datasource")

# Create or load expectation suite
suite_name = "date_validation_suite"
context.add_or_update_expectation_suite(expectation_suite_name=suite_name)

# Create RuntimeBatchRequest
batch_request = RuntimeBatchRequest(
    datasource_name="pandas_datasource",
    data_connector_name="default_runtime_data_connector_name",
    data_asset_name="customer_data",
    runtime_parameters={"batch_data": df},
    batch_identifiers={"default_identifier_name": "default"},
)

# Create validator
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=suite_name
)

# Add date format expectation
# This regex matches YYYY-MM-DD format
validator.expect_column_values_to_match_regex(
    column="Date",
    regex=r"^\d{4}-\d{2}-\d{2}$"
)

# Save expectation suite
validator.save_expectation_suite()

# Run checkpoint
results = context.run_checkpoint(
    name="date_validation_checkpoint",
    validations=[{
        "batch_request": batch_request,
        "expectation_suite_name": suite_name
    }]
)

# Check result
if results.success:
    print("✅ Date format validation PASSED.")
else:
    print("❌ Date format validation FAILED.")

# Optional: Path to view report in browser
html_path = os.path.join(ge_root_dir, "uncommitted", "data_docs", "local_site", "index.html")
print(f"\n📊 View report: file://{html_path}")

FileNotFoundError: [Errno 2] No such file or directory: '/workspaces/AI_DATA_ANALYSIS_/src/Module 10/Automating Data Quality Checks in Data Pipelines/data/customer_data.csv'