## Defining Data Quality SLAs
### Data Completeness
**Description**: Set an SLA that ensures that 95% of data fields in your dataset are filled (non-null values). Practice by checking a dataset of your choice and calculate its completeness.

In [1]:
import os
import pandas as pd
from great_expectations.data_context import DataContext
from great_expectations.core.batch import RuntimeBatchRequest

# Step 0: Create sample CSV data folder and file
os.makedirs("data", exist_ok=True)
sample_csv_path = "data/sample_data.csv"

sample_csv_content = """Name,Age,Income
Alice,30,70000
Bob,25,50000
Charlie,,60000
David,40,
Eve,35,55000
"""

with open(sample_csv_path, "w") as f:
    f.write(sample_csv_content)

# Step 1: Load dataset
df = pd.read_csv(sample_csv_path)

# Step 2: Initialize GE DataContext (FileDataContext)
project_dir = os.getcwd()
ge_root_dir = os.path.join(project_dir, "great_expectations")
context = DataContext(ge_root_dir)

# Step 3: Add pandas datasource via config dict (if not already added)
datasource_name = "pandas_datasource"
if datasource_name not in [ds["name"] for ds in context.list_datasources()]:
    datasource_config = {
        "name": datasource_name,
        "class_name": "Datasource",
        "execution_engine": {
            "class_name": "PandasExecutionEngine"
        },
        "data_connectors": {
            "default_runtime_data_connector_name": {
                "class_name": "RuntimeDataConnector",
                "batch_identifiers": ["default_identifier_name"],
            }
        }
    }
    context.add_datasource(**datasource_config)

# Step 4: Create expectation suite if not exists
suite_name = "completeness_sla_suite"
if suite_name not in [suite.expectation_suite_name for suite in context.list_expectation_suites()]:
    context.create_expectation_suite(suite_name, overwrite_existing=True)

# Step 5: Create RuntimeBatchRequest
batch_request = RuntimeBatchRequest(
    datasource_name=datasource_name,
    data_connector_name="default_runtime_data_connector_name",
    data_asset_name="sample_data",
    runtime_parameters={"batch_data": df},
    batch_identifiers={"default_identifier_name": "default"},
)

# Step 6: Get validator
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=suite_name,
)

# Step 7: Add completeness expectations (mostly=0.95)
for col in df.columns:
    validator.expect_column_values_to_not_be_null(column=col, mostly=0.95)

# Step 8: Save expectation suite
validator.save_expectation_suite()

# Step 9: Run validation
results = validator.validate()

# Step 10: Print completeness per column and SLA result
print("\nCompleteness SLA Results:")
for result in results.results:
    col = result.expectation_config.kwargs.get("column")
    success = result.success
    unexpected_percent = result.result.get("unexpected_percent", 0)
    completeness = 100 - unexpected_percent
    print(f"- Column '{col}': Completeness = {completeness:.2f}%, Meets SLA (≥95%)? {'YES' if success else 'NO'}")

print("\nOverall SLA Status:", "PASSED ✅" if results.success else "FAILED ❌")

ImportError: cannot import name 'DataContext' from 'great_expectations.data_context' (/home/vscode/.local/lib/python3.10/site-packages/great_expectations/data_context/__init__.py)

### Data Timeliness:
**Description**: Establish an SLA that specifies that data should be integrated and processed within 24 hours of acquisition. Monitor the data pipeline for timeliness.

In [2]:
import pandas as pd
from datetime import datetime, timedelta
from great_expectations.data_context import DataContext
from great_expectations.core.batch import RuntimeBatchRequest

# Sample data with 'acquisition_time' column (ISO format datetime strings)
data = {
    "id": [1, 2, 3],
    "acquisition_time": [
        "2025-05-21T10:00:00",  # 1 day ago (assuming today is 2025-05-22)
        "2025-05-22T06:00:00",  # 4 hours ago
        "2025-05-22T12:00:00",  # just now
    ],
    "value": [100, 200, 150],
}

df = pd.DataFrame(data)
df["acquisition_time"] = pd.to_datetime(df["acquisition_time"])

# SLA threshold
SLA_HOURS = 24

# Calculate delay in hours for each record
now = pd.Timestamp.now()
df["processing_delay_hours"] = (now - df["acquisition_time"]).dt.total_seconds() / 3600

# Check SLA condition
sla_pass = (df["processing_delay_hours"] <= SLA_HOURS).all()

print(f"Max processing delay (hours): {df['processing_delay_hours'].max():.2f}")
print(f"Data Timeliness SLA met? {'YES ✅' if sla_pass else 'NO ❌'}")

# Optional: integrate with Great Expectations to flag records violating SLA

from great_expectations.data_context import DataContext

# Initialize GE DataContext (adjust your GE root dir)
context = DataContext("./great_expectations")

suite_name = "timeliness_sla_suite"

if suite_name not in [suite.expectation_suite_name for suite in context.list_expectation_suites()]:
    context.create_expectation_suite(suite_name, overwrite_existing=True)

batch_request = RuntimeBatchRequest(
    datasource_name="pandas_datasource",  # Your datasource name here
    data_connector_name="default_runtime_data_connector_name",
    data_asset_name="timeliness_data",
    runtime_parameters={"batch_data": df},
    batch_identifiers={"default_identifier_name": "default"},
)

validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=suite_name,
)

# Expect processing_delay_hours to be <= SLA_HOURS
validator.expect_column_values_to_be_between(
    column="processing_delay_hours", max_value=SLA_HOURS
)

validator.save_expectation_suite()
results = validator.validate()

print(f"Great Expectations validation success: {results.success}")

ImportError: cannot import name 'DataContext' from 'great_expectations.data_context' (/home/vscode/.local/lib/python3.10/site-packages/great_expectations/data_context/__init__.py)

### Data Consistency:
**Description**: Define an SLA for maintaining consistency across various related datasets. Implement a check to ensure that 99% of data entries are consistent.

In [3]:
import pandas as pd
from great_expectations.data_context import DataContext
from great_expectations.core.batch import RuntimeBatchRequest

# Sample datasets
orders = pd.DataFrame({
    "order_id": [1, 2, 3, 4, 5],
    "customer_id": [101, 102, 103, 999, 105],  # 999 is invalid (not in customers)
    "amount": [250, 125, 300, 100, 450]
})

customers = pd.DataFrame({
    "customer_id": [101, 102, 103, 104, 105],
    "name": ["Alice", "Bob", "Charlie", "David", "Eve"]
})

# Step 1: Check consistency
valid_customer_ids = set(customers["customer_id"])
orders["is_customer_valid"] = orders["customer_id"].apply(lambda x: x in valid_customer_ids)

# Calculate consistency percentage
consistency_pct = orders["is_customer_valid"].mean() * 100
print(f"Data consistency: {consistency_pct:.2f}%")

# SLA threshold
SLA_THRESHOLD = 99.0

print(f"Consistency SLA met? {'YES ✅' if consistency_pct >= SLA_THRESHOLD else 'NO ❌'}")

# Step 2: Use Great Expectations to enforce this SLA
# Initialize GE DataContext (adjust your path accordingly)
context = DataContext("./great_expectations")

suite_name = "consistency_sla_suite"

if suite_name not in [suite.expectation_suite_name for suite in context.list_expectation_suites()]:
    context.create_expectation_suite(suite_name, overwrite_existing=True)

# Create RuntimeBatchRequest with orders dataframe
batch_request = RuntimeBatchRequest(
    datasource_name="pandas_datasource",  # Adjust to your datasource name
    data_connector_name="default_runtime_data_connector_name",
    data_asset_name="orders_data",
    runtime_parameters={"batch_data": orders},
    batch_identifiers={"default_identifier_name": "default"},
)

validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=suite_name,
)

# Validate that 'is_customer_valid' column is mostly True (>= 0.99)
validator.expect_column_values_to_be_in_set(
    column="is_customer_valid",
    value_set=[True],
    mostly=0.99
)

validator.save_expectation_suite()

results = validator.validate()

print(f"Great Expectations validation success: {results.success}")

ImportError: cannot import name 'DataContext' from 'great_expectations.data_context' (/home/vscode/.local/lib/python3.10/site-packages/great_expectations/data_context/__init__.py)