## Automate Data Quality Checks with Great Expectations
**Introduction**: In this activity, you will learn how to automate data quality checks using the Great Expectations framework. This includes setting up expectations and generating validation reports.

### Task 1: Setup and Initial Expectations

1. Objective: Set up Great Expectations and create initial expectations for a dataset.
2. Steps:
    - Install Great Expectations using pip.
    - Initialize a data context.
    - Create basic expectations on a sample dataset.
    - Eg., Implement a basic setup and expectation for column presence and type.

In [2]:
import great_expectations as ge
import pandas as pd
import os

def create_sample_data(file_path):
    data = {
        "CustomerID": [1, 2, 3, 4, 4],  # Note: duplicate CustomerID 4 for uniqueness test
        "Name": ["Alice", "Bob", "Charlie", "David", "David"],
        "Email": ["alice@example.com", "bob@example.com", "charlie@example.com", "david@example.com", None],
        "Age": [25, 30, 35, 40, None]
    }
    df = pd.DataFrame(data)
    df.to_csv(file_path, index=False)
    print(f"Sample data saved to {file_path}")

def setup_ge_context_and_suite(data_path):
    # Initialize GE context
    context = ge.get_context()

    # Load CSV using pandas
    df = pd.read_csv(data_path)

    # Add datasource if not exists
    datasources = context.list_datasources()
    if not any(ds["name"] == "my_pandas_datasource" for ds in datasources):
        context.add_datasource(
            name="my_pandas_datasource",
            class_name="Datasource",
            execution_engine={"class_name": "PandasExecutionEngine"},
            data_connectors={
                "default_runtime_data_connector_name": {
                    "class_name": "RuntimeDataConnector",
                    "batch_identifiers": ["default_identifier_name"],
                }
            },
        )

    suite_name = "advanced_expectations_suite"
    try:
        context.delete_expectation_suite(suite_name)
    except Exception:
        pass

    batch_request = {
        "datasource_name": "my_pandas_datasource",
        "data_connector_name": "default_runtime_data_connector_name",
        "data_asset_name": "runtime_data_asset",
        "runtime_parameters": {"batch_data": df},
        "batch_identifiers": {"default_identifier_name": "default"},
    }

    # Create expectation suite
    validator = context.get_validator(
        batch_request=batch_request,
        expectation_suite_name=suite_name,
        create_expectation_suite=True,
    )

    # Define expectations

    # Basic Completeness Expectations
    validator.expect_column_to_exist("CustomerID")
    validator.expect_column_values_to_not_be_null("CustomerID")
    validator.expect_column_to_exist("Name")
    validator.expect_column_values_to_not_be_null("Name")

    # Email format validity and completeness
    validator.expect_column_to_exist("Email")
    validator.expect_column_values_to_not_be_null("Email")
    validator.expect_column_values_to_match_regex("Email", r"[^@]+@[^@]+\.[^@]+")

    # Age range check
    validator.expect_column_to_exist("Age")
    validator.expect_column_values_to_be_between("Age", min_value=0, max_value=120, mostly=0.8)

    # Advanced expectation: CustomerID should be unique
    validator.expect_column_values_to_be_unique("CustomerID")

    validator.save_expectation_suite(discard_failed_expectations=False)

    print(f"Expectation suite '{suite_name}' created and saved.")
    return context, validator, suite_name

def validate_and_generate_report(context, validator, suite_name):
    results = validator.validate()
    print("\nValidation Results Summary:")
    print(results)

    # Build data docs and open report
    context.build_data_docs()
    context.open_data_docs()

def main():
    data_path = "sample_data.csv"
    create_sample_data(data_path)

    context, validator, suite_name = setup_ge_context_and_suite(data_path)

    validate_and_generate_report(context, validator, suite_name)

if __name__ == "__main__":
    main()

Sample data saved to sample_data.csv


DataContextError: Datasource is not a FluentDatasource

### Task 2: Validate Datasets and Generate Reports

1. Objective: Validate a dataset against defined expectations and generate a report.
2. Steps:
    - Execute the validation process on the dataset.
    - Review the validation results and generate a report.
    - Eg., Validate completeness and consistency expectations, and view the results.


In [6]:
!python validate_and_report.py
import great_expectations as ge
import pandas as pd
import os

def create_sample_data(file_path):
    data = {
        "ID": [1, 2, 3, 4, 5],
        "Name": ["Alice", "Bob", "Charlie", None, "Eve"],
        "Email": ["alice@example.com", "bob@example", "charlie@example.com", "david@example.com", None],
        "Age": [25, 30, 35, 40, 28]
    }
    df = pd.DataFrame(data)
    df.to_csv(file_path, index=False)
    print(f"Sample data saved to {file_path}")

def setup_ge_context_and_suite(data_path):
    # Initialize GE context
    context = ge.get_context()

    # Load data with pandas
    df = pd.read_csv(data_path)

    # Add pandas datasource if not exists
    datasources = context.list_datasources()
    if not any(ds["name"] == "pandas_datasource" for ds in datasources):
        context.add_datasource(
            name="pandas_datasource",
            class_name="Datasource",
            execution_engine={"class_name": "PandasExecutionEngine"},
            data_connectors={
                "default_runtime_data_connector_name": {
                    "class_name": "RuntimeDataConnector",
                    "batch_identifiers": ["default_identifier_name"],
                }
            },
        )

    suite_name = "completeness_consistency_suite"
    # Delete if exists
    try:
        context.delete_expectation_suite(suite_name)
    except Exception:
        pass

    batch_request = {
        "datasource_name": "pandas_datasource",
        "data_connector_name": "default_runtime_data_connector_name",
        "data_asset_name": "runtime_data_asset",
        "runtime_parameters": {"batch_data": df},
        "batch_identifiers": {"default_identifier_name": "default"},
    }

    validator = context.get_validator(
        batch_request=batch_request,
        expectation_suite_name=suite_name,
        create_expectation_suite=True,
    )

    # Define Completeness Expectations (non-null)
    validator.expect_column_values_to_not_be_null("Name")
    validator.expect_column_values_to_not_be_null("Email")

    # Define Consistency Expectation - Email format (basic regex)
    validator.expect_column_values_to_match_regex("Email", r"[^@]+@[^@]+\.[^@]+")

    validator.save_expectation_suite()
    print(f"Expectation suite '{suite_name}' created.")

    return context, validator, suite_name

def validate_and_generate_report(context, validator, suite_name):
    # Validate the data
    results = validator.validate()
    print("\nValidation Results Summary:")
    print(results)

    # Build and open data docs (HTML report)
    context.build_data_docs()
    context.open_data_docs()

def main():
    data_path = "sample_data.csv"
    create_sample_data(data_path)

    context, validator, suite_name = setup_ge_context_and_suite(data_path)

    validate_and_generate_report(context, validator, suite_name)

if __name__ == "__main__":
    main()

python: can't open file '/workspaces/AI_DATA_ANALYSIS_/src/Module 8/Hands-on - Data Quality Scoring & Automation/validate_and_report.py': [Errno 2] No such file or directory
Sample data saved to sample_data.csv


DataContextError: Datasource is not a FluentDatasource

### Task 3: Advanced Expectations and Scheduling

1. Objective: Create advanced expectations for conditional checks and automate the validation.
2. Steps:
    - Define advanced expectations based on complex conditions.
    - Use scheduling tools to automate periodic checks.
    - E.g., an expectation that customer IDs must be unique and schedule a daily check.

In [9]:
import pandas as pd
import great_expectations as ge

def create_sample_data(file_path):
    data = {
        "CustomerID": [101, 102, 103, 104, 102],  # Duplicate CustomerID to test uniqueness
        "Name": ["Alice", "Bob", "Charlie", "David", "Eve"],
        "Email": ["alice@example.com", "bob@example.com", None, "david@example.com", "eve@example.com"],
        "Age": [25, 30, 17, 40, 22]  # 17 is underage with Email, to trigger manual check
    }
    df = pd.DataFrame(data)
    df.to_csv(file_path, index=False)
    print(f"Sample data saved to {file_path}")
    return df

def create_validator_from_df(df):
    # Create Great Expectations validator directly from pandas DataFrame
    validator = ge.from_pandas(df)
    return validator

def add_expectations(validator):
    # Expect CustomerID to be unique
    validator.expect_column_values_to_be_unique("CustomerID")

    # Expect no nulls in CustomerID and Name
    validator.expect_column_values_to_not_be_null("CustomerID")
    validator.expect_column_values_to_not_be_null("Name")

    # Expect Email column to contain '@' (validity check)
    validator.expect_column_values_to_match_regex("Email", r".+@.+\..+")

    # Manually check conditional expectation: If Email is not null, Age must be > 18
    invalid_rows = validator.df[(validator.df["Email"].notna()) & (validator.df["Age"] <= 18)]
    if not invalid_rows.empty:
        print("\nWarning: Rows where Email is present but Age <= 18 (failed conditional check):")
        print(invalid_rows)
    else:
        print("\nConditional check passed: All Email holders are older than 18.")

    return validator

def run_validation(validator):
    # Run validation and get results
    results = validator.validate()
    print("\nValidation Results Summary:")
    print(f"Success: {results['success']}")
    for res in results['results']:
        exp_type = res['expectation_config']['expectation_type']
        success = res['success']
        print(f" - Expectation {exp_type} passed: {success}")

def main():
    data_path = "sample_data.csv"
    df = create_sample_data(data_path)
    validator = create_validator_from_df(df)
    validator = add_expectations(validator)
    run_validation(validator)

if __name__ == "__main__":
    main()

Sample data saved to sample_data.csv


AttributeError: module 'great_expectations' has no attribute 'from_pandas'