In [2]:
import pandas as pd
import great_expectations as gx
from great_expectations.core.batch import BatchRequest # Corrected import for BatchRequest
# For older style Datasource, if needed:
# from great_expectations.datasource.fluent import Datasource
# from great_expectations.execution_engine import PandasExecutionEngine
# from great_expectations.core.batch import BatchKwargsGenerator # Path might vary based on GX version

# --- 1. Sample Pandas DataFrame ---
# This is the data we want to validate.
# It includes a numeric column with some missing (None) values.
data = {'id': [1, 2, 3, 4, 5],
        'numeric_column': [10.5, 20.0, None, 35.2, None],
        'category_column': ['A', 'B', 'A', 'C', 'B']}
df = pd.DataFrame(data)
print("--- Sample DataFrame Created ---")
print(df.head())
print("-" * 30)

# --- 2. Great Expectations DataContext ---
# The DataContext is the primary entry point for the Great Expectations API.
# It manages configurations, Datasources, Expectation Suites, and validation results.
# Using try-except for robust context loading/creation.
try:
    context = gx.get_context()
    print("Great Expectations DataContext loaded successfully.")
except Exception as e:
    print(f"Could not load DataContext: {e}")
    print("Attempting to create a new DataContext (ephemeral, in-memory by default if no project exists).")
    # For a persistent setup, you would typically initialize a project first:
    # `great_expectations init` in your terminal
    # Then load with: `context = gx.get_context(project_root_dir='path/to/your/great_expectations')`
    context = gx.data_context.DataContext() # Creates an ephemeral context if no GX project is found
    print("Ephemeral DataContext created.")
print("-" * 30)

# --- 3. Define Datasource and Add DataAsset ---
# A Datasource tells Great Expectations where your data lives.
# An Asset is a specific table, file, or DataFrame within that Datasource.
# For modern Great Expectations (Fluent Datasources API, typically GX 0.13.x and later):
datasource_name = "pandas_datasource_example"
data_asset_name = "my_pandas_dataframe_asset"

try:
    # Try to get the datasource if it already exists
    datasource = context.get_datasource(datasource_name)
    print(f"Datasource '{datasource_name}' already exists and was retrieved.")
except gx.exceptions.DatasourceNotFoundError:
    # If it doesn't exist, add a new Pandas Datasource
    datasource = context.sources.add_pandas(name=datasource_name)
    print(f"Fluent Pandas Datasource '{datasource_name}' added.")
except Exception as e:
    print(f"An unexpected error occurred while trying to get or add datasource '{datasource_name}': {e}")
    # As a fallback, re-assign context if it was an ephemeral one and got lost.
    if not isinstance(context, gx.data_context.DataContext):
         context = gx.data_context.DataContext()
    datasource = context.sources.add_pandas(name=datasource_name) # Try adding again
    print(f"Fluent Pandas Datasource '{datasource_name}' added after fallback.")


# Add a DataFrame Asset to the Datasource
# This makes the in-memory DataFrame `df` available to Great Expectations.
try:
    data_asset = datasource.add_dataframe_asset(name=data_asset_name, dataframe=df)
    print(f"DataFrame asset '{data_asset_name}' added/updated in datasource '{datasource_name}'.")
except Exception as e:
    print(f"Error adding DataFrame asset '{data_asset_name}': {e}")
    print("Attempting to retrieve the asset if it already exists.")
    try:
        data_asset = datasource.get_asset(data_asset_name)
        # If we get an existing asset, we might want to ensure it's configured
        # with the *current* df. Building a batch request with batch_spec_passthrough
        # (as done later) is a robust way to handle this for in-memory data.
        print(f"DataFrame asset '{data_asset_name}' retrieved.")
    except Exception as e_asset:
        print(f"Could not get or reconfigure DataAsset '{data_asset_name}': {e_asset}")
        # If asset creation/retrieval fails, the batch_request might still work if datasource_name
        # and data_asset_name are correctly specified and batch_spec_passthrough is used.
print("-" * 30)

# --- 4. Build a BatchRequest ---
# A BatchRequest specifies the data you want to validate.
# `batch_spec_passthrough` is particularly useful for in-memory DataFrames,
# ensuring the exact DataFrame instance is used.
try:
    batch_request = data_asset.build_batch_request()
    # Or, for more explicit control with in-memory data, especially if asset setup was tricky:
    # batch_request = BatchRequest(
    #     datasource_name=datasource_name,
    #     data_asset_name=data_asset_name,
    #     options={}, # No specific partitioning options needed for a single in-memory df
    #     batch_spec_passthrough={"dataframe": df} # Directly pass the DataFrame
    # )
    print("BatchRequest built successfully.")
    # print(f"Batch Request details: {batch_request}")
except Exception as e:
    print(f"Error building BatchRequest: {e}")
    print("Falling back to manual BatchRequest creation with batch_spec_passthrough.")
    batch_request = BatchRequest(
        datasource_name=datasource_name,
        data_asset_name=data_asset_name, # This asset name should match what was attempted above
        options={},
        batch_spec_passthrough={"dataframe": df}
    )
    print("Manual BatchRequest created.")
print("-" * 30)

# --- 5. Create or Load an Expectation Suite ---
# An Expectation Suite is a collection of Expectations (data quality rules).
expectation_suite_name = "check_numeric_column_quality_suite"
try:
    suite = context.get_expectation_suite(expectation_suite_name)
    print(f"Found existing Expectation Suite: '{expectation_suite_name}'")
except gx.exceptions.ExpectationSuiteNotFoundError:
    suite = context.create_expectation_suite(
        expectation_suite_name=expectation_suite_name
    )
    print(f"Created new Expectation Suite: '{expectation_suite_name}'")
except Exception as e:
    print(f"Error getting or creating Expectation Suite: {e}")
    # Fallback: use add_or_update_expectation_suite for robustness
    suite = context.add_or_update_expectation_suite(expectation_suite_name=expectation_suite_name)
    print(f"Ensured Expectation Suite exists using add_or_update: '{expectation_suite_name}'")
print("-" * 30)

# --- 6. Get a Validator ---
# A Validator is used to evaluate Expectations against a batch of data.
validator = None # Initialize validator to None
try:
    validator = context.get_validator(
        batch_request=batch_request,
        expectation_suite_name=expectation_suite_name,
    )
    print("Validator obtained successfully.")
    print("First 3 rows of data being validated by the Validator:")
    # validator.head() returns a dictionary, access the 'data' key for the DataFrame
    print(validator.head(n_rows=3).get('data', pd.DataFrame()).head(3))
except Exception as e:
    print(f"Error obtaining validator: {e}")
    print("Ensure your BatchRequest is correctly configured and the DataAsset is accessible.")
print("-" * 30)

# --- 7. Add Expectations to the Suite (via Validator) ---
if validator:
    # Expectation 1: Column values should not be null
    # This expectation will FAIL for 'numeric_column' because it has None values.
    print(f"Adding Expectation: 'numeric_column' values should not be null.")
    validator.expect_column_values_to_not_be_null(column="numeric_column")

    # Expectation 2: Column values should be of a certain type
    # This expectation should PASS for 'id' if it's integer-like.
    print(f"Adding Expectation: 'id' column values should be of type 'int' (or compatible).")
    validator.expect_column_values_to_be_of_type(column="id", type_="int") # Pandas dtype for int is often 'int64'

    # Expectation 3: Column values should be in a given set
    # This expectation should PASS for 'category_column'.
    print(f"Adding Expectation: 'category_column' values should be in set ['A', 'B', 'C', 'D'].")
    validator.expect_column_values_to_be_in_set(column="category_column", value_set=['A', 'B', 'C', 'D'])
    print("-" * 30)

    # --- 8. Run Validation ---
    # The validate() method executes all Expectations in the suite against the data batch.
    print("Running validation...")
    validation_result = validator.validate()
    print("Validation complete.")
    print("-" * 30)

    # --- 9. Review Validation Results ---
    print("\n--- Validation Result Summary ---")
    if validation_result["success"]:
        print("Overall Validation SUCCEEDED!")
    else:
        print("Overall Validation FAILED!")

    print(f"\nNumber of expectations evaluated: {len(validation_result['results'])}")
    print(f"Successful expectations: {validation_result['statistics']['successful_expectations']}")
    print(f"Unsuccessful expectations: {validation_result['statistics']['unsuccessful_expectations']}")
    print(f"Success percent: {validation_result['statistics']['success_percent']:.2f}%")

    print("\n--- Detailed Results for Each Expectation ---")
    for result in validation_result["results"]:
        print(f"  Expectation Type: {result['expectation_config']['expectation_type']}")
        print(f"    Column: {result['expectation_config']['kwargs'].get('column', 'N/A')}")
        print(f"    Success: {result['success']}")
        if not result['success']:
            print(f"    Observed Value: {result['result'].get('observed_value', 'N/A')}")
            print(f"    Unexpected Count: {result['result'].get('unexpected_count', 'N/A')}")
            print(f"    Unexpected Percent: {result['result'].get('unexpected_percent', 'N/A')}")
            # Limit the display of partial_unexpected_list for brevity
            partial_list = result['result'].get('partial_unexpected_list', [])
            print(f"    Partial Unexpected List (first 5): {partial_list[:5]}")
        print("-" * 20) # Separator for each expectation result
    print("-" * 30)

    # --- 10. Save the Expectation Suite (Optional but Recommended) ---
    # This saves the suite (with all added expectations) to your context's configured store
    # (usually as a JSON file in the `great_expectations/expectations/` directory if you have a project).
    try:
        validator.save_expectation_suite()
        print(f"Expectation Suite '{expectation_suite_name}' saved successfully.")
    except Exception as e:
        print(f"Error saving Expectation Suite '{expectation_suite_name}': {e}")

else:
    print("Validator could not be initialized. Skipping addition of expectations, validation, and saving of suite.")

print("-" * 30)
print("Script finished.")

# --- To view Data Docs (if you have a GX project initialized and build them) ---
# After running, if you have a GX project, you can build Data Docs:
# `great_expectations docs build`
# And then open `great_expectations/uncommitted/data_docs/local_site/index.html` in your browser.


--- Sample DataFrame Created ---
   id  numeric_column category_column
0   1            10.5               A
1   2            20.0               B
2   3             NaN               A
3   4            35.2               C
4   5             NaN               B
------------------------------
Great Expectations DataContext loaded successfully.
------------------------------
An unexpected error occurred while trying to get or add datasource 'pandas_datasource_example': "Could not find a datasource named 'pandas_datasource_example'"




AttributeError: module 'great_expectations.data_context' has no attribute 'DataContext'