### Task 1: Validate Data with a Custom Expectation in Great Expectations
**Description**: Create a custom expectation and validate data with Great Expectations.

**Load a sample DataFrame**

data = {
'age': [25, 30, 35, 40, 45],
'income': [50000, 60000, 75000, None, 100000]
}

In [2]:
import pandas as pd
import great_expectations as ge
from great_expectations.core.batch import RuntimeBatchRequest
from great_expectations.data_context.types.base import DataContextConfig
from great_expectations.data_context import BaseDataContext

# Sample data
data = {
    'age': [25, 30, 35, 40, 45],
    'income': [50000, 60000, 75000, None, 100000]
}
df = pd.DataFrame(data)

# --- Setup minimal in-memory DataContext ---
data_context_config = DataContextConfig(
    datasources={
        "my_pandas_datasource": {
            "class_name": "Datasource",
            "execution_engine": {
                "class_name": "PandasExecutionEngine",
            },
            "data_connectors": {
                "default_runtime_data_connector": {
                    "class_name": "RuntimeDataConnector",
                    "batch_identifiers": ["default_identifier_name"],
                }
            },
        }
    },
    stores={
        "expectations_store": {
            "class_name": "InMemoryStoreBackend"
        },
        "validations_store": {
            "class_name": "InMemoryStoreBackend"
        },
        "evaluation_parameter_store": {
            "class_name": "InMemoryStoreBackend"
        }
    },
    expectations_store_name="expectations_store",
    validations_store_name="validations_store",
    evaluation_parameter_store_name="evaluation_parameter_store",
    checkpoint_store_name=None,
)

context = BaseDataContext(project_config=data_context_config)

# --- Create an expectation suite ---
expectation_suite_name = "my_suite"
context.create_expectation_suite(expectation_suite_name, overwrite_existing=True)

# --- Create a runtime batch request with the DataFrame ---
batch_request = RuntimeBatchRequest(
    datasource_name="my_pandas_datasource",
    data_connector_name="default_runtime_data_connector",
    data_asset_name="my_runtime_data_asset",
    runtime_parameters={"batch_data": df},
    batch_identifiers={"default_identifier_name": "default_identifier"},
)

# --- Get a Validator to validate the data ---
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name,
)

# --- Add custom expectations ---

# Check age is greater or equal to 0
validator.expect_column_values_to_be_between(column="age", min_value=0, max_value=120)

# Check income is not null
validator.expect_column_values_to_not_be_null(column="income")

# --- Validate and get results ---
results = validator.validate()

# --- Print results ---
print("Validation Results:")
print(results)

# Optionally, show summary
print("\nSummary of Expectation Results:")
for result in results["results"]:
    print(f"- Expectation: {result['expectation_config']['expectation_type']}")
    print(f"  Success: {result['success']}")

ImportError: cannot import name 'BaseDataContext' from 'great_expectations.data_context' (/home/vscode/.local/lib/python3.10/site-packages/great_expectations/data_context/__init__.py)

### Task 2: Implement a Basic Alert System for Data Quality Drops
**Description**: Set up a basic alert system that triggers when data quality drops.

In [3]:
import pandas as pd

def check_data_quality(df, threshold=0.05):
    """
    Check the percentage of missing values in the dataframe.
    Trigger alert if missing percentage exceeds threshold.
    """
    total_values = df.size
    missing_values = df.isnull().sum().sum()
    missing_ratio = missing_values / total_values
    
    print(f"Missing Data Ratio: {missing_ratio:.2%}")
    
    if missing_ratio > threshold:
        alert_message = f"ALERT: Data quality has dropped! Missing data ratio is {missing_ratio:.2%} which exceeds the threshold of {threshold:.2%}."
        trigger_alert(alert_message)
    else:
        print("Data quality is within acceptable limits.")
        
def trigger_alert(message):
    """
    Function to handle alerting mechanism.
    Currently, it just prints the message.
    You can expand to send emails, SMS, or push notifications.
    """
    print(message)
    # Here you could integrate email/SMS APIs or other notification systems

# Example usage:
if __name__ == "__main__":
    # Sample data with missing values
    data = {
        'A': [1, 2, None, 4],
        'B': [None, 2, 3, 4],
        'C': [1, 2, 3, 4]
    }
    df = pd.DataFrame(data)
    
    check_data_quality(df, threshold=0.05)

Missing Data Ratio: 16.67%
ALERT: Data quality has dropped! Missing data ratio is 16.67% which exceeds the threshold of 5.00%.


### Task 3: Real-time Data Quality Monitoring with Python and Great Expectations
**Description**: Implement a system that monitors data quality in real-time.

In [6]:
import great_expectations as ge
import pandas as pd
import time

def create_expectation_suite():
    # Create an empty suite
    suite = ge.core.ExpectationSuite(expectation_suite_name="my_data_suite")

    # Add expectations manually
    suite.add_expectation(
        ge.expectations.expectation.ExpectationConfiguration(
            expectation_type="expect_column_to_exist",
            kwargs={"column": "col1"},
        )
    )
    suite.add_expectation(
        ge.expectations.expectation.ExpectationConfiguration(
            expectation_type="expect_column_values_to_not_be_null",
            kwargs={"column": "col1"},
        )
    )
    suite.add_expectation(
        ge.expectations.expectation.ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_between",
            kwargs={"column": "col1", "min_value": 0, "max_value": 10},
        )
    )
    suite.add_expectation(
        ge.expectations.expectation.ExpectationConfiguration(
            expectation_type="expect_column_to_exist",
            kwargs={"column": "col2"},
        )
    )
    suite.add_expectation(
        ge.expectations.expectation.ExpectationConfiguration(
            expectation_type="expect_column_values_to_not_be_null",
            kwargs={"column": "col2"},
        )
    )
    return suite

def validate_data_batch(df, suite):
    ge_df = ge.from_pandas(df)

    results = ge_df.validate(expectation_suite=suite)

    if results["success"]:
        print("✅ Data quality check PASSED.")
    else:
        print("❌ Data quality check FAILED!")
        for result in results["results"]:
            if not result["success"]:
                exp_type = result["expectation_config"]["expectation_type"]
                column = result["expectation_config"]["kwargs"].get("column", "unknown column")
                print(f" - Failed expectation: {exp_type} on column '{column}'")

def simulate_real_time_stream():
    batches = [
        pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]}),          # clean data
        pd.DataFrame({"col1": [None, 2, 3], "col2": [4, 5, None]}),    # nulls in col1 and col2
        pd.DataFrame({"col1": [11, 2, 3], "col2": [4, None, 6]}),      # out of range in col1, null in col2
    ]
    for batch in batches:
        yield batch
        time.sleep(3)

def main():
    print("Creating expectation suite in-memory...")
    suite = create_expectation_suite()

    print("Starting real-time data quality monitoring...\n")
    for i, batch_df in enumerate(simulate_real_time_stream(), 1):
        print(f"--- Batch #{i} ---")
        print(batch_df)
        validate_data_batch(batch_df, suite)
        print()

if __name__ == "__main__":
    main()

Creating expectation suite in-memory...


TypeError: ExpectationSuite.__init__() got an unexpected keyword argument 'expectation_suite_name'