### Task 1: Validate Data with a Custom Expectation in Great Expectations
**Description**: Create a custom expectation and validate data with Great Expectations.

**Load a sample DataFrame**

data = {
'age': [25, 30, 35, 40, 45],
'income': [50000, 60000, 75000, None, 100000]
}

In [9]:
import pandas as pd
import great_expectations as ge
from great_expectations.core import ExpectationConfiguration, ExpectationSuite
from great_expectations.execution_engine import PandasExecutionEngine

# 1. Load the Sample Data
data = {
    'age': [25, 30, 35, 40, 45],
    'income': [50000, 60000, 75000, None, 100000]
}
df = pd.DataFrame(data)

# 2. Create a Great Expectations Context
context = ge.get_context()

# 3. Define a Custom Expectation (if needed)
#    In this case, we'll create a custom expectation to check if income is above a threshold.
#    If you want to use only built-in expectations, you can skip this part
class ExpectIncomeGreaterThan(ge.expectation.ColumnMapExpectation):  # Changed base class
    """
    Expect income values to be greater than a specified threshold.
    """
    # These six class variables define the domain of the expectation.
    # It applies to a specific column.
    map_metric_provider_class = PandasExecutionEngine.map_series  # Use Pandas series map
    success_keys = ("threshold",)
    default_kwarg_values = {"threshold": 0}  # Set a default threshold
    args_keys = ("column", "threshold")  # Corrected args_keys
    # This is the main logic for the custom expectation
    @PandasExecutionEngine.map_series(schema_type=None)  # Add schema_type
    def _map_series(cls, series: pd.Series, threshold: int) -> pd.Series:
        """
        Determines if income is greater than the threshold.

        Args:
            series: The Pandas Series representing the income column.
            threshold: The threshold value.

        Returns:
            A Pandas Series of boolean values (True if income > threshold, False otherwise).
        """
        return series > threshold

    # This method describes the expectation in human-readable terms.
    def get_validation_dependencies(
        cls,
        column: str,
        threshold: int = None,
        **kwargs,
    ) -> list[ge.core.ExpectationConfiguration]:
        """
        Returns a list of dependencies for this Expectation
        """
        return [ExpectationConfiguration("expect_column_values_to_exist", {"column": column})]

    def describe(
        cls,
        column: str,
        threshold: int = None,
        **kwargs,
    ) -> str:
        """
        Returns a human-readable string representation of this expectation.
        """
        return f"Values in column '{column}' should be greater than {threshold:.2f}."



# 4. Create an Expectation Suite
expectation_suite_name = "income_validation_suite"
suite = context.get_expectation_suite(expectation_suite_name)
if suite is None:
    suite = context.create_expectation_suite(expectation_suite_name)

# 5. Add Expectations to the Suite
#    Here, we add both a built-in expectation and our custom expectation.
suite.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={"column": "income"},
    )
)
suite.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between", #Using built-in
        kwargs={
            "column": "age",
            "min_value": 18,
            "max_value": 65,
        },
    )
)

suite.add_expectation(
    ExpectationConfiguration(  #Using custom
        expectation_type="expect_income_greater_than",
        kwargs={
            "column": "income",
            "threshold": 40000,
        },
    )
)
context.save_expectation_suite(expectation_suite=suite, expectation_suite_name=expectation_suite_name)

# 6. Create a BatchRequest
batch_request = ge.datasource.Datasource.create_batch_request(
    datasource_name="pandas",  # Use the name 'pandas'
    data_asset_name="my_data",  # You can choose any name
    pandas_df=df,  # Pass the DataFrame here
)

# 7. Create a Validator
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name,
)
# 8. Validate the Data
results = validator.validate()

# 9. Print the Validation Results
print(results)

# 10. Check if the validation was successful
if results["success"]:
    print("Data validation successful!")
else:
    print("Data validation failed.")



ImportError: cannot import name 'ExpectationConfiguration' from 'great_expectations.core' (/home/vscode/.local/lib/python3.10/site-packages/great_expectations/core/__init__.py)

### Task 2: Implement a Basic Alert System for Data Quality Drops
**Description**: Set up a basic alert system that triggers when data quality drops.

In [None]:
import pandas as pd
import great_expectations as ge
from great_expectations.core import ExpectationConfiguration, ExpectationSuite
from great_expectations.execution_engine import PandasExecutionEngine
import smtplib  # For sending emails
from email.mime.text import MIMEText  # For creating email messages
import datetime

# 1. Load the Sample Data
data = {
    'age': [25, 30, 35, 40, 45],
    'income': [50000, 60000, 75000, None, 100000]
}
df = pd.DataFrame(data)

# 2. Create a Great Expectations Context
context = ge.get_context()

# 3. Define a Custom Expectation (if needed)
class ExpectIncomeGreaterThan(ge.expectation.ColumnMapExpectation):
    """
    Expect income values to be greater than a specified threshold.
    """
    map_metric_provider_class = PandasExecutionEngine.map_series
    success_keys = ("threshold",)
    default_kwarg_values = {"threshold": 0}
    args_keys = ("column", "threshold")

    @PandasExecutionEngine.map_series(schema_type=None)
    def _map_series(cls, series: pd.Series, threshold: int) -> pd.Series:
        return series > threshold

    def get_validation_dependencies(
        cls,
        column: str,
        threshold: int = None,
        **kwargs,
    ) -> list[ge.core.ExpectationConfiguration]:
        return [ExpectationConfiguration("expect_column_values_to_exist", {"column": column})]

    def describe(
        cls,
        column: str,
        threshold: int = None,
        **kwargs,
    ) -> str:
        return f"Values in column '{column}' should be greater than {threshold:.2f}."

# 4. Create an Expectation Suite
expectation_suite_name = "income_validation_suite"
suite = context.get_expectation_suite(expectation_suite_name)
if suite is None:
    suite = context.create_expectation_suite(expectation_suite_name)

# 5. Add Expectations to the Suite
suite.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={"column": "income"},
    )
)
suite.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "age",
            "min_value": 18,
            "max_value": 65,
        },
    )
)
suite.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_income_greater_than",
        kwargs={
            "column": "income",
            "threshold": 40000,
        },
    )
)
context.save_expectation_suite(expectation_suite=suite, expectation_suite_name=expectation_suite_name)

# 6. Create a BatchRequest
batch_request = ge.datasource.Datasource.create_batch_request(
    datasource_name="pandas",
    data_asset_name="my_data",
    pandas_df=df,
)

# 7. Create a Validator
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name,
)

# 8. Validate the Data
results = validator.validate()

# 9. Print the Validation Results
print(results)

# 10. Check if the validation was successful and send an alert if not
if not results["success"]:
    print("Data validation failed!")
    send_data_quality_alert(results)  # Call the alerting function
else:
    print("Data validation successful!")



def send_data_quality_alert(validation_results):
    """
    Sends an email alert with details about the data quality issues.

    Args:
        validation_results (dict): The validation results from Great Expectations.
    """
    # Email configuration
    sender_email = "your_email@example.com"  # Replace with your email address
    receiver_email = "recipient_email@example.com"  # Replace with the recipient's email address
    smtp_server = "smtp.example.com"  # Replace with your SMTP server address
    smtp_port = 587  # Replace with your SMTP server port (e.g., 587 for TLS)
    smtp_username = "your_email@example.com"  # Replace with your email username
    smtp_password = "your_email_password"  # Replace with your email password or an app password

    # Create the email message
    subject = "Data Quality Alert"
    now = datetime.datetime.now()
    date_time_str = now.strftime("%d/%m/%Y, %H:%M:%S")
    body = f"Data quality check failed on {date_time_str} UTC.\n\n"
    body += "Details:\n"
    for result in validation_results["results"]:
        if not result["success"]:
            body += f"- Expectation: {result['expectation_config']['expectation_type']}\n"
            body += f"  Column: {result['expectation_config']['kwargs'].get('column', 'N/A')}\n"
            body += f"  Status: Failed\n"
            if result.get("result", {}).get("unexpected_count") is not None:
                body += f"  Unexpected Count: {result['result']['unexpected_count']}\n"
            body += f"  Details: {result['result']}\n"
            body += "\n"
    body += "\n"
    body += "Please investigate the data quality issues."

    msg = MIMEText(body)
    msg["Subject"] = subject
    msg["From"] = sender_email
    msg["To"] = receiver_email

    # Send the email
    try:
        server = smtplib.SMTP(smtp_server, smtp_port)
        server.starttls()  # Use TLS encryption
        server.login(smtp_username, smtp_password)
        server.sendmail(sender_email, [receiver_email], msg.as_string())
        server.quit()
        print("Data quality alert email sent successfully.")
    except Exception as e:
        print(f"Error sending email: {e}")
        print("Please check your email configuration and network connection.")
        print(
            "Note: Email sending is a critical component of an alerting system.  "
            "Ensure your SMTP server details are correct and that your email provider allows sending emails from your script."
        )



ImportError: cannot import name 'ExpectationConfiguration' from 'great_expectations.core' (/home/vscode/.local/lib/python3.10/site-packages/great_expectations/core/__init__.py)

### Task 3: Real-time Data Quality Monitoring with Python and Great Expectations
**Description**: Implement a system that monitors data quality in real-time.

In [None]:
import pandas as pd
import great_expectations as ge
from great_expectations.core import ExpectationSuite
from great_expectations.execution_engine import PandasExecutionEngine
import time
import random
import smtplib
from email.mime.text import MIMEText
import datetime
import threading  # Import the threading module

# 1. Load the Sample Data (as a function for re-use)
def generate_sample_data():
    """Generates a sample DataFrame with 'age' and 'income'."""
    data = {
        'age': [random.randint(20, 60) for _ in range(5)],  # Simulate age between 20 and 60
        'income': [random.randint(40000, 120000) if random.random() > 0.1 else None for _ in range(5)]  # Simulate income, with 10% missing
    }
    return pd.DataFrame(data)

# 2. Create a Great Expectations Context
context = ge.get_context()

# 3. Define a Custom Expectation (if needed)
class ExpectIncomeGreaterThan(ge.expectation.ColumnMapExpectation):
    """
    Expect income values to be greater than a specified threshold.
    """
    map_metric_provider_class = PandasExecutionEngine.map_series
    success_keys = ("threshold",)
    default_kwarg_values = {"threshold": 0}
    args_keys = ("column", "threshold")

    @PandasExecutionEngine.map_series(schema_type=None)
    def _map_series(cls, series: pd.Series, threshold: int) -> pd.Series:
        """
        Determines if income is greater than the threshold.

        Args:
            series: The Pandas Series representing the income column.
            threshold: The threshold value.

        Returns:
            A Pandas Series of boolean values (True if income > threshold, False otherwise).
        """
        return series > threshold

    # This method describes the expectation in human-readable terms.
    def get_validation_dependencies(
        cls,
        column: str,
        threshold: int = None,
        **kwargs,
    ) -> list[ge.core.ExpectationConfiguration]: # Changed from ge.core.ExpectationConfiguration
        """
        Returns a list of dependencies for this Expectation
        """
        return [ge.core.ExpectationConfiguration("expect_column_values_to_exist", {"column": column})]

    def describe(
        cls,
        column: str,
        threshold: int = None,
        **kwargs,
    ) -> str:
        """
        Returns a human-readable string representation of this expectation.
        """
        return f"Values in column '{column}' should be greater than {threshold:.2f}."



# 4. Create an Expectation Suite
expectation_suite_name = "income_validation_suite"
suite = context.get_expectation_suite(expectation_suite_name)
if suite is None:
    suite = context.create_expectation_suite(expectation_suite_name)

# 5. Add Expectations to the Suite
suite.add_expectation(
    ge.core.ExpectationConfiguration( # Changed from ExpectationConfiguration
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={"column": "income"},
    )
)
suite.add_expectation(
    ge.core.ExpectationConfiguration( # Changed from ExpectationConfiguration
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "age",
            "min_value": 18,
            "max_value": 65,
        },
    )
)
suite.add_expectation(
    ge.core.ExpectationConfiguration(  # Changed from ExpectationConfiguration
        expectation_type="expect_income_greater_than",
        kwargs={
            "column": "income",
            "threshold": 40000,
        },
    )
)
context.save_expectation_suite(expectation_suite=suite, expectation_suite_name=expectation_suite_name)



def send_data_quality_alert(validation_results):
    """
    Sends an email alert with details about the data quality issues.

    Args:
        validation_results (dict): The validation results from Great Expectations.
    """
    # Email configuration
    sender_email = "your_email@example.com"  # Replace with your email address
    receiver_email = "recipient_email@example.com"  # Replace with the recipient's email address
    smtp_server = "smtp.example.com"  # Replace with your SMTP server address
    smtp_port = 587  # Replace with your SMTP server port (e.g., 587 for TLS)
    smtp_username = "your_email@example.com"  # Replace with your email username
    smtp_password = "your_email_password"  # Replace with your email password or an app password

    # Create the email message
    subject = "Data Quality Alert - Real-time Monitoring"
    now = datetime.datetime.now()
    date_time_str = now.strftime("%d/%m/%Y, %H:%M:%S")
    body = f"Data quality check failed on {date_time_str} UTC.\n\n"
    body += "Details:\n"
    for result in validation_results["results"]:
        if not result["success"]:
            body += f"- Expectation: {result['expectation_config']['expectation_type']}\n"
            body += f"  Column: {result['expectation_config']['kwargs'].get('column', 'N/A')}\n"
            body += f"  Status: Failed\n"
            if result.get("result", {}).get("unexpected_count") is not None:
                body += f"  Unexpected Count: {result['result']['unexpected_count']}\n"
            body += f"  Details: {result['result']}\n"
            body += "\n"
    body += "\n"
    body += "Please investigate the data quality issues immediately."  # Emphasize urgency

    msg = MIMEText(body)
    msg["Subject"] = subject
    msg["From"] = sender_email
    msg["To"] = receiver_email

    # Send the email
    try:
        server = smtplib.SMTP(smtp_server, smtp_port)
        server.starttls()  # Use TLS encryption
        server.login(smtp_username, smtp_password)
        server.sendmail(sender_email, [receiver_email], msg.as_string())
        server.quit()
        print("Data quality alert email sent successfully.")
    except Exception as e:
        print(f"Error sending email: {e}")
        print("Please check your email configuration and network connection.")



def monitor_data_quality():
    """
    Monitors data quality in real-time (simulated) by repeatedly generating sample data,
    validating it with Great Expectations, and sending alerts if necessary.
    """
    while True:
        # 1. Generate (or receive) new data
        new_data_df = generate_sample_data()

        # 2. Create a BatchRequest for the new data
        batch_request = ge.datasource.Datasource.create_batch_request(
            datasource_name="pandas",
            data_asset_name="realtime_data",  # Important: Use a consistent data_asset_name
            pandas_df=new_data_df,
        )

        # 3. Create a Validator
        validator = context.get_validator(
            batch_request=batch_request,
            expectation_suite_name=expectation_suite_name,
        )

        # 4. Validate the data
        results = validator.validate()

        # 5. Print and check the validation results
        print(f"Data quality check at {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}:")
        print(results)

        if not results["success"]:
            print("Data validation failed!")
            send_data_quality_alert(results)
        else:
            print("Data validation successful.")

        time.sleep(5)  # Check every 5 seconds (adjust as needed)



if __name__ == "__main__":
    # Start the monitoring in a separate thread
    monitoring_thread = threading.Thread(target=monitor_data_quality)
    monitoring_thread.daemon = True  # Allow the main thread to exit even if this thread is running
    monitoring_thread.start()

    # Keep the main thread alive (optional, for demonstration)
    while True:
        time.sleep(60)  # Print a message every 60 seconds to show the main thread is alive
        print("Main thread is still running...")


AttributeError: module 'great_expectations' has no attribute 'expectation'