### Task 1: Validate Data with a Custom Expectation in Great Expectations
**Description**: Create a custom expectation and validate data with Great Expectations.

**Load a sample DataFrame**

data = {
'age': [25, 30, 35, 40, 45],
'income': [50000, 60000, 75000, None, 100000]
}

In [1]:
# Write your code from here
import great_expectations as ge
from great_expectations.dataset import PandasDataset

# Sample data
data = {
    'age': [25, 30, 35, 40, 45],
    'income': [50000, 60000, 75000, None, 100000]
}

# Convert to Great Expectations dataset object
df = ge.from_pandas(pd.DataFrame(data))

# Define a custom expectation method by subclassing PandasDataset
class CustomDataset(PandasDataset):

    def expect_column_values_to_be_even(self, column):
        """
        Custom expectation to check if all values in a column are even numbers.
        """
        # Boolean series where True means the value is even or missing (we'll ignore nulls)
        success_series = self[column].dropna() % 2 == 0

        # Calculate success: all non-null values should be even
        success = success_series.all()

        return {
            "success": success,
            "result": {
                "observed_value": success_series.tolist()
            }
        }

# Use the custom dataset class
custom_df = CustomDataset(df)

# Validate the 'age' column to see if all values are even
result = custom_df.expect_column_values_to_be_even('age')

print(result)


ModuleNotFoundError: No module named 'great_expectations.dataset'

### Task 2: Implement a Basic Alert System for Data Quality Drops
**Description**: Set up a basic alert system that triggers when data quality drops.

In [None]:
# Write your code from here
import pandas as pd

# Sample data with some missing values
data = {
    'age': [25, 30, None, 40, 45],
    'income': [50000, 60000, 75000, None, 100000]
}
df = pd.DataFrame(data)

# Function to calculate missing data percentage for each column
def calculate_missing_rate(df):
    missing_rate = df.isnull().mean()
    return missing_rate

# Function to check data quality and trigger alert
def data_quality_alert(df, threshold=0.1):
    missing_rate = calculate_missing_rate(df)
    alert_columns = missing_rate[missing_rate > threshold]

    if not alert_columns.empty:
        print("ALERT: Data quality has dropped below threshold!")
        print(f"Columns with missing data above {threshold*100}%:")
        print(alert_columns)
    else:
        print("Data quality is within acceptable limits.")

# Run alert system
data_quality_alert(df)


### Task 3: Real-time Data Quality Monitoring with Python and Great Expectations
**Description**: Implement a system that monitors data quality in real-time.

In [None]:
# Write your code from here
import great_expectations as ge
import pandas as pd

# Sample incoming data batch (could be from API, streaming, etc.)
def get_new_data():
    data = {
        'age': [25, 30, None, 40, 150],   # 150 is invalid age
        'income': [50000, 60000, 75000, None, 100000]
    }
    return pd.DataFrame(data)

# Initialize Great Expectations DataFrame
def validate_data(df):
    ge_df = ge.from_pandas(df)
    
    # Define expectations
    ge_df.expect_column_values_to_not_be_null('age')
    ge_df.expect_column_values_to_be_between('age', min_value=0, max_value=120)
    ge_df.expect_column_values_to_not_be_null('income')
    ge_df.expect_column_values_to_be_between('income', min_value=0)
    
    # Run validation
    result = ge_df.validate()
    
    # Check for success or failures
    if result['success']:
        print("Data quality check PASSED.")
    else:
        print("Data quality check FAILED!")
        for res in result['results']:
            if not res['success']:
                print(f"Failed Expectation: {res['expectation_config']['expectation_type']}")
                print(f"Details: {res['result']}")
    
    return result['success']

# Simulate real-time data arrival
def monitor_real_time():
    # In a real scenario, this would be a loop or triggered by data arrival
    new_data = get_new_data()
    print("Validating new data batch:")
    print(new_data)
    
    success = validate_data(new_data)
    if not success:
        # Here, you can send alerts, log, or take corrective action
        print("Alert: Data quality issues detected!")
    else:
        print("Data is clean.")

if __name__ == "__main__":
    monitor_real_time()
