## Automated Data Quality Monitoring
**Objective**: Use Great Expectations to perform data profiling and write validation rules.

1. Data Profiling with Great Expectations

### Profile a JSON dataset with product sales data to check for null values in the 'ProductID' and 'Price' fields.
- Create an expectation suite and connect it to the data context.
- Use the `expect_column_values_to_not_be_null` expectation to profile these fields.
- Review the summary to identify any unexpected null values.

In [1]:
# write your code from here
import pandas as pd
import great_expectations as ge
from great_expectations.data_context import DataContext

# Path to your Great Expectations project directory
GE_PROJECT_PATH = "/path/to/great_expectations"

# Path to your JSON dataset
JSON_PATH = "/path/to/product_sales.json"

def profile_json_for_nulls():
    # Load JSON data into a pandas DataFrame
    df = pd.read_json(JSON_PATH)

    # Wrap the DataFrame as a Great Expectations Dataset
    df_ge = ge.from_pandas(df)

    # Initialize DataContext
    context = DataContext(GE_PROJECT_PATH)

    # Create a new expectation suite (overwrite if it exists)
    suite_name = "product_sales_null_check"
    context.create_expectation_suite(suite_name, overwrite_existing=True)

    # Get a Validator for the data and expectation suite
    validator = context.get_validator(
        batch_data=df_ge,
        expectation_suite_name=suite_name,
    )

    # Add expectations to check that 'ProductID' and 'Price' columns have no nulls
    validator.expect_column_values_to_not_be_null("ProductID")
    validator.expect_column_values_to_not_be_null("Price")

    # Save the expectation suite
    validator.save_expectation_suite(discard_failed_expectations=False)

    # Run validation on the dataset
    results = validator.validate()

    # Print validation summary
    print(f"Validation success: {results['success']}")
    for res in results["results"]:
        exp_type = res["expectation_config"]["expectation_type"]
        col = res["expectation_config"]["kwargs"]["column"]
        success = res["success"]
        unexpected_count = res["result"].get("unexpected_count", 0)
        print(f"- Expectation: {exp_type} on '{col}' — Passed? {success}, Unexpected nulls: {unexpected_count}")

if __name__ == "__main__":
    profile_json_for_nulls()


ImportError: cannot import name 'DataContext' from 'great_expectations.data_context' (/home/vscode/.local/lib/python3.10/site-packages/great_expectations/data_context/__init__.py)

2. Writing Validation Rules for Data Ingestion

### Define validation rules for an API data source to confirm that 'Status' field contains only predefined statuses ('Active', 'Inactive').

- Apply `expect_column_values_to_be_in_set` to check field values during data ingestion.
- Execute the validation and review any mismatches.

In [2]:
# write your code from here
import great_expectations as ge
from great_expectations.data_context import DataContext
import requests
import pandas as pd

# Great Expectations project path
GE_PROJECT_PATH = "/path/to/great_expectations"

# API endpoint returning JSON data
API_URL = "https://api.example.com/data"  # Replace with your API URL

def validate_api_status_field():
    # Fetch data from API
    response = requests.get(API_URL)
    response.raise_for_status()  # Raise error if request failed

    # Load JSON data into pandas DataFrame
    data = response.json()
    df = pd.DataFrame(data)

    # Wrap as Great Expectations dataset
    df_ge = ge.from_pandas(df)

    # Initialize DataContext
    context = DataContext(GE_PROJECT_PATH)

    # Create or overwrite an expectation suite
    suite_name = "api_status_validation"
    context.create_expectation_suite(suite_name, overwrite_existing=True)

    # Get validator for the batch data and suite
    validator = context.get_validator(
        batch_data=df_ge,
        expectation_suite_name=suite_name,
    )

    # Define allowed statuses
    allowed_statuses = ["Active", "Inactive"]

    # Add expectation for 'Status' column values
    validator.expect_column_values_to_be_in_set(
        column="Status",
        value_set=allowed_statuses,
    )

    # Save the expectation suite
    validator.save_expectation_suite(discard_failed_expectations=False)

    # Run validation
    results = validator.validate()

    # Print validation summary
    print(f"Validation success: {results['success']}")
    for res in results["results"]:
        exp_type = res["expectation_config"]["expectation_type"]
        col = res["expectation_config"]["kwargs"]["column"]
        success = res["success"]
        unexpected_values = res["result"].get("unexpected_list", [])
        unexpected_count = res["result"].get("unexpected_count", 0)
        print(f"- Expectation: {exp_type} on '{col}' — Passed? {success}")
        if not success:
            print(f"  Unexpected values ({unexpected_count}): {unexpected_values}")

if __name__ == "__main__":
    validate_api_status_field()


ImportError: cannot import name 'DataContext' from 'great_expectations.data_context' (/home/vscode/.local/lib/python3.10/site-packages/great_expectations/data_context/__init__.py)