In [None]:
# Activity 4: Data Quality Automation Tools

# Task A: Using Great Expectations

# 19. Setting Up Expectations:
# - Install Great Expectations and set up a basic expectation suite.
# - Validate a dataset and list unmet expectations.






# 20. Testing for Expectation:
# - Create expectations such as “column values must fall within a certain range.”
import pandas as pd
import great_expectations as gx
import os

# Create a sample Pandas DataFrame (replace with your data loading)
data = {
    'ID': [1, 2, 3, 4, 5],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 30, 35, 28, 42],
    'Salary': [50000, 60000, 55000, 70000, 62000]
}
df = pd.DataFrame(data)

# Initialize DataContext (if not already initialized in your project)
context = gx.get_context()

# Create a Pandas Datasource
datasource_name = "pandas_example"
pandas_datasource = context.sources.add_pandas(name=datasource_name)

# Create a DataAsset from the Pandas DataFrame
data_asset_name = "my_data"
data_asset = pandas_datasource.add_dataframe_asset(name=data_asset_name, dataframe=df)

# Create a BatchRequest to specify the data to validate
batch_request = data_asset.build_batch_request()

# Create an Expectation Suite (if it doesn't exist)
expectation_suite_name = "my_expectation_suite"
try:
    suite = context.get_expectation_suite(expectation_suite_name)
    print(f"Loaded existing Expectation Suite: {expectation_suite_name}")
except gx.exceptions.ExpectationSuiteNotFoundError:
    suite = context.create_expectation_suite(expectation_suite_name=expectation_suite_name)
    print(f"Created new Expectation Suite: {expectation_suite_name}")

# Add a basic expectation (e.g., check if the 'ID' column exists)
suite.expect_column_to_exist("ID")

# Save the Expectation Suite
context.save_expectation_suite(suite)

# Run the validation
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name
)

validation_result = validator.validate()

print("\nValidation Result:")
print(validation_result)
#
# List unmet expectations
unmet_expectations = validation_result.statistics["unsuccessful_expectations"]
if unmet_expectations > 0:
    print("\nUnmet Expectations:")
    for result in validation_result["results"]:
        if not result["success"]:
            print(f"  {result['expectation_config']['expectation_type']}: {result['expectation_config']['kwargs']}")
else:
    print("\nAll expectations were met.")





# 21. Generating Data Docs:
# - Automatically generate data quality documentation.








In [None]:
# Task B: Using DQ Labs

# 22. Tool Setup and Configuration:
# - Download and configure DQ Labs on your local environment.
# - Create a new data quality project.








# 23. Data Analysis Automation:
# - Apply DQ Labs for automating data profiling and quality checks.







# 24. Quality Rule Creation:
# - Create quality rules for detecting and handling duplicates or enforcing standards.








