In [1]:
# Activity 4: Data Quality Automation Tools

# Task A: Using Great Expectations

# 19. Setting Up Expectations:
# - Install Great Expectations and set up a basic expectation suite.
# - Validate a dataset and list unmet expectations.






# 20. Testing for Expectation:
# - Create expectations such as “column values must fall within a certain range.”






# 21. Generating Data Docs:
# - Automatically generate data quality documentation.








In [2]:
import great_expectations as ge
from great_expectations.dataset import PandasDataset
import pandas as pd

# Sample data with some numeric column to validate
data = {
    'age': [25, 30, 45, 22, 99, 150, -5],  # Note: 150 and -5 are invalid ages
    'salary': [50000, 60000, 55000, 48000, 70000, 65000, 52000]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Convert pandas DataFrame to Great Expectations dataset
ge_df = ge.from_pandas(df)

# --- 19. Setting Up Basic Expectation Suite ---

# Create expectation suite if not exists (here done programmatically)
suite_name = "age_salary_suite"

# --- 20. Testing for Expectations ---

# Age should be between 0 and 120
ge_df.expect_column_values_to_be_between('age', min_value=0, max_value=120)

# Salary should be positive
ge_df.expect_column_values_to_be_between('salary', min_value=0)

# Validate dataset and collect results
results = ge_df.validate()

print("\nValidation Results:")
print(results)

# List unmet expectations (those that failed)
failed_expectations = [res for res in results['results'] if not res['success']]
print("\nUnmet Expectations:")
for fail in failed_expectations:
    print(f"Expectation: {fail['expectation_config']['expectation_type']}")
    print(f"Details: {fail['result']}")
    print()

# --- 21. Generating Data Docs ---

# Initialize Data Context (for a quick run, use in-memory context)
from great_expectations.data_context import DataContext

# Usually, you create a directory with great_expectations init but here we'll create a minimal context
context = DataContext.create("./great_expectations/")

# Save the expectation suite
context.save_expectation_suite(
    ge_df.get_expectation_suite(discard_failed_expectations=False),
    suite_name
)

# Create an in-memory batch from the dataframe
batch = context.create_pandas_batch(df)

# Validate and save validation results
validation_result_identifier = context.run_validation_operator(
    "action_list_operator",
    assets_to_validate=[batch],
    run_name="validation_run"
)

# Build and open data docs (this generates HTML files you can open in your browser)
context.build_data_docs()
context.open_data_docs()

print("\nData docs generated and opened in your browser.")


ModuleNotFoundError: No module named 'great_expectations.dataset'

In [None]:
# Task B: Using DQ Labs

# 22. Tool Setup and Configuration:
# - Download and configure DQ Labs on your local environment.
# - Create a new data quality project.








# 23. Data Analysis Automation:
# - Apply DQ Labs for automating data profiling and quality checks.







# 24. Quality Rule Creation:
# - Create quality rules for detecting and handling duplicates or enforcing standards.








