### Task 1: Understanding and Defining Data Quality Metrics
**Description**: Learn how to define basic data quality metrics such as completeness, validity, and uniqueness for a simple dataset.

**Steps**:
1. Dataset: Use a CSV with columns like Name , Email , Age .
2. Metric Definitions:
    - Completeness: Percentage of non-null values.
    - Validity: % of email fields containing @ .
    - Uniqueness: Count distinct entries in the Email column.

In [1]:
import pandas as pd
import numpy as np

# Step 1: Create a sample CSV dataset
data = {
    'Name': ['Alice', 'Bob', '', 'David', 'Eve', None],
    'Email': ['alice@example.com', 'bob@example', 'charlie@domain.com', 'david@example.com', 'eve@example.com', 'eve@example.com'],
    'Age': [25, None, 30, 28, 22, 35]
}

# Save the sample dataset to a CSV file
df = pd.DataFrame(data)
df.to_csv('sample_data.csv', index=False)

# Step 2: Load the CSV file
df = pd.read_csv('sample_data.csv')

# Step 3: Calculate Data Quality Metrics

# Completeness: Percentage of non-null values for each column
completeness = df.notnull().mean() * 100
print("Completeness (% of non-null values):")
print(completeness)
print("\n")

# Validity: Percentage of email fields containing '@'
valid_emails = df['Email'].str.contains('@', na=False).mean() * 100
print(f"Validity (% of emails containing '@'): {valid_emails:.2f}%")
print("\n")

# Uniqueness: Count distinct entries in the Email column
unique_emails = df['Email'].nunique()
total_emails = df['Email'].count()
uniqueness_percentage = (unique_emails / total_emails) * 100
print(f"Uniqueness (distinct emails): {unique_emails} out of {total_emails}")
print(f"Uniqueness (% of distinct emails): {uniqueness_percentage:.2f}%")

Completeness (% of non-null values):
Name      66.666667
Email    100.000000
Age       83.333333
dtype: float64


Validity (% of emails containing '@'): 100.00%


Uniqueness (distinct emails): 5 out of 6
Uniqueness (% of distinct emails): 83.33%


### Task 2: Calculating Data Quality Score
**Description**: Aggregate multiple metrics to calculate an overall data quality score.

**Steps**:
1. Formula: Simple average of all metrics defined in Task 1.

In [3]:
import pandas as pd
import numpy as np

# Step 1: Load the CSV file (assumes sample_data.csv from Task 1)
df = pd.read_csv('sample_data.csv')

# Step 2: Calculate Data Quality Metrics (from Task 1)

# Completeness: Average percentage of non-null values across all columns
completeness = df.notnull().mean().mean() * 100

# Validity: Percentage of email fields containing '@'
valid_emails = df['Email'].str.contains('@', na=False).mean() * 100

# Uniqueness: Percentage of distinct entries in the Email column
unique_emails = df['Email'].nunique()
total_emails = df['Email'].count()
uniqueness_percentage = (unique_emails / total_emails) * 100

# Step 3: Calculate Overall Data Quality Score
# Simple average of completeness, validity, and uniqueness
data_quality_score = (completeness + valid_emails + uniqueness_percentage) / 3

# Step 4: Display Results
print("Individual Metrics:")
print(f"Completeness (avg % non-null): {completeness:.2f}%")
print(f"Validity (% emails with '@'): {valid_emails:.2f}%")
print(f"Uniqueness (% distinct emails): {uniqueness_percentage:.2f}%")
print("\n")
print(f"Overall Data Quality Score: {data_quality_score:.2f}%")

Individual Metrics:
Completeness (avg % non-null): 83.33%
Validity (% emails with '@'): 100.00%
Uniqueness (% distinct emails): 83.33%


Overall Data Quality Score: 88.89%


### Task 3: Creating Expectations for a CSV
**Description**: Develop basic data quality expectations using Great Expectations.

**Steps**:
1. Expectation Suite
2. Define Expectations for Completeness

In [5]:
import pandas as pd
import great_expectations as ge
from great_expectations.data_context import FileDataContext
import os

# Step 1: Create sample_data.csv if it doesn't exist
if not os.path.exists('sample_data.csv'):
    data = {
        'Name': ['Alice', 'Bob', '', 'David', 'Eve', None],
        'Email': ['alice@example.com', 'bob@example', 'charlie@domain.com', 'david@example.com', 'eve@example.com', 'eve@example.com'],
        'Age': [25, None, 30, 28, 22, 35]
    }
    df = pd.DataFrame(data)
    df.to_csv('sample_data.csv', index=False)
    print("Created sample_data.csv")

# Step 2: Initialize a FileDataContext
try:
    context = FileDataContext.create(project_root_dir=".")
    print("Initialized Great Expectations DataContext")
except Exception as e:
    print(f"Error initializing DataContext: {e}")
    raise

# Step 3: Set up Datasource and Data Asset
datasource_name = "pandas_datasource"
data_asset_name = "sample_data"

try:
    if datasource_name not in context.datasources:
        datasource = context.sources.add_pandas(name=datasource_name)
    else:
        datasource = context.datasources[datasource_name]
    
    data_asset = datasource.add_csv_asset(name=data_asset_name, filepath_or_buffer="sample_data.csv")
    print(f"Added datasource '{datasource_name}' and asset '{data_asset_name}'")
except Exception as e:
    print(f"Error setting up datasource/asset: {e}")
    raise

# Step 4: Create or load Expectation Suite
suite_name = "sample_data_suite"
try:
    if suite_name not in context.list_expectation_suite_names():
        suite = context.add_expectation_suite(expectation_suite_name=suite_name)
    else:
        suite = context.get_expectation_suite(expectation_suite_name=suite_name)
    print(f"Expectation suite '{suite_name}' ready")
except Exception as e:
    print(f"Error creating/loading expectation suite: {e}")
    raise

# Step 5: Get a Validator
try:
    batch_request = data_asset.build_batch_request()
    validator = context.get_validator(batch_request=batch_request, expectation_suite_name=suite_name)
    print("Validator initialized")
except Exception as e:
    print(f"Error initializing validator: {e}")
    raise

# Step 6: Define Completeness Expectations
try:
    validator.expect_column_values_to_not_be_null(column="Email", mostly=0.95, result_format="SUMMARY")
    validator.expect_column_values_to_not_be_null(column="Name", mostly=0.80, result_format="SUMMARY")
    validator.expect_column_values_to_not_be_null(column="Age", mostly=0.80, result_format="SUMMARY")
    print("Completeness expectations defined")
except Exception as e:
    print(f"Error defining expectations: {e}")
    raise

# Step 7: Save the Expectation Suite
try:
    validator.save_expectation_suite(discard_failed_expectations=False)
    print(f"Expectation suite '{suite_name}' saved to Great Expectations configuration")
except Exception as e:
    print(f"Error saving expectation suite: {e}")
    raise

# Step 8: Validate the Dataset
try:
    checkpoint = context.add_or_update_checkpoint(
        name="sample_checkpoint",
        validations=[
            {
                "batch_request": batch_request,
                "expectation_suite_name": suite_name,
            }
        ]
    )
    checkpoint_result = checkpoint.run()
    print("\nValidation Results:")
    print(checkpoint_result)
except Exception as e:
    print(f"Error running validation: {e}")
    raise

# Step 9: Save the Expectation Suite to a File
try:
    context.save_expectation_suite(expectation_suite=validator.expectation_suite, expectation_suite_name=suite_name)
    print(f"\nExpectation suite saved to Great Expectations configuration")
except Exception as e:
    print(f"Error saving expectation suite to file: {e}")
    raise

Error initializing DataContext: type object 'FileDataContext' has no attribute 'create'


AttributeError: type object 'FileDataContext' has no attribute 'create'

### Task 4: Running and Validating Expectations
**Description**: Run the created expectations and generate an output report.

**Steps**:
1. Validate
2. Generate HTML Report

In [6]:
import pandas as pd
import great_expectations as ge
from great_expectations.data_context import FileDataContext
import os

# Step 1: Verify sample_data.csv exists, create if not
if not os.path.exists('sample_data.csv'):
    data = {
        'Name': ['Alice', 'Bob', '', 'David', 'Eve', None],
        'Email': ['alice@example.com', 'bob@example', 'charlie@domain.com', 'david@example.com', 'eve@example.com', 'eve@example.com'],
        'Age': [25, None, 30, 28, 22, 35]
    }
    df = pd.DataFrame(data)
    df.to_csv('sample_data.csv', index=False)
    print("Created sample_data.csv")

# Step 2: Initialize FileDataContext
try:
    context = FileDataContext(project_root_dir=".")
    print("Initialized Great Expectations DataContext")
except Exception as e:
    print(f"Error initializing DataContext: {e}")
    raise

# Step 3: Verify Datasource and Data Asset
datasource_name = "pandas_datasource"
data_asset_name = "sample_data"

try:
    if datasource_name not in context.datasources:
        datasource = context.sources.add_pandas(name=datasource_name)
    else:
        datasource = context.datasources[datasource_name]
    
    if data_asset_name not in [asset.name for asset in datasource.get_asset_names()]:
        data_asset = datasource.add_csv_asset(name=data_asset_name, filepath_or_buffer="sample_data.csv")
    else:
        data_asset = datasource.get_asset(data_asset_name)
    print(f"Verified datasource '{datasource_name}' and asset '{data_asset_name}'")
except Exception as e:
    print(f"Error setting up datasource/asset: {e}")
    raise

# Step 4: Load Expectation Suite
suite_name = "sample_data_suite"
try:
    suite = context.get_expectation_suite(expectation_suite_name=suite_name)
    print(f"Loaded expectation suite '{suite_name}'")
except Exception as e:
    print(f"Error loading expectation suite: {e}")
    print("Please ensure Task 3 was completed to create the expectation suite.")
    raise

# Step 5: Get a Validator and Validate
try:
    batch_request = data_asset.build_batch_request()
    validator = context.get_validator(batch_request=batch_request, expectation_suite_name=suite_name)
    print("Validator initialized")
except Exception as e:
    print(f"Error initializing validator: {e}")
    raise

# Step 6: Run Validation
try:
    checkpoint = context.add_or_update_checkpoint(
        name="sample_checkpoint",
        validations=[
            {
                "batch_request": batch_request,
                "expectation_suite_name": suite_name,
            }
        ]
    )
    checkpoint_result = checkpoint.run()
    print("\nValidation completed")
except Exception as e:
    print(f"Error running validation: {e}")
    raise

# Step 7: Generate HTML Report
try:
    # Build and save Data Docs (HTML report)
    context.build_data_docs()
    print("\nHTML report generated in the Great Expectations Data Docs directory")
    
    # Get the URL of the validation results
    validation_results_url = context.get_validation_result_page_url(checkpoint_result)
    print(f"Validation results available at: {validation_results_url}")
    
    # Save the HTML report to a specific file
    report_path = "validation_report.html"
    with open(report_path, "w") as f:
        f.write(context.get_data_docs_page(checkpoint_result))
    print(f"HTML report saved to '{report_path}'")
except Exception as e:
    print(f"Error generating HTML report: {e}")
    raise

Initialized Great Expectations DataContext
Error setting up datasource/asset: 'FileDataContext' object has no attribute 'datasources'


AttributeError: 'FileDataContext' object has no attribute 'datasources'

### Task 5: Automating Data Quality Score Calculation
**Description**: Automate the data quality score via a script that integrates with Great
Expectations.

In [7]:
import pandas as pd
import great_expectations as ge
from great_expectations.data_context import FileDataContext
import os

# Step 1: Verify sample_data.csv exists, create if not
if not os.path.exists('sample_data.csv'):
    data = {
        'Name': ['Alice', 'Bob', '', 'David', 'Eve', None],
        'Email': ['alice@example.com', 'bob@example', 'charlie@domain.com', 'david@example.com', 'eve@example.com', 'eve@example.com'],
        'Age': [25, None, 30, 28, 22, 35]
    }
    df = pd.DataFrame(data)
    df.to_csv('sample_data.csv', index=False)
    print("Created sample_data.csv")

# Step 2: Load the CSV file
try:
    df = pd.read_csv('sample_data.csv')
    print("Loaded sample_data.csv")
except Exception as e:
    print(f"Error loading CSV: {e}")
    raise

# Step 3: Initialize FileDataContext
try:
    context = FileDataContext(project_root_dir=".")
    print("Initialized Great Expectations DataContext")
except Exception as e:
    print(f"Error initializing DataContext: {e}")
    raise

# Step 4: Verify Datasource and Data Asset
datasource_name = "pandas_datasource"
data_asset_name = "sample_data"

try:
    if datasource_name not in context.datasources:
        datasource = context.sources.add_pandas(name=datasource_name)
    else:
        datasource = context.datasources[datasource_name]
    
    if data_asset_name not in [asset.name for asset in datasource.get_asset_names()]:
        data_asset = datasource.add_csv_asset(name=data_asset_name, filepath_or_buffer="sample_data.csv")
    else:
        data_asset = datasource.get_asset(data_asset_name)
    print(f"Verified datasource '{datasource_name}' and asset '{data_asset_name}'")
except Exception as e:
    print(f"Error setting up datasource/asset: {e}")
    raise

# Step 5: Load Expectation Suite
suite_name = "sample_data_suite"
try:
    suite = context.get_expectation_suite(expectation_suite_name=suite_name)
    print(f"Loaded expectation suite '{suite_name}'")
except Exception as e:
    print(f"Error loading expectation suite: {e}")
    print("Please ensure Task 3 was completed to create the expectation suite.")
    raise

# Step 6: Run Validation with Great Expectations
try:
    batch_request = data_asset.build_batch_request()
    validator = context.get_validator(batch_request=batch_request, expectation_suite_name=suite_name)
    checkpoint = context.add_or_update_checkpoint(
        name="sample_checkpoint",
        validations=[
            {
                "batch_request": batch_request,
                "expectation_suite_name": suite_name,
            }
        ]
    )
    checkpoint_result = checkpoint.run()
    print("\nGreat Expectations validation completed")
except Exception as e:
    print(f"Error running validation: {e}")
    raise

# Step 7: Calculate Data Quality Metrics
try:
    # Completeness: Average percentage of non-null values across all columns
    completeness = df.notnull().mean().mean() * 100

    # Validity: Percentage of email fields containing '@'
    valid_emails = df['Email'].str.contains('@', na=False).mean() * 100

    # Uniqueness: Percentage of distinct entries in the Email column
    unique_emails = df['Email'].nunique()
    total_emails = df['Email'].count()
    uniqueness_percentage = (unique_emails / total_emails) * 100

    # Overall Data Quality Score: Simple average of metrics
    data_quality_score = (completeness + valid_emails + uniqueness_percentage) / 3

    print("\nIndividual Metrics:")
    print(f"Completeness (avg % non-null): {completeness:.2f}%")
    print(f"Validity (% emails with '@'): {valid_emails:.2f}%")
    print(f"Uniqueness (% distinct emails): {uniqueness_percentage:.2f}%")
    print(f"\nOverall Data Quality Score: {data_quality_score:.2f}%")
except Exception as e:
    print(f"Error calculating metrics: {e}")
    raise

# Step 8: Generate HTML Report
try:
    # Add data quality score to the context for inclusion in Data Docs
    context.variables.metadata = {
        "data_quality_score": f"{data_quality_score:.2f}%",
        "completeness": f"{completeness:.2f}%",
        "validity": f"{valid_emails:.2f}%",
        "uniqueness": f"{uniqueness_percentage:.2f}%"
    }
    context.save_context()

    # Build and save Data Docs (HTML report)
    context.build_data_docs()
    print("\nHTML report generated in the Great Expectations Data Docs directory")
    
    # Save a standalone HTML report
    report_path = "data_quality_report.html"
    with open(report_path, "w") as f:
        f.write(context.get_data_docs_page(checkpoint_result))
    print(f"HTML report saved to '{report_path}'")
except Exception as e:
    print(f"Error generating HTML report: {e}")
    raise

Loaded sample_data.csv
Initialized Great Expectations DataContext
Error setting up datasource/asset: 'FileDataContext' object has no attribute 'datasources'


AttributeError: 'FileDataContext' object has no attribute 'datasources'

### Task 6: Leveraging Data Quality Metrics for Automated Data Cleaning
**Description**: Implement a system where if data quality metrics fall below a threshold,
automated data cleaning scripts are triggered.

**Steps**:
1. Define Cleaning Logic
2. Integrate with Great Expectations:
    - Use an action within the Great Expectations action list that only triggers if quality score is below a threshold, automating the cleaning.

In [8]:
import pandas as pd
import great_expectations as ge
from great_expectations.data_context import FileDataContext
from great_expectations.core.expectation_suite import ExpectationSuite
from great_expectations.checkpoint.actions import ValidationAction
import os
import json

# Custom Great Expectations Action to Trigger Cleaning
class CustomCleaningAction(ValidationAction):
    def run(self, validation_result, context, expectation_suite_name, batch_request):
        # Extract data quality score from context metadata
        data_quality_score = float(context.variables.metadata.get("data_quality_score", 0))
        threshold = 90.0  # Threshold for triggering cleaning
        
        if data_quality_score < threshold:
            print(f"Data quality score ({data_quality_score:.2f}%) is below threshold ({threshold}%). Triggering cleaning...")
            
            # Load the dataset
            df = pd.read_csv('sample_data.csv')
            
            # Cleaning Logic
            # Completeness: Fill missing Name with 'Unknown', Age with median
            df['Name'] = df['Name'].fillna('Unknown')
            df['Age'] = df['Age'].fillna(df['Age'].median())
            
            # Validity: Remove rows where Email does not contain '@'
            df = df[df['Email'].str.contains('@', na=False)]
            
            # Uniqueness: Drop duplicate Emails, keeping first occurrence
            df = df.drop_duplicates(subset='Email', keep='first')
            
            # Save cleaned dataset
            cleaned_path = 'sample_data_cleaned.csv'
            df.to_csv(cleaned_path, index=False)
            print(f"Cleaned dataset saved to '{cleaned_path}'")
            
            return {"cleaned": True, "cleaned_file": cleaned_path}
        else:
            print(f"Data quality score ({data_quality_score:.2f}%) meets threshold ({threshold}%). No cleaning needed.")
            return {"cleaned": False}

# Step 1: Verify sample_data.csv exists, create if not
if not os.path.exists('sample_data.csv'):
    data = {
        'Name': ['Alice', 'Bob', '', 'David', 'Eve', None],
        'Email': ['alice@example.com', 'bob@example', 'charlie@domain.com', 'david@example.com', 'eve@example.com', 'eve@example.com'],
        'Age': [25, None, 30, 28, 22, 35]
    }
    df = pd.DataFrame(data)
    df.to_csv('sample_data.csv', index=False)
    print("Created sample_data.csv")

# Step 2: Load the CSV file
try:
    df = pd.read_csv('sample_data.csv')
    print("Loaded sample_data.csv")
except Exception as e:
    print(f"Error loading CSV: {e}")
    raise

# Step 3: Initialize FileDataContext
try:
    context = FileDataContext(project_root_dir=".")
    print("Initialized Great Expectations DataContext")
except Exception as e:
    print(f"Error initializing DataContext: {e}")
    raise

# Step 4: Verify Datasource and Data Asset
datasource_name = "pandas_datasource"
data_asset_name = "sample_data"

try:
    if datasource_name not in context.datasources:
        datasource = context.sources.add_pandas(name=datasource_name)
    else:
        datasource = context.datasources[datasource_name]
    
    if data_asset_name not in [asset.name for asset in datasource.get_asset_names()]:
        data_asset = datasource.add_csv_asset(name=data_asset_name, filepath_or_buffer="sample_data.csv")
    else:
        data_asset = datasource.get_asset(data_asset_name)
    print(f"Verified datasource '{datasource_name}' and asset '{data_asset_name}'")
except Exception as e:
    print(f"Error setting up datasource/asset: {e}")
    raise

# Step 5: Load Expectation Suite
suite_name = "sample_data_suite"
try:
    suite = context.get_expectation_suite(expectation_suite_name=suite_name)
    print(f"Loaded expectation suite '{suite_name}'")
except Exception as e:
    print(f"Error loading expectation suite: {e}")
    print("Please ensure Task 3 was completed to create the expectation suite.")
    raise

# Step 6: Calculate Data Quality Metrics
try:
    # Completeness: Average percentage of non-null values across all columns
    completeness = df.notnull().mean().mean() * 100

    # Validity: Percentage of email fields containing '@'
    valid_emails = df['Email'].str.contains('@', na=False).mean() * 100

    # Uniqueness: Percentage of distinct entries in the Email column
    unique_emails = df['Email'].nunique()
    total_emails = df['Email'].count()
    uniqueness_percentage = (unique_emails / total_emails) * 100

    # Overall Data Quality Score: Simple average of metrics
    data_quality_score = (completeness + valid_emails + uniqueness_percentage) / 3

    # Store metrics in context metadata
    context.variables.metadata = {
        "data_quality_score": f"{data_quality_score:.2f}",
        "completeness": f"{completeness:.2f}%",
        "validity": f"{valid_emails:.2f}%",
        "uniqueness": f"{uniqueness_percentage:.2f}%"
    }
    context.save_context()

    print("\nIndividual Metrics:")
    print(f"Completeness (avg % non-null): {completeness:.2f}%")
    print(f"Validity (% emails with '@'): {valid_emails:.2f}%")
    print(f"Uniqueness (% distinct emails): {uniqueness_percentage:.2f}%")
    print(f"\nOverall Data Quality Score: {data_quality_score:.2f}%")
except Exception as e:
    print(f"Error calculating metrics: {e}")
    raise

# Step 7: Run Validation with Custom Action
try:
    batch_request = data_asset.build_batch_request()
    checkpoint = context.add_or_update_checkpoint(
        name="sample_checkpoint",
        validations=[
            {
                "batch_request": batch_request,
                "expectation_suite_name": suite_name,
                "action_list": [
                    {
                        "name": "custom_cleaning_action",
                        "action": {
                            "class_name": "CustomCleaningAction",
                            "module_name": __name__
                        }
                    }
                ]
            }
        ]
    )
    checkpoint_result = checkpoint.run()
    print("\nValidation and custom action completed")
except Exception as e:
    print(f"Error running validation/action: {e}")
    raise

# Step 8: Generate HTML Report
try:
    context.build_data_docs()
    print("\nHTML report generated in the Great Expectations Data Docs directory")
    
    report_path = "data_quality_cleaning_report.html"
    with open(report_path, "w") as f:
        f.write(context.get_data_docs_page(checkpoint_result))
    print(f"HTML report saved to '{report_path}'")
except Exception as e:
    print(f"Error generating HTML report: {e}")
    raise

Loaded sample_data.csv
Initialized Great Expectations DataContext
Error setting up datasource/asset: 'FileDataContext' object has no attribute 'datasources'


AttributeError: 'FileDataContext' object has no attribute 'datasources'