### Task 1: Understanding and Defining Data Quality Metrics
**Description**: Learn how to define basic data quality metrics such as completeness, validity, and uniqueness for a simple dataset.

**Steps**:
1. Dataset: Use a CSV with columns like Name , Email , Age .
2. Metric Definitions:
    - Completeness: Percentage of non-null values.
    - Validity: % of email fields containing @ .
    - Uniqueness: Count distinct entries in the Email column.

In [12]:
import pandas as pd

# Sample dataset (replace this with pd.read_csv('your_file.csv') for real data)
data = {
    "Name": ["Alice", "Bob", "Charlie", None, "Eve"],
    "Email": ["alice@example.com", "bob@example.com", "invalid_email", "eve@example.com", None],
    "Age": [25, 30, None, 40, 22]
}

df = pd.DataFrame(data)

# 1. Completeness: Percentage of non-null values per column
def calculate_completeness(df):
    completeness = df.notnull().mean() * 100
    return completeness

# 2. Validity: % of emails containing '@'
def calculate_email_validity(df):
    valid_emails = df["Email"].dropna().apply(lambda x: "@" in x)
    validity_percentage = valid_emails.mean() * 100
    return validity_percentage

# 3. Uniqueness: Count of distinct emails
def calculate_email_uniqueness(df):
    unique_emails = df["Email"].nunique(dropna=True)
    return unique_emails

# Running all metrics
completeness = calculate_completeness(df)
validity = calculate_email_validity(df)
uniqueness = calculate_email_uniqueness(df)

# Display Results
print("📊 Data Quality Metrics")
print("------------------------")
print("Completeness (% per column):\n", completeness)
print(f"\nValidity (% of valid emails): {validity:.2f}%")
print(f"Uniqueness (distinct email count): {uniqueness}")

📊 Data Quality Metrics
------------------------
Completeness (% per column):
 Name     80.0
Email    80.0
Age      80.0
dtype: float64

Validity (% of valid emails): 75.00%
Uniqueness (distinct email count): 4


### Task 2: Calculating Data Quality Score
**Description**: Aggregate multiple metrics to calculate an overall data quality score.

**Steps**:
1. Formula: Simple average of all metrics defined in Task 1.

In [13]:
import pandas as pd

# Sample dataset (replace with pd.read_csv('your_data.csv') for real-world use)
data = {
    "Name": ["Alice", "Bob", "Charlie", None, "Eve"],
    "Email": ["alice@example.com", "bob@example.com", "invalid_email", "eve@example.com", None],
    "Age": [25, 30, None, 40, 22]
}
df = pd.DataFrame(data)

# --- Task 1 Metrics ---

# 1. Completeness: % of non-null values per column
def calculate_completeness(df):
    return df.notnull().mean().mean() * 100  # Average completeness across all columns

# 2. Validity: % of email fields containing '@'
def calculate_email_validity(df):
    valid_emails = df["Email"].dropna().apply(lambda x: "@" in x)
    return valid_emails.mean() * 100

# 3. Uniqueness: % of unique email values
def calculate_email_uniqueness(df):
    total = df["Email"].dropna().shape[0]
    unique = df["Email"].nunique(dropna=True)
    return (unique / total * 100) if total else 0

# --- Task 2: Aggregate all metrics into a Data Quality Score ---
def calculate_data_quality_score(df):
    completeness = calculate_completeness(df)
    validity = calculate_email_validity(df)
    uniqueness = calculate_email_uniqueness(df)
    
    # Simple average
    score = (completeness + validity + uniqueness) / 3
    return {
        "Completeness (%)": round(completeness, 2),
        "Validity (%)": round(validity, 2),
        "Uniqueness (%)": round(uniqueness, 2),
        "Data Quality Score (%)": round(score, 2)
    }

# Run
result = calculate_data_quality_score(df)

# Display results
print("📊 Data Quality Summary")
print("------------------------")
for metric, value in result.items():
    print(f"{metric}: {value}%")

📊 Data Quality Summary
------------------------
Completeness (%): 80.0%
Validity (%): 75.0%
Uniqueness (%): 100.0%
Data Quality Score (%): 85.0%


### Task 3: Creating Expectations for a CSV
**Description**: Develop basic data quality expectations using Great Expectations.

**Steps**:
1. Expectation Suite
2. Define Expectations for Completeness

In [14]:
import os
import pandas as pd
import great_expectations as gx
from sklearn.ensemble import IsolationForest

# Step 0: Create sample CSV data
def create_sample_csv():
    df = pd.DataFrame({
        "Name": ["Alice", "Bob", "Charlie", None, "Eve"],
        "Email": ["alice@example.com", "bob@example.com", "invalid_email", "carol@example.com", None],
        "Age": [25, 30, None, 40, 22]
    })
    df.to_csv("data.csv", index=False)
    print("✅ Created sample 'data.csv' file.")

# Step 1: Setup a minimal GE context
def create_context():
    context = gx.get_context()
    return context

# Step 2: Load the data
def load_data():
    df = pd.read_csv("data.csv")
    return df

# Step 3: Create expectation suite
def create_expectations(context, df):
    suite_name = "dq_suite"
    try:
        context.create_expectation_suite(suite_name)
    except:
        pass  # already exists

    datasource_name = "my_pandas_datasource"
    if datasource_name not in context.datasources:
        context.add_datasource(
            name=datasource_name,
            class_name="Datasource",
            execution_engine={"class_name": "PandasExecutionEngine"},
            data_connectors={
                "default_runtime_data_connector_name": {
                    "class_name": "RuntimeDataConnector",
                    "batch_identifiers": ["default_identifier_name"]
                }
            }
        )

    batch_request = {
        "datasource_name": datasource_name,
        "data_connector_name": "default_runtime_data_connector_name",
        "data_asset_name": "my_data_asset",
        "runtime_parameters": {"batch_data": df},
        "batch_identifiers": {"default_identifier_name": "default_id"}
    }

    validator = context.get_validator(batch_request=batch_request, expectation_suite_name=suite_name)

    # Define basic expectations
    validator.expect_column_values_to_not_be_null("Name")
    validator.expect_column_values_to_not_be_null("Email")
    validator.expect_column_values_to_not_be_null("Age")
    validator.expect_column_values_to_match_regex("Email", regex=".+@.+\\..+")
    validator.expect_column_values_to_be_unique("Email")

    validator.save_expectation_suite(discard_failed_expectations=False)

    return batch_request, suite_name

# Step 4: Validate and generate HTML report
def validate_and_generate_report(context, batch_request, suite_name):
    checkpoint = context.add_or_update_checkpoint(
        name="dq_checkpoint",
        validations=[{"batch_request": batch_request, "expectation_suite_name": suite_name}]
    )
    results = checkpoint.run()
    context.build_data_docs()
    print("✅ Validation complete. View the HTML report here:")
    print(context.get_docs_sites_urls()[0]["site_url"])

# Step 5: Calculate Data Quality Score
def calculate_dq_score(df):
    completeness = df.notnull().mean().mean() * 100
    validity = df["Email"].dropna().apply(lambda x: "@" in x).mean() * 100
    unique_email_count = df["Email"].nunique(dropna=True)
    total_email_rows = df["Email"].dropna().shape[0]
    uniqueness = (unique_email_count / total_email_rows * 100) if total_email_rows else 0
    dq_score = (completeness + validity + uniqueness) / 3

    print("\n📊 Data Quality Metrics")
    print(f"Completeness: {completeness:.2f}%")
    print(f"Validity (Email): {validity:.2f}%")
    print(f"Uniqueness (Email): {uniqueness:.2f}%")
    print(f"🔢 Final Data Quality Score: {dq_score:.2f}%")

    return dq_score

# Step 6: Trigger data cleaning if quality score is low
def clean_data(df):
    print("\n⚠️ Data Quality is low. Running cleaning script...")
    df_cleaned = df.copy()
    df_cleaned = df_cleaned.dropna(subset=["Name", "Email", "Age"])
    df_cleaned = df_cleaned[df_cleaned["Email"].str.contains("@", na=False)]
    print("✅ Cleaned data:")
    print(df_cleaned)
    df_cleaned.to_csv("cleaned_data.csv", index=False)
    print("📁 Saved cleaned data as 'cleaned_data.csv'")

# === MAIN EXECUTION ===
if __name__ == "__main__":
    create_sample_csv()
    context = create_context()
    df = load_data()
    batch_request, suite_name = create_expectations(context, df)
    validate_and_generate_report(context, batch_request, suite_name)
    dq_score = calculate_dq_score(df)

    # Threshold: 85%
    if dq_score < 85:
        clean_data(df)
    else:
        print("✅ Data quality is acceptable. No cleaning needed.")

✅ Created sample 'data.csv' file.


AttributeError: 'EphemeralDataContext' object has no attribute 'datasources'

### Task 4: Running and Validating Expectations
**Description**: Run the created expectations and generate an output report.

**Steps**:
1. Validate
2. Generate HTML Report

In [None]:
%pip install great_expectations pandas
import pandas as pd
import great_expectations as gx

# Step 1: Create sample data
def create_sample_data():
    data = {
        "Name": ["Alice", "Bob", "Charlie", None, "Eve"],
        "Email": ["alice@example.com", "bob@example.com", "invalid_email", "carol@example.com", None],
        "Age": [25, 30, None, 40, 22]
    }
    df = pd.DataFrame(data)
    df.to_csv("data.csv", index=False)
    print("✅ Sample CSV created.")

# Step 2: Load GE context
def create_ge_context():
    return gx.get_context()

# Step 3: Create Expectations
def define_expectations(context, df):
    suite_name = "basic_suite"
    try:
        context.create_expectation_suite(suite_name)
    except Exception:
        pass  # If already created

    datasource_name = "pandas_datasource"
    if datasource_name not in context.datasources:
        context.add_datasource(
            name=datasource_name,
            class_name="Datasource",
            execution_engine={"class_name": "PandasExecutionEngine"},
            data_connectors={
                "runtime_data_connector": {
                    "class_name": "RuntimeDataConnector",
                    "batch_identifiers": ["default_id"]
                }
            }
        )

    batch_request = {
        "datasource_name": datasource_name,
        "data_connector_name": "runtime_data_connector",
        "data_asset_name": "my_data_asset",
        "runtime_parameters": {"batch_data": df},
        "batch_identifiers": {"default_id": "batch_001"}
    }

    validator = context.get_validator(
        batch_request=batch_request,
        expectation_suite_name=suite_name
    )

    # Add some basic expectations
    validator.expect_column_values_to_not_be_null("Name")
    validator.expect_column_values_to_match_regex("Email", ".+@.+\\..+")
    validator.expect_column_values_to_not_be_null("Age")
    validator.expect_column_values_to_be_between("Age", min_value=0, max_value=120)

    validator.save_expectation_suite(discard_failed_expectations=False)
    return batch_request, suite_name

# Step 4: Run validation and generate report
def validate_and_report(context, batch_request, suite_name):
    checkpoint = context.add_or_update_checkpoint(
        name="my_checkpoint",
        validations=[
            {
                "batch_request": batch_request,
                "expectation_suite_name": suite_name
            }
        ]
    )
    result = checkpoint.run()
    context.build_data_docs()
    print("📄 Validation complete. Report generated.")
    print("🔗 Report location:", context.get_docs_sites_urls()[0]["site_url"])

# === MAIN ===
if __name__ == "__main__":
    create_sample_data()
    context = create_ge_context()
    df = pd.read_csv("data.csv")
    batch_request, suite_name = define_expectations(context, df)
    validate_and_report(context, batch_request, suite_name)

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
✅ Sample CSV created.


AttributeError: 'EphemeralDataContext' object has no attribute 'datasources'

### Task 5: Automating Data Quality Score Calculation
**Description**: Automate the data quality score via a script that integrates with Great
Expectations.

In [None]:
import pandas as pd
import great_expectations as gx

# Step 1: Create sample data and load
def create_sample_csv():
    df = pd.DataFrame({
        "Name": ["Alice", "Bob", None, "David", "Eve"],
        "Email": ["alice@example.com", "bob@example.com", "invalid_email", None, "eve@example.com"],
        "Age": [25, 30, None, 22, 45]
    })
    df.to_csv("data_quality_sample.csv", index=False)
    return df

# Step 2: Setup Great Expectations context
def get_ge_context():
    return gx.get_context()

# Step 3: Define Expectations
def create_expectations(context, df):
    suite_name = "dq_score_suite"
    try:
        context.create_expectation_suite(suite_name)
    except Exception:
        pass  # already exists

    datasource_name = "dq_datasource"
    if datasource_name not in context.datasources:
        context.add_datasource(
            name=datasource_name,
            class_name="Datasource",
            execution_engine={"class_name": "PandasExecutionEngine"},
            data_connectors={
                "runtime_data_connector": {
                    "class_name": "RuntimeDataConnector",
                    "batch_identifiers": ["default_id"]
                }
            }
        )

    batch_request = {
        "datasource_name": datasource_name,
        "data_connector_name": "runtime_data_connector",
        "data_asset_name": "dq_data_asset",
        "runtime_parameters": {"batch_data": df},
        "batch_identifiers": {"default_id": "dq_batch_01"}
    }

    validator = context.get_validator(
        batch_request=batch_request,
        expectation_suite_name=suite_name
    )

    # Basic expectations
    validator.expect_column_values_to_not_be_null("Name")
    validator.expect_column_values_to_not_be_null("Email")
    validator.expect_column_values_to_match_regex("Email", ".+@.+\\..+")
    validator.expect_column_values_to_not_be_null("Age")
    validator.expect_column_values_to_be_between("Age", 0, 120)
    validator.expect_column_values_to_be_unique("Email")

    validator.save_expectation_suite()
    return batch_request, suite_name

# Step 4: Validate & Calculate Score
def calculate_data_quality_score(context, batch_request, suite_name):
    checkpoint = context.add_or_update_checkpoint(
        name="dq_score_checkpoint",
        validations=[{
            "batch_request": batch_request,
            "expectation_suite_name": suite_name
        }]
    )

    results = checkpoint.run()
    validation_result = results.list_validation_results()[0]

    total_expectations = len(validation_result["results"])
    passed = sum(1 for r in validation_result["results"] if r["success"])

    dq_score = (passed / total_expectations) * 100
    print(f"\n📊 Data Quality Score: {dq_score:.2f}% ({passed}/{total_expectations} expectations passed)")

    # Optionally: open the HTML report
    context.build_data_docs()
    print("📄 HTML Report:", context.get_docs_sites_urls()[0]["site_url"])

# === MAIN ===
if __name__ == "__main__":
    df = create_sample_csv()
    context = get_ge_context()
    batch_request, suite_name = create_expectations(context, df)
    calculate_data_quality_score(context, batch_request, suite_name)


AttributeError: 'EphemeralDataContext' object has no attribute 'datasources'

### Task 6: Leveraging Data Quality Metrics for Automated Data Cleaning
**Description**: Implement a system where if data quality metrics fall below a threshold,
automated data cleaning scripts are triggered.

**Steps**:
1. Define Cleaning Logic
2. Integrate with Great Expectations:
    - Use an action within the Great Expectations action list that only triggers if quality score is below a threshold, automating the cleaning.

In [None]:

import pandas as pd
import great_expectations as gx
import re

# Step 1: Create sample data with issues
def create_sample_csv():
    data = {
        "Name": ["Alice", None, "Charlie", "David", "Eve"],
        "Email": ["alice@example.com", "bob@example.com", "invalid_email", None, "eve@example.com"],
        "Age": [25, 30, None, 22, 45]
    }
    df = pd.DataFrame(data)
    df.to_csv("data_quality_sample.csv", index=False)
    return df

# Step 2: Setup GE context and expectations
def setup_ge_and_expectations(context, df):
    suite_name = "dq_cleaning_suite"
    try:
        context.create_expectation_suite(suite_name)
    except Exception:
        pass  # Suite exists

    ds_name = "dq_datasource"
    if ds_name not in context.datasources:
        context.add_datasource(
            name=ds_name,
            class_name="Datasource",
            execution_engine={"class_name": "PandasExecutionEngine"},
            data_connectors={
                "runtime_data_connector": {
                    "class_name": "RuntimeDataConnector",
                    "batch_identifiers": ["default_id"]
                }
            }
        )

    batch_request = {
        "datasource_name": ds_name,
        "data_connector_name": "runtime_data_connector",
        "data_asset_name": "dq_data_asset",
        "runtime_parameters": {"batch_data": df},
        "batch_identifiers": {"default_id": "batch_001"}
    }

    validator = context.get_validator(
        batch_request=batch_request,
        expectation_suite_name=suite_name
    )

    # Expectations
    validator.expect_column_values_to_not_be_null("Name")
    validator.expect_column_values_to_not_be_null("Email")
    validator.expect_column_values_to_match_regex("Email", r".+@.+\..+")
    validator.expect_column_values_to_not_be_null("Age")
    validator.expect_column_values_to_be_between("Age", 0, 120)
    validator.expect_column_values_to_be_unique("Email")

    validator.save_expectation_suite()

    return batch_request, suite_name

# Step 3: Calculate DQ score from validation results
def calculate_dq_score(validation_results):
    results = validation_results["results"]
    total = len(results)
    passed = sum(1 for r in results if r["success"])
    score = (passed / total) * 100 if total > 0 else 0
    return score

# Step 4: Automated cleaning logic
def clean_data(df):
    print("⚙️ Cleaning data...")

    # Drop rows where Name or Email or Age is null
    df_cleaned = df.dropna(subset=["Name", "Email", "Age"])

    # Fix emails: remove invalid emails by filtering regex
    df_cleaned = df_cleaned[df_cleaned["Email"].str.match(r".+@.+\..+")]

    # Remove duplicates based on Email
    df_cleaned = df_cleaned.drop_duplicates(subset=["Email"])

    print(f"✅ Cleaned data: {len(df)} -> {len(df_cleaned)} rows")
    return df_cleaned

# Step 5: Run validation and optionally clean & re-validate
def run_validation_and_clean(context, batch_request, suite_name, df, threshold=85):
    checkpoint = context.add_or_update_checkpoint(
        name="dq_checkpoint",
        validations=[{
            "batch_request": batch_request,
            "expectation_suite_name": suite_name
        }]
    )
    result = checkpoint.run()
    validation_result = result.list_validation_results()[0]
    score = calculate_dq_score(validation_result)

    print(f"\n📊 Data Quality Score: {score:.2f}%")

    if score < threshold:
        print(f"⚠️ Score below {threshold}%. Triggering cleaning...")
        df_cleaned = clean_data(df)

        # Recreate batch_request with cleaned data
        batch_request_cleaned = {
            "datasource_name": batch_request["datasource_name"],
            "data_connector_name": batch_request["data_connector_name"],
            "data_asset_name": batch_request["data_asset_name"],
            "runtime_parameters": {"batch_data": df_cleaned},
            "batch_identifiers": batch_request["batch_identifiers"],
        }

        # Re-run validation on cleaned data
        checkpoint_validated = context.add_or_update_checkpoint(
            name="dq_checkpoint_cleaned",
            validations=[{
                "batch_request": batch_request_cleaned,
                "expectation_suite_name": suite_name
            }]
        )
        cleaned_result = checkpoint_validated.run()
        cleaned_validation_result = cleaned_result.list_validation_results()[0]
        cleaned_score = calculate_dq_score(cleaned_validation_result)

        print(f"\n✅ Post-cleaning Data Quality Score: {cleaned_score:.2f}%")
        return df_cleaned, cleaned_score

    else:
        print("👍 Data quality is sufficient. No cleaning needed.")
        return df, score

# === MAIN ===
if __name__ == "__main__":
    df = create_sample_csv()
    context = gx.get_context()
    batch_request, suite_name = setup_ge_and_expectations(context, df)

    cleaned_df, final_score = run_validation_and_clean(context, batch_request, suite_name, df, threshold=85)

    # Optional: Save cleaned data
    cleaned_df.to_csv("cleaned_data.csv", index=False)
    print("\n✅ Cleaned data saved to cleaned_data.csv")

AttributeError: 'EphemeralDataContext' object has no attribute 'datasources'