In [1]:
import pandas as pd
import great_expectations as ge
from great_expectations.data_context import FileDataContext
import os
import re

# Step 1: Create sample_data.csv and reference_data.csv if they don't exist
if not os.path.exists('sample_data.csv'):
    data = {
        'Name': ['Alice', 'Bob', '', 'David', 'Eve', None],
        'Email': ['alice@example.com', 'bob@example', 'charlie@domain.com', 'david@example.com', 'eve@example.com', 'eve@example.com'],
        'Age': [25, None, 30, 28, 22, 35]
    }
    df = pd.DataFrame(data)
    df.to_csv('sample_data.csv', index=False)
    print("Created sample_data.csv")

if not os.path.exists('reference_data.csv'):
    ref_data = {
        'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
        'Email': ['alice@example.com', 'bob@domain.com', 'charlie@domain.com', 'david@example.com', 'eve@example.com'],
        'Age': [25, 27, 30, 28, 22]
    }
    ref_df = pd.DataFrame(ref_data)
    ref_df.to_csv('reference_data.csv', index=False)
    print("Created reference_data.csv")

# Step 2: Load datasets
try:
    df = pd.read_csv('sample_data.csv')
    ref_df = pd.read_csv('reference_data.csv')
    print("Loaded sample_data.csv and reference_data.csv")
except Exception as e:
    print(f"Error loading datasets: {e}")
    raise

# Step 3: Task 1 - Completeness Score
try:
    # Calculate percentage of non-missing values for each column
    completeness_per_column = df.notnull().mean() * 100
    completeness_score = completeness_per_column.mean()
    print("\nTask 1: Completeness Score")
    print("Completeness per column (% non-missing):")
    print(completeness_per_column)
    print(f"Overall Completeness Score: {completeness_score:.2f}%")
except Exception as e:
    print(f"Error calculating completeness score: {e}")
    raise

# Step 4: Task 2 - Accuracy Score
try:
    # Merge datasets on Name for comparison
    merged_df = df.merge(ref_df, on='Name', how='inner', suffixes=('_main', '_ref'))
    
    # Compare Email and Age columns
    email_matches = (merged_df['Email_main'] == merged_df['Email_ref']).sum()
    age_matches = (merged_df['Age_main'] == merged_df['Age_ref']).sum()
    
    total_comparisons = len(merged_df) * 2  # Two columns compared
    matches = email_matches + age_matches
    accuracy_score = (matches / total_comparisons) * 100 if total_comparisons > 0 else 0
    
    print("\nTask 2: Accuracy Score")
    print(f"Email matches: {email_matches}/{len(merged_df)}")
    print(f"Age matches: {age_matches}/{len(merged_df)}")
    print(f"Accuracy Score: {accuracy_score:.2f}%")
except Exception as e:
    print(f"Error calculating accuracy score: {e}")
    raise

# Step 5: Task 3 - Consistency Score
try:
    # Check if Email follows consistent format (contains '@' and '.')
    def is_consistent_email(email):
        if pd.isna(email):
            return False
        return bool(re.match(r'^.+@.+\..+$', str(email)))
    
    consistent_emails = df['Email'].apply(is_consistent_email).sum()
    total_emails = df['Email'].count()
    consistency_score = (consistent_emails / total_emails) * 100 if total_emails > 0 else 0
    
    print("\nTask 3: Consistency Score")
    print(f"Consistent Emails: {consistent_emails}/{total_emails}")
    print(f"Consistency Score: {consistency_score:.2f}%")
except Exception as e:
    print(f"Error calculating consistency score: {e}")
    raise

# Step 6: Initialize Great Expectations DataContext
try:
    context = FileDataContext(project_root_dir=".")
    print("\nInitialized Great Expectations DataContext")
except Exception as e:
    print(f"Error initializing DataContext: {e}")
    raise

# Step 7: Set up Datasource and Data Asset
datasource_name = "pandas_datasource"
data_asset_name = "sample_data"

try:
    if datasource_name not in context.datasources:
        datasource = context.sources.add_pandas(name=datasource_name)
    else:
        datasource = context.datasources[datasource_name]
    
    if data_asset_name not in [asset.name for asset in datasource.get_asset_names()]:
        data_asset = datasource.add_csv_asset(name=data_asset_name, filepath_or_buffer="sample_data.csv")
    else:
        data_asset = datasource.get_asset(data_asset_name)
    print(f"Verified datasource '{datasource_name}' and asset '{data_asset_name}'")
except Exception as e:
    print(f"Error setting up datasource/asset: {e}")
    raise

# Step 8: Create or Load Expectation Suite
suite_name = "sample_data_suite"
try:
    if suite_name not in context.list_expectation_suite_names():
        suite = context.add_expectation_suite(expectation_suite_name=suite_name)
    else:
        suite = context.get_expectation_suite(expectation_suite_name=suite_name)
    print(f"Expectation suite '{suite_name}' ready")
except Exception as e:
    print(f"Error creating/loading expectation suite: {e}")
    raise

# Step 9: Define Expectations (update suite with completeness and consistency)
try:
    batch_request = data_asset.build_batch_request()
    validator = context.get_validator(batch_request=batch_request, expectation_suite_name=suite_name)
    
    # Completeness expectations
    validator.expect_column_values_to_not_be_null(column="Email", mostly=0.95, result_format="SUMMARY")
    validator.expect_column_values_to_not_be_null(column="Name", mostly=0.80, result_format="SUMMARY")
    validator.expect_column_values_to_not_be_null(column="Age", mostly=0.80, result_format="SUMMARY")
    
    # Consistency expectation for Email format
    validator.expect_column_values_to_match_regex(
        column="Email",
        regex=r'^.+@.+\..+$',
        mostly=0.95,
        result_format="SUMMARY"
    )
    
    validator.save_expectation_suite(discard_failed_expectations=False)
    print("Updated expectation suite with completeness and consistency expectations")
except Exception as e:
    print(f"Error defining expectations: {e}")
    raise

# Step 10: Run Validation
try:
    checkpoint = context.add_or_update_checkpoint(
        name="sample_checkpoint",
        validations=[
            {
                "batch_request": batch_request,
                "expectation_suite_name": suite_name,
            }
        ]
    )
    checkpoint_result = checkpoint.run()
    print("\nValidation completed")
except Exception as e:
    print(f"Error running validation: {e}")
    raise

# Step 11: Store Metrics in Context
try:
    context.variables.metadata = {
        "completeness_score": f"{completeness_score:.2f}%",
        "accuracy_score": f"{accuracy_score:.2f}%",
        "consistency_score": f"{consistency_score:.2f}%"
    }
    context.save_context()
    print("Stored metrics in DataContext")
except Exception as e:
    print(f"Error storing metrics: {e}")
    raise

# Step 12: Generate HTML Report
try:
    context.build_data_docs()
    print("\nHTML report generated in the Great Expectations Data Docs directory")
    
    report_path = "data_quality_scores_report.html"
    with open(report_path, "w") as f:
        f.write(context.get_data_docs_page(checkpoint_result))
    print(f"HTML report saved to '{report_path}'")
except Exception as e:
    print(f"Error generating HTML report: {e}")
    raise

Created reference_data.csv
Loaded sample_data.csv and reference_data.csv

Task 1: Completeness Score
Completeness per column (% non-missing):
Name      66.666667
Email    100.000000
Age       83.333333
dtype: float64
Overall Completeness Score: 83.33%

Task 2: Accuracy Score
Email matches: 3/4
Age matches: 3/4
Accuracy Score: 75.00%

Task 3: Consistency Score
Consistent Emails: 5/6
Consistency Score: 83.33%

Initialized Great Expectations DataContext
Error setting up datasource/asset: 'FileDataContext' object has no attribute 'datasources'


AttributeError: 'FileDataContext' object has no attribute 'datasources'