In [17]:
import great_expectations as gx
from great_expectations.data_context import FileDataContext
import pandas as pd
import os

# Initialize the DataContext
if not os.path.exists('../services/gx'):
    FileDataContext(project_root_dir = "../services")
context = gx.get_context(project_root_dir = "../services")

# Add or update the pandas datasource
ds = context.sources.add_or_update_pandas(name = "pandas_datasource")

da3 = ds.add_csv_asset(
    name = "csv_file3",
    filepath_or_buffer="../data/preprocessed/X.csv"
)

# Build batch request
batch_request = da3.build_batch_request()

# Create a new expectation suite
suite_name = "feature_validation"
context.add_or_update_expectation_suite(suite_name)

# Get validator for the batch and expectation suite
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=suite_name
)

# Step 3: Create expectations to validate all features
# Load data
data = pd.read_csv('../data/preprocessed/X.csv')

# Define expectations
# Expect one-hot encoded columns to be 0 or 1
one_hot_columns = [col for col in data.columns if 'flat_type_' in col or 'flat_model_' in col or 'storey_range_' in col]
for col in one_hot_columns:
    validator.expect_column_values_to_be_in_set(col, [0, 1])

# Expect 'floor_area_sqm' to be within a reasonable range
validator.expect_column_values_to_be_between("floor_area_sqm", -10, 10)

# Expect 'latitude' and 'longitude' to be within valid ranges
validator.expect_column_values_to_be_between("latitude", -90, 90)
validator.expect_column_values_to_be_between("longitude", -180, 180)

# Other expectations based on domain knowledge
# For example, expect 'remaining_lease' to be positive
validator.expect_column_values_to_be_between("remaining_lease", -10, 10)

# Save expectations to suite
validator.save_expectation_suite(discard_failed_expectations=False)

# Create a checkpoint to check the validity of the features in src/data.py
da4 = ds.add_csv_asset(
    name = "csv_file4",
    filepath_or_buffer="data/preprocessed/X.csv",
)
batch_request4 = da4.build_batch_request()
checkpoint = context.add_or_update_checkpoint(
    name = "preprocessed_data_validation_checkpoint_data",
    validations=[
        {
            "batch_request":batch_request4,
            "expectation_suite_name" : suite_name
        }
    ]
)

# validate expectations
checkpoint = context.add_or_update_checkpoint(
    name = "preprocessed_data_validation_checkpoint",
    validations=[
        {
            "batch_request":batch_request,
            "expectation_suite_name" : suite_name
        }
    ]
)

results = context.run_checkpoint(checkpoint_name="preprocessed_data_validation_checkpoint")

# Print detailed validation results
print("Validation success:", results.success)
for result in results["run_results"].values():
    validation_result = result["validation_result"]
    for res in validation_result["results"]:
        expectation = res["expectation_config"]["expectation_type"]
        success = res["success"]
        print(f"Expectation {expectation}: {'SUCCESS' if success else 'FAILURE'}")
        if not success:
            print(f"Details: {res['result']}")

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/346 [00:00<?, ?it/s]

Validation success: True
Expectation expect_column_values_to_be_in_set: SUCCESS
Expectation expect_column_values_to_be_in_set: SUCCESS
Expectation expect_column_values_to_be_in_set: SUCCESS
Expectation expect_column_values_to_be_in_set: SUCCESS
Expectation expect_column_values_to_be_in_set: SUCCESS
Expectation expect_column_values_to_be_in_set: SUCCESS
Expectation expect_column_values_to_be_in_set: SUCCESS
Expectation expect_column_values_to_be_in_set: SUCCESS
Expectation expect_column_values_to_be_in_set: SUCCESS
Expectation expect_column_values_to_be_in_set: SUCCESS
Expectation expect_column_values_to_be_in_set: SUCCESS
Expectation expect_column_values_to_be_in_set: SUCCESS
Expectation expect_column_values_to_be_in_set: SUCCESS
Expectation expect_column_values_to_be_in_set: SUCCESS
Expectation expect_column_values_to_be_in_set: SUCCESS
Expectation expect_column_values_to_be_in_set: SUCCESS
Expectation expect_column_values_to_be_in_set: SUCCESS
Expectation expect_column_values_to_be_i