In [1]:
# Parameters
execution_date = "2025-05-24 09:37:42.251289+00:00"


In [2]:
# Install pakcage if not installed
# pip install great-expectations -q

In [3]:
import great_expectations as gx
import great_expectations.expectations as gxe
import os

## Define Data Context

In [4]:
# context = gx.get_context(mode="file")
# Retrieve your Data Context
context = gx.get_context(mode="ephemeral") # Please choose one of: ephemeral, file, cloud.
print(type(context).__name__ )
assert type(context).__name__ == "EphemeralDataContext" # Please choose one of: FileDataContext, EphemeralDataContext, CloudDataContext

EphemeralDataContext


## Define Data Source

In [5]:
import great_expectations as gx

# Define the Data Source name
data_source_name = "canada_data_source"

# Add the Data Source to the Data Context
data_source = context.data_sources.add_pandas(name=data_source_name)
assert data_source.name == data_source_name

## Define Data Assest

In [6]:
import great_expectations as gx

# Retrieve the Data Source
data_source_name = "canada_data_source"
data_source = context.data_sources.get(data_source_name)

# Define the Data Asset name
data_asset_name = "canada_data_asset"

# Add a Data Asset to the Data Source
data_asset = data_source.add_dataframe_asset(name=data_asset_name)

assert data_asset.name == data_asset_name

## Define Batch Definition

In [7]:
import great_expectations as gx

# Retrieve the Data Asset
data_source_name = "canada_data_source"
data_asset_name = "canada_data_asset"
data_asset = context.data_sources.get(data_source_name).get_asset(data_asset_name)

# Define the Batch Definition name
batch_definition_name = "canada_batch_definition"

# Add a Batch Definition to the Data Asset
batch_definition = data_asset.add_batch_definition_whole_dataframe(
    batch_definition_name
)
assert batch_definition.name == batch_definition_name

In [8]:
import great_expectations as gx

# Retrieve the dataframe Batch Definition
data_source_name = "canada_data_source"
data_asset_name = "canada_data_asset"
batch_definition_name = "canada_batch_definition"
batch_definition = (
    context.data_sources.get(data_source_name)
    .get_asset(data_asset_name)
    .get_batch_definition(batch_definition_name)
)

## Define Expectations

In [9]:
expectations = [
    gx.expectations.ExpectTableColumnCountToEqual(value=396),
    gx.expectations.ExpectTableColumnsToMatchSet(
        column_set=[
            "streetAddress", "addressLocality", "addressRegion", "postalCode",
            "latitude", "longitude", "description", "price", "priceCurrency", "property-beds"
        ],
        # column_set=[column_list],
        exact_match=True
    ),
    gx.expectations.ExpectColumnValuesToBeOfType(column="latitude", type_="float64"),
    gx.expectations.ExpectColumnValuesToBeOfType(column="longitude", type_="float64"),
    gx.expectations.ExpectColumnValuesToBeOfType(column="streetAddress", type_="object"),
    gx.expectations.ExpectColumnValuesToBeOfType(column="price", type_="float64"),
    gx.expectations.ExpectColumnValuesToBeOfType(column="property-beds", type_="float64")
]

## Define Batch Parameters

In [10]:
import pandas as pd

DATA_DIR = os.environ.get('DATA_PATH')
DATA_PATH = f'{DATA_DIR}/small_canada.csv'

small_dataframe = pd.read_csv(DATA_PATH, index_col=False)
# yt_dataframe = pd.read_csv('/content/drive/MyDrive/linh tinh/canada-data-small/data_yt.csv', index_col=False)

# remove 10th column of yt_dataframe
# modified_yt_dataframe = yt_dataframe.iloc[:, :-1]

small_batch_parameters = {"dataframe": small_dataframe}
# yt_batch_parameters = {"dataframe": yt_dataframe}
# modified_yt_batch_parameters = {"dataframe": modified_yt_dataframe}

  small_dataframe = pd.read_csv(DATA_PATH, index_col=False)


In [11]:
small_batch = batch_definition.get_batch(batch_parameters=small_batch_parameters)
# yt_batch = batch_definition.get_batch(batch_parameters=yt_batch_parameters)
# modified_yt_batch = batch_definition.get_batch(batch_parameters=modified_yt_batch_parameters)

## Validate expectations

In [12]:
# Run them
for exp in expectations:
    result = small_batch.validate(exp)
    if not result["success"]:
        print("❌ Validation failed:")
        print(result)

Calculating Metrics:   0%|          | 0/3 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/3 [00:00<?, ?it/s]

Calculating Metrics:  33%|███▎      | 1/3 [00:00<00:00, 1700.85it/s]

Calculating Metrics:  33%|███▎      | 1/3 [00:00<00:00, 1259.17it/s]

Calculating Metrics:  67%|██████▋   | 2/3 [00:00<00:00, 2028.19it/s]

Calculating Metrics:  67%|██████▋   | 2/3 [00:00<00:00, 1795.51it/s]

Calculating Metrics: 100%|██████████| 3/3 [00:00<00:00, 2422.12it/s]

Calculating Metrics: 100%|██████████| 3/3 [00:00<00:00, 2231.01it/s]

Calculating Metrics: 100%|██████████| 3/3 [00:00<00:00, 2070.92it/s]

Calculating Metrics: 100%|██████████| 3/3 [00:00<00:00, 1902.47it/s]




❌ Validation failed:
{
  "success": false,
  "expectation_config": {
    "type": "expect_table_column_count_to_equal",
    "kwargs": {
      "batch_id": "canada_data_source-canada_data_asset",
      "value": 396
    },
    "meta": {}
  },
  "result": {
    "observed_value": 397
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:  50%|█████     | 1/2 [00:00<00:00, 3004.52it/s]

Calculating Metrics:  50%|█████     | 1/2 [00:00<00:00, 1754.94it/s]

Calculating Metrics: 100%|██████████| 2/2 [00:00<00:00, 2743.17it/s]

Calculating Metrics: 100%|██████████| 2/2 [00:00<00:00, 2367.66it/s]

Calculating Metrics: 100%|██████████| 2/2 [00:00<00:00, 2101.35it/s]

Calculating Metrics: 100%|██████████| 2/2 [00:00<00:00, 1835.18it/s]




❌ Validation failed:
{
  "success": false,
  "expectation_config": {
    "type": "expect_table_columns_to_match_set",
    "kwargs": {
      "batch_id": "canada_data_source-canada_data_asset",
      "column_set": [
        "streetAddress",
        "addressLocality",
        "addressRegion",
        "postalCode",
        "latitude",
        "longitude",
        "description",
        "price",
        "priceCurrency",
        "property-beds"
      ]
    },
    "meta": {}
  },
  "result": {
    "observed_value": [
      "2nd Br Description",
      "3rd Br Description",
      "4th Br Description",
      "5th Br Description",
      "Access",
      "Access Road",
      "Acreage",
      "Acreage Range",
      "Age Description",
      "Air Conditioning",
      "Amenities",
      "Appliances",
      "Approx Age",
      "Architectural Style",
      "Architecture",
      "Area",
      "Assessed Fees",
      "Assoc",
      "Assoc/Maint Fee Per",
      "Association Amenities",
      "Association Fee

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 2375.03it/s]

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 1512.55it/s]

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 1249.05it/s]

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 1035.37it/s]




Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 2007.80it/s]

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 1336.62it/s]

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 1013.36it/s]

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 804.43it/s] 




Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 3622.02it/s]

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 2448.51it/s]

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 1871.62it/s]

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 1509.29it/s]




Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 3495.25it/s]

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 2349.75it/s]

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 1824.40it/s]

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 1236.53it/s]




Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 2307.10it/s]

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 1486.29it/s]

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 1251.66it/s]

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 1079.06it/s]




In [13]:
# for exp in expectations:
#     result = yt_batch.validate(exp)
#     if not result["success"]:
#         print("❌ Validation failed:")
#         print(result)

In [14]:

# for exp in expectations:
#     result = modified_yt_batch.validate(exp)
#     if not result["success"]:
#         print("❌ Validation failed:")
#         print(result)