In [65]:
# Install pakcage if not installed
# pip install great-expectations -q

In [None]:
import great_expectations as gx
import great_expectations.expectations as gxe
import os

## Define Data Context

In [67]:
# context = gx.get_context(mode="file")
# Retrieve your Data Context
context = gx.get_context(mode="ephemeral") # Please choose one of: ephemeral, file, cloud.
print(type(context).__name__ )
assert type(context).__name__ == "EphemeralDataContext" # Please choose one of: FileDataContext, EphemeralDataContext, CloudDataContext

INFO:great_expectations.data_context.types.base:Created temporary directory '/tmp/tmpw7lxsgyi' for ephemeral docs site


EphemeralDataContext


## Define Data Source

In [68]:
import great_expectations as gx

# Define the Data Source name
data_source_name = "canada_data_source"

# Add the Data Source to the Data Context
data_source = context.data_sources.add_pandas(name=data_source_name)
assert data_source.name == data_source_name

## Define Data Assest

In [None]:
import great_expectations as gx

# Retrieve the Data Source
data_source_name = "canada_data_source"
data_source = context.data_sources.get(data_source_name)

# Define the Data Asset name
data_asset_name = "canada_raw_data_asset"

# Add a Data Asset to the Data Source
data_asset = data_source.add_dataframe_asset(name=data_asset_name)

assert data_asset.name == data_asset_name

## Define Batch Definition

In [None]:
import great_expectations as gx

# Retrieve the Data Asset
data_source_name = "canada_data_source"
data_asset_name = "canada_raw_data_asset"
data_asset = context.data_sources.get(data_source_name).get_asset(data_asset_name)

# Define the Batch Definition name
batch_definition_name = "canada_raw_batch_definition"

# Add a Batch Definition to the Data Asset
batch_definition = data_asset.add_batch_definition_whole_dataframe(
    batch_definition_name
)
assert batch_definition.name == batch_definition_name

In [None]:
import great_expectations as gx

# Retrieve the dataframe Batch Definition
data_source_name = "canada_data_source"
data_asset_name = "canada_raw_data_asset"
batch_definition_name = "canada_raw_batch_definition"
batch_definition = (
    context.data_sources.get(data_source_name)
    .get_asset(data_asset_name)
    .get_batch_definition(batch_definition_name)
)

## Define Expectations

In [72]:
expectations = [
    gx.expectations.ExpectTableColumnCountToEqual(value=396),
    gx.expectations.ExpectTableColumnsToMatchSet(
        column_set=[
            "streetAddress", "addressLocality", "addressRegion", "postalCode",
            "latitude", "longitude", "description", "price", "priceCurrency", "property-beds"
        ],
        # column_set=[column_list],
        exact_match=True
    ),
    gx.expectations.ExpectColumnValuesToBeOfType(column="latitude", type_="float64"),
    gx.expectations.ExpectColumnValuesToBeOfType(column="longitude", type_="float64"),
    gx.expectations.ExpectColumnValuesToBeOfType(column="streetAddress", type_="object"),
    gx.expectations.ExpectColumnValuesToBeOfType(column="price", type_="float64"),
    gx.expectations.ExpectColumnValuesToBeOfType(column="property-beds", type_="float64")
]

## Define Batch Parameters

In [None]:
import pandas as pd

DATA_DIR = os.environ.get('DATA_PATH')
DATA_PATH = f'{DATA_DIR}/canada_house.csv'

dataframe = pd.read_csv(DATA_PATH, index_col=False)
# yt_dataframe = pd.read_csv('/content/drive/MyDrive/linh tinh/canada-data-small/data_yt.csv', index_col=False)

# remove 10th column of yt_dataframe
# modified_yt_dataframe = yt_dataframe.iloc[:, :-1]

batch_parameters = {"dataframe": dataframe}
# yt_batch_parameters = {"dataframe": yt_dataframe}
# modified_yt_batch_parameters = {"dataframe": modified_yt_dataframe}

In [None]:
batch = batch_definition.get_batch(batch_parameters=batch_parameters)

## Validate expectations

In [76]:
# Run them
for exp in expectations:
    result = batch.validate(exp)
    if not result["success"]:
        print("❌ Validation failed:")
        print(result)