In [None]:
%pip install great-expectations==0.13.7 pandas kaggle

To use the Kaggle API, sign up for a Kaggle account at https://www.kaggle.com. Then go to the 'Account' tab of your user profile (https://www.kaggle.com/<username>/account) and select 'Create API Token'. This will trigger the download of kaggle.json, a file containing your API credentials. Place this file in the location ~/.kaggle/kaggle.json (on Windows in the location C:\Users\<Windows-username>\.kaggle\kaggle.json - you can check the exact location, sans drive, with echo %HOMEPATH%).

In [None]:
import pandas as pd
import great_expectations as ge

In [None]:
# great_expectations init

In [None]:
!echo "y" | great_expectations init

In [None]:
%cd great_expectations
%mkdir data

In [None]:
!kaggle competitions download -c expedia-hotel-recommendations -f test.csv -p data

In [None]:
import zipfile

with zipfile.ZipFile("data/test.csv.zip", 'r') as zip_ref:
    zip_ref.extractall("data")

In [None]:
df = pd.read_csv("data/test.csv")
df.head()

In [None]:
from great_expectations.data_context.types.base import DataContextConfig, DatasourceConfig, FilesystemStoreBackendDefaults
from great_expectations.data_context import BaseDataContext

data_context_config = DataContextConfig(
    datasources={
        "pandas_datasource": DatasourceConfig(
            class_name="PandasDatasource",
        )
    },
    store_backend_defaults=FilesystemStoreBackendDefaults(root_directory="/great_expectations"),
)

context = BaseDataContext(project_config=data_context_config)

In [None]:
from great_expectations.core.expectation_configuration import ExpectationConfiguration

expectations = [
    ExpectationConfiguration(
        expectation_type="expect_table_columns_to_match_ordered_list",
        kwargs={
            "column_list": [
                "id", "date_time", "site_name", "posa_continent",
                "user_location_country", "user_location_region", "user_location_city",
                "orig_destination_distance", "user_id", "is_mobile", "is_package",
                "channel", "srch_ci", "srch_co", "srch_adults_cnt", "srch_children_cnt",
                "srch_rm_cnt", "srch_destination_id", "srch_destination_type_id",
                "hotel_continent", "hotel_country", "hotel_market"]
          },
        meta={}
    ),
    ExpectationConfiguration(
        expectation_type="expect_compound_columns_to_be_unique",
        kwargs={
            "column_list": [
                "id"
            ]
          },
        meta={}
    ),
    ExpectationConfiguration(
        expectation_type="expect_compound_columns_to_be_unique",
        kwargs={
            "column_list": [
                "date_time", "user_id"
            ]
          },
        meta={}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={
            "column": "id",
          },
        meta={}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={
            "column": "user_id",
          },
        meta={}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={
            "column": "orig_destination_distance",
            "mostly": 0.9
          },
        meta={}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "user_location_country",
            "min_value": 0,
            "max_value": 246
          },
        meta={}
    ),
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "date_time",
            "min_value": 2015,
            "max_value": 2018
          },
        meta={}
    ),
]

In [None]:
batch_kwargs = {"dataset": df, "datasource": "pandas_datasource"}
suite = context.create_expectation_suite(
    "hotels_suite", overwrite_existing=True
)

for expectation in expectations:
    suite.add_expectation(expectation_configuration=expectation)
    
suite_name = "hotels_suite"

context.save_expectation_suite(suite, suite_name)

In [None]:
batch = context.get_batch(batch_kwargs, suite_name)

results = context.run_validation_operator(
    "action_list_operator",
    assets_to_validate=[batch],
    run_id="my_run_id")

In [None]:
validation_result_id = list(results["run_results"].keys())[0]
local_site = results["run_results"][validation_result_id]["actions_results"]["update_data_docs"]["local_site"]
local_site