In [1]:
!pip install -U great_expectations pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting great_expectations
  Downloading great_expectations-0.15.27-py3-none-any.whl (5.1 MB)
[K     |████████████████████████████████| 5.1 MB 35.1 MB/s 
Collecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.12-py2.py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 76.7 MB/s 
Collecting cryptography>=3.2
  Downloading cryptography-38.0.1-cp36-abi3-manylinux_2_24_x86_64.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 54.2 MB/s 
Collecting Ipython>=7.16.3
  Downloading ipython-7.34.0-py3-none-any.whl (793 kB)
[K     |████████████████████████████████| 793 kB 71.8 MB/s 
Collecting notebook>=6.4.10
  Downloading notebook-6.5.1-py3-none-any.whl (439 kB)
[K     |████████████████████████████████| 439 kB 73.8 MB/s 
Collecting makefun<2,>=1.7.0
  Downloading makefun-1.15.0-py2.py3-none-any.whl (22 kB)
Collecting jsonpatch>=1.22
  Downloading jsonp

# New Section

In [1]:
# import the required packages
import great_expectations as ge
from great_expectations.core.batch import RuntimeBatchRequest
from great_expectations.profile.user_configurable_profiler import (
    UserConfigurableProfiler,
)
from great_expectations.checkpoint import SimpleCheckpoint
from ruamel import yaml
import pandas as pd

In [3]:
!echo y | great_expectations init

Using v3 (Batch Request) API[0m
[36m
  ___              _     ___                  _        _   _
 / __|_ _ ___ __ _| |_  | __|_ ___ __  ___ __| |_ __ _| |_(_)___ _ _  ___
| (_ | '_/ -_) _` |  _| | _|\ \ / '_ \/ -_) _|  _/ _` |  _| / _ \ ' \(_-<
 \___|_| \___\__,_|\__| |___/_\_\ .__/\___\__|\__\__,_|\__|_\___/_||_/__/
                                |_|
             ~ Always know what to expect from your data ~
[0m[0m
Let's create a new Data Context to hold your project configuration.

Great Expectations will create a new directory with the following structure:

    great_expectations
    |-- great_expectations.yml
    |-- expectations
    |-- checkpoints
    |-- plugins
    |-- .gitignore
    |-- uncommitted
        |-- config_variables.yml
        |-- data_docs
        |-- validations

OK to proceed? [Y/n]: 
[0m
[36mCongratulations! You are now ready to customize your Great Expectations configuration.[0m[0m

[36mYou can customize your configuration in many ways. Here are som

In [6]:
context = ge.get_context()

In [5]:
datasource_name = "house_prices"
execution_engine = "PandasExecutionEngine" 
data_directory = "data"
data_asset_name = f"{datasource_name}_survey_2006"
runtime_data_connector_name = "runtime_batch_files_connector"
batch_identifier_name = "pipeline_step"

In [8]:
datasource_config = {
    "name": datasource_name,
    "class_name": "Datasource",
    "module_name": "great_expectations.datasource",
    "execution_engine": {
        "module_name": "great_expectations.execution_engine",
        "class_name": execution_engine,
    },
    "data_connectors": {
        runtime_data_connector_name: {
            "class_name": "RuntimeDataConnector",
            "module_name": "great_expectations.datasource.data_connector",
            "assets": {
              data_asset_name: {
                "class_name": "Asset",
                "batch_identifiers": [batch_identifier_name],
                "module_name": "great_expectations.datasource.data_connector.asset"}}
        },
    },
}

In [9]:
context.test_yaml_config(yaml.dump(datasource_config))

Attempting to instantiate class from config...
	Instantiating as a Datasource, since class_name is Datasource
	Successfully instantiated Datasource


ExecutionEngine class name: PandasExecutionEngine
Data Connectors:
	runtime_batch_files_connector:RuntimeDataConnector

	runtime_batch_files_connector : RuntimeDataConnector

	Available data_asset_names (1 of 1):
		house_prices_survey_2006 (0 of 0): []

	Unmatched data_references (0 of 0):[]



<great_expectations.datasource.new_datasource.Datasource at 0x7fdd766e13d0>

In [10]:
context.add_datasource(**datasource_config)

<great_expectations.datasource.new_datasource.Datasource at 0x7fdd75df2bd0>

In [11]:
context.list_datasources()

[{'execution_engine': {'module_name': 'great_expectations.execution_engine',
   'class_name': 'PandasExecutionEngine'},
  'class_name': 'Datasource',
  'data_connectors': {'runtime_batch_files_connector': {'class_name': 'RuntimeDataConnector',
    'assets': {'house_prices_survey_2006': {'batch_identifiers': ['pipeline_step'],
      'class_name': 'Asset',
      'module_name': 'great_expectations.datasource.data_connector.asset'}},
    'module_name': 'great_expectations.datasource.data_connector'}},
  'name': 'house_prices',
  'module_name': 'great_expectations.datasource'}]

In [12]:
house_data = ge.read_csv("https://github.com/NatanMish/data_validation/blob/a77b247b25c6622ce0c8f8cbc505228161c31a3c/data/train.csv?raw=true")

In [13]:
house_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [14]:
dir(house_data)

['Alley',
 'BedroomAbvGr',
 'BldgType',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtFinType1',
 'BsmtFinType2',
 'BsmtFullBath',
 'BsmtHalfBath',
 'BsmtQual',
 'BsmtUnfSF',
 'CentralAir',
 'Condition1',
 'Condition2',
 'Electrical',
 'EnclosedPorch',
 'ExterCond',
 'ExterQual',
 'Exterior1st',
 'Exterior2nd',
 'Fence',
 'FireplaceQu',
 'Fireplaces',
 'Foundation',
 'FullBath',
 'Functional',
 'GarageArea',
 'GarageCars',
 'GarageCond',
 'GarageFinish',
 'GarageQual',
 'GarageType',
 'GarageYrBlt',
 'GrLivArea',
 'HalfBath',
 'Heating',
 'HeatingQC',
 'HouseStyle',
 'Id',
 'KitchenAbvGr',
 'KitchenQual',
 'LandContour',
 'LandSlope',
 'LotArea',
 'LotConfig',
 'LotFrontage',
 'LotShape',
 'LowQualFinSF',
 'MSSubClass',
 'MSZoning',
 'MasVnrArea',
 'MasVnrType',
 'MiscFeature',
 'MiscVal',
 'MoSold',
 'Neighborhood',
 'OpenPorchSF',
 'OverallCond',
 'OverallQual',
 'PavedDrive',
 'PoolArea',
 'PoolQC',
 'RoofMatl',
 'RoofStyle',
 'SaleCondition',
 'SalePrice',
 'Sal

In [15]:
house_data.expect_column_to_exist("Id")

{
  "success": true,
  "result": {},
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "expectation_config": {
    "kwargs": {
      "column": "Id",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_to_exist"
  }
}

In [16]:
house_data.expect_column_values_to_be_unique("Id")

{
  "success": true,
  "result": {
    "element_count": 1460,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "expectation_config": {
    "kwargs": {
      "column": "Id",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_values_to_be_unique"
  }
}

In [17]:
house_data.expect_column_max_to_be_between("SalePrice", 0, 100000)

{
  "success": false,
  "result": {
    "observed_value": 755000,
    "element_count": 1460,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "expectation_config": {
    "kwargs": {
      "column": "SalePrice",
      "min_value": 0,
      "max_value": 100000,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_max_to_be_between"
  }
}

In [18]:
house_data.expect_column_distinct_values_to_be_in_set("MSZoning", ["C (all)", "FV", "RH", "RL", "RM"])

{
  "success": true,
  "result": {
    "observed_value": [
      "C (all)",
      "FV",
      "RH",
      "RL",
      "RM"
    ],
    "element_count": 1460,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "expectation_config": {
    "kwargs": {
      "column": "MSZoning",
      "value_set": [
        "C (all)",
        "FV",
        "RH",
        "RL",
        "RM"
      ],
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_distinct_values_to_be_in_set"
  }
}

In [19]:
house_data.expect_column_mean_to_be_between("GrLivArea", 0, 10000)

{
  "success": true,
  "result": {
    "observed_value": 1515.463698630137,
    "element_count": 1460,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "expectation_config": {
    "kwargs": {
      "column": "GrLivArea",
      "min_value": 0,
      "max_value": 10000,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_mean_to_be_between"
  }
}

In [20]:
house_data.get_expectation_suite()

{
  "data_asset_type": "Dataset",
  "expectations": [
    {
      "kwargs": {
        "column": "Id"
      },
      "meta": {},
      "expectation_type": "expect_column_to_exist"
    },
    {
      "kwargs": {
        "column": "Id"
      },
      "meta": {},
      "expectation_type": "expect_column_values_to_be_unique"
    },
    {
      "kwargs": {
        "column": "MSZoning",
        "value_set": [
          "C (all)",
          "FV",
          "RH",
          "RL",
          "RM"
        ]
      },
      "meta": {},
      "expectation_type": "expect_column_distinct_values_to_be_in_set"
    },
    {
      "kwargs": {
        "column": "GrLivArea",
        "min_value": 0,
        "max_value": 10000
      },
      "meta": {},
      "expectation_type": "expect_column_mean_to_be_between"
    }
  ],
  "meta": {
    "great_expectations_version": "0.15.27"
  },
  "ge_cloud_id": null,
  "expectation_suite_name": "default"
}

In [21]:
expectation_suite_name = "my_expectations"
context.save_expectation_suite(house_data.get_expectation_suite(), expectation_suite_name)

In [22]:
house_data.expect_column_values_to_be_of_type("Street", 'str')
house_data.expect_column_values_to_not_be_null("LandContour")
house_data.expect_column_min_to_be_between("YearBuilt", 1700, 1900)
house_data.expect_column_median_to_be_between("LotArea", 5000, 15000)
house_data.expect_column_most_common_value_to_be_in_set("SaleType", ["WD", "New"])

{
  "success": true,
  "result": {
    "observed_value": [
      "WD"
    ],
    "element_count": 1460,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "expectation_config": {
    "kwargs": {
      "column": "SaleType",
      "value_set": [
        "WD",
        "New"
      ],
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_most_common_value_to_be_in_set"
  }
}

In [23]:
checkpoint_name = "data_batch_appended"

In [24]:
checkpoint_config = {
    "name": checkpoint_name,
    "config_version": 1,
    "class_name": "SimpleCheckpoint",
    "validations": [
        {
            "batch_request": {
                "datasource_name": datasource_name,
                "data_connector_name": runtime_data_connector_name,
                "data_asset_name": data_asset_name,
            },
            "expectation_suite_name": expectation_suite_name,
        }
    ],
}
context.add_checkpoint(**checkpoint_config)

{
  "action_list": [
    {
      "name": "store_validation_result",
      "action": {
        "class_name": "StoreValidationResultAction"
      }
    },
    {
      "name": "store_evaluation_params",
      "action": {
        "class_name": "StoreEvaluationParametersAction"
      }
    },
    {
      "name": "update_data_docs",
      "action": {
        "class_name": "UpdateDataDocsAction",
        "site_names": []
      }
    }
  ],
  "batch_request": {},
  "class_name": "Checkpoint",
  "config_version": 1.0,
  "evaluation_parameters": {},
  "module_name": "great_expectations.checkpoint",
  "name": "data_batch_appended",
  "profilers": [],
  "runtime_configuration": {},
  "validations": [
    {
      "batch_request": {
        "datasource_name": "house_prices",
        "data_connector_name": "runtime_batch_files_connector",
        "data_asset_name": "house_prices_survey_2006"
      },
      "expectation_suite_name": "my_expectations"
    }
  ]
}

In [25]:
house_data_test = ge.read_csv("https://github.com/NatanMish/data_validation/blob/a77b247b25c6622ce0c8f8cbc505228161c31a3c/data/test.csv?raw=true")

In [26]:
results = context.run_checkpoint(
    checkpoint_name=checkpoint_name,
    batch_request={
        "runtime_parameters": {"batch_data": house_data_test},
        "batch_identifiers": {
            batch_identifier_name: "step_1"
        },
    },
)

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

In [27]:
run_identifier = next(iter(results['run_results']))
results['run_results'][run_identifier]['validation_result']['statistics']

{'evaluated_expectations': 4,
 'successful_expectations': 4,
 'unsuccessful_expectations': 0,
 'success_percent': 100.0}

In [28]:
results['run_results'][run_identifier]['validation_result']['results'][2]

{
  "success": true,
  "result": {
    "observed_value": [
      "C (all)",
      "FV",
      "RH",
      "RL",
      "RM"
    ],
    "details": {
      "value_counts": [
        {
          "value": "C (all)",
          "count": 15
        },
        {
          "value": "FV",
          "count": 74
        },
        {
          "value": "RH",
          "count": 10
        },
        {
          "value": "RL",
          "count": 1114
        },
        {
          "value": "RM",
          "count": 242
        }
      ]
    }
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "expectation_config": {
    "kwargs": {
      "column": "MSZoning",
      "value_set": [
        "C (all)",
        "FV",
        "RH",
        "RL",
        "RM"
      ],
      "batch_id": "3722fd4c2e47cabe7527b03f86860621"
    },
    "meta": {},
    "expectation_type": "expect_column_distinct_values_to_be_in_set"
  }
}

In [29]:
house_data_test.at[0, 'Id'] = 1462

In [30]:
validator = {
            "batch_request": {
                "datasource_name": datasource_name,
                "data_connector_name": runtime_data_connector_name,
                "data_asset_name": data_asset_name,
            },
            "expectation_suite_name": expectation_suite_name,
        }

In [31]:
bad_data_checkpoint_name = "my_bad_data_checkpoint"
bad_data_checkpoint_config = {
    "name": bad_data_checkpoint_name,
    "config_version": 1,
    "class_name": "SimpleCheckpoint",
    "runtime_configuration": {
        "result_format": {
            "result_format": "COMPLETE", # BASIC
            "include_unexpected_rows": True
        }
    },
    "validations": [
      validator 
    ],
}
context.add_checkpoint(**bad_data_checkpoint_config)

{
  "action_list": [
    {
      "name": "store_validation_result",
      "action": {
        "class_name": "StoreValidationResultAction"
      }
    },
    {
      "name": "store_evaluation_params",
      "action": {
        "class_name": "StoreEvaluationParametersAction"
      }
    },
    {
      "name": "update_data_docs",
      "action": {
        "class_name": "UpdateDataDocsAction",
        "site_names": []
      }
    }
  ],
  "batch_request": {},
  "class_name": "Checkpoint",
  "config_version": 1.0,
  "evaluation_parameters": {},
  "module_name": "great_expectations.checkpoint",
  "name": "my_bad_data_checkpoint",
  "profilers": [],
  "runtime_configuration": {
    "result_format": {
      "result_format": "COMPLETE",
      "include_unexpected_rows": true
    }
  },
  "validations": [
    {
      "batch_request": {
        "datasource_name": "house_prices",
        "data_connector_name": "runtime_batch_files_connector",
        "data_asset_name": "house_prices_survey_2006"
  

In [32]:
results_bad_data_checkpoint = context.run_checkpoint(
    checkpoint_name=bad_data_checkpoint_name,
    batch_request={
        "runtime_parameters": {"batch_data": house_data_test},
        "batch_identifiers": {
            batch_identifier_name: "step_2"
        },
    },
)

Calculating Metrics:   0%|          | 0/12 [00:00<?, ?it/s]

In [33]:
bad_data_run_identifier = next(iter(results_bad_data_checkpoint['run_results']))
results_bad_data_checkpoint['run_results'][bad_data_run_identifier]['validation_result']['statistics']

{'evaluated_expectations': 4,
 'successful_expectations': 3,
 'unsuccessful_expectations': 1,
 'success_percent': 75.0}

In [34]:
results_bad_data_checkpoint['run_results'][bad_data_run_identifier]['validation_result']['results'][1]

{
  "success": false,
  "result": {
    "element_count": 1459,
    "unexpected_count": 2,
    "unexpected_percent": 0.1370801919122687,
    "partial_unexpected_list": [
      1462,
      1462
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.1370801919122687,
    "unexpected_percent_nonmissing": 0.1370801919122687,
    "unexpected_rows": [
      {
        "Id": 1462,
        "MSSubClass": 20,
        "MSZoning": "RH",
        "LotFrontage": 80.0,
        "LotArea": 11622,
        "Street": "Pave",
        "Alley": null,
        "LotShape": "Reg",
        "LandContour": "Lvl",
        "Utilities": "AllPub",
        "LotConfig": "Inside",
        "LandSlope": "Gtl",
        "Neighborhood": "NAmes",
        "Condition1": "Feedr",
        "Condition2": "Norm",
        "BldgType": "1Fam",
        "HouseStyle": "1Story",
        "OverallQual": 5,
        "OverallCond": 6,
        "YearBuilt": 1961,
        "YearRemodAdd": 1961,
        "RoofStyle": 

In [35]:
unexpected_data_indices = results_bad_data_checkpoint['run_results'][bad_data_run_identifier]['validation_result']['results'][1]['result']['unexpected_index_list']


In [36]:
filtered_house_data_test = house_data_test[~house_data_test.index.isin(unexpected_data_indices)]

In [37]:
# Build the data docs, in Jupyter a new tab will open up with the data docs page
!echo y | great_expectations docs build --site-name local_site

Using v3 (Batch Request) API[0m

The following Data Docs sites will be built:

 - [36mlocal_site:[0m file:///content/great_expectations/uncommitted/data_docs/local_site/index.html
[0m
Would you like to proceed?[0m [Y/n]: 
Building Data Docs...
[0m
Done building Data Docs[0m
[0m

In [39]:
exclude_column_names = ['LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', "LotConfig", "LandSlope", "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "MasVnrArea", "ExterQual", "ExterCond", "Foundation", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinSF1", "BsmtFinType2", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "Heating", "HeatingQC", "CentralAir", "Electrical", "PoolArea", "PoolQC", "LowQualFinSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr", "KitchenQual", "TotRmsAbvGrd", "Functional", "Fireplaces", "FireplaceQu", "GarageType", "GarageYrBlt", "GarageFinish", "GarageCars", "GarageArea", "GarageQual", "GarageCond", "PavedDrive", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "Fence", "MiscFeature", "MiscVal", "MoSold", "YrSold", "SaleType", "SaleCondition"]


In [40]:
house_data.drop(exclude_column_names, axis=1).columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', '1stFlrSF', '2ndFlrSF',
       'SalePrice'],
      dtype='object')

In [41]:
runtime_batch_request = RuntimeBatchRequest(
    datasource_name=datasource_name,
    data_connector_name=runtime_data_connector_name,
    data_asset_name=data_asset_name,
    runtime_parameters={"batch_data": house_data.drop(exclude_column_names, axis=1)},
    batch_identifiers={
        batch_identifier_name: "step_3",
    },
)

In [42]:
validator = context.get_validator(batch_request=runtime_batch_request)

In [43]:
profiler = UserConfigurableProfiler(
    profile_dataset=validator,
    excluded_expectations=None,
    ignored_columns=exclude_column_names,
    not_null_only=False,
    primary_or_compound_key=None,
    semantic_types_dict=None,
    table_expectations_only=False,
    value_set_threshold="MANY",
)
suite = profiler.build_suite()
validator.expectation_suite = suite

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling:   0%|          | 0/7 [00:00<?, ?it/s, Column=Id]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Creating an expectation suite with the following expectations:

Table-Level Expectations
expect_table_columns_to_match_ordered_list
expect_table_row_count_to_be_between

Expectations by Column
Column Name: 1stFlrSF | Column Data Type: INT | Cardinality: VERY_MANY
expect_column_max_to_be_between
expect_column_mean_to_be_between
expect_column_median_to_be_between
expect_column_min_to_be_between
expect_column_proportion_of_unique_values_to_be_between
expect_column_quantile_values_to_be_between
expect_column_values_to_be_in_type_list
expect_column_values_to_not_be_null


Column Name: 2ndFlrSF | Column Data Type: INT | Cardinality: VERY_MANY
expect_column_max_to_be_between
expect_column_mean_to_be_between
expect_column_median_to_be_between
expect_column_min_to_be_between
expect_column_proportion_of_unique_values_to_be_between
expect_column_quantile_values_to_be_between
expect_column_values_to_be_in_type_list
expect_column_values_to_not_be_null


Column Name: Id | Column Data Type: INT | Car

In [44]:
validator.save_expectation_suite(discard_failed_expectations=False)

In [45]:
profiled_validator_checkpoint = "profiled_validator"
checkpoint_config = {
    "name": profiled_validator_checkpoint,
    "config_version": 1,
    "class_name": "SimpleCheckpoint",
    "validations": [
        {
            "batch_request": {
                "datasource_name": datasource_name,
                "data_connector_name": runtime_data_connector_name,
                "data_asset_name": data_asset_name,
            },
            "expectation_suite_name": "default",
        }
    ],
}
context.add_checkpoint(**checkpoint_config)
results_profiled_checkpoint = context.run_checkpoint(
    checkpoint_name=profiled_validator_checkpoint,
    batch_request={
        "runtime_parameters": {"batch_data": house_data_test.drop(exclude_column_names, axis=1)},
        "batch_identifiers": {
            batch_identifier_name: "step_4"
        },
    },
)

Calculating Metrics:   0%|          | 0/80 [00:00<?, ?it/s]

In [46]:
context.build_data_docs()

validation_result_identifier = results_profiled_checkpoint.list_validation_result_identifiers()[0]
context.open_data_docs(resource_identifier=validation_result_identifier)

In [47]:
%%writefile expect_column_z_score_lower_than_3.py
"""
This is a template for creating custom ColumnExpectations.
For detailed instructions on how to use it, please see:
    https://docs.greatexpectations.io/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_column_aggregate_expectations
"""

import json
from typing import Callable, Dict, Optional

from numpy import array

from great_expectations.core.expectation_configuration import ExpectationConfiguration
from great_expectations.execution_engine import (
    PandasExecutionEngine,
    SparkDFExecutionEngine,
    SqlAlchemyExecutionEngine,
)
from great_expectations.execution_engine.execution_engine import (
    ExecutionEngine,
    MetricDomainTypes,
    MetricPartialFunctionTypes,
)
from great_expectations.expectations.expectation import (
    ColumnMapExpectation,
    ExpectationValidationResult,
)
from great_expectations.expectations.metrics import (
    ColumnMapMetricProvider,
    column_condition_partial,
    metric_partial,
)
from great_expectations.expectations.metrics.import_manager import F, sa
from great_expectations.expectations.util import render_evaluation_parameter_string
from great_expectations.render.renderer.renderer import renderer
from great_expectations.render.types import (
    CollapseContent,
    RenderedStringTemplateContent,
)
from great_expectations.render.util import (
    handle_strict_min_max,
    parse_row_condition_string_pandas_engine,
    substitute_none_for_missing,
)
from great_expectations.validator.metric_configuration import MetricConfiguration

    
# This class defines a Metric to support your Expectation.
# For most ColumnMapExpectations, the main business logic for calculation will live in this class.
class ColumnValuesLowerThanZScoreOf3(ColumnMapMetricProvider):

    # This is the id string that will be used to reference your metric.
    condition_metric_name = "column_values.lower_than_z_score_of_3"

    # This method implements the core logic for the PandasExecutionEngine
    @column_condition_partial(engine=PandasExecutionEngine)
    def _pandas(cls, column, **kwargs):
        return abs((abs(column.mean()) - abs(column))/column.std()) < 3


# This class defines the Expectation itself
class ExpectColumnZScoreLowerThan3(ColumnMapExpectation):
    """This expectation takes the input column, calculates the standarad deviation, mean for the entire column and then calculates the 
    Z-score for each value in the column. Any value with a Z-score larger than 3 is considered an outlier. Z-score is defined as: 
    (value-column_mean)/standard_deviation"""

    # These examples will be shown in the public gallery.
    # They will also be executed as unit tests for your Expectation.
    examples = [
        {
            "data": {"x": [1, 2, 3, 4, 5], "y": [-15, 2, 3, 4, 5]},
            "tests": [
                {
                    "title": "basic_positive_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "column": "x",
                    },
                    "out": {"success": True},
                },
                {
                    "title": "basic_negative_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "column": "y",
                    },
                    "out": {"success": False},
                },
            ],
            "test_backends": [
                {
                    "backend": "pandas",
                    "dialects": None,
                },
                # {
                #     "backend": "sqlalchemy",
                #     "dialects": ["sqlite", "postgresql"],
                # },
                # {
                #     "backend": "spark",
                #     "dialects": None,
                # },
            ],
        }
    ]

    # This is the id string of the Metric used by this Expectation.
    # For most Expectations, it will be the same as the `condition_metric_name` defined in your Metric class above.
    map_metric = "column_values.lower_than_z_score_of_3"

    # This is a list of parameter names that can affect whether the Expectation evaluates to True or False
    # Please see https://docs.greatexpectations.io/en/latest/reference/core_concepts/expectations/expectations.html#expectation-concepts-domain-and-success-keys
    # for more information about domain and success keys, and other arguments to Expectations
    success_keys = ("mostly",)

    # This dictionary contains default values for any parameters that should have default values
    default_kwarg_values = {}

    @renderer(renderer_type="renderer.diagnostic.observed_value")
    @render_evaluation_parameter_string
    def _diagnostic_observed_value_renderer(
        cls,
        configuration: ExpectationConfiguration = None,
        result: ExpectationValidationResult = None,
        language: str = None,
        runtime_configuration: dict = None,
        **kwargs,
    ):
        assert result, "Must provide a result object."

        result_dict = result.result
        if result_dict is None:
            return "--"

        if result_dict.get("observed_value"):
            observed_value = result_dict.get("observed_value")
            if isinstance(observed_value, (int, float)) and not isinstance(
                observed_value, bool
            ):
                return num_to_str(observed_value, precision=10, use_locale=True)
            return str(observed_value)
        elif result_dict.get("unexpected_percent") is not None:
            return (
                num_to_str(result_dict.get("unexpected_percent"), precision=5)
                + "% unexpected"
            )
        else:
            return "--"

    @renderer(renderer_type="renderer.diagnostic.unexpected_statement")
    @render_evaluation_parameter_string
    def _diagnostic_unexpected_statement_renderer(
        cls,
        configuration: ExpectationConfiguration = None,
        result: ExpectationValidationResult = None,
        language: str = None,
        runtime_configuration: dict = None,
        **kwargs,
    ):
        assert result, "Must provide a result object."

        success = result.success
        result = result.result

        if result.exception_info["raised_exception"]:
            exception_message_template_str = (
                "\n\n$expectation_type raised an exception:\n$exception_message"
            )

            exception_message = RenderedStringTemplateContent(
                **{
                    "content_block_type": "string_template",
                    "string_template": {
                        "template": exception_message_template_str,
                        "params": {
                            "expectation_type": result.expectation_config.expectation_type,
                            "exception_message": result.exception_info[
                                "exception_message"
                            ],
                        },
                        "tag": "strong",
                        "styling": {
                            "classes": ["text-danger"],
                            "params": {
                                "exception_message": {"tag": "code"},
                                "expectation_type": {
                                    "classes": ["badge", "badge-danger", "mb-2"]
                                },
                            },
                        },
                    },
                }
            )

            exception_traceback_collapse = CollapseContent(
                **{
                    "collapse_toggle_link": "Show exception traceback...",
                    "collapse": [
                        RenderedStringTemplateContent(
                            **{
                                "content_block_type": "string_template",
                                "string_template": {
                                    "template": result.exception_info[
                                        "exception_traceback"
                                    ],
                                    "tag": "code",
                                },
                            }
                        )
                    ],
                }
            )

            return [exception_message, exception_traceback_collapse]

        if success or not result_dict.get("unexpected_count"):
            return []
        else:
            unexpected_count = num_to_str(
                result_dict["unexpected_count"], use_locale=True, precision=20
            )
            unexpected_percent = (
                num_to_str(result_dict["unexpected_percent"], precision=4) + "%"
            )
            element_count = num_to_str(
                result_dict["element_count"], use_locale=True, precision=20
            )

            template_str = (
                "\n\n$unexpected_count unexpected values found. "
                "$unexpected_percent of $element_count total rows."
            )

            return [
                RenderedStringTemplateContent(
                    **{
                        "content_block_type": "string_template",
                        "string_template": {
                            "template": template_str,
                            "params": {
                                "unexpected_count": unexpected_count,
                                "unexpected_percent": unexpected_percent,
                                "element_count": element_count,
                            },
                            "tag": "strong",
                            "styling": {"classes": ["text-danger"]},
                        },
                    }
                )
            ]

    @renderer(renderer_type="renderer.diagnostic.unexpected_table")
    @render_evaluation_parameter_string
    def _diagnostic_unexpected_table_renderer(
        cls,
        configuration: ExpectationConfiguration = None,
        result: ExpectationValidationResult = None,
        language: str = None,
        runtime_configuration: dict = None,
        **kwargs,
    ):
        try:
            result_dict = result.result
        except KeyError:
            return None

        if result_dict is None:
            return None

        if not result_dict.get("partial_unexpected_list") and not result_dict.get(
            "partial_unexpected_counts"
        ):
            return None

        table_rows = []

        if result_dict.get("partial_unexpected_counts"):
            total_count = 0
            for unexpected_count_dict in result_dict.get("partial_unexpected_counts"):
                value = unexpected_count_dict.get("value")
                count = unexpected_count_dict.get("count")
                total_count += count
                if value is not None and value != "":
                    table_rows.append([value, count])
                elif value == "":
                    table_rows.append(["EMPTY", count])
                else:
                    table_rows.append(["null", count])

            if total_count == result_dict.get("unexpected_count"):
                header_row = ["Unexpected Value", "Count"]
            else:
                header_row = ["Sampled Unexpected Values"]
                table_rows = [[row[0]] for row in table_rows]

        else:
            header_row = ["Sampled Unexpected Values"]
            sampled_values_set = set()
            for unexpected_value in result_dict.get("partial_unexpected_list"):
                if unexpected_value:
                    string_unexpected_value = str(unexpected_value)
                elif unexpected_value == "":
                    string_unexpected_value = "EMPTY"
                else:
                    string_unexpected_value = "null"
                if string_unexpected_value not in sampled_values_set:
                    table_rows.append([unexpected_value])
                    sampled_values_set.add(string_unexpected_value)

        unexpected_table_content_block = RenderedTableContent(
            **{
                "content_block_type": "table",
                "table": table_rows,
                "header_row": header_row,
                "styling": {
                    "body": {"classes": ["table-bordered", "table-sm", "mt-3"]}
                },
            }
        )

        return unexpected_table_content_block

    # This dictionary contains metadata for display in the public gallery
    library_metadata = {
        "tags": [],
        "contributors": ["@NatanMish"],
    }


if __name__ == "__main__":
    ExpectColumnZScoreLowerThan3().print_diagnostic_checklist()

Writing expect_column_z_score_lower_than_3.py


In [48]:
!python expect_column_z_score_lower_than_3.py

Calculating Metrics:   0% 0/9 [00:00<?, ?it/s]Calculating Metrics:   0% 0/9 [00:00<?, ?it/s]Calculating Metrics:  22% 2/9 [00:00<00:00, 4122.17it/s]Calculating Metrics:  22% 2/9 [00:00<00:00, 1493.17it/s]Calculating Metrics:  33% 3/9 [00:00<00:00, 2037.06it/s]Calculating Metrics:  33% 3/9 [00:00<00:00, 1339.32it/s]Calculating Metrics:  56% 5/9 [00:00<00:00, 1157.94it/s]Calculating Metrics:  56% 5/9 [00:00<00:00, 1010.19it/s]Calculating Metrics: 100% 9/9 [00:00<00:00, 1376.84it/s]Calculating Metrics: 100% 9/9 [00:00<00:00, 1312.31it/s]Calculating Metrics: 100% 9/9 [00:00<00:00, 1300.56it/s]Calculating Metrics: 100% 9/9 [00:00<00:00, 1284.49it/s]
Calculating Metrics:   0% 0/9 [00:00<?, ?it/s]Calculating Metrics:   0% 0/9 [00:00<?, ?it/s]Calculating Metrics:  22% 2/9 [00:00<00:00, 6369.48it/s]Calculating Metrics:  22% 2/9 [00:00<00:00, 1389.30it/s]Calculating Metrics:  33% 3/9 [00:00<00:00, 1836.12it/s]Calculating Metrics:  33% 3/9 [00:00<00:00, 1214.68it/s]Calculating 

In [49]:
from expect_column_z_score_lower_than_3 import ExpectColumnZScoreLowerThan3

In [50]:
validator.expect_column_z_score_lower_than3(column="SalePrice")

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 1460,
    "unexpected_count": 22,
    "unexpected_percent": 1.5068493150684932,
    "partial_unexpected_list": [
      438780,
      501837,
      475000,
      437154,
      426000,
      555000,
      440000,
      430000,
      446261,
      451950,
      423000,
      755000,
      538000,
      485000,
      582933,
      611657,
      556581,
      424870,
      625000,
      745000
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 1.5068493150684932,
    "unexpected_percent_nonmissing": 1.5068493150684932
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "expectation_config": {
    "kwargs": {
      "column": "SalePrice",
      "batch_id": "43db0a017fd961a59ce1c44196ce63ad"
    },
    "meta": {},
    "expectation_type": "expect_column_z_score_lower_than3"
  }
}

In [51]:
validator.expect_column_value_z_scores_to_be_less_than("SalePrice", 3, double_sided=True)

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 1460,
    "unexpected_count": 22,
    "unexpected_percent": 1.5068493150684932,
    "partial_unexpected_list": [
      438780,
      501837,
      475000,
      437154,
      426000,
      555000,
      440000,
      430000,
      446261,
      451950,
      423000,
      755000,
      538000,
      485000,
      582933,
      611657,
      556581,
      424870,
      625000,
      745000
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 1.5068493150684932,
    "unexpected_percent_nonmissing": 1.5068493150684932
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "expectation_config": {
    "kwargs": {
      "double_sided": true,
      "column": "SalePrice",
      "threshold": 3,
      "batch_id": "43db0a017fd961a59ce1c44196ce63ad"
    },
    "meta": {},
    "expectation_type": "expect_column_value_z_

In [52]:
from typing import Dict, Optional

from great_expectations.core.expectation_configuration import ExpectationConfiguration
from great_expectations.exceptions.exceptions import (
    InvalidExpectationConfigurationError,
)
from great_expectations.execution_engine import (
    ExecutionEngine,
    PandasExecutionEngine,
    SparkDFExecutionEngine,
    SqlAlchemyExecutionEngine,
)
from great_expectations.expectations.expectation import (
    ColumnPairMapExpectation,
    ExpectationValidationResult,
)
from great_expectations.expectations.metrics.import_manager import F, sa
from great_expectations.expectations.metrics.map_metric_provider import (
    ColumnPairMapMetricProvider,
    column_pair_condition_partial,
)
from great_expectations.validator.metric_configuration import MetricConfiguration


class ColumnFloorsSquareFeetComparison(ColumnPairMapMetricProvider):
    """MetricProvider Class for columns floors square feet comparison"""
    condition_metric_name = "column_pair_values.floors_square_feet_ratio"
    condition_domain_keys = (
        "column_A",
        "column_B",
    )
    condition_value_keys = ()
    @column_pair_condition_partial(engine=PandasExecutionEngine)
    def _pandas(cls, column_A, column_B, **kwargs):
        # This methold should return a Pandas series of booleans
        return column_B/column_A <= 2


class ExpectProportionalFloorDifference(ColumnPairMapExpectation):
    """Expect house 2nd floor to be no more than twice larger than the 1st floor"""
    map_metric = "column_pair_values.floors_square_feet_ratio"
    # These examples will be shown in the public gallery.
    # They will also be executed as unit tests for your Expectation.
    examples = [
        {
            "data": {
                "col_a": [1000, 500, 2000, 4000, 300, 100],
                "col_b": [500, 1000, 1000, 2500, 0, 2000],
            },
            "tests": [
                {
                    "title": "basic_positive_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {"column_A": "col_a", "column_B": "col_b", "mostly": 0.6},
                    "out": {
                        "success": True,
                    },
                },
                {
                    "title": "basic_negative_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {"column_A": "col_a", "column_B": "col_b", "mostly": 1},
                    "out": {
                        "success": False,
                    },
                },
            ],
        }
    ]
    # Setting necessary computation metric dependencies and defining kwargs, as well as assigning kwargs default values
    success_keys = (
        "column_A",
        "column_B",
        "mostly",
    )

    default_kwarg_values = {
        "row_condition": None,
        "condition_parser": None,  # we expect this to be explicitly set whenever a row_condition is passed
        "mostly": 1.0,
        "result_format": "COMPLETE",
        "include_config": True,
        "catch_exceptions": False,
    }
    args_keys = (
        "column_A",
        "column_B",
    )

    def validate_configuration(
        self, configuration: Optional[ExpectationConfiguration]
    ) -> None:
        super().validate_configuration(configuration)
        if configuration is None:
            configuration = self.configuration
        try:
            assert (
                "column_A" in configuration.kwargs
                and "column_B" in configuration.kwargs
            ), "both columns must be provided"
        except AssertionError as e:
            raise InvalidExpectationConfigurationError(str(e))

    # This dictionary contains metadata for display in the public gallery
    library_metadata = {
        "tags": [],
        "contributors": ["<YOUR GITHUB USERNAME HERE>"],
    }


if __name__ == "__main__":
    ExpectProportionalFloorDifference().print_diagnostic_checklist()
# Note to users: code below this line is only for integration testing -- ignore!

diagnostics = ExpectProportionalFloorDifference().run_diagnostics()

for check in diagnostics["tests"]:
    assert check["test_passed"] is True
    assert check["error_diagnostics"] is None

for check in diagnostics["errors"]:
    assert check is None

for check in diagnostics["maturity_checklist"]["experimental"]:
    if check["message"] == "Passes all linting checks":
        continue

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

1 1 0 True
Completeness checklist for ExpectProportionalFloorDifference (EXPERIMENTAL):
 ✔ Has a valid library_metadata object
 ✔ Has a docstring, including a one-line short description
    ✔ "Expect house 2nd floor to be no more than twice larger than the 1st floor"
 ✔ Has at least one positive and negative example case, and all test cases pass
 ✔ Has core logic and passes tests on at least one Execution Engine
    ✔ All 2 tests for pandas are passing
   Passes all linting checks
      inspect.getfile(impl) raised a TypeError (impl is a built-in class)
 ✔ Has basic input validation and type checking
    ✔ Custom 'assert' statements in validate_configuration
 ✔ Has both statement Renderers: prescriptive and diagnostic
 ✔ Has core logic that passes tests for all applicable Execution Engines and SQL dialects
    ✔ All 2 tests for pandas are passing
   Has a full suite of tests, as determined by a code owner
   Has passed a manual review by a code owner for code standards and style guides

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

1 1 0 True


In [53]:
!python expect_proportional_floor_sf_ratio.py

python3: can't open file 'expect_proportional_floor_sf_ratio.py': [Errno 2] No such file or directory


In [54]:
from expect_proportional_floor_sf_ratio import ExpectProportionalFloorDifference

ModuleNotFoundError: ignored

In [55]:
validator.expect_proportional_floor_difference("1stFlrSF", "2ndFlrSF")

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 1460,
    "unexpected_count": 1,
    "unexpected_percent": 0.06854009595613435,
    "partial_unexpected_list": [
      [
        862,
        1788
      ]
    ],
    "missing_count": 1,
    "missing_percent": 0.0684931506849315,
    "unexpected_percent_total": 0.0684931506849315,
    "unexpected_percent_nonmissing": 0.06854009595613435
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "expectation_config": {
    "kwargs": {
      "column_A": "1stFlrSF",
      "column_B": "2ndFlrSF",
      "batch_id": "43db0a017fd961a59ce1c44196ce63ad"
    },
    "meta": {},
    "expectation_type": "expect_proportional_floor_difference"
  }
}