# Imports

In [0]:
import great_expectations as gx
from great_expectations.checkpoint import Checkpoint

Configuration GX : 

In [0]:
context_root_dir = "/dbfs/great_expectations/"
context = gx.get_context(context_root_dir=context_root_dir)

    - No action was taken.

    - No action was taken.

INFO:great_expectations.data_context.types.base:Created temporary directory '/tmp/tmpi_0lq63b' for ephemeral docs site


We'll use in memory to load data from spark dataframe : 

In [0]:
dataframe_datasource = context.sources.add_or_update_spark(
    name="my_spark_in_memory_datasource",
)

### Reading the dataframe to validate using spark : 

In [0]:
dfBronze = spark.table("bronze_retails.sales")

#### Assigning to GX the dataframe loaded : 

In [0]:

dataframe_asset = dataframe_datasource.add_dataframe_asset(
    name="bronze_sales",
    dataframe=dfBronze,
)

In [0]:
batch_request = dataframe_asset.build_batch_request()

### Data Validation : 

#### Create our validation suite: 

In [0]:
expectation_suite_name = "Simple_data_validation"
context.add_or_update_expectation_suite(expectation_suite_name=expectation_suite_name)
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name,
)

print(validator.head())

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

    invoice_id branch       city customer_type  gender  \
0  750-67-8428      A     Yangon        Member  Female   
1  226-31-3081      C  Naypyitaw        Normal  Female   
2  631-41-3108      A     Yangon        Normal    Male   
3  123-19-1176      A     Yangon        Member    Male   
4  373-73-7910      A     Yangon        Normal    Male   

             product_line unit_price quantity    tax_5     total       date  \
0       Health and beauty      74.69        7  26.1415  548.9715   1/5/2019   
1  Electronic accessories      15.28        5     3.82     80.22   3/8/2019   
2      Home and lifestyle      46.33        7  16.2155  340.5255   3/3/2019   
3       Health and beauty      58.22        8   23.288   489.048  1/27/2019   
4       Sports and travel      86.31        7  30.2085  634.3785   2/8/2019   

    time      payment    cogs gross_margin_percentage gross_income rating  
0  13:08      Ewallet  522.83             4.761904762      26.1415    9.1  
1  10:29         Cash   

#### Specifying our checks : 

In [0]:
validator.expect_column_values_to_not_be_null(column="invoice_id")

validator.expect_column_values_to_be_between(
    column="rating", min_value=0, max_value=7
    )




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Out[52]: {
  "success": false,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_between",
    "kwargs": {
      "column": "rating",
      "min_value": 0,
      "max_value": 7,
      "batch_id": "my_spark_in_memory_datasource-bronze_sales"
    },
    "meta": {}
  },
  "result": {
    "element_count": 1000,
    "unexpected_count": 329,
    "unexpected_percent": 32.9,
    "partial_unexpected_list": [
      "9.1",
      "9.6",
      "8.4",
      "8",
      "8.2",
      "8.6",
      "9.9",
      "8.5",
      "9.6",
      "9.5",
      "8.4",
      "8.1",
      "9.5",
      "8.5",
      "8.2",
      "9.3",
      "10",
      "10",
      "8.6",
      "9.9"
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 32.9,
    "unexpected_percent_nonmissing": 32.9
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [0]:
validator.save_expectation_suite(discard_failed_expectations=False)

### Running Validation : 

#### Creating Checkpoints : 

In [0]:
my_checkpoint_name = "my_databricks_checkpoint"

checkpoint = Checkpoint(
    name=my_checkpoint_name,
    run_name_template="%Y%m%d-%H%M%S-my-run-name-template",
    data_context=context,
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name,
    action_list=[
        {
            "name": "store_validation_result",
            "action": {"class_name": "StoreValidationResultAction"},
        },
        {"name": "update_data_docs", "action": {"class_name": "UpdateDataDocsAction"}},
    ],
)

context.add_or_update_checkpoint(checkpoint=checkpoint)


Out[54]: {
  "action_list": [
    {
      "name": "store_validation_result",
      "action": {
        "class_name": "StoreValidationResultAction"
      }
    },
    {
      "name": "update_data_docs",
      "action": {
        "class_name": "UpdateDataDocsAction"
      }
    }
  ],
  "batch_request": {
    "datasource_name": "my_spark_in_memory_datasource",
    "data_asset_name": "bronze_sales",
    "options": {}
  },
  "class_name": "Checkpoint",
  "config_version": 1.0,
  "evaluation_parameters": {},
  "expectation_suite_name": "Simple_data_validation",
  "module_name": "great_expectations.checkpoint",
  "name": "my_databricks_checkpoint",
  "profilers": [],
  "run_name_template": "%Y%m%d-%H%M%S-my-run-name-template",
  "runtime_configuration": {},
  "validations": []
}

#### Executing the checks :

In [0]:
checkpoint_result = checkpoint.run()

Calculating Metrics:   0%|          | 0/19 [00:00<?, ?it/s]

#### Building HTML doc in human readable format : 

In [0]:
context.build_data_docs()

Out[56]: {'local_site': 'file:///tmp/tmpi_0lq63b/index.html'}