# Milestone 3
## The Great Expectation

Name : Niko Amrullah Hakam

Batch : FTDS - RMT - 038

In [1]:
# import necessary libraries
import great_expectations as ge
from great_expectations.data_context import FileDataContext

In [2]:
# Programmatically initialize a Great Expectations project
context = FileDataContext.create(project_root_dir="./great_expectations")
print("Great Expectations project initialized!")

# Verify the structure
print(f"Project directory: {context.root_directory}")

Great Expectations project initialized!
Project directory: d:\Study\RMT038\Phase 2\p2-ftds038-rmt-m3-Radenaz\great_expectations\gx


In [3]:
# Add a Pandas datasource
datasource_name = 'csv-data-preprocessed'
datasource = context.sources.add_pandas(datasource_name)

# Add a CSV asset
asset_name = 'data-preprocessed'
path_to_data = './P2M3_nikoahakam_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build a batch request
batch_request = asset.build_batch_request()
print(f"Batch request built successfully for asset: {asset_name}")

Batch request built successfully for asset: data-preprocessed


In [4]:
# Create a new expectation suite
suite_name = 'data_preprocessed_suite'
context.add_or_update_expectation_suite(expectation_suite_name=suite_name)

{
  "ge_cloud_id": null,
  "expectation_suite_name": "data_preprocessed_suite",
  "expectations": [],
  "meta": {
    "great_expectations_version": "0.18.19"
  },
  "data_asset_type": null
}

In [5]:
# Get a validator for the batch
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=suite_name
)

In [6]:
# 1. Check if the column data are as expected : customer_id to be unique
validator.expect_column_values_to_be_unique("row_id")

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 9994,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "success": true
}

In [7]:
# 2. Check if the column data are as expected : 
validator.expect_column_values_to_be_between("discount", min_value=0, max_value=1)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 9994,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "success": true
}

In [8]:
# 3. Check if the column data are as expected : segment to be one of the values
validator.expect_column_values_to_be_in_set("segment", ["Consumer", "Corporate", "Home Office"])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 9994,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "success": true
}

In [9]:
# 4. Check if the column data are as expected : category to be one of the values
validator.expect_column_values_to_be_of_type("sales", "float")

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": "float64"
  },
  "meta": {},
  "success": true
}

In [10]:
# 5. Check if the column data are as expected : customer_name to not have special characters
validator.expect_column_values_to_not_match_regex("customer_name", r"[!@#$%^&*]")

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 9994,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "success": true
}

In [11]:
# 6. Check if the column data are as expected :
validator.expect_column_value_lengths_to_be_between("customer_id", min_value=8, max_value=10)

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 9994,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "success": true
}

In [12]:
# 7. Check if the column data are as expected : customer_id must follow a specific pattern
validator.expect_column_values_to_match_regex("customer_id", r"^[A-Za-z]{2}-\d{5}$")

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 9994,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "success": true
}

In [13]:
validator.save_expectation_suite(discard_failed_expectations=False)

In [14]:
# Create a checkpoint

checkpoint_1 = context.add_or_update_checkpoint(
    name = 'checkpoint_1',
    validator = validator,
)

In [15]:
# Run a checkpoint

checkpoint_result = checkpoint_1.run()

Calculating Metrics:   0%|          | 0/44 [00:00<?, ?it/s]

In [16]:
# Build data docs

context.build_data_docs()

{'local_site': 'file://d:\\Study\\RMT038\\Phase 2\\p2-ftds038-rmt-m3-Radenaz\\great_expectations\\gx\\uncommitted/data_docs/local_site/index.html'}