# 1. Introduction

Nama : Stephanus Adinata Susanto

Batch : Sby - 001

Objective : Project ini dilakukan untuk memvalidasi dataset yang telah di cleaning sebelummnya


# 2. Instantiate Data Context

In [2]:
# Create a data context

from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir='./')

# 3. Connect to A `Datasource`

In [3]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'csv_clean'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'bank_customer_churn'
path_to_data = 'P2M3_Stephanus_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

# 4. Create an Expectation Suite

In [4]:
# Creat an expectation suite
expectation_suite_name = 'expectation_bank_customer_churn_dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,id,row_number,customer_id,surname,credit_score,geography,gender,age,tenure,balance,num_of_products,has_cr_card,is_active_member,estimated_salary,exited
0,1,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


## 4.1 Expectation

In [10]:
# Expectation 1 : Column `customer_id` must be unique

validator.expect_column_values_to_be_unique('customer_id')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 10000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [11]:
# Expectation  2: Column `balance` must be not less than $0

validator.expect_column_values_to_be_between(
    column='balance', min_value=0, max_value=10000000000
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 10000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [12]:
# Expectation 3 : Column `gender` must contain one of the following 2 things :
# 1 = Male
# 2 = Female

validator.expect_column_values_to_be_in_set('gender', ['Male', 'Female'])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 10000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [31]:
# Expectation 4 : Column `estimated_salary` must in form of integer or float

validator.expect_column_values_to_be_in_type_list('estimated_salary',['integer', 'float'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "float64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [32]:
# Expectation 5 : Column `exited` must be valued between 0 - 1

validator.expect_column_value_lengths_to_be_between('exited', 0, 1)

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 10000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [27]:
# Expectation 6 : Column `credit_score` must be not less than 0 and over 1000

validator.expect_column_min_to_be_between('credit_score', 0, 1000)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 350
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [39]:
# Expectation 7 : the unique value count of customer_id column must be not over 10000 count

validator.expect_column_unique_value_count_to_be_between('customer_id', 0, 10000)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 10000
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [40]:
# Save into Expectation Suite

validator.save_expectation_suite(discard_failed_expectations=False)

## 4.2 Checkpoint

In [41]:
# Create a checkpoint

checkpoint_1 = context.add_or_update_checkpoint(
    name = 'checkpoint_1',
    validator = validator,
)

In [42]:
# Run a checkpoint

checkpoint_result = checkpoint_1.run()

Calculating Metrics:   0%|          | 0/68 [00:00<?, ?it/s]

## 4.3 Data Docs

In [43]:
# Build data docs

context.build_data_docs()

{'local_site': 'file://d:\\Hacktiv8\\Private\\p2-ftds001-sby-m3-StephanusAdinata\\gx\\uncommitted/data_docs/local_site/index.html'}