=======================================================================

Program ini dibuat untuk uji validasi menggunakan Great Expectations.

=======================================================================

In [1]:
from great_expectations.data_context import FileDataContext

In [2]:
context = FileDataContext.create(project_root_dir='./')

In [4]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'csv-ecommerce'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'us-ecommerce-2020'
path_to_data = '/Users/permatahajjarianti/github-classroom/FTDS-assignment-bay/p2-ftds011-hck-m3-Rian273/dags/P2M3_permata_hajjarianti_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

In [5]:
# Creat an expectation suite
expectation_suite_name = 'expectation-us-ecommerce-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,order_date,row_id,order_id,ship_mode,customer_id,segment,country,city,state,postal_code,region,product_id,category,sub-category,product_name,sales,quantity,discount,profit
0,2020-01-01,849,CA-2017-107503,Standard Class,GA-14725,Consumer,United States,Lorain,Ohio,44052,East,FUR-FU-10003878,Furniture,Furnishings,"Linden 10"" Round Wall Clock, Black",48.896,4,0.2,8.5568
1,2020-01-01,4010,CA-2017-144463,Standard Class,SC-20725,Consumer,United States,Los Angeles,California,90036,West,FUR-FU-10001215,Furniture,Furnishings,"Howard Miller 11-1/2"" Diameter Brentwood Wall ...",474.43,11,0.0,199.2606
2,2020-01-01,6683,CA-2017-154466,First Class,DP-13390,Home Office,United States,Franklin,Wisconsin,53132,Central,OFF-BI-10002012,Office Supplies,Binders,Wilson Jones Easy Flow II Sheet Lifters,3.6,2,0.0,1.728
3,2020-01-01,8070,CA-2017-151750,Standard Class,JM-15250,Consumer,United States,Huntsville,Texas,77340,Central,OFF-ST-10002743,Office Supplies,Storage,SAFCO Boltless Steel Shelving,454.56,5,0.2,-107.958
4,2020-01-01,8071,CA-2017-151750,Standard Class,JM-15250,Consumer,United States,Huntsville,Texas,77340,Central,FUR-FU-10002116,Furniture,Furnishings,"Tenex Carpeted, Granite-Look or Clear Contempo...",141.42,5,0.6,-187.3815


In [7]:
# Expectation 1 : Column `row_id` can not contain duplicate values. :

validator.expect_column_values_to_be_unique('row_id')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 3312,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [8]:
# Expectation 2 : Column 'discount' must be less than 0.8. :

validator.expect_column_values_to_be_between(column='discount', min_value=0.0, max_value=0.8)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 3312,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [9]:
# Expectation 3 : Column `ship_mode` must contain one of the following 4 things :
# First Class
# Second Class
# Standard Class
# Same Day

validator.expect_column_values_to_be_in_set(column='ship_mode', value_set=['First Class', 'Second Class', 'Standard Class', 'Same Day'])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 3312,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [10]:
# Expectation 4 : Column `profit` must in form of int64 or float. :

validator.expect_column_values_to_be_in_type_list('sales', ['integer', 'float'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "float64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [12]:
# Expectation 5 : Expect the column 'customer_id' follows the pattern of two uppercase letters followed by a hyphen and five digits. :

validator.expect_column_values_to_match_regex(column='customer_id', regex='[a-zA-Z]{2}-[0-9]{5}')


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 3312,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [16]:
# Expectation 6 : Expect the column 'city' contain only letters and spaces, no numbers or special characters. :

validator.expect_column_values_to_not_match_regex(column='city', regex='[^a-zA-Z\s]')


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 3312,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [22]:
# Expectation 7 : To validate that the length of each 'product_name' is between 1 and 130 characters. :

validator.expect_column_value_lengths_to_be_between(column='product_name', min_value=1, max_value=130)

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 3312,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [23]:
# Create a checkpoint

checkpoint_1 = context.add_or_update_checkpoint(
    name = 'checkpoint_1',
    validator = validator,
)

In [24]:
# Run a checkpoint

checkpoint_result = checkpoint_1.run()

Calculating Metrics: 0it [00:00, ?it/s]

In [25]:
# Build data docs

context.build_data_docs()

{'local_site': 'file:///Users/permatahajjarianti/github-classroom/FTDS-assignment-bay/p2-ftds011-hck-m3-Rian273/gx/gx/uncommitted/data_docs/local_site/index.html'}