Milestone 3 Great Expectation

Nama  : Syahrul Budi Rahmadan
Batch : FTDS-002-SBY

My primary focus in this project will be to establish robust protocols for data validation, which will strengthen the data's credibility and facilitate the execution of later stages. The objective is to ensure the data's dependability and integrity by establishing thorough validation standards, which will enable a more confident and effective progression through future processing various stages.

# I Libraries

In [1]:
import pandas as pd
from great_expectations.data_context import FileDataContext

# II Create Root

In [2]:
context = FileDataContext.create(project_root_dir='./')

# III Data load

In [3]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'csv-data-jan'
datasource = context.sources.add_pandas(datasource_name)

# IV Make Expectation sets

In [4]:
# Give a name to a data asset
asset_name = 'selling-january'
path_to_data = 'laptop_sales_from_posgres_cleaned.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

In [5]:
# Creat an expectation suite
expectation_suite_name = 'expectation-sell-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0.1,Unnamed: 0,id,brand,processor_brand,processor_name,processor_generation,ram_gb,ram_type,ssd,hdd,...,os_bit,graphic_card_gb,weight,warranty,touchscreen,msoffice,price,rating,number_of_ratings,number_of_reviews
0,0,0,ASUS,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,...,64-bit,0 GB,Casual,No warranty,No,No,34649,2 stars,3,0
1,1,1,Lenovo,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,...,64-bit,0 GB,Casual,No warranty,No,No,38999,3 stars,65,5
2,2,2,Lenovo,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,...,64-bit,0 GB,Casual,No warranty,No,No,39999,3 stars,8,1
3,3,3,ASUS,Intel,Core i5,10th,8 GB,DDR4,512 GB,0 GB,...,32-bit,2 GB,Casual,No warranty,No,No,69990,3 stars,0,0
4,4,4,ASUS,Intel,Celeron Dual,Not Available,4 GB,DDR4,0 GB,512 GB,...,64-bit,0 GB,Casual,No warranty,No,No,26990,3 stars,0,0


# V Expectations

In [6]:
# Expectation 1 : Column `brand` can not contain missing values

validator.expect_column_values_to_not_be_null('brand')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 823,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [7]:
# Expectation 2 : Column `processor_brand` must be unique

validator.expect_column_values_to_be_unique('processor_brand')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 823,
    "unexpected_count": 823,
    "unexpected_percent": 100.0,
    "partial_unexpected_list": [
      "Intel",
      "Intel",
      "Intel",
      "Intel",
      "Intel",
      "Intel",
      "Intel",
      "Intel",
      "Intel",
      "AMD",
      "AMD",
      "AMD",
      "Intel",
      "Intel",
      "Intel",
      "Intel",
      "Intel",
      "Intel",
      "Intel",
      "Intel"
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 100.0,
    "unexpected_percent_nonmissing": 100.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [9]:
# Expectation 3 : Column `Price` must be less than $ 7000

validator.expect_column_values_to_be_between(
    column='price', min_value=0, max_value=7000
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 823,
    "unexpected_count": 823,
    "unexpected_percent": 100.0,
    "partial_unexpected_list": [
      34649,
      38999,
      39999,
      69990,
      26990,
      22990,
      21990,
      58799,
      49999,
      59990,
      93700,
      72990,
      17490,
      22990,
      35990,
      56490,
      65390,
      31999,
      32490,
      31799
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 100.0,
    "unexpected_percent_nonmissing": 100.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [10]:
# Expectation 4 : Column `warranty` must be exist to know about waranty that a vendor can give

validator.expect_column_to_exist(column='warranty')

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

{
  "success": true,
  "result": {},
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [12]:
# Expectation 5 : Column `Number of Reviews` must be exist to know about waranty that a vendor can give

validator.expect_column_median_to_be_between(column='number_of_reviews')

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 2.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [13]:
# Expectation 6 : 

validator.expect_column_values_to_be_decreasing(column='number_of_ratings')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 823,
    "unexpected_count": 345,
    "unexpected_percent": 41.919805589307416,
    "partial_unexpected_list": [
      65,
      31,
      1946,
      425,
      1120,
      286,
      7,
      15279,
      2080,
      273,
      1267,
      284,
      286,
      816,
      1178,
      758,
      816,
      17,
      24,
      1753
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 41.919805589307416,
    "unexpected_percent_nonmissing": 41.919805589307416
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [15]:
# Expectation 7 : 

validator.expect_column_values_to_be_null(column='processor_generation')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 823,
    "unexpected_count": 823,
    "unexpected_percent": 100.0,
    "partial_unexpected_list": [
      "10th",
      "10th",
      "10th",
      "10th",
      "Not Available",
      "Not Available",
      "Not Available",
      "10th",
      "10th",
      "10th",
      "10th",
      "10th",
      "Not Available",
      "Not Available",
      "11th",
      "11th",
      "11th",
      "Not Available",
      "Not Available",
      "Not Available"
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [16]:
validator.save_expectation_suite(discard_failed_expectations=False)

# VI Create Checkpoint and Build Data Docs

In [17]:
# Create a checkpoint

checkpoint_1 = context.add_or_update_checkpoint(
    name = 'checkpoint_1',
    validator = validator,
)

In [18]:
# Run a checkpoint

checkpoint_result = checkpoint_1.run()

Calculating Metrics:   0%|          | 0/35 [00:00<?, ?it/s]

In [19]:
# Build data docs

context.build_data_docs()

{'local_site': 'file://c:\\milestone_3\\GX\\gx\\uncommitted/data_docs/local_site/index.html'}