# Great Expectations for dataset validation

In [None]:
!pip install great_expectations

Collecting great_expectations
  Downloading great_expectations-0.18.12-py3-none-any.whl (5.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
Collecting colorama>=0.4.3 (from great_expectations)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Collecting jsonpatch>=1.22 (from great_expectations)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting makefun<2,>=1.7.0 (from great_expectations)
  Downloading makefun-1.15.2-py2.py3-none-any.whl (22 kB)
Collecting marshmallow<4.0.0,>=3.7.1 (from great_expectations)
  Downloading marshmallow-3.21.1-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting ruamel.yaml<0.17.18,>=0.16 (from great_expectations)
  Downloading ruamel.yaml-0.17.17-py3-none-any.whl (109 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.1/109.1 kB[0m 

In [3]:
from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir='./')

# Loading Employee Performance Data

In [4]:
# Define the name of the data source and convert it to lowercase
datasource_name = 'employee-performance-dataset'

# Add a pandas data source with the specified name to the context
datasource = context.sources.add_pandas(datasource_name)

# Define the name of the dataset asset and convert it to lowercase
asset_name = 'employee-performance-dataset'

# Specify the path to the raw CSV file containing the dataset
path_to_data = 'P2M3_reynaldi_evans_adam_data_cleaned.csv'

# Add a CSV asset with the specified name and file path to the data source
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build a batch request to load the dataset into memory
batch_request = asset.build_batch_request()


# Creating Expectation Suite and Validator

In [5]:
# Creating Expectation suite
expectation_suite_name = 'expectation-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Creating validator
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Displaying Validator
validator.head()




Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,empid,firstname,lastname,startdate,title,supervisor,ademail,businessunit,employeestatus,employeetype,...,division,dob,state,jobfunctiondescription,gendercode,locationcode,racedesc,maritaldesc,performance_score,current_employee_rating
0,3427,Uriah,Bridges,2019-09-20,Production Technician I,Peter Oneill,uriah.bridges@bilearner.com,CCDR,Active,Contract,...,Finance & Accounting,1969-10-07,MA,Accounting,Female,34904,White,Widowed,Fully Meets,4
1,3428,Paula,Small,2023-02-11,Production Technician I,Renee Mccormick,paula.small@bilearner.com,EW,Active,Contract,...,Aerial,1965-08-30,MA,Labor,Male,6593,Hispanic,Widowed,Fully Meets,3
2,3429,Edward,Buck,2018-12-10,Area Sales Manager,Crystal Walker,edward.buck@bilearner.com,PL,Active,Full-Time,...,General - Sga,1991-10-06,MA,Assistant,Male,2330,Hispanic,Widowed,Fully Meets,4
3,3430,Michael,Riordan,2021-06-21,Area Sales Manager,Rebekah Wright,michael.riordan@bilearner.com,CCDR,Active,Contract,...,Finance & Accounting,1998-04-04,ND,Clerk,Male,58782,Other,Single,Fully Meets,2
4,3431,Jasmine,Onque,2019-06-29,Area Sales Manager,Jason Kim,jasmine.onque@bilearner.com,TNS,Active,Contract,...,General - Con,1969-08-29,FL,Laborer,Female,33174,Other,Married,Fully Meets,3


# Expectation 1: Column To be Unique

In [9]:
# Expectation 1: Employee ID must be unique

validator.expect_column_distinct_values_to_be_in_set(column='empid', value_set=None)

  and should_run_async(code)




Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_distinct_values_to_be_in_set",
    "kwargs": {
      "column": "empid",
      "value_set": null,
      "batch_id": "employee-performance-dataset-employee-performance-dataset"
    },
    "meta": {}
  },
  "result": {
    "observed_value": [
      1001,
      1005,
      1013,
      1015,
      1022,
      1029,
      1030,
      1031,
      1033,
      1034,
      1035,
      1038,
      1042,
      1044,
      1047,
      1048,
      1050,
      1054,
      1055,
      1056,
      1057,
      1058,
      1059,
      1061,
      1063,
      1065,
      1067,
      1068,
      1072,
      1073,
      1077,
      1080,
      1083,
      1084,
      1085,
      1090,
      1092,
      1093,
      1094,
      1095,
      1100,
      1101,
      1102,
      1104,
      1106,
      1107,
      1108,
      1110,
      1112,
      1113,
      1114,
      1115,
      1120,
      1121,
      1123,
      1125,
  

# Expectation 2: Column Values to Be in Set

In [10]:
# Expectation 2: Performance Score must be in the column
validator.expect_column_values_to_be_in_set(
    "performance_score",
    value_set=["Fully Meets", "Exceeds", "Needs Improvement", "PIP"]
)




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  common = np.find_common_type([values.dtype, comps_array.dtype], [])



{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_in_set",
    "kwargs": {
      "value_set": [
        "Fully Meets",
        "Exceeds",
        "Needs Improvement",
        "PIP"
      ],
      "column": "performance_score",
      "batch_id": "employee-performance-dataset-employee-performance-dataset"
    },
    "meta": {}
  },
  "result": {
    "element_count": 1467,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

# Expectation 3: Column Values to be Between

In [11]:
# Expectation 3: Employee Rating in the column must be between min_value and max_value
validator.expect_column_values_to_be_between(
    column="current_employee_rating",
    min_value=1,
    max_value=5
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_between",
    "kwargs": {
      "column": "current_employee_rating",
      "min_value": 1,
      "max_value": 5,
      "batch_id": "employee-performance-dataset-employee-performance-dataset"
    },
    "meta": {}
  },
  "result": {
    "element_count": 1467,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

# Expectation 4: Column Values to be of Type

In [13]:
# Expectation 4: State in the column must be of type specified
validator.expect_column_values_to_be_of_type(
    column="state",
    type_="object"
)

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_of_type",
    "kwargs": {
      "column": "state",
      "type_": "object",
      "batch_id": "employee-performance-dataset-employee-performance-dataset"
    },
    "meta": {}
  },
  "result": {
    "observed_value": "object_"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [14]:
# Expectation 5: StartDate must not be null
validator.expect_column_values_to_not_be_null(column="startdate")

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_not_be_null",
    "kwargs": {
      "column": "startdate",
      "batch_id": "employee-performance-dataset-employee-performance-dataset"
    },
    "meta": {}
  },
  "result": {
    "element_count": 1467,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

# Expectation 6: Column Values to be in type List

In [16]:
#Expectation 6: Locationcode must be either int or float
validator.expect_column_values_to_be_in_type_list('locationcode', ['int64', 'float64'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_in_type_list",
    "kwargs": {
      "column": "locationcode",
      "type_list": [
        "int64",
        "float64"
      ],
      "batch_id": "employee-performance-dataset-employee-performance-dataset"
    },
    "meta": {}
  },
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

# Expectation 7: Column Values count to Equal

In [17]:
# Expectation 7: The number of column must be equal
validator.expect_table_column_count_to_equal(value=24)

Calculating Metrics:   0%|          | 0/3 [00:00<?, ?it/s]

{
  "success": false,
  "expectation_config": {
    "expectation_type": "expect_table_column_count_to_equal",
    "kwargs": {
      "value": 24,
      "batch_id": "employee-performance-dataset-employee-performance-dataset"
    },
    "meta": {}
  },
  "result": {
    "observed_value": 23
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}