# Great Expectations Task

In [None]:
!pip install great_expectations


Collecting great_expectations
  Downloading great_expectations-1.3.12-py3-none-any.whl.metadata (8.6 kB)
Collecting altair<5.0.0,>=4.2.1 (from great_expectations)
  Downloading altair-4.2.2-py3-none-any.whl.metadata (13 kB)
Collecting marshmallow<4.0.0,>=3.7.1 (from great_expectations)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting posthog<4,>3 (from great_expectations)
  Downloading posthog-3.23.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting ruamel.yaml>=0.16 (from great_expectations)
  Downloading ruamel.yaml-0.18.10-py3-none-any.whl.metadata (23 kB)
Collecting pandas<2.2,>=1.3.0 (from great_expectations)
  Downloading pandas-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting numpy>=1.22.4 (from great_expectations)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.0 M

## 1. Install Great Expectations Library


In [None]:
!pip install great_expectations



##2. Import Necessary Libraries

In [None]:
import pandas as pd
import great_expectations as gx

##3. Load Labels.csv

Download and upload the [Labels.csv](https://github.com/zubxxr/SOFE3980U-Lab5/blob/main/Labels.csv) into this notebook, and then load the file.

In [None]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
                 names=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
                        "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
                        "hours-per-week", "native-country", "income"])

##4. Preview the Dataset

In [None]:
df.head()

Unnamed: 0,Timestamp,Car1_Location_X,Car1_Location_Y,Car1_Location_Z,Car2_Location_X,Car2_Location_Y,Car2_Location_Z,Occluded_Image_view,Occluding_Car_view,Ground_Truth_View,pedestrianLocationX_TopLeft,pedestrianLocationY_TopLeft,pedestrianLocationX_BottomRight,pedestrianLocationY_BottomRight
0,1736796157,-51.402977,143,0.596902,-59.32027,140,0.596902,A_001.png,B_001.png,C_001.png,593,361,610,410
1,1736796167,-53.819637,143,0.596902,-59.196568,140,0.596902,A_002.png,B_002.png,C_002.png,579,368,594,415
2,1736796178,-50.239144,143,0.596902,-56.744479,140,0.596902,A_003.png,B_003.png,C_003.png,854,720,854,720
3,1736796188,-53.70722,143,0.596902,-57.30938,140,0.596902,A_004.png,B_004.png,C_004.png,549,368,567,425
4,1736796198,-52.053721,143,0.596902,-59.545897,140,0.596902,A_005.png,B_005.png,C_005.png,524,368,537,413


##5. Set Up Great Expectations Context and Data Source

In [None]:
context = gx.get_context()
data_source = context.data_sources.add_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="pd dataframe asset")

INFO:great_expectations.data_context.types.base:Created temporary directory '/tmp/tmpib9kr1fp' for ephemeral docs site


##6. Define and Create a Data Batch

In [None]:
batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch = batch_definition.get_batch(batch_parameters={"dataframe": df})

ValueError: "batch definition" already exists (all existing batch_definition names are batch definition)

##7. Define Three Expectations for Column Values

Using this [link](https://greatexpectations.io/expectations/), choose three expectation functions and apply them to the labels dataset in a relevant manner.

You should replace the 'ExpectColumnValuesToBeBetween' function with other functions you select from the link.

You can also check the format/parameters required of each function when you click "See more" on the function.

In [None]:
## Original Function
expectation = gx.expectations.ExpectColumnValuesToBeBetween(
    column="education-num", min_value=0, max_value=20
)


## Example Function

## This function only requires a column parameter, and not a max or min value
expectation = gx.expectations.ExpectColumnValuesToBeUnique(
    column="column"
)

### Expectation 1

In [None]:
expectation = gx.expectations.ExpectColumnValuesToNotBeNull(
    column = "age"

)

### Validate Data Against Expectation 1

In [None]:
validation_result = batch.validate(expectation)
print(validation_result)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "type": "expect_column_values_to_not_be_null",
    "kwargs": {
      "batch_id": "pandas-pd dataframe asset",
      "column": "age"
    },
    "meta": {}
  },
  "result": {
    "element_count": 32561,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "partial_unexpected_counts": [],
    "partial_unexpected_index_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}


### Expectation 2

In [None]:
expectation = gx.expectations.ExpectColumnMaxToBeBetween(
    column="capital-gain", min_value=0, max_value=100000
)


### Validate Data Against Expectation 2

In [None]:
validation_result = batch.validate(expectation)
print(validation_result)


Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "type": "expect_column_max_to_be_between",
    "kwargs": {
      "batch_id": "pandas-pd dataframe asset",
      "column": "capital-gain",
      "min_value": 0.0,
      "max_value": 100000.0
    },
    "meta": {}
  },
  "result": {
    "observed_value": 99999
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}


### Expectation 3

In [None]:
expectation = gx.expectations.ExpectColumnProportionOfUniqueValuesToBeBetween(
    column="capital-loss", min_value=0, max_value=5
)



### Validate Data Against Expectation 3

In [None]:
validation_result = batch.validate(expectation)
print(validation_result)

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "type": "expect_column_proportion_of_unique_values_to_be_between",
    "kwargs": {
      "batch_id": "pandas-pd dataframe asset",
      "column": "capital-loss",
      "min_value": 0.0,
      "max_value": 5.0
    },
    "meta": {}
  },
  "result": {
    "observed_value": 0.0028254660483400386
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}
