In [1]:
import synapseclient

import great_expectations as gx
import pandas as pd
import json

context = gx.get_context(project_root_dir='../src/agoradatatools/great_expectations')

from agoradatatools.gx import GreatExpectationsRunner
from expectations.expect_column_values_to_have_list_length import ExpectColumnValuesToHaveListLength
from expectations.expect_column_values_to_have_list_members import ExpectColumnValuesToHaveListMembers
from expectations.expect_column_values_to_have_list_members_of_type import ExpectColumnValuesToHaveListMembersOfType


# Create Expectation Suite for Biomarkers Data

In [2]:
syn = synapseclient.Synapse()
syn.login()



UPGRADE AVAILABLE

A more recent version of the Synapse Client (4.6.0) is available. Your version (4.0.0) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 4.6.0 release notes

https://python-docs.synapse.org/news/

Welcome, beatrizsaldana!

INFO: 2024-10-23 10:53:13 | synapseclient_default | Welcome, beatrizsaldana!



In [3]:
biomarkers_data_file = syn.get("syn63540269").path


## Create Validator Object on Data File

In [4]:
#validator = context.sources.pandas_default.read_json(biomarkers_data_file)
#validator.expectation_suite_name = "biomarkers"
df = pd.read_json(biomarkers_data_file)
nested_columns = ['points']
df = GreatExpectationsRunner.convert_nested_columns_to_json(df, nested_columns)
validator = context.sources.pandas_default.read_dataframe(df)
validator.expectation_suite_name = "biomarkers"

## Add Expectations to Validator Object For Each Column

In [5]:
# points
validator.expect_column_values_to_be_of_type("points", "list")
validator.expect_column_values_to_not_be_null("points")
validator.expect_column_values_to_have_list_members_of_type(column="points", member_type="dict")
#get JSON schema
with open("../src/agoradatatools/great_expectations/gx/json_schemas/immunohisto/points.json", "r") as file:
    points_schema = json.load(file)
validator.expect_column_values_to_match_json_schema("points", json_schema=points_schema)






Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]





Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]





Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

NameError: name 'json' is not defined

In [1]:
# model
validator.expect_column_values_to_be_of_type("model", "str")
validator.expect_column_values_to_not_be_null("model")
# allows all alphanumeric characters, underscores, periods, and dashes
validator.expect_column_values_to_match_regex("type", "^[A-Za-z0-9\s\*_.-]+$")


NameError: name 'validator' is not defined

In [6]:
# type
validator.expect_column_values_to_be_of_type("type", "str")
validator.expect_column_values_to_not_be_null("type")
# allows all alphanumeric characters, underscores, periods, and dashes
validator.expect_column_values_to_match_regex("type", "^[A-Za-z0-9\s_.-]+$")



  validator.expect_column_values_to_match_regex("type", "^[A-Za-z0-9\s_.-]+$")




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 88,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [7]:
# units
validator.expect_column_values_to_be_of_type("units", "str")
validator.expect_column_values_to_not_be_null("units")
# allows all alphanumeric characters, underscores, periods, and dashes
validator.expect_column_values_to_match_regex("units", "^[A-Za-z0-9\/\s\*_.-]+$")



  validator.expect_column_values_to_match_regex("units", "^[A-Za-z0-9\/\s\*_.-]+$")




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 88,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [8]:
# age_death
validator.expect_column_values_to_be_of_type("age_death", "int")
validator.expect_column_values_to_not_be_null("age_death")
validator.expect_column_values_to_be_between("age_death", strict_min_value=0, max_value=100)




Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 88,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [9]:
# tissue
validator.expect_column_values_to_be_of_type("tissue", "str")
validator.expect_column_values_to_not_be_null("tissue")
# allows all alphanumeric characters, underscores, periods, and dashes
validator.expect_column_values_to_match_regex("tissue", "^[A-Za-z0-9\/\s\*_.-]+$")



  validator.expect_column_values_to_match_regex("tissue", "^[A-Za-z0-9\/\s\*_.-]+$")




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 88,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [12]:
# unique entries ExpectSelectColumnValuesToBeUniqueWithinRecord
validator.expect_select_column_values_to_be_unique_within_record(column_list=["model", "type", "age_death", "tissue", "units"])




Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 88,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [13]:
validator.save_expectation_suite(discard_failed_expectations=False)


## Create Checkpoint and View Results

In [14]:
checkpoint = context.add_or_update_checkpoint(
    name="agora-test-checkpoint",
    validator=validator,
)
checkpoint_result = checkpoint.run()
context.view_validation_result(checkpoint_result)


Calculating Metrics:   0%|          | 0/89 [00:00<?, ?it/s]

## Build Data Docs - Click on Expectation Suite to View All Expectations

In [15]:
context.build_data_docs()
context.open_data_docs()
