In [1]:
import synapseclient

import great_expectations as gx

context = gx.get_context(project_root_dir='../src/agoradatatools/great_expectations')

from expectations.expect_column_values_to_have_list_length import ExpectColumnValuesToHaveListLength
from expectations.expect_column_values_to_have_list_length_in_range import ExpectColumnValuesToHaveListLengthInRange
from expectations.expect_column_values_to_have_list_members import ExpectColumnValuesToHaveListMembers
from expectations.expect_column_values_to_have_list_members_of_type import ExpectColumnValuesToHaveListMembersOfType


# Create Expectation Suite for Metabolomics Data

## Get Example Data File

In [2]:
syn = synapseclient.Synapse()
syn.login()


Welcome, Brad Macdonald!




UPGRADE AVAILABLE

A more recent version of the Synapse Client (3.2.0) is available. Your version (3.1.1) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 3.2.0 release notes

https://python-docs.synapse.org/build/html/news.html



In [3]:
genes_biodomains_data_file = syn.get("syn51062085").path


## Create Validator Object on Data File

In [4]:
validator = context.sources.pandas_default.read_json(
    genes_biodomains_data_file
)
validator.expectation_suite_name = "genes_biodomains"


## Add Expectations to Validator Object For Each Column

In [5]:
# ensembl_gene_id
validator.expect_column_values_to_be_of_type("ensembl_gene_id", "str")
validator.expect_column_values_to_not_be_null("ensembl_gene_id")
validator.expect_column_value_lengths_to_equal("ensembl_gene_id", 15)
# checks format and allowed chatacters
validator.expect_column_values_to_match_regex("ensembl_gene_id", "^ENSG\d{11}$")
validator.expect_column_values_to_be_unique("ensembl_gene_id")




  validator.expect_column_values_to_match_regex("ensembl_gene_id", "^ENSG\d{11}$")




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 20974,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [14]:
import pandas as pd

df = pd.read_json(genes_biodomains_data_file)


In [20]:
biodomain_list = ['Apoptosis', 'Vasculature', 'Lipid Metabolism', 'Proteostasis', 'Immune Response', 'Autophagy', 'Mitochondrial Metabolism', 'Structural Stabilization', 'Synapse', 'Endolysosome', 'Metal Binding and Homeostasis', 'Oxidative Stress', 'Epigenetic', 'APP Metabolism', 'Cell Cycle', 'DNA Repair', 'RNA Spliceosome', 'Tau Homeostasis', 'Myelination']


['Apoptosis', 'Vasculature', 'Lipid Metabolism', 'Proteostasis', 'Immune Response', 'Autophagy', 'Mitochondrial Metabolism', 'Structural Stabilization', 'Synapse', 'Endolysosome', 'Metal Binding and Homeostasis', 'Oxidative Stress', 'Epigenetic', 'APP Metabolism', 'Cell Cycle', 'DNA Repair', 'RNA Spliceosome', 'Tau Homeostasis', 'Myelination']


In [6]:
# gene_biodomains
validator.expect_column_values_to_be_of_type("gene_biodomains", "list")
validator.expect_column_values_to_not_be_null("gene_biodomains")
validator.expect_column_values_to_have_list_length_in_range(column="gene_biodomains", list_length_range=[0, 19])
validator.expect_column_values_to_have_list_members_of_type(column="gene_biodomains", member_type="dict")





Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 20974,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## Save Expectation Suite

In [7]:
validator.save_expectation_suite(discard_failed_expectations=False)


## Create Checkpoint and View Results

In [8]:
checkpoint = context.add_or_update_checkpoint(
    name="agora-test-checkpoint",
    validator=validator,
)
checkpoint_result = checkpoint.run()
context.view_validation_result(checkpoint_result)


Calculating Metrics:   0%|          | 0/49 [00:00<?, ?it/s]

## Build Data Docs - Click on Expectation Suite to View All Expectations

In [None]:
context.build_data_docs()
context.open_data_docs()
