In [None]:
import synapseclient

import great_expectations as gx

context = gx.get_context(project_root_dir='../src/agoradatatools/great_expectations')

from expectations.expect_column_values_to_have_list_length import ExpectColumnValuesToHaveListLength
from expectations.expect_column_values_to_have_list_length_in_range import ExpectColumnValuesToHaveListLengthInRange
from expectations.expect_column_values_to_have_list_members import ExpectColumnValuesToHaveListMembers
from expectations.expect_column_values_to_have_list_members_of_type import ExpectColumnValuesToHaveListMembersOfType
from expectations.expect_column_values_to_have_list_of_dict_with_expected_values import ExpectColumnValuesToHaveListOfDictWithExpectedValues


# Create Expectation Suite for Metabolomics Data

## Get Example Data File

In [None]:
syn = synapseclient.Synapse()
syn.login()


In [None]:
genes_biodomains_data_file = syn.get("syn51062085").path


## Create Validator Object on Data File

In [None]:
validator = context.sources.pandas_default.read_json(
    genes_biodomains_data_file
)
validator.expectation_suite_name = "genes_biodomains"


## Add Expectations to Validator Object For Each Column

In [None]:
# ensembl_gene_id
validator.expect_column_values_to_be_of_type("ensembl_gene_id", "str")
validator.expect_column_values_to_not_be_null("ensembl_gene_id")
validator.expect_column_value_lengths_to_equal("ensembl_gene_id", 15)
# checks format and allowed chatacters
validator.expect_column_values_to_match_regex("ensembl_gene_id", "^ENSG\d{11}$")
validator.expect_column_values_to_be_unique("ensembl_gene_id")


In [None]:
# gene_biodomains
validator.expect_column_values_to_be_of_type("gene_biodomains", "list")
validator.expect_column_values_to_not_be_null("gene_biodomains")
validator.expect_column_values_to_have_list_length_in_range(column="gene_biodomains", list_length_range=[0, 19])
validator.expect_column_values_to_have_list_members_of_type(column="gene_biodomains", member_type="dict")
biodomain_list = ['Apoptosis', 'Vasculature', 'Lipid Metabolism', 'Proteostasis', 'Immune Response', 'Autophagy', 'Mitochondrial Metabolism', 'Structural Stabilization', 'Synapse', 'Endolysosome', 'Metal Binding and Homeostasis', 'Oxidative Stress', 'Epigenetic', 'APP Metabolism', 'Cell Cycle', 'DNA Repair', 'RNA Spliceosome', 'Tau Homeostasis', 'Myelination']
validator.expect_column_values_to_have_list_of_dict_with_expected_values(column="gene_biodomains", list_dict_values={"key": "biodomain", "values": biodomain_list})


## Save Expectation Suite

In [None]:
validator.save_expectation_suite(discard_failed_expectations=False)


## Create Checkpoint and View Results

In [None]:
checkpoint = context.add_or_update_checkpoint(
    name="agora-test-checkpoint",
    validator=validator,
)
checkpoint_result = checkpoint.run()
context.view_validation_result(checkpoint_result)


## Build Data Docs - Click on Expectation Suite to View All Expectations

In [None]:
context.build_data_docs()
context.open_data_docs()
