In [None]:
import synapseclient

import great_expectations as gx

context = gx.get_context(project_root_dir='../great_expectations')

from expectations.expect_column_values_to_have_list_length import ExpectColumnValuesToHaveListLength
from expectations.expect_column_values_to_have_list_members import ExpectColumnValuesToHaveListMembers
from expectations.expect_column_values_to_have_list_members_of_type import ExpectColumnValuesToHaveListMembersOfType


# Create Expectation Suite for Metabolomics Data

## Get Example Data File

In [None]:
syn = synapseclient.Synapse()
syn.login()


In [None]:
metabolomics_data_file = syn.get("syn19276330").path


## Create Validator Object on Data File

In [None]:
validator = context.sources.pandas_default.read_json(
    metabolomics_data_file
)
validator.expectation_suite_name = "metabolomics"


## Add Expectations to Validator Object For Each Column

In [None]:
# ad_diagnosis_p_value
validator.expect_column_values_to_be_of_type("ad_diagnosis_p_value", "list")
validator.expect_column_values_to_not_be_null("ad_diagnosis_p_value")
# for custom and experimental expectations you have to pass args as kwargs
validator.expect_column_values_to_have_list_length(column="ad_diagnosis_p_value", list_length=1)
validator.expect_column_values_to_have_list_members_of_type(column="ad_diagnosis_p_value", member_type="float")


In [None]:
# associated gene name
validator.expect_column_values_to_be_of_type("associated_gene_name", "str")
validator.expect_column_values_to_not_be_null("associated_gene_name")
validator.expect_column_value_lengths_to_be_between("associated_gene_name", min_value=2, max_value=100)
# allows all alphanumeric characters, underscores, periods, and dashes
validator.expect_column_values_to_match_regex("associated_gene_name", "^[A-Za-z0-9_.-]+$")


In [None]:
# association p
validator.expect_column_values_to_be_of_type("association_p", "float")
validator.expect_column_values_to_not_be_null("association_p")
validator.expect_column_values_to_be_between("association_p", min_value=0, max_value=1)


In [None]:
# boxplot_group_names
validator.expect_column_values_to_be_of_type("boxplot_group_names", "list")
validator.expect_column_values_to_not_be_null("boxplot_group_names")
validator.expect_column_values_to_have_list_length(column="boxplot_group_names", list_length=2)
validator.expect_column_values_to_have_list_members(column="boxplot_group_names", list_members={"AD", "CN"})
validator.expect_column_values_to_have_list_members_of_type(column="boxplot_group_names", member_type="str")


In [None]:
# ensembl gene id
validator.expect_column_values_to_be_of_type("ensembl_gene_id", "str")
validator.expect_column_values_to_not_be_null("ensembl_gene_id")
validator.expect_column_value_lengths_to_equal("ensembl_gene_id", 15)
# checks format and allowed chatacters
validator.expect_column_values_to_match_regex("ensembl_gene_id", "^ENSG\d{11}$")
validator.expect_column_values_to_be_unique("ensembl_gene_id")


In [None]:
# gene_wide_p_threshold_1kgp
validator.expect_column_values_to_be_of_type("gene_wide_p_threshold_1kgp", "float")
validator.expect_column_values_to_not_be_null("gene_wide_p_threshold_1kgp")
validator.expect_column_values_to_be_between("gene_wide_p_threshold_1kgp", min_value=0, max_value=0.05)


In [None]:
# metabolite full name
validator.expect_column_values_to_be_of_type("metabolite_full_name", "str")
validator.expect_column_values_to_not_be_null("metabolite_full_name")
validator.expect_column_value_lengths_to_be_between("metabolite_full_name", min_value=2, max_value=100)  
# allows all alphanumeric characters, dashes, parentheses, hyphens and spaces
validator.expect_column_values_to_match_regex("metabolite_full_name", "^[A-Za-z0-9\s\-:.()+]+$")


In [None]:
# metabolite ID
validator.expect_column_values_to_be_of_type("metabolite_id", "str")
validator.expect_column_values_to_not_be_null("metabolite_id")
validator.expect_column_value_lengths_to_be_between("metabolite_id", min_value=2, max_value=100)
# allows all alphanumeric characters and periods
validator.expect_column_values_to_match_regex("metabolite_id", "^[A-Za-z0-9.]+$")


In [None]:
# n_per_group
validator.expect_column_values_to_be_of_type("n_per_group", "list")
validator.expect_column_values_to_not_be_null("n_per_group")
validator.expect_column_values_to_have_list_length(column="n_per_group", list_length=2)
validator.expect_column_values_to_have_list_members_of_type(column="n_per_group", member_type="int")


In [None]:
# transposed_boxplot_stats
validator.expect_column_values_to_be_of_type("transposed_boxplot_stats", "list")
validator.expect_column_values_to_not_be_null("transposed_boxplot_stats")
validator.expect_column_values_to_have_list_length(column="transposed_boxplot_stats", list_length=2)
validator.expect_column_values_to_have_list_members_of_type(column="transposed_boxplot_stats", member_type="list")


## Save Expectation Suite

In [None]:
validator.save_expectation_suite(discard_failed_expectations=False)


## Create Checkpoint and View Results

In [None]:
checkpoint = context.add_or_update_checkpoint(
    name="agora-test-checkpoint",
    validator=validator,
)
checkpoint_result = checkpoint.run()
context.view_validation_result(checkpoint_result)
