In [None]:
import synapseclient

import great_expectations as gx

context = gx.get_context(project_root_dir='../src/agoradatatools/great_expectations')

# Create Expectation Suite for Target Exp Validation Harmonized Data

## Get Example Data File

In [None]:
syn = synapseclient.Synapse()
syn.login()


In [None]:
target_exp_validation_harmonized_data_file = syn.get("syn25740978").path


## Create Validator Object on Data File

In [None]:
validator = context.sources.pandas_default.read_json(
    target_exp_validation_harmonized_data_file
)
validator.expectation_suite_name = "target_exp_validation_harmonized"


## Add Expectations to Validator Object For Each Column

In [None]:
# ensembl_gene_id
validator.expect_column_values_to_be_of_type("ensembl_gene_id", "str")
validator.expect_column_values_to_not_be_null("ensembl_gene_id")
validator.expect_column_value_lengths_to_equal("ensembl_gene_id", 15)
# checks format and allowed chatacters
validator.expect_column_values_to_match_regex("ensembl_gene_id", "^ENSG\d{11}$")


In [None]:
# hgnc_symbol
validator.expect_column_values_to_be_of_type("hgnc_symbol", "str")
validator.expect_column_value_lengths_to_be_between("hgnc_symbol", 2, 100)
validator.expect_column_values_to_match_regex("hgnc_symbol", "^[a-zA-Z0-9_() ]*$")
validator.expect_column_values_to_not_be_null("hgnc_symbol")

In [None]:
# hypothesis_tested
validator.expect_column_values_to_be_of_type("hypothesis_tested", "str")
validator.expect_column_value_lengths_to_be_between("hypothesis_tested", 2, 1000)
validator.expect_column_values_to_not_match_regex("hypothesis_tested", "\ufffd")

In [None]:
# summary_findings
validator.expect_column_values_to_be_of_type("summary_findings", "str")
validator.expect_column_value_lengths_to_be_between("summary_findings", 2, 1000)
validator.expect_column_values_to_not_match_regex("summary_findings", "\ufffd")

In [None]:
# published 
validator.expect_column_values_to_be_of_type("published", "str")
validator.expect_column_values_to_be_in_set("published", ["yes", "Yes", "no", "No"])
validator.expect_column_values_to_not_be_null("published")

In [None]:
# reference
validator.expect_column_values_to_be_of_type("reference", "str")
validator.expect_column_value_lengths_to_be_between("reference", 2, 1000)
validator.expect_column_values_to_not_match_regex("reference", "\ufffd")

In [None]:
# species
validator.expect_column_values_to_be_of_type("species", "str")
validator.expect_column_values_to_be_in_set("species", [
        "Mouse",
        "Human",
        "Human, Mouse",
        "Human, Drosophila",
        "Drosophila",
        "Drosophila, Human",
        "Human ", 
        "Zebrafish",
    ]
)
validator.expect_column_values_to_not_match_regex("species", "\ufffd")

In [None]:
# model_system
validator.expect_column_values_to_be_of_type("model_system", "str")
validator.expect_column_value_lengths_to_be_between("model_system", 2, 1000)
validator.expect_column_values_to_not_match_regex("model_system", "\ufffd")


In [None]:
# outcome_measure
validator.expect_column_values_to_be_of_type("outcome_measure", "str")
validator.expect_column_values_to_be_in_set("outcome_measure", [
        "Behavioral",
        "Electrophysiology",
        "Biochemical",
        "Biomarker",
        "Cell Biology",
        "Other",
        "Histopathology",
        "Imaging",
        "Omics",
        "Immunochemistry",
        "Immunohistochemistry",
    ]
)
validator.expect_column_values_to_not_match_regex("outcome_measure", "\ufffd")

In [None]:
# outcome_measure_details
validator.expect_column_values_to_be_of_type("outcome_measure_details", "str")
validator.expect_column_value_lengths_to_be_between("outcome_measure_details", 2, 2000)
validator.expect_column_values_to_not_match_regex("outcome_measure_details", "\ufffd")

In [None]:
# balanced_for_sex 
validator.expect_column_values_to_be_of_type("balanced_for_sex", "str")
validator.expect_column_values_to_be_in_set("balanced_for_sex", ["yes", "Yes", "no", "No"])

In [None]:
# contributors
validator.expect_column_values_to_be_of_type("contributors", "str")
validator.expect_column_value_lengths_to_be_between("contributors", 2, 1000)

In [None]:
# team
validator.expect_column_values_to_be_of_type("team", "str")
validator.expect_column_values_to_be_in_set("team", [
    "Duke",
    "Columbia-Rush",
    "MSSM - Zhang Lab",
    "Emory",
    "Mayo-UFL-ISB",
    "Mayo",
    ]
)

In [None]:
# reference_doi
validator.expect_column_values_to_be_of_type("reference_doi", "str")
validator.expect_column_value_lengths_to_be_between("reference_doi", 2, 1000)
validator.expect_column_values_to_match_regex("reference_doi", "https://doi\.org/[A-Za-z0-9/]+")

In [None]:
# date_report
validator.expect_column_values_to_be_of_type("date_report", "str")
validator.expect_column_values_to_match_strftime_format("date_report", "%m/%d/%y")

## Save Expectation Suite

In [None]:
validator.save_expectation_suite(discard_failed_expectations=False)


## Create Checkpoint and View Results

In [None]:
checkpoint = context.add_or_update_checkpoint(
    name="agora-test-checkpoint",
    validator=validator,
)
checkpoint_result = checkpoint.run()
context.view_validation_result(checkpoint_result)


## Build Data Docs - Click on Expectation Suite to View All Expectations

In [None]:
context.build_data_docs()
context.open_data_docs()
