In [None]:
import synapseclient

import great_expectations as gx

context = gx.get_context(project_root_dir='../src/agoradatatools/great_expectations')

# Create Expectation Suite for RNASEQ Differential Expression Dataset

## Get Example Data File

In [None]:
syn = synapseclient.Synapse()
syn.login()


In [None]:
rnaseq_de_data_file = syn.get("syn17015360").path

## Create Validator Object on Data File

In [None]:
validator = context.sources.pandas_default.read_json(
    rnaseq_de_data_file
)
validator.expectation_suite_name = "rnaseq_differential_expression"

## Add Expectations to Validator Object For Each Column

In [None]:
# ensembl_gene_id
validator.expect_column_values_to_be_of_type("ensembl_gene_id", "str")
validator.expect_column_values_to_not_be_null("ensembl_gene_id")
validator.expect_column_value_lengths_to_equal("ensembl_gene_id", 15)
validator.expect_column_values_to_match_regex("ensembl_gene_id", "^ENSG\d{11}$")

In [None]:
# hgnc_symbol
validator.expect_column_values_to_be_of_type("hgnc_symbol", "str")
validator.expect_column_value_lengths_to_be_between("hgnc_symbol", 1, 25)
validator.expect_column_values_to_match_regex("hgnc_symbol", "^[a-zA-Z0-9_.-]*$")

In [None]:
# logfc
validator.expect_column_values_to_be_of_type("logfc", "float")
validator.expect_column_values_to_not_be_null("logfc")
validator.expect_column_values_to_be_between("logfc", -5, 5)

In [None]:
# fc
validator.expect_column_values_to_be_of_type("fc", "float")
validator.expect_column_values_to_not_be_null("fc")
validator.expect_column_values_to_be_between("fc", -1, 7)

In [None]:
# ci_l
validator.expect_column_values_to_be_of_type("ci_l", "float")
validator.expect_column_values_to_not_be_null("ci_l")
validator.expect_column_values_to_be_between("ci_l", -5, 5)

In [None]:
# ci_r
validator.expect_column_values_to_be_of_type("ci_r", "float")
validator.expect_column_values_to_not_be_null("ci_r")
validator.expect_column_values_to_be_between("ci_r", -5, 5)

In [None]:
# adj_p_val
validator.expect_column_values_to_be_of_type("adj_p_val", "float")
validator.expect_column_values_to_not_be_null("adj_p_val")
validator.expect_column_values_to_be_between("adj_p_val", 0, 1)

In [None]:
# tissue
validator.expect_column_values_to_be_of_type("tissue", "str")
validator.expect_column_values_to_not_be_null("tissue")
validator.expect_column_values_to_be_in_set("tissue", ["CBE", "TCX", "FP", "IFG", "PHG", "STG", "DLPFC", "ACC", "PCC"])

In [None]:
# study
validator.expect_column_values_to_be_of_type("study", "str")
validator.expect_column_values_to_not_be_null("study")
validator.expect_column_values_to_be_in_set("study", ["MayoRNAseq", "MSBB", "ROSMAP"])

In [None]:
# model
validator.expect_column_values_to_be_of_type("model", "str")
validator.expect_column_values_to_be_of_type("model", "str")
validator.expect_column_values_to_be_in_set("model", ["AD Diagnosis (males and females)", "AD Diagnosis x AOD (males and females)", "AD Diagnosis x Sex (females only)", "AD Diagnosis x Sex (males only)"])

In [None]:
# multi-field
validator.expect_compound_columns_to_be_unique(["ensembl_gene_id", "tissue", "model"])

## Save Expectation Suite

In [None]:
validator.save_expectation_suite(discard_failed_expectations=False)


## Create Checkpoint and View Results

In [None]:
checkpoint = context.add_or_update_checkpoint(
    name="agora-test-checkpoint",
    validator=validator,
)
checkpoint_result = checkpoint.run()
context.view_validation_result(checkpoint_result)


## Build Data Docs - Click on Expectation Suite to View All Expectations

In [None]:
context.build_data_docs()
context.open_data_docs()
