In [1]:
import synapseclient

import great_expectations as gx
from great_expectations.data_context import FileDataContext

context = FileDataContext.create('../great_expectations')

from expectations.expect_column_values_to_have_list_length import ExpectColumnValuesToHaveListLength
from expectations.expect_column_values_to_have_list_members import ExpectColumnValuesToHaveListMembers
from expectations.expect_column_values_to_have_list_members_of_type import ExpectColumnValuesToHaveListMembersOfType


# Create Expectation Suite for Metabolomics Data

## Get Example Data File

In [2]:
syn = synapseclient.Synapse()
syn.login()


Welcome, Brad Macdonald!




UPGRADE AVAILABLE

A more recent version of the Synapse Client (3.1.1) is available. Your version (2.7.2) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 3.1.1 release notes

https://python-docs.synapse.org/build/html/news.html



In [3]:
metabolomics_data_file = syn.get("syn19276330").path


## Create Validator Object on Data File

In [4]:
validator = context.sources.pandas_default.read_json(
    metabolomics_data_file
)
validator.expectation_suite_name = "metabolomics"


## Add Expectations to Validator Object For Each Column

In [6]:
# ad_diagnosis_p_value
validator.expect_column_values_to_be_of_type("ad_diagnosis_p_value", "list")
validator.expect_column_values_to_not_be_null("ad_diagnosis_p_value")
# for custom and experimental expectations you have to pass args as kwargs
validator.expect_column_values_to_have_list_length(column="ad_diagnosis_p_value", list_length=1)





Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 15401,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [7]:
# associated gene name
validator.expect_column_values_to_be_of_type("associated_gene_name", "str")
validator.expect_column_values_to_not_be_null("associated_gene_name")
validator.expect_column_value_lengths_to_be_between("associated_gene_name", min_value=2, max_value=100)
# allows all alphanumeric characters, underscores, periods, and dashes
validator.expect_column_values_to_match_regex("associated_gene_name", "^[A-Za-z0-9_.-]+$")





Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 15401,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [10]:
# association p
validator.expect_column_values_to_be_of_type("association_p", "float")
validator.expect_column_values_to_not_be_null("association_p")
validator.expect_column_values_to_be_between("association_p", min_value=0, max_value=1)





Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 15401,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [7]:
# boxplot_group_names
validator.expect_column_values_to_be_of_type("boxplot_group_names", "list")
validator.expect_column_values_to_not_be_null("boxplot_group_names")
validator.expect_column_values_to_have_list_length(column="boxplot_group_names", list_length=2)
validator.expect_column_values_to_have_list_members(column="boxplot_group_names", list_members={"AD", "CN"})
validator.expect_column_values_to_have_list_members_of_type(column="boxplot_group_names", member_type="str")





Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 15401,
    "unexpected_count": 15401,
    "unexpected_percent": 100.0,
    "partial_unexpected_list": [
      [
        "CN",
        "AD"
      ],
      [
        "CN",
        "AD"
      ],
      [
        "CN",
        "AD"
      ],
      [
        "CN",
        "AD"
      ],
      [
        "CN",
        "AD"
      ],
      [
        "CN",
        "AD"
      ],
      [
        "CN",
        "AD"
      ],
      [
        "CN",
        "AD"
      ],
      [
        "CN",
        "AD"
      ],
      [
        "CN",
        "AD"
      ],
      [
        "CN",
        "AD"
      ],
      [
        "CN",
        "AD"
      ],
      [
        "CN",
        "AD"
      ],
      [
        "CN",
        "AD"
      ],
      [
        "CN",
        "AD"
      ],
      [
        "CN",
        "AD"
      ],
      [
        "CN",
        "AD"
      ],
      [
        "CN",
        "AD"
      ],
      [
        "CN",
        "AD"
      ],
   

In [12]:
# ensembl gene id
validator.expect_column_values_to_be_of_type("ensembl_gene_id", "str")
validator.expect_column_values_to_not_be_null("ensembl_gene_id")
validator.expect_column_value_lengths_to_equal("ensembl_gene_id", 15)
# checks format and allowed chatacters
validator.expect_column_values_to_match_regex("ensembl_gene_id", "^ENSG\d{11}$")
validator.expect_column_values_to_be_unique("ensembl_gene_id")




  validator.expect_column_values_to_match_regex("ensembl_gene_id", "^ENSG\d{11}$")




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 15401,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [13]:
# gene_wide_p_threshold_1kgp
validator.expect_column_values_to_be_of_type("gene_wide_p_threshold_1kgp", "float")
validator.expect_column_values_to_not_be_null("gene_wide_p_threshold_1kgp")
validator.expect_column_values_to_be_between("gene_wide_p_threshold_1kgp", min_value=0, max_value=0.05)





Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 15401,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [14]:
# metabolite full name
validator.expect_column_values_to_be_of_type("metabolite_full_name", "str")
validator.expect_column_values_to_not_be_null("metabolite_full_name")
validator.expect_column_value_lengths_to_be_between("metabolite_full_name", min_value=2, max_value=100)  
# allows all alphanumeric characters, dashes, parentheses, hyphens and spaces
validator.expect_column_values_to_match_regex("metabolite_full_name", "^[A-Za-z0-9\s\-:.()+]+$")




  validator.expect_column_values_to_match_regex("metabolite_full_name", "^[A-Za-z0-9\s\-:.()+]+$")




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 15401,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [15]:
# metabolite ID
validator.expect_column_values_to_be_of_type("metabolite_id", "str")
validator.expect_column_values_to_not_be_null("metabolite_id")
validator.expect_column_value_lengths_to_be_between("metabolite_id", min_value=2, max_value=100)
# allows all alphanumeric characters and periods
validator.expect_column_values_to_match_regex("metabolite_id", "^[A-Za-z0-9.]+$")





Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 15401,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [16]:
# n_per_group
validator.expect_column_values_to_be_of_type("n_per_group", "list")
validator.expect_column_values_to_not_be_null("n_per_group")
validator.expect_column_values_to_have_list_length(column="n_per_group", list_length=2)





Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 15401,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [17]:
# transposed_boxplot_stats
validator.expect_column_values_to_be_of_type("transposed_boxplot_stats", "list")
validator.expect_column_values_to_not_be_null("transposed_boxplot_stats")
validator.expect_column_values_to_have_list_length(column="transposed_boxplot_stats", list_length=2)





Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 15401,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## Save Expectation Suite

In [18]:
validator.save_expectation_suite(discard_failed_expectations=False)


## Create Checkpoint and View Results

In [19]:
checkpoint = context.add_or_update_checkpoint(
    name="agora-test-checkpoint",
    validator=validator,
)
checkpoint_result = checkpoint.run()
context.view_validation_result(checkpoint_result)


Calculating Metrics:   0%|          | 0/172 [00:00<?, ?it/s]

## Run Expectation Suite from JSON

In [21]:
#core logic for running the existing expectations
test_dataset = "./staging/metabolomics.json"
validator = context.sources.pandas_default.read_json(
    test_dataset,
)
expectation_suite = context.get_expectation_suite("metabolomics")
validator.expectation_suite = expectation_suite
validator.validate()
checkpoint = context.add_or_update_checkpoint(
    name="metabolomics",
    validator=validator,
)
checkpoint_result = checkpoint.run()


Calculating Metrics:   0%|          | 0/106 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/172 [00:00<?, ?it/s]

In [22]:
context.get_validation_result("metabolomics")


{
  "success": true,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_be_of_type",
        "kwargs": {
          "batch_id": "default_pandas_datasource-#ephemeral_pandas_asset",
          "column": "ad_diagnosis_p_value",
          "type_": "list"
        },
        "meta": {}
      },
      "result": {
        "element_count": 15401,
        "missing_count": 0,
        "missing_percent": 0.0,
        "partial_unexpected_counts": [],
        "partial_unexpected_index_list": [],
        "partial_unexpected_list": [],
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "unexpected_percent_nonmissing": 0.0,
        "unexpected_percent_total": 0.0
      },
      "meta": {},
      "exception_info": {
        "exception_message": null,
        "exception_traceback": null,
        "raised_exception": false
      }
    },
    {
      "success": true,
      "expectation_config": {
        "expect

## Get path to latest report file

In [50]:
import os
# class method to get the path and probably upload to synapse
results_path_items = list(checkpoint_result.list_validation_result_identifiers()[0].to_tuple())
results_path_items[-1] = results_path_items[-1] + ".html"
results_path = os.path.join(
    "./great_expectations/gx/uncommitted/data_docs/local_site/validations",
    *results_path_items
)


## GX Runner Class

In [None]:
import synapseclient

import great_expectations as gx
from great_expectations.data_context import FileDataContext

context = FileDataContext.create('../great_expectations')

from expectations.expect_column_values_to_have_list_length import ExpectColumnValuesToHaveListLength


In [None]:
class GreatExpectationsRunner:
    """Class to run great expectations on a dataset and upload the results to Synapse"""
    def __init__(self, syn, context, expectation_suite_name):
        self.syn = syn
        self.context = context
        self.expectation_suite_name = expectation_suite_name

    def run(self, dataset_path, expectation_suite_name):
        validator = self.context.sources.pandas_default.read_json(
            dataset_path,
        )
        expectation_suite = self.context.get_expectation_suite(self.expectation_suite_name)
        validator.expectation_suite = expectation_suite
        validator.validate()
        checkpoint = self.context.add_or_update_checkpoint(
            name=expectation_suite_name,
            validator=validator,
        )
        self.checkpoint_result = checkpoint.run()

    def get_results_path(self):
        results_path_items = list(self.checkpoint_result.list_validation_result_identifiers()[0].to_tuple())
        results_path_items[-1] = results_path_items[-1] + ".html"
        results_path = os.path.join(
            "./great_expectations/gx/uncommitted/data_docs/local_site/validations",
            *results_path_items
        )
        return results_path

    def upload_results_file_to_synapse(self):
        results_path = self.get_results_path()
        self.syn.store(synapseclient.File(results_path, parentId="parent_folder_here"))
