In [15]:
import synapseclient

import pandas as pd
import great_expectations as gx

from agoradatatools.gx import GreatExpectationsRunner

context = gx.get_context(project_root_dir='../src/agoradatatools/great_expectations')

# Create Expectation Suite for Network Data

## Get Example Data File

In [16]:
syn = synapseclient.Synapse()
syn.login()


Welcome, Brad Macdonald!




UPGRADE AVAILABLE

A more recent version of the Synapse Client (4.3.0) is available. Your version (4.0.0) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 4.3.0 release notes

https://python-docs.synapse.org/news/



INFO: 2024-07-17 10:49:02 | synapseclient_default | Welcome, Brad Macdonald!



In [17]:
network_file = syn.get("syn17015361").path

## Create Validator Object on Data File

In [18]:
df = pd.read_json(network_file)
nested_columns = []
df = GreatExpectationsRunner.convert_nested_columns_to_json(df, nested_columns)
validator = context.sources.pandas_default.read_dataframe(df)
validator.expectation_suite_name = "network"

In [28]:
# get df rows where geneA_ensembl_gene_id is missing
df_missing = df[df['geneA_ensembl_gene_id'].isnull()]

In [29]:
df_missing.head()

Unnamed: 0,geneA_ensembl_gene_id,geneB_ensembl_gene_id,geneA_external_gene_name,geneB_external_gene_name,brainRegion


## Add Expectations to Validator Object For Each Column

In [19]:
# geneA_ensembl_gene_id
validator.expect_column_values_to_be_of_type("geneA_ensembl_gene_id", "str")
validator.expect_column_values_to_not_be_null("geneA_ensembl_gene_id")
validator.expect_column_values_to_match_regex("geneA_ensembl_gene_id", "^ENSG\d{11}$")













  validator.expect_column_values_to_match_regex("geneA_ensembl_gene_id", "^ENSG\d{11}$")



  validator.expect_column_values_to_match_regex("geneA_ensembl_gene_id", "^ENSG\d{11}$")









Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 347419,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [20]:
# geneB_ensembl_gene_id
validator.expect_column_values_to_be_of_type("geneB_ensembl_gene_id", "str")
validator.expect_column_values_to_not_be_null("geneB_ensembl_gene_id")
validator.expect_column_values_to_match_regex("geneB_ensembl_gene_id", "^ENSG\d{11}$")













  validator.expect_column_values_to_match_regex("geneB_ensembl_gene_id", "^ENSG\d{11}$")



  validator.expect_column_values_to_match_regex("geneB_ensembl_gene_id", "^ENSG\d{11}$")









Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 347419,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [30]:
# geneA_external_gene_name
validator.expect_column_values_to_be_of_type("geneA_external_gene_name", "str")
validator.expect_column_values_to_not_be_null("geneA_external_gene_name", mostly=0.99)
validator.expect_column_value_lengths_to_be_between("geneA_external_gene_name", 2, 100)







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 347419,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 318,
    "missing_percent": 0.09153212691303585,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [22]:
# geneB_external_gene_name
validator.expect_column_values_to_be_of_type("geneB_external_gene_name", "str")
validator.expect_column_values_to_not_be_null("geneB_external_gene_name", mostly=0.99)
validator.expect_column_value_lengths_to_be_between("geneB_external_gene_name", 2, 100)







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 347419,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 216,
    "missing_percent": 0.06217276545036397,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [23]:
# brainRegion
validator.expect_column_values_to_be_of_type("brainRegion", "str")
validator.expect_column_values_to_not_be_null("brainRegion")
validator.expect_column_values_to_be_in_set("brainRegion", ["CBE", "DLPFC", "FP", "IFG", "PHG", "STG", "TCX"])







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 347419,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [24]:
# multi-field
validator.expect_compound_columns_to_be_unique(["brainRegion", "geneA_ensembl_gene_id", "geneB_ensembl_gene_id"])







Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 347419,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## Save Expectation Suite

In [25]:
validator.save_expectation_suite(discard_failed_expectations=False)


## Create Checkpoint and View Results

In [26]:
checkpoint = context.add_or_update_checkpoint(
    name="agora-test-checkpoint",
    validator=validator,
)
checkpoint_result = checkpoint.run()
context.view_validation_result(checkpoint_result)


Calculating Metrics:   0%|          | 0/86 [00:00<?, ?it/s]

## Build Data Docs - Click on Expectation Suite to View All Expectations

In [None]:
context.build_data_docs()
context.open_data_docs()
