In [69]:
import json

import pandas as pd

import great_expectations as gx
import synapseclient

from agoradatatools.gx import GreatExpectationsRunner

context = gx.get_context(project_root_dir='../src/agoradatatools/great_expectations')

from expectations.expect_column_values_to_have_list_length import \
    ExpectColumnValuesToHaveListLength
from expectations.expect_column_values_to_have_list_members import \
    ExpectColumnValuesToHaveListMembers
from expectations.expect_column_values_to_have_list_members_of_type import \
    ExpectColumnValuesToHaveListMembersOfType

# Create Expectation Suite for Gene Info Data

## Get Example Data File

In [70]:
syn = synapseclient.Synapse()
syn.login()


Welcome, Brad Macdonald!

INFO: 2024-06-27 11:52:39 | synapseclient_default | Welcome, Brad Macdonald!




UPGRADE AVAILABLE

A more recent version of the Synapse Client (4.3.0) is available. Your version (4.0.0) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 4.3.0 release notes

https://python-docs.synapse.org/news/



In [71]:
gene_info_data_file = syn.get("syn17015359").path

## Create Validator Object on Data File

In [72]:
df = pd.read_json(gene_info_data_file)
nested_columns = ['target_nominations', 'median_expression', 'druggability', 'ensembl_info']
df = GreatExpectationsRunner.convert_nested_columns_to_json(df, nested_columns)
validator = context.sources.pandas_default.read_dataframe(df)
validator.expectation_suite_name = "gene_info"

## Add Expectations to Validator Object For Each Column

In [74]:
# ensembl_gene_id
validator.expect_column_values_to_be_of_type("ensembl_gene_id", "str")
validator.expect_column_values_to_not_be_null("ensembl_gene_id")
validator.expect_column_value_lengths_to_equal("ensembl_gene_id", 15)
# checks format and allowed chatacters
validator.expect_column_values_to_match_regex("ensembl_gene_id", "^ENSG\d{11}$")
validator.expect_column_values_to_be_unique("ensembl_gene_id")














  validator.expect_column_values_to_match_regex("ensembl_gene_id", "^ENSG\d{11}$")



  validator.expect_column_values_to_match_regex("ensembl_gene_id", "^ENSG\d{11}$")









Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [75]:
# name
validator.expect_column_values_to_be_of_type("name", "str")
validator.expect_column_value_lengths_to_be_between("name", 1, 200)
validator.expect_column_values_to_not_be_null("name", mostly=0.70)







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 7639,
    "unexpected_percent": 20.396774538075405,
    "partial_unexpected_list": [
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [76]:
# summary
validator.expect_column_values_to_be_of_type("summary", "str")
validator.expect_column_values_to_not_be_null("summary", mostly=0.50)







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 17323,
    "unexpected_percent": 46.25387162234327,
    "partial_unexpected_list": [
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [77]:
# hgnc_symbol
validator.expect_column_values_to_be_of_type("hgnc_symbol", "str")
validator.expect_column_value_lengths_to_be_between("hgnc_symbol", 1, 25)
validator.expect_column_values_to_match_regex("hgnc_symbol", "^[a-zA-Z0-9_.-]*$")
validator.expect_column_values_to_not_be_null("hgnc_symbol", mostly=0.70)







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 7639,
    "unexpected_percent": 20.396774538075405,
    "partial_unexpected_list": [
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [78]:
# alias
validator.expect_column_values_to_be_of_type("alias", "list")
validator.expect_column_values_to_not_be_null("alias")
validator.expect_column_values_to_have_list_members_of_type(column="alias", member_type="str")







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [79]:
# is_igap
validator.expect_column_values_to_be_of_type("is_igap", "bool")
validator.expect_column_values_to_not_be_null("is_igap")







Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [80]:
# is_eqtl
validator.expect_column_values_to_be_of_type("is_eqtl", "bool")
validator.expect_column_values_to_not_be_null("is_eqtl")







Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [81]:
# is_any_rna_changed_in_ad_brain
validator.expect_column_values_to_be_of_type("is_any_rna_changed_in_ad_brain", "bool")
validator.expect_column_values_to_not_be_null("is_any_rna_changed_in_ad_brain")







Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [82]:
# rna_brain_change_studied
validator.expect_column_values_to_be_of_type("rna_brain_change_studied", "bool")
validator.expect_column_values_to_not_be_null("rna_brain_change_studied")







Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [83]:
# is_any_protein_changed_in_ad_brain
validator.expect_column_values_to_be_of_type("is_any_protein_changed_in_ad_brain", "bool")
validator.expect_column_values_to_not_be_null("is_any_protein_changed_in_ad_brain")







Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [84]:
# protein_brain_change_studied
validator.expect_column_values_to_be_of_type("protein_brain_change_studied", "bool")
validator.expect_column_values_to_not_be_null("protein_brain_change_studied")







Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [85]:
# target_nominations
validator.expect_column_values_to_be_of_type("target_nominations", "str")
with open("../src/agoradatatools/great_expectations/gx/json_schemas/gene_info/target_nominations.json", "r") as file:
    target_nominations_schema = json.load(file)
validator.expect_column_values_to_match_json_schema("target_nominations", json_schema=target_nominations_schema, mostly=0.98)







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 481,
    "unexpected_percent": 1.284310584214461,
    "partial_unexpected_list": [
      "[{\"source\": \"Community\", \"team\": \"Chang Lab\", \"rank\": \"PS1-7\", \"hgnc_symbol\": \"ICA1\", \"target_choice_justification\": \"ICA1 was identified as an important key driver by predictive network analysis in microglial cells of dorsolateral prefrontal cortex in ROSMAP cohort, and microglial cells of temporal cortex in MAYO cohort, and neuron cells of dorsolateral prefrontal cortex in ROSMAP cohort, and temporal cortex in MAYO cohort, and dorsolateral prefrontal cortex in ROSMAP cohort.\", \"predicted_therapeutic_direction\": \"Agonism predicted to reduce disease progression\", \"data_used_to_support_target_selection\": \"Predictive network analysis derived from RNAseq and GWAS genotypes\", \"data_synapseid\": \"syn18358612\", \"study\": \"Mayo, ROSMAP\", \"input_data\": \"Genetics, RNA\", \"validation_

In [86]:
# median_expression
validator.expect_column_values_to_be_of_type("median_expression", "str")
with open("../src/agoradatatools/great_expectations/gx/json_schemas/gene_info/median_expression.json", "r") as file:
    median_expression_schema = json.load(file)
validator.expect_column_values_to_match_json_schema("median_expression", json_schema=median_expression_schema)







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [87]:
# druggability
validator.expect_column_values_to_be_of_type("druggability", "str")
with open("../src/agoradatatools/great_expectations/gx/json_schemas/gene_info/druggability.json", "r") as file:
    druggability_schema = json.load(file)
validator.expect_column_values_to_match_json_schema("druggability", json_schema=druggability_schema)







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [88]:
# total_nominations
validator.expect_column_values_to_be_of_type("total_nominations", "float")
validator.expect_column_vaLues_to_be_between("total_nominations", 0, 1000)







Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 36505,
    "missing_percent": 97.47143009719107,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [89]:
# biodomains
validator.expect_column_values_to_be_of_type("biodomains", "list")
validator.expect_column_values_to_have_list_members_of_type(column="biodomains", member_type="str")
validator.expect_column_values_to_have_list_members(column="biodomains", list_members={
        'Apoptosis',
        'Vasculature',
        'Lipid Metabolism',
        'Metal Binding and Homeostasis',
        'Proteostasis',
        'Immune Response',
        'Cell Cycle',
        'DNA Repair',
        'Autophagy',
        'Mitochondrial Metabolism',
        'Structural Stabilization',
        'Synapse',
        'Endolysosome',
        'Oxidative Stress',
        'Epigenetic',
        'APP Metabolism',
        'RNA Spliceosome',
        'Tau Homeostasis',
        'Myelination'
    }
)







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 21462,
    "missing_percent": 57.305350849086835,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [90]:
# is_adi
validator.expect_column_values_to_be_of_type("is_adi", "bool")
validator.expect_column_values_to_not_be_null("is_adi")







Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [91]:
# is_tep
validator.expect_column_values_to_be_of_type("is_tep", "bool")
validator.expect_column_values_to_not_be_null("is_tep")







Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [92]:
# resource_url
validator.expect_column_values_to_be_of_type("resource_url", "str")
validator.expect_column_values_to_match_regex("resource_url", "^https://adknowledgeportal.synapse.org/Explore/Target")







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 37308,
    "missing_percent": 99.61550785004806,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [93]:
# ensembl_info
validator.expect_column_values_to_be_of_type("ensembl_info", "str")
validator.expect_column_values_to_not_be_null("ensembl_info")
with open("../src/agoradatatools/great_expectations/gx/json_schemas/gene_info/ensembl_info.json", "r") as file:
    ensembl_info_schema = json.load(file)
validator.expect_column_values_to_match_json_schema("ensembl_info", ensembl_info_schema)







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [94]:
# multi-field
validator.expect_compound_columns_to_be_unique(["hgnc_symbol", "ensembl_gene_id"])







Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## Save Expectation Suite

In [95]:
validator.save_expectation_suite(discard_failed_expectations=False)

## Create Checkpoint and View Results

In [96]:
checkpoint = context.add_or_update_checkpoint(
    name="agora-test-checkpoint",
    validator=validator,
)
checkpoint_result = checkpoint.run()
context.view_validation_result(checkpoint_result)

Calculating Metrics:   0%|          | 0/224 [00:00<?, ?it/s]

## Build Data Docs - Click on Expectation Suite to View All Expectations

In [None]:
context.build_data_docs()
context.open_data_docs()
