In [1]:
import json

import pandas as pd

import great_expectations as gx
import synapseclient

from agoradatatools.gx import GreatExpectationsRunner

context = gx.get_context(project_root_dir='../src/agoradatatools/great_expectations')

from expectations.expect_column_values_to_have_list_length import \
    ExpectColumnValuesToHaveListLength
from expectations.expect_column_values_to_have_list_members import \
    ExpectColumnValuesToHaveListMembers
from expectations.expect_column_values_to_have_list_members_of_type import \
    ExpectColumnValuesToHaveListMembersOfType

# Create Expectation Suite for Gene Info Data

## Get Example Data File

In [2]:
syn = synapseclient.Synapse()
syn.login()



UPGRADE AVAILABLE

A more recent version of the Synapse Client (4.6.0) is available. Your version (4.0.0) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 4.6.0 release notes

https://python-docs.synapse.org/news/



Welcome, beatrizsaldana!

INFO: 2024-11-18 14:55:42 | synapseclient_default | Welcome, beatrizsaldana!



In [3]:
gene_info_data_file = syn.get("syn17015359").path

## Create Validator Object on Data File

In [4]:
df = pd.read_json(gene_info_data_file)
nested_columns = ['target_nominations', 'median_expression', 'druggability', 'ensembl_info']
df = GreatExpectationsRunner.convert_nested_columns_to_json(df, nested_columns)
validator = context.sources.pandas_default.read_dataframe(df)
validator.expectation_suite_name = "gene_info"

## Add Expectations to Validator Object For Each Column

In [5]:
# ensembl_gene_id
validator.expect_column_values_to_be_of_type("ensembl_gene_id", "str")
validator.expect_column_values_to_not_be_null("ensembl_gene_id")
# checks format and allowed characters
validator.expect_column_values_to_match_regex("ensembl_gene_id", "^ENSG\d{11}$")
validator.expect_column_values_to_be_unique("ensembl_gene_id")














  validator.expect_column_values_to_match_regex("ensembl_gene_id", "^ENSG\d{11}$")



  validator.expect_column_values_to_match_regex("ensembl_gene_id", "^ENSG\d{11}$")









Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [6]:
# name
validator.expect_column_values_to_be_of_type("name", "str")
validator.expect_column_value_lengths_to_be_between("name", 1, 200)
validator.expect_column_values_to_not_be_null("name", mostly=0.70)







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 7639,
    "unexpected_percent": 20.396774538075405,
    "partial_unexpected_list": [
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [7]:
# summary
validator.expect_column_values_to_be_of_type("summary", "str")
validator.expect_column_values_to_not_be_null("summary", mostly=0.50)







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 17323,
    "unexpected_percent": 46.25387162234327,
    "partial_unexpected_list": [
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [8]:
# hgnc_symbol
validator.expect_column_values_to_be_of_type("hgnc_symbol", "str")
validator.expect_column_value_lengths_to_be_between("hgnc_symbol", 1, 25)
validator.expect_column_values_to_match_regex("hgnc_symbol", "^[a-zA-Z0-9_.-]*$")
validator.expect_column_values_to_not_be_null("hgnc_symbol", mostly=0.70)







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 7639,
    "unexpected_percent": 20.396774538075405,
    "partial_unexpected_list": [
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [9]:
# alias
validator.expect_column_values_to_be_of_type("alias", "list")
validator.expect_column_values_to_not_be_null("alias")
validator.expect_column_values_to_have_list_members_of_type(column="alias", member_type="str")







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [10]:
# is_igap
validator.expect_column_values_to_be_of_type("is_igap", "bool")
validator.expect_column_values_to_not_be_null("is_igap")







Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [11]:
# is_eqtl
validator.expect_column_values_to_be_of_type("is_eqtl", "bool")
validator.expect_column_values_to_not_be_null("is_eqtl")







Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [12]:
# is_any_rna_changed_in_ad_brain
validator.expect_column_values_to_be_of_type("is_any_rna_changed_in_ad_brain", "bool")
validator.expect_column_values_to_not_be_null("is_any_rna_changed_in_ad_brain")







Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [13]:
# rna_brain_change_studied
validator.expect_column_values_to_be_of_type("rna_brain_change_studied", "bool")
validator.expect_column_values_to_not_be_null("rna_brain_change_studied")







Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [14]:
# is_any_protein_changed_in_ad_brain
validator.expect_column_values_to_be_of_type("is_any_protein_changed_in_ad_brain", "bool")
validator.expect_column_values_to_not_be_null("is_any_protein_changed_in_ad_brain")







Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [15]:
# protein_brain_change_studied
validator.expect_column_values_to_be_of_type("protein_brain_change_studied", "bool")
validator.expect_column_values_to_not_be_null("protein_brain_change_studied")







Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [16]:
# target_nominations
validator.expect_column_values_to_be_of_type("target_nominations", "str")
with open("../src/agoradatatools/great_expectations/gx/json_schemas/gene_info/target_nominations.json", "r") as file:
    target_nominations_schema = json.load(file)
validator.expect_column_values_to_match_json_schema("target_nominations", json_schema=target_nominations_schema)







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [17]:
# median_expression
validator.expect_column_values_to_be_of_type("median_expression", "str")
with open("../src/agoradatatools/great_expectations/gx/json_schemas/gene_info/median_expression.json", "r") as file:
    median_expression_schema = json.load(file)
validator.expect_column_values_to_match_json_schema("median_expression", json_schema=median_expression_schema)







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [18]:
# druggability
validator.expect_column_values_to_be_of_type("druggability", "str")
with open("../src/agoradatatools/great_expectations/gx/json_schemas/gene_info/druggability.json", "r") as file:
    druggability_schema = json.load(file)
validator.expect_column_values_to_match_json_schema("druggability", json_schema=druggability_schema)







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [19]:
# total_nominations
validator.expect_column_values_to_be_of_type("total_nominations", "float")
validator.expect_column_vaLues_to_be_between("total_nominations", 0, 1000)







Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 36504,
    "missing_percent": 97.4687600128164,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [20]:
# biodomains
validator.expect_column_values_to_be_of_type("biodomains", "list")
validator.expect_column_values_to_have_list_members_of_type(column="biodomains", member_type="str")
validator.expect_column_values_to_have_list_members(column="biodomains", list_members={
        'Apoptosis',
        'Vasculature',
        'Lipid Metabolism',
        'Metal Binding and Homeostasis',
        'Proteostasis',
        'Immune Response',
        'Cell Cycle',
        'DNA Repair',
        'Autophagy',
        'Mitochondrial Metabolism',
        'Structural Stabilization',
        'Synapse',
        'Endolysosome',
        'Oxidative Stress',
        'Epigenetic',
        'APP Metabolism',
        'RNA Spliceosome',
        'Tau Homeostasis',
        'Myelination'
    }
)







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 21462,
    "missing_percent": 57.305350849086835,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [21]:
# is_adi
validator.expect_column_values_to_be_of_type("is_adi", "bool")
validator.expect_column_values_to_not_be_null("is_adi")







Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [22]:
# is_tep
validator.expect_column_values_to_be_of_type("is_tep", "bool")
validator.expect_column_values_to_not_be_null("is_tep")







Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [23]:
# resource_url
validator.expect_column_values_to_be_of_type("resource_url", "str")
validator.expect_column_values_to_match_regex("resource_url", "^https://adknowledgeportal.synapse.org/Explore/Target")







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 37307,
    "missing_percent": 99.61283776567339,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [24]:
# ensembl_info
validator.expect_column_values_to_be_of_type("ensembl_info", "str")
validator.expect_column_values_to_not_be_null("ensembl_info")
with open("../src/agoradatatools/great_expectations/gx/json_schemas/gene_info/ensembl_info.json", "r") as file:
    ensembl_info_schema = json.load(file)
validator.expect_column_values_to_match_json_schema("ensembl_info", ensembl_info_schema)







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 37452,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [25]:
# uniprotkb_accessions
#validator.expect_column_values_to_be_of_type("uniprotkb_accessions", "list")
#validator.expect_column_values_to_have_list_members_of_type(column="uniprotkb_accessions", member_type="str")

# Regex from https://www.uniprot.org/help/accession_numbers
# validator.expect_column_values_to_match_regex("uniprotkb_accession", "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}")







Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

IndexError: list index out of range

## Save Expectation Suite

In [None]:
validator.save_expectation_suite(discard_failed_expectations=False)

## Create Checkpoint and View Results

In [None]:
checkpoint = context.add_or_update_checkpoint(
    name="agora-test-checkpoint",
    validator=validator,
)
checkpoint_result = checkpoint.run()
context.view_validation_result(checkpoint_result)

## Build Data Docs - Click on Expectation Suite to View All Expectations

In [None]:
context.build_data_docs()
context.open_data_docs()
