In [1]:
import synapseclient
import json

import pandas as pd
import great_expectations as gx

from agoradatatools.gx import GreatExpectationsRunner

context = gx.get_context(project_root_dir='../src/agoradatatools/great_expectations')

# Create Expectation Suite for Overall Scores Data

## Get Example Data File

In [2]:
syn = synapseclient.Synapse()
syn.login()



UPGRADE AVAILABLE

A more recent version of the Synapse Client (4.0.0) is available. Your version (3.1.1) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 4.0.0 release notes

https://python-docs.synapse.org/news/



Welcome, Brad Macdonald!

INFO: 2024-01-31 13:45:58 | synapseclient_default | Welcome, Brad Macdonald!



In [3]:
overall_scores_data_file = syn.get("syn25740976").path


In [4]:
import pandas as pd

df = pd.read_json(overall_scores_data_file)

In [10]:
df.head()

Unnamed: 0,ensembl_gene_id,hgnc_symbol,target_risk_score,genetics_score,multi_omics_score
0,ENSG00000204525,HLA-C,4.743823,2.774846,1.968977
1,ENSG00000142192,APP,4.711319,2.731975,1.979344
2,ENSG00000130203,APOE,4.679266,2.850445,1.828822
3,ENSG00000234745,HLA-B,4.679037,2.723583,1.955454
4,ENSG00000197535,MYO5A,4.669046,2.675176,1.993871


In [27]:
print(max(df["multi_omics_score"]))
print(min(df["multi_omics_score"]))

2.0
0.0


In [14]:
print(len(set(df["hgnc_symbol"])))

19678


In [15]:
# get minimum string length in a list
print(min([len(x) for x in df["hgnc_symbol"]]))

# get maximum string length in a list
print(max([len(x) for x in df["hgnc_symbol"]]))

TypeError: object of type 'NoneType' has no len()

## Create Validator Object on Data File

In [12]:
df = pd.read_json(overall_scores_data_file)
nested_columns = []
df = GreatExpectationsRunner.convert_nested_columns_to_json(df, nested_columns)
validator = context.sources.pandas_default.read_dataframe(df)
validator.expectation_suite_name = "overall_scores"

## Add Expectations to Validator Object For Each Column

In [13]:
# ensembl_gene_id
validator.expect_column_values_to_be_of_type("ensembl_gene_id", "str")
validator.expect_column_values_to_not_be_null("ensembl_gene_id")
validator.expect_column_value_lengths_to_equal("ensembl_gene_id", 15)
# checks format and allowed chatacters
validator.expect_column_values_to_match_regex("ensembl_gene_id", "^ENSG\d{11}$")
validator.expect_column_values_to_be_unique("ensembl_gene_id")














  validator.expect_column_values_to_match_regex("ensembl_gene_id", "^ENSG\d{11}$")



  validator.expect_column_values_to_match_regex("ensembl_gene_id", "^ENSG\d{11}$")









Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 24786,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [None]:
# hgnc_symbol
validator.expect_column_values_to_be_of_type("hgnc_symbol", "str")
# validator.expect_column_values_to_not_be_null("hgnc_symbol")


In [20]:
# target_risk_score
validator.expect_column_values_to_be_of_type("target_risk_score", "float")
validator.expect_column_values_to_be_between("target_risk_score", 0, 5)
validator.expect_column_values_to_not_be_null("target_risk_score")







Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 24786,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [26]:
# genetics_score
validator.expect_column_values_to_be_of_type("genetics_score", "float")
validator.expect_column_values_to_be_between("genetics_score", 0, 3)


Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 24786,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 508,
    "missing_percent": 2.0495440974743806,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [29]:
#  multi_omics_score
validator.expect_column_values_to_be_of_type("multi_omics_score", "float")
validator.expect_column_values_to_be_between("multi_omics_score", 0, 2)








Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]







Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 24786,
    "unexpected_count": 910,
    "unexpected_percent": 3.671427418704107,
    "partial_unexpected_list": [
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## Save Expectation Suite

In [None]:
validator.save_expectation_suite(discard_failed_expectations=False)


## Create Checkpoint and View Results

In [None]:
checkpoint = context.add_or_update_checkpoint(
    name="agora-test-checkpoint",
    validator=validator,
)
checkpoint_result = checkpoint.run()
context.view_validation_result(checkpoint_result)


## Build Data Docs - Click on Expectation Suite to View All Expectations

In [None]:
context.build_data_docs()
context.open_data_docs()
