In [1]:
# Have to specify TMPDIR and target in pip install command to work around the kernel crash issue due to 
# the small ephemeral local storage quota allocated to /tmp which is used by default by pip install
!TMPDIR=/home/hyi/temp pip install git+https://github.com/vaclab/BiasAnalyzer.git --target /home/hyi/temp --upgrade

Collecting git+https://github.com/vaclab/BiasAnalyzer.git
  Cloning https://github.com/vaclab/BiasAnalyzer.git to ./temp/pip-req-build-9u5eohe1
  Running command git clone --filter=blob:none --quiet https://github.com/vaclab/BiasAnalyzer.git /home/hyi/temp/pip-req-build-9u5eohe1
  Resolved https://github.com/vaclab/BiasAnalyzer.git to commit a3d43525ddd2b934d8a094901f7ad62c52f2e724
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting duckdb<2.0.0,>=1.1.1 (from BiasAnalyzer==0.1.0)
  Obtaining dependency information for duckdb<2.0.0,>=1.1.1 from https://files.pythonhosted.org/packages/bf/56/f627b6fcd4aa34015a15449d852ccb78d7cc6eda654aa20c1d378e99fa76/duckdb-1.1.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached duckdb-1.1.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (762 bytes)
Collecting duckdb-engine

In [2]:
# append the target folder where HealthDataBias module was installed to PYTHONPATH
import sys
sys.path.append('/home/hyi/temp')

In [3]:
from biasanalyzer.api import BIAS

In [4]:
# create an object of BIAS class
bias = BIAS()

no configuration file specified. Call set_config(config_file_path) next to specify configurations
Cohort Definition table created.
Cohort table created.


In [5]:
bias.set_config('/home/hyi/bias/config/config.yaml')

configuration specified in /home/hyi/bias/config/config.yaml loaded successfully


In [6]:
# the configuration file includes root_omop_cdm_database configuration info with an example shown 
# in https://github.com/hyi/HealthDataBias/blob/main/tests/assets/test_config.yaml
bias.set_root_omop()

Connected to the OMOP CDM database (read-only).


In [7]:
# create a baseline cohort with all COVID-19 patients
baseline_cohort_query = ('SELECT person_id, condition_start_date as cohort_start_date, '
                                 'condition_end_date as cohort_end_date '
                                 'FROM condition_occurrence '
                                 'WHERE condition_concept_id = 37311061')

baseline_cohort = bias.create_cohort('COVID-19 patients', 'Patients with COVID-19 condition', baseline_cohort_query, 'system')
baseline_cohort_def = baseline_cohort.metadata
print(f'all COVID-19 patient cohort definition: {baseline_cohort_def}')
baseline_cohort_data = baseline_cohort.data
print(f'The first five patients in the COVID-19 patient cohort: {baseline_cohort_data[:5]}')

Cohort definition inserted successfully.
Cohort COVID-19 patients successfully created.
cohort created successfully
all COVID-19 patient cohort definition: {'id': 1, 'name': 'COVID-19 patients', 'description': 'Patients with COVID-19 condition', 'created_date': datetime.date(2024, 11, 7), 'creation_info': 'SELECT person_id, condition_start_date as cohort_start_date, condition_end_date as cohort_end_date FROM condition_occurrence WHERE condition_concept_id = 37311061', 'created_by': 'system'}
The first five patients in the COVID-19 patient cohort: [{'subject_id': 20342, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2020, 3, 11), 'cohort_end_date': datetime.date(2020, 4, 3)}, {'subject_id': 20343, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2020, 3, 9), 'cohort_end_date': datetime.date(2020, 4, 7)}, {'subject_id': 20344, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2020, 3, 11), 'cohort_end_date': datetime.date(2020, 4, 8)}, {'subject_id

In [8]:
# get stats of the baseline cohort
cohort_stats = baseline_cohort.get_stats()
print(f'the baseline cohort stats: {cohort_stats}')
cohort_age_stats = baseline_cohort.get_stats("age")
print(f'the baseline cohort age stats: {cohort_age_stats}')
cohort_gender_stats = baseline_cohort.get_stats("gender")
print(f'the baseline cohort gender stats: {cohort_gender_stats}')
cohort_race_stats = baseline_cohort.get_stats("race")
print(f'the baseline cohort race stats: {cohort_race_stats}')
cohort_ethnicity_stats = baseline_cohort.get_stats("ethnicity")
print(f'the baseline cohort ethnicity stats: {cohort_ethnicity_stats}')

the baseline cohort stats: [{'total_count': 88166, 'earliest_start_date': datetime.date(2020, 1, 14), 'latest_start_date': datetime.date(2020, 3, 31), 'earliest_end_date': datetime.date(2020, 1, 30), 'latest_end_date': datetime.date(2020, 5, 3), 'min_duration_days': 8, 'max_duration_days': 37, 'avg_duration_days': 22.63, 'median_duration': 23, 'stddev_duration': 8.06}]
the baseline cohort age stats: [{'total_count': 88166, 'min_age': 0, 'max_age': 111, 'avg_age': 41.6, 'median_age': 41, 'stddev_age': 23.71}]
the baseline cohort gender stats: [{'gender': 'male', 'gender_count': 42961, 'probability': 0.49}, {'gender': 'female', 'gender_count': 45205, 'probability': 0.51}]
the baseline cohort race stats: [{'race': 'Asian', 'race_count': 6165, 'probability': 0.07}, {'race': 'Other', 'race_count': 511, 'probability': 0.01}, {'race': 'White', 'race_count': 74065, 'probability': 0.84}, {'race': 'Black or African American', 'race_count': 7425, 'probability': 0.08}]
the baseline cohort ethnicit

In [9]:
# get discrete probability distribution of the age variable in the baseline cohort
cohort_age_distr = baseline_cohort.get_distributions('age')
print(f'the baseline cohort age discrete probability distribution: {cohort_age_distr}')

the baseline cohort age discrete probability distribution: [{'age_bin': '0-10', 'bin_count': 9231, 'probability': 0.1047}, {'age_bin': '11-20', 'bin_count': 10746, 'probability': 0.1219}, {'age_bin': '21-30', 'bin_count': 12377, 'probability': 0.1404}, {'age_bin': '31-40', 'bin_count': 10896, 'probability': 0.1236}, {'age_bin': '41-50', 'bin_count': 11450, 'probability': 0.1299}, {'age_bin': '51-60', 'bin_count': 13081, 'probability': 0.1484}, {'age_bin': '61-70', 'bin_count': 9985, 'probability': 0.1133}, {'age_bin': '71-80', 'bin_count': 5865, 'probability': 0.0665}, {'age_bin': '81-90', 'bin_count': 2810, 'probability': 0.0319}, {'age_bin': '91+', 'bin_count': 1725, 'probability': 0.0196}]


In [10]:
# create a user study cohort with all COVID patients above the age of 65
study_cohort_query = ('SELECT c.person_id, c.condition_start_date as cohort_start_date, '
                      'c.condition_end_date as cohort_end_date '
                      'FROM condition_occurrence c JOIN '
                      'person p ON c.person_id = p.person_id '
                      'WHERE c.condition_concept_id = 37311061 AND p.year_of_birth < 1955')

study_cohort = bias.create_cohort('Older COVID-19 patients', 'Patients with COVID-19 condition who are older than 65', study_cohort_query, 'system')
study_cohort_def = study_cohort.metadata
print(f'Older COVID-19 patient cohort definition: {study_cohort_def}')
study_cohort_data = study_cohort.data
print(f'The first five patients in the older COVID-19 patient cohort: {study_cohort_data[:5]}')

Cohort definition inserted successfully.
Cohort Older COVID-19 patients successfully created.
cohort created successfully
Older COVID-19 patient cohort definition: {'id': 2, 'name': 'Older COVID-19 patients', 'description': 'Patients with COVID-19 condition who are older than 65', 'created_date': datetime.date(2024, 11, 7), 'creation_info': 'SELECT c.person_id, c.condition_start_date as cohort_start_date, c.condition_end_date as cohort_end_date FROM condition_occurrence c JOIN person p ON c.person_id = p.person_id WHERE c.condition_concept_id = 37311061 AND p.year_of_birth < 1955', 'created_by': 'system'}
The first five patients in the older COVID-19 patient cohort: [{'subject_id': 20344, 'cohort_definition_id': 2, 'cohort_start_date': datetime.date(2020, 3, 11), 'cohort_end_date': datetime.date(2020, 4, 8)}, {'subject_id': 20352, 'cohort_definition_id': 2, 'cohort_start_date': datetime.date(2020, 3, 5), 'cohort_end_date': datetime.date(2020, 3, 31)}, {'subject_id': 20361, 'cohort_defi

In [11]:
# get stats and distributions of the user study cohort
study_cohort_stats = study_cohort.get_stats()
print(f'the user study cohort stats: {study_cohort_stats}')
study_cohort_age_stats = study_cohort.get_stats("age")
print(f'the user study cohort age stats: {study_cohort_age_stats}')
study_cohort_gender_stats = study_cohort.get_stats("gender")
print(f'the user study gender stats: {study_cohort_gender_stats}')
study_cohort_race_stats = study_cohort.get_stats("race")
print(f'the user study cohort race stats: {study_cohort_race_stats}')
study_cohort_ethnicity_stats = study_cohort.get_stats("ethnicity")
print(f'the user study ethnicity stats: {study_cohort_ethnicity_stats}')

the user study cohort stats: [{'total_count': 14786, 'earliest_start_date': datetime.date(2020, 1, 20), 'latest_start_date': datetime.date(2020, 3, 29), 'earliest_end_date': datetime.date(2020, 2, 5), 'latest_end_date': datetime.date(2020, 4, 28), 'min_duration_days': 8, 'max_duration_days': 37, 'avg_duration_days': 22.05, 'median_duration': 22, 'stddev_duration': 8.36}]
the user study cohort age stats: [{'total_count': 14786, 'min_age': 66, 'max_age': 111, 'avg_age': 77.64, 'median_age': 75, 'stddev_age': 10.4}]
the user study gender stats: [{'gender': 'male', 'gender_count': 7321, 'probability': 0.5}, {'gender': 'female', 'gender_count': 7465, 'probability': 0.5}]
the user study cohort race stats: [{'race': 'Other', 'race_count': 115, 'probability': 0.01}, {'race': 'Asian', 'race_count': 992, 'probability': 0.07}, {'race': 'White', 'race_count': 12474, 'probability': 0.84}, {'race': 'Black or African American', 'race_count': 1205, 'probability': 0.08}]
the user study ethnicity stats:

In [12]:
# get discrete probability distribution of the age variable in the baseline cohort
study_cohort_age_distr = study_cohort.get_distributions('age')
print(f'the user study cohort age discrete probability distribution: {study_cohort_age_distr}')

the user study cohort age discrete probability distribution: [{'age_bin': '0-10', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '11-20', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '21-30', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '31-40', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '41-50', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '51-60', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '61-70', 'bin_count': 4386, 'probability': 0.2966}, {'age_bin': '71-80', 'bin_count': 5865, 'probability': 0.3967}, {'age_bin': '81-90', 'bin_count': 2810, 'probability': 0.19}, {'age_bin': '91+', 'bin_count': 1725, 'probability': 0.1167}]


In [13]:
# compare the baseline and user study cohorts
result = bias.compare_cohorts(1, 2)
print(result)

[{'age_hellinger_distance': 0.7270707021318757}, {'gender_hellinger_distance': 0.005516329339504949}]


In [14]:
bias.cleanup()

Connection to BiasDatabase closed.
Connection to the OMOP CDM database closed.
