In [1]:
# Have to specify TMPDIR and target in pip install command to work around the kernel crash issue due to 
# the small ephemeral local storage quota allocated to /tmp which is used by default by pip install
!TMPDIR=/home/hyi/temp pip install git+https://github.com/vaclab/BiasAnalyzer.git --target /home/hyi/temp --upgrade

Collecting git+https://github.com/vaclab/BiasAnalyzer.git
  Cloning https://github.com/vaclab/BiasAnalyzer.git to ./temp/pip-req-build-5rfwe_wl
  Running command git clone --filter=blob:none --quiet https://github.com/vaclab/BiasAnalyzer.git /home/hyi/temp/pip-req-build-5rfwe_wl
  Resolved https://github.com/vaclab/BiasAnalyzer.git to commit 8709e09238d56d7b3fa7d0b20140cf448045c621
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting duckdb<2.0.0,>=1.1.1 (from BiasAnalyzer==0.1.0)
  Obtaining dependency information for duckdb<2.0.0,>=1.1.1 from https://files.pythonhosted.org/packages/bf/56/f627b6fcd4aa34015a15449d852ccb78d7cc6eda654aa20c1d378e99fa76/duckdb-1.1.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached duckdb-1.1.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (762 bytes)
Collecting duckdb-engine

In [2]:
# append the target folder where HealthDataBias module was installed to PYTHONPATH
import sys
sys.path.append('/home/hyi/temp')
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
import time

In [3]:
from biasanalyzer.api import BIAS

In [4]:
# create an object of BIAS class
bias = BIAS()

no configuration file specified. Call set_config(config_file_path) next to specify configurations
Cohort Definition table created.
Cohort table created.


In [5]:
bias.set_config('/home/hyi/bias/config/config.yaml')

configuration specified in /home/hyi/bias/config/config.yaml loaded successfully


In [6]:
# the configuration file includes root_omop_cdm_database configuration info with an example shown 
# in https://github.com/hyi/HealthDataBias/blob/main/tests/assets/test_config.yaml
bias.set_root_omop()

Connected to the OMOP CDM database (read-only).


In [7]:
bias.get_concepts("COVID-19")
concepts = bias.get_concepts("COVID-19", "Condition", "SNOMED")
print(f'concepts for COVID-19 in Condition domain with SNOMED vocabulary:\n {pd.DataFrame(concepts)}')
concepts = bias.get_concepts("COVID-19", domain="Condition")
print(f'concepts for COVID-19 in Condition domain:\n {pd.DataFrame(concepts)}')
concepts = bias.get_concepts("COVID-19", vocabulary="SNOMED")
print(f'concepts for COVID-19 in SNOMED vocabulary:\n {pd.DataFrame(concepts)}')

either domain or vocabulary must be set to constrain the number of returned concepts
concepts for COVID-19 in Condition domain with SNOMED vocabulary:
    concept_id                                       concept_name valid_start_date valid_end_date  domain_id vocabulary_id
0      703440  COVID-19 confirmed using clinical diagnostic c...       2020-04-01     2099-12-31  Condition        SNOMED
1      703441              COVID-19 confirmed by laboratory test       2020-04-01     2099-12-31  Condition        SNOMED
2      703445  Low risk category for developing complication ...       2020-04-01     2099-12-31  Condition        SNOMED
3      703446  Moderate risk category for developing complica...       2020-04-01     2099-12-31  Condition        SNOMED
4      703447  High risk category for developing complication...       2020-04-01     2099-12-31  Condition        SNOMED
5    37310269                                           COVID-19       2020-02-04     2020-10-28  Condition        S

In [8]:
# create a cohort with all COVID-19 female patients under 24 years old
cohort_query = ('SELECT c.person_id, c.condition_start_date as cohort_start_date, '
                             'c.condition_end_date as cohort_end_date '
                             'FROM condition_occurrence c JOIN '
                             'person p ON c.person_id = p.person_id '
                             'WHERE c.condition_concept_id = 37311061 '
                             'AND p.gender_concept_id = 8532 AND p.year_of_birth > 2000')

cohort_data = bias.create_cohort('Young female COVID-19 patients', 'Female patients with COVID-19 condition under 24 years old', cohort_query, 'system')
md = cohort_data.metadata
print(f'Young female COVID-19 patient cohort definition: {md}')
print(f'The first five patients in the cohort: {cohort_data.data[:5]}')

Cohort definition inserted successfully.
Cohort Young female COVID-19 patients successfully created.
cohort created successfully
Young female COVID-19 patient cohort definition: {'id': 1, 'name': 'Young female COVID-19 patients', 'description': 'Female patients with COVID-19 condition under 24 years old', 'created_date': datetime.date(2024, 11, 14), 'creation_info': 'SELECT c.person_id, c.condition_start_date as cohort_start_date, c.condition_end_date as cohort_end_date FROM condition_occurrence c JOIN person p ON c.person_id = p.person_id WHERE c.condition_concept_id = 37311061 AND p.gender_concept_id = 8532 AND p.year_of_birth > 2000', 'created_by': 'system'}
The first five patients in the cohort: [{'subject_id': 20342, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2020, 3, 11), 'cohort_end_date': datetime.date(2020, 4, 3)}, {'subject_id': 20343, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2020, 3, 9), 'cohort_end_date': datetime.date(2020, 4, 7)}, {

In [9]:
# get stats of the cohocohort
cohort_stats = cohort_data.get_stats()
print(f'the cohort stats: {cohort_stats}')
cohort_age_stats = cohort_data.get_stats("age")
print(f'the cohort age stats: {cohort_age_stats}')
cohort_gender_stats = cohort_data.get_stats("gender")
print(f'the cohort gender stats: {cohort_gender_stats}')
cohort_race_stats = cohort_data.get_stats("race")
print(f'the cohort race stats: {cohort_race_stats}')
cohort_ethnicity_stats = cohort_data.get_stats("ethnicity")
print(f'the cohort ethnicity stats: {cohort_ethnicity_stats}')

the cohort stats: [{'total_count': 9623, 'earliest_start_date': datetime.date(2020, 1, 18), 'latest_start_date': datetime.date(2020, 3, 30), 'earliest_end_date': datetime.date(2020, 2, 10), 'latest_end_date': datetime.date(2020, 5, 3), 'min_duration_days': 8, 'max_duration_days': 37, 'avg_duration_days': 24.26, 'median_duration': 24, 'stddev_duration': 7.2}]
the cohort age stats: [{'total_count': 9623, 'min_age': 0, 'max_age': 19, 'avg_age': 10.39, 'median_age': 11, 'stddev_age': 5.65}]
the cohort gender stats: [{'gender': 'female', 'gender_count': 9623, 'probability': 1.0}]
the cohort race stats: [{'race': 'Asian', 'race_count': 680, 'probability': 0.07}, {'race': 'Other', 'race_count': 53, 'probability': 0.01}, {'race': 'White', 'race_count': 8081, 'probability': 0.84}, {'race': 'Black or African American', 'race_count': 809, 'probability': 0.08}]
the cohort ethnicity stats: [{'ethnicity': 'other', 'ethnicity_count': 9623, 'probability': 1.0}]


In [10]:
# get discrete probability distribution of the age variable in the cohort
cohort_age_distr = cohort_data.get_distributions('age')
print(f'the cohort age discrete probability distribution: {cohort_age_distr}')

the cohort age discrete probability distribution: [{'age_bin': '0-10', 'bin_count': 4744, 'probability': 0.493}, {'age_bin': '11-20', 'bin_count': 4879, 'probability': 0.507}, {'age_bin': '21-30', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '31-40', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '41-50', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '51-60', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '61-70', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '71-80', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '81-90', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '91+', 'bin_count': 0, 'probability': 0.0}]


In [11]:
# get cohort concept prevalance
t1 = time.time()
cohort_concepts = cohort_data.get_concept_stats()
print(pd.DataFrame(cohort_concepts["condition_occurrence"]))
print(f'the time taken to get cohort concept stats is {time.time() - t1}s')

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

    concept_id                                       concept_name  count_in_cohort  prevalence
0     37311061                                           COVID-19             9623    1.000000
1       437663                                              Fever             8161    0.848072
2       254761                                              Cough             6223    0.646680
3      4289517                                      Loss of taste             4598    0.477814
4      4223659                                            Fatigue             3553    0.369220
5      4089228                                     Sputum finding             3045    0.316429
6       312437                                            Dyspnea             1538    0.159825
7       314754                                           Wheezing             1538    0.159825
8        77074                                         Joint pain             1351    0.140393
9       442752                                    

In [12]:
bias.cleanup()

Connection to BiasDatabase closed.
Connection to the OMOP CDM database closed.
