Filtering to get only main diagnosis... should this be done? Why aren't the other diagnoses important?

Instead we could get the encounter/main diagnoses in another table, then join on the entire diagnoses table to keep the other diagnoses

In [None]:
from typing import Iterator
from cyclops.query import gemini
import cyclops.query.process as qp

from sqlalchemy import select, func

from cyclops.query.util import get_column

from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    ENCOUNTER_ID,
    HOSPITAL_ID,
    SEX,
    SUBJECT_ID,
)

In [7]:
def most_recent_encounter(table):
    """Get the most recent encounter for each patient."""
    
    # First most recent admission
    recent_admits = qp.GroupByAggregate(
        SUBJECT_ID,
        {
            ADMIT_TIMESTAMP: "max",
            SUBJECT_ID: ("count", "prev_encounter_count")
        }
    )(table)

    # Keep only most recent admission, i.e., encounter
    table = qp.Join(
        recent_admits,
        on=[SUBJECT_ID, ADMIT_TIMESTAMP],
        join_table_cols=["prev_encounter_count"],
    )(table)
    
    # Subtract one from encounter count to get previous encounters count
    table = qp.AddNumeric("prev_encounter_count", -1)(table)
    
    return table


def non_cardiac_diagnoses():
    """Get non-cardiac diagnoses."""
    diagnoses = gemini.diagnoses(diagnosis_types="M").query
    diagnoses = qp.Limit(10000)(diagnoses)
    diagnoses = qp.Drop(["ccsr_3", "ccsr_4", "ccsr_5"])(diagnoses)
    
    # Drop ER diagnoses
    diagnoses = qp.ConditionEquals("is_er_diagnosis", False)(diagnoses)
    
    # Keep only the encounters with a non-cardiac main diagnosis
    diagnoses = qp.ConditionStartsWith('ccsr_1', 'CIR', not_=True)(diagnoses)
    
    return diagnoses


def get_cohort() -> Iterator:
    """Get cohort.
    
    Get cohort of pre-Covid, GIM patients admitted for
    non-cardiac main diagnoses.
    """

    # Get only pre-Covid encounters
    table = gemini.patient_encounters(
        sex=["M", "F"],
        before_date="2020-01-23",
        died_binarize_col="outcome_death",
    ).query
    
    # Do not filter rows before this point - Need all encounters
    # since they are counted for each subject
    table = qp.Limit(10000)(table)
    table = most_recent_encounter(table)
    
    # Only keep encounters where most responsible physician is GIM
    table = qp.ConditionEquals("mrp_gim", "y")(table)
    
    # Filter columns
    keep = [
        ENCOUNTER_ID, SUBJECT_ID, ADMIT_TIMESTAMP, AGE, SEX, HOSPITAL_ID,
        "outcome_death", 'admit_category', "readmission",
        "institution_from_type", "from_nursing_home_mapped",
        "from_acute_care_institution_mapped"
    ]
    table = qp.FilterColumns(keep)(table)
    
    # Remove null SUBJECT_ID
    table = qp.DropNulls(SUBJECT_ID)(table)

    table = qp.ReorderAfter(ADMIT_TIMESTAMP, SUBJECT_ID)(table)

    diagnoses = non_cardiac_diagnoses()
    diagnoses = qp.Limit(10000)(diagnoses)
    
    table = qp.Join(diagnoses, on=ENCOUNTER_ID)(table)
    
    #return table#, diagnoses
    return gemini.get_interface(table)

In [8]:
pe = get_cohort()

In [9]:
gemini.get_interface(pe).run(limit=10000)

2022-06-02 16:01:11,308 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-06-02 16:01:11,309 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 8.437839 s


Unnamed: 0,encounter_id,subject_id,admit_timestamp,age,sex,hospital_id,outcome_death,admit_category,readmission,institution_from_type,...,from_acute_care_institution_mapped,diagnosis_code,diagnosis_type,diagnosis_type_description,ip_diagnosis_cluster,is_er_diagnosis,ccsr_default,ccsr_1,ccsr_2,row_id


In [7]:
gemini.get_interface(d).run(limit=10000)

2022-06-02 15:51:35,908 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-06-02 15:51:35,910 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 0.078487 s


Unnamed: 0,encounter_id,diagnosis_code,diagnosis_type,diagnosis_type_description,ip_diagnosis_cluster,is_er_diagnosis,ccsr_default,ccsr_1,ccsr_2,row_id
0,14429103,K701,M,Most Responsible Diagnosis,,False,INF007,INF007,MBD017,1670165
1,14429126,N179,M,Most Responsible Diagnosis,,False,GEN002,GEN002,,1670173
2,14429127,K920,M,Most Responsible Diagnosis,,False,DIG021,DIG021,,1670186
3,14429138,E1110,M,Most Responsible Diagnosis,,False,END003,END003,END005,1670194
4,14429141,D700,M,Most Responsible Diagnosis,A,False,BLD007,BLD007,,1670198
...,...,...,...,...,...,...,...,...,...,...
9995,14727476,J690,M,Most Responsible Diagnosis,,False,RSP010,RSP010,,1787030
9996,14726757,M329,M,Most Responsible Diagnosis,,False,MUS024,MUS024,,1786793
9997,14727535,N136,M,Most Responsible Diagnosis,,False,GEN004,GEN004,,1787061
9998,14727580,J441,M,Most Responsible Diagnosis,,False,RSP008,RSP008,,1787067


In [6]:
gemini.patient_diagnoses(
    patient_encounters_table=gemini.patient_encounters().query,
    diagnoses_table=gemini.diagnoses().query
).run(limit=1000)

2022-06-02 15:44:03,162 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-06-02 15:44:03,165 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 0.050704 s


Unnamed: 0,encounter_id,subject_id,city,province,country,language,total_direct_cost,total_indirect_cost,total_cost,hospital_id,...,diagnosis_type_description,ip_diagnosis_cluster,is_er_diagnosis,ccsr_default,ccsr_1,ccsr_2,ccsr_3,ccsr_4,ccsr_5,row_id
0,14398453,3336fefa8fab9a0b121b31174f1ad77c413984ee24b5cb...,TORONTO,Ontario,Canada,English,3892.0000,1393.0000,5284.000,MSH,...,Secondary Diagnosis,,False,END003,END003,,,,,1658601
1,14398453,3336fefa8fab9a0b121b31174f1ad77c413984ee24b5cb...,TORONTO,Ontario,Canada,English,3892.0000,1393.0000,5284.000,MSH,...,Secondary Diagnosis,,False,CIR017,CIR017,,,,,1658602
2,14398453,3336fefa8fab9a0b121b31174f1ad77c413984ee24b5cb...,TORONTO,Ontario,Canada,English,3892.0000,1393.0000,5284.000,MSH,...,Pre-Admit Comorbidity,,False,BLD001,BLD001,,,,,1658603
3,14398453,3336fefa8fab9a0b121b31174f1ad77c413984ee24b5cb...,TORONTO,Ontario,Canada,English,3892.0000,1393.0000,5284.000,MSH,...,Secondary Diagnosis,,False,CIR007,CIR007,,,,,1658604
4,14398453,3336fefa8fab9a0b121b31174f1ad77c413984ee24b5cb...,TORONTO,Ontario,Canada,English,3892.0000,1393.0000,5284.000,MSH,...,Pre-Admit Comorbidity,,False,END011,END011,,,,,1658605
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,14401595,0b4567e2039c6931a9f83907af5fc4d9d7351fa32ef915...,TORONTO,ON,CA,,2721.6106,1176.8622,3898.473,MSH,...,Secondary Diagnosis,,False,,,,,,,1659844
996,14401595,0b4567e2039c6931a9f83907af5fc4d9d7351fa32ef915...,TORONTO,ON,CA,,2721.6106,1176.8622,3898.473,MSH,...,Secondary Diagnosis,,False,CIR007,CIR007,,,,,1659845
997,14401595,0b4567e2039c6931a9f83907af5fc4d9d7351fa32ef915...,TORONTO,ON,CA,,2721.6106,1176.8622,3898.473,MSH,...,Secondary Diagnosis,,False,MUS013,MUS013,,,,,1659846
998,14401595,0b4567e2039c6931a9f83907af5fc4d9d7351fa32ef915...,TORONTO,ON,CA,,2721.6106,1176.8622,3898.473,MSH,...,Most Responsible Diagnosis,,True,RSP010,RSP010,,,,,1659847


In [6]:
table = get_cohort()
df = gemini.get_interface(table).run(limit=1000)
df

2022-06-02 15:23:36,566 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-06-02 15:23:36,570 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 112.438915 s


Unnamed: 0,encounter_id,diagnosis_code,diagnosis_type,diagnosis_type_description,ip_diagnosis_cluster,is_er_diagnosis,ccsr_default,ccsr_1,ccsr_2,row_id,...,age,sex,hospital_id,outcome_death,admit_category,readmission,institution_from_type,from_nursing_home_mapped,from_acute_care_institution_mapped,prev_encounter_count
0,13656342,N179,M,Most Responsible Diagnosis,,False,GEN002,GEN002,,1178194,...,80,F,UHNTW,False,U,9,HF,True,False,0
1,13656405,T402,M,Most Responsible Diagnosis,,False,INJ022,EXT014,EXT023,1178304,...,60,M,UHNTW,False,U,9,,False,False,0
2,13657431,N179,M,Most Responsible Diagnosis,,False,GEN002,GEN002,,1179602,...,56,F,UHNTW,False,U,5,,False,False,0
3,13657844,K704,M,Most Responsible Diagnosis,,False,DIG018,DIG018,MBD017,1180083,...,40,F,UHNTW,True,U,9,,False,False,0
4,13658565,C833,M,Most Responsible Diagnosis,,False,NEO058,NEO058,,1180969,...,79,M,UHNTW,False,U,9,,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,13704867,R55,M,Most Responsible Diagnosis,,False,SYM001,SYM001,,1229686,...,85,F,UHNTW,False,U,5,,False,False,0
72,13704914,C7988,M,Most Responsible Diagnosis,,False,NEO070,NEO070,,1229736,...,90,F,UHNTW,False,U,9,,False,False,0
73,13705062,R410,M,Most Responsible Diagnosis,,False,SYM010,SYM010,,1229879,...,80,M,UHNTW,False,U,5,,False,False,0
74,13705605,E870,M,Most Responsible Diagnosis,,False,END011,END011,,1230472,...,42,M,UHNTW,False,U,5,,False,False,0


In [3]:
table = non_cardiac_diagnoses()
df = gemini.get_interface(table).run(limit=1000)
df

2022-06-02 15:51:20,171 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-06-02 15:51:20,174 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 0.230590 s


Unnamed: 0,encounter_id,diagnosis_code,diagnosis_type,diagnosis_type_description,ip_diagnosis_cluster,is_er_diagnosis,ccsr_default,ccsr_1,ccsr_2,row_id
0,14398498,D700,M,Most Responsible Diagnosis,A,False,BLD007,BLD007,,1658616
1,14398522,F03,M,Most Responsible Diagnosis,,False,NVS011,NVS011,,1658624
2,14398575,E871,M,Most Responsible Diagnosis,,False,END011,END011,,1658628
3,14398724,L022,M,Most Responsible Diagnosis,,False,SKN001,SKN001,,1658634
4,14398735,F840,M,Most Responsible Diagnosis,,False,MBD014,MBD014,,1658641
...,...,...,...,...,...,...,...,...,...,...
995,14429761,Z751,M,Most Responsible Diagnosis,,False,FAC025,FAC025,,1670408
996,14429279,N308,M,Most Responsible Diagnosis,,False,GEN004,GEN004,,1670283
997,14429103,K701,M,Most Responsible Diagnosis,,False,INF007,INF007,MBD017,1670165
998,14429126,N179,M,Most Responsible Diagnosis,,False,GEN002,GEN002,,1670173


In [10]:
df.columns

Index(['encounter_id', 'subject_id', 'admit_timestamp', 'city', 'province',
       'country', 'language', 'total_direct_cost', 'total_indirect_cost',
       'total_cost', 'hospital_id', 'sex', 'age', 'discharge_timestamp',
       'admit_category', 'discharge_disposition', 'responsibility_for_payment',
       'province_territory_issuing_health_card_number', 'number_of_alc_days',
       'institution_from', 'institution_from_type', 'institution_to',
       'institution_to_type', 'readmission', 'residence_code',
       'admitting_service_raw', 'discharging_service_raw', 'mrp_service',
       'cmg', 'admitting_physician', 'discharging_physician', 'mrp',
       'admitting_physician_gim', 'discharging_physician_gim', 'mrp_gim',
       'admitting_service_mapped', 'discharging_service_mapped',
       'from_nursing_home_mapped', 'from_acute_care_institution_mapped',
       'los_derived', 'del_present', 'gemini_cohort', 'discharge_description',
       'outcome_death', 'prev_encounter_count'],
   

In [None]:
from cyclops.workflow.task import QueryTask
from cyclops.workflow.queries import register_query

# Register query creation functions.
register_query(get_cohort)

QueryTask()...

# How to create?
# How to run?

In [None]:
query_interface

In [10]:
import luigi
from cyclops.workflow.task import QueryTask

luigi.build([QueryTask(get_cohort())], workers=1, local_scheduler=True)

ModuleNotFoundError: No module named 'luigi'