In [None]:
import csv
import math
import re
import time
from getpass import getpass

# import ipdb
# import nltk
import numpy as np
import pandas as pd
import psycopg2

# nltk.download('wordnet')
# from nltk.stem import WordNetLemmatizer
from sklearn.cluster import FeatureAgglomeration
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    confusion_matrix,
    precision_recall_fscore_support,
    roc_auc_score,
)
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler, OneHotEncoder, StandardScaler

# from nltk.corpus import stopwords

In [None]:
USERNAME = getpass("Username")
SPLIT_METHOD = "quarter"  # quarter, random, ordered
SCORING = "f1"  # f1, roc_auc, f1_weighted
VECTORIZER = "tfidf"  # tfidf, count
DEL_LABEL_WEIGHT = (
    2.9  # for class imbalance, use 1 to equally weigh positives and negatives
)
DIMENSIONALITY_REDUCTION = None  # pca, clustering
SCALING = None  # maxabs, standard (necessary if using dimensionality reduction)
HARDCODED_IMAGING_SYNONYMOUS_DIAGNOSES = "granular"  # aggregate, granular
HARDCODED_IMAGING_TRIGGER_WORDS = "granular"  # aggregate, granular
HARDCODED_IMAGING_SUPPORTING_WORDS = "granular"  # aggregate, granular
LAB_FREQUENCY_ONLY = False  # True, False
beers_list_path = "H:\\repos\\delirium\\beers_list.csv"
antipsychotics_path = "H:\\repos\\delirium\\antipsychotics.csv"
nootropics_path = "H:\\repos\\delirium\\nootropics.csv"
benzodiazepine_path = "H:\\repos\\delirium\\benzodiazepine.csv"
antibiotics_path = "H:\\repos\\delirium\\antibiotics.csv"
sedatives_path = "H:\\repos\\delirium\\sedatives.csv"

# 1 Query

Create connection:

In [None]:
conn = psycopg2.connect(
    host="db.gemini-hpc.ca",
    database="delirium_v3_0_0",
    user=USERNAME,
    password=getpass(""),
)

In [None]:
conn_sbk = psycopg2.connect(
    host="172.27.12.113",
    database="delirium_sbk_v2_2_0",
    user=USERNAME,
    password=getpass(""),
)

Just diagnostics:

In [None]:
query_start = time.time()

## 1.1 `ip_administrative`

In [None]:
ip_administrative_query = """
    select
           ia.admit_date_time,
           ia.genc_id,
           ia.admit_category,
           ia.discharge_disposition,
           ia.number_of_alc_days,
           ia.gender,
           ia.age,
           ia.institution_to,
           ia.institution_to_type,
           ia.institution_from,
           ia.institution_from_type,
           ia.del_present
    from ip_administrative ia
    where ia.del_present is not null and ia.del_present <> 3;
"""

ip_administrative = pd.read_sql_query(ip_administrative_query, conn)

## 1.2 `room_transfer`

In [None]:
room_transfer_query = """
    select
        rt.genc_id, count(*) as room_transfers
    from room_transfer rt
        inner join ip_administrative ia
            on rt.genc_id = ia.genc_id
    where ia.del_present is not null and ia.del_present <> 3
    group by rt.genc_id;
"""

room_transfer = pd.read_sql_query(room_transfer_query, conn)

## 1.3 `ip_scu`

In [None]:
ip_scu_query = """
    select
        scu.genc_id, count(*) as icu_transfers
    from ip_scu scu
        inner join ip_administrative ia
            on scu.genc_id = ia.genc_id
    where ia.del_present is not null and ia.del_present <> 3
    group by scu.genc_id;
"""

ip_scu = pd.read_sql_query(ip_scu_query, conn)

## 1.4 `er_administrative`

In [None]:
er_administrative_query = """
    select
        ea.genc_id,
        ea.admit_via_ambulance,
        ea.triage_level,
        date_part('day', ea.left_er_date_time::timestamp - ea.triage_date_time::timestamp) * 24 + date_part('hour', ea.left_er_date_time::timestamp - ea.triage_date_time::timestamp) as er_los_derived
    from er_administrative ea
        inner join ip_administrative ia
            on ea.genc_id = ia.genc_id
    where ia.del_present is not null and ia.del_present <> 3;
"""

er_administrative = pd.read_sql_query(er_administrative_query, conn)

## 1.5 `blood_transfusion`

In [None]:
blood_transfusion_query = """
    select
        bt.genc_id, 
        count(bt.*) as total_transfusions,
        count(distinct bt.blood_product_raw) as unique_transfusions
    from blood_transfusion bt
        inner join ip_administrative ia
            on bt.genc_id = ia.genc_id
    where ia.del_present is not null and ia.del_present <> 3
    group by bt.genc_id;
"""

blood_transfusion = pd.read_sql_query(blood_transfusion_query, conn)

## 1.6 `delirium_icd10`

In [None]:
delirium_icd10_query = """
    select
        a.genc_id, bool_or(a.icd_delirium) as icd_delirium
    from (
        select distinct
            d.genc_id,
            case when d.diagnosis_code like 'F05%' then TRUE else FALSE end as icd_delirium
        from diagnosis d
            inner join ip_administrative ia
                on d.genc_id = ia.genc_id
        where ia.del_present is not null and ia.del_present <> 3
    ) a
    group by a.genc_id;
"""

delirium_icd10 = pd.read_sql_query(delirium_icd10_query, conn)

## 1.7 `diagnosis`

In [None]:
diagnosis_query = """
    select d.genc_id, d.ccsr_no, substring(lc.ccsr_desc, 1, 20) as ccsr_code
    from (
             (with ccsr as (
                 select i.genc_id,
                        i.ccsr_default,
                        i.ccsr_1,
                        i.ccsr_2,
                        i.ccsr_3,
                        i.ccsr_4,
                        i.ccsr_5
                 from diagnosis i
                          inner join ip_administrative a
                                     on i.genc_id = a.genc_id
                 where a.del_present is not null and a.del_present <> 3
             )
             select t.genc_id, m.ccsr_no, m.ccsr_code
             from ccsr t
                      cross join lateral (
                 values (1, ccsr_default),
                        (1, ccsr_1),
                        (1, ccsr_2),
                        (1, ccsr_3),
                        (1, ccsr_4),
                        (1, ccsr_5)
                 ) as m(ccsr_no, ccsr_code)
             where ccsr_code is not null
               and ccsr_code <> '')
         ) d
    inner join (
        select
               i.ccsr_default,
               count(*)
        from diagnosis i
        group by i.ccsr_default
        order by count(*) desc
        limit 250
        ) common
    on d.ccsr_code = common.ccsr_default
    inner join lookup_ccsr lc on d.ccsr_code = lc.ccsr;
"""

diagnosis = pd.read_sql_query(diagnosis_query, conn)

96% of codes are mapped in `delirium_sbk_v2_2_0.ipdiagnosis` and 90% are mapped in `delirium_v3_1_0.diagnosis`. If we had `lookup_icd10_ca_to_ccsr` in `delirium_v3_1_0` we would probably have identical coverage.

Doing a weighted one-hot encoding where the weight is given by the ccsr_no. This may or may not be necessary. Instead we could use the diagnosis "type" - but this might be hard to code as a "weight". `@TODO`: Check that this makes sense.

In [None]:
diagnosis = (
    diagnosis.pivot_table(
        index="genc_id", columns="ccsr_code", values="ccsr_no", aggfunc="max"
    )
    .fillna(0)
    .add_prefix("diag_")
)

## 1.8 `lab`

In [None]:
lookup_lab_concept_query = """
    select concept_id::int, concept_desc from lookup_lab_concept
"""

lookup_lab_concept = pd.read_sql_query(lookup_lab_concept_query, conn_sbk)

In [None]:
lab_query = """
    select stats_long.*
    from (
             with lab_stats as (
                 select l.genc_id,
                        l.test_type_mapped_omop,
                        min(l.result_value::double precision)                    as minimum,
                        --percentile_disc(0.25)
                        --within group (order by l.result_value::double precision) as p25,
                        percentile_disc(0.5)
                        within group (order by l.result_value::double precision) as median,
                        --percentile_disc(0.75)
                        --within group (order by l.result_value::double precision) as p75,
                        max(l.result_value::double precision)                    as maximum
                 from lab l
                          inner join ip_administrative a
                                     on l.genc_id = a.genc_id
                 where l.result_value ~ '^[0-9]+(\\.[0-9]+)?$'
                   and a.del_present is not null and a.del_present <> 3
                 group by l.genc_id, l.test_type_mapped_omop
             )
             select t.genc_id, t.test_type_mapped_omop, m.stat, m.value
             from lab_stats t
                      cross join lateral (
                 values ('min', minimum),
                        --('p25', p25),
                        ('median', median),
                        --('p75', p75),
                        ('max', maximum)--,
                        --('n', n_tests)
                 ) as m(stat, value)
         ) stats_long
    inner join (
        select
               l.test_type_mapped_omop,
               count(l.*)
        from lab l
        where l.test_type_mapped_omop is not null
        group by l.test_type_mapped_omop
        order by count(l.*) desc
        limit 250
        ) common
    on stats_long.test_type_mapped_omop = common.test_type_mapped_omop
    
union

    select stats_long_n.*
    from (
             with lab_stats_n as (
                 select l.genc_id,
                        l.test_type_mapped_omop,
                        count(*) as n_tests
                 from lab l
                          inner join ip_administrative a
                                     on l.genc_id = a.genc_id
                 where a.del_present is not null and a.del_present <> 3
                 group by l.genc_id, l.test_type_mapped_omop
             )
             select t.genc_id, t.test_type_mapped_omop, m.stat, m.value
             from lab_stats_n t
                      cross join lateral (
                 values ('n', n_tests)
                 ) as m(stat, value)
         ) stats_long_n
    inner join (
        select
               l.test_type_mapped_omop,
               count(l.*)
        from lab l
        where l.test_type_mapped_omop is not null
        group by l.test_type_mapped_omop
        order by count(l.*) desc
        limit 250
        ) common_n
    on stats_long_n.test_type_mapped_omop = common_n.test_type_mapped_omop;
"""

lab = pd.read_sql_query(lab_query, conn)

Give english names to the lab tests by joining with concept map:

In [None]:
lab = pd.merge(
    lab,
    lookup_lab_concept,
    left_on="test_type_mapped_omop",
    right_on="concept_id",
    how="left",
).drop(["test_type_mapped_omop", "concept_id"], axis=1)

In [None]:
lab.loc[:, "concept_desc"] = lab.loc[:, "concept_desc"].str.slice(
    stop=20
)  # shorten names

In [None]:
lab = lab.pivot_table(index="genc_id", columns=["concept_desc", "stat"], values="value")

In [None]:
lab.columns = ["_".join(map(str, col)).strip() for col in lab.columns.values]

In [None]:
lab = lab.add_prefix("lab_")

In [None]:
if LAB_FREQUENCY_ONLY:
    labs_to_drop = lab.columns[~lab.columns.str.endswith("_n")].tolist()
    lab.drop(labs_to_drop, axis=1, inplace=True)

## 1.9 `intervention`

In [None]:
intervention_query = """
    select
        i.genc_id, 
        substring(i.intervention_code, 1, 2) as cci, 1 as value 
    from intervention i 
        inner join ip_administrative ia 
            on i.genc_id = ia.genc_id 
    where ia.del_present is not null and ia.del_present <> 3
        and i.intervention_code is not null;
"""

intervention = pd.read_sql_query(intervention_query, conn)

In [None]:
intervention = intervention.pivot_table(
    index="genc_id", columns="cci", values="value", aggfunc="max"
).add_prefix("intervention_")

In [None]:
intervention.drop("intervention_", axis=1, inplace=True)  # NULL intervention

Look for very specific interventions in the data.

`@TODO`: Add rare interventions only looking at training data

In [None]:
rare_interventions_query = """
    select
        a.genc_id,
        max(a."CT head without enha") as "CT head without enha",
        max(a."CT brain without enh") as "CT brain without enh",
        max(a."Drain bladder PO &tu") as "Drain bladder PO &tu",
        max(a."Xray thor cav withou") as "Xray thor cav withou",
        max(a."Specimen collect NEC") as "Specimen collect NEC"
    from (
        select 
            i.genc_id, 
            case when i.intervention_code = '3ER20VA' then 1 else 0
                end as "CT head without enha",
            case when i.intervention_code = '3AN20WA' then 1 else 0
                end as "CT brain without enh",
            case when i.intervention_code = '1PM52CATS' then 1 else 0
                end as "Drain bladder PO &tu",
            case when i.intervention_code = '3GY10VA' then 1 else 0
                end as "Xray thor cav withou",
            case when i.intervention_code = '2ZZ13RK' then 1 else 0
                end as "Specimen collect NEC"
        from intervention i 
            inner join ip_administrative ia 
                on i.genc_id = ia.genc_id 
        where ia.del_present is not null and ia.del_present <> 3
    ) a
    group by a.genc_id;
"""

rare_interventions = pd.read_sql_query(rare_interventions_query, conn)

In [None]:
rare_interventions = rare_interventions.set_index("genc_id").add_prefix(
    "rare_intervention_"
)

## 1.10 `pharmacy`

In [None]:
pharmacy_query = """
    select
        r.genc_id, 
        string_agg(r.med_id_generic_name_raw, ', ') as medications, 
        count(r.med_id_generic_name_raw) as n_meds
    from pharmacy r 
        right join ip_administrative ia 
            on ia.genc_id = r.genc_id 
    where ia.del_present is not null and ia.del_present <> 3
    group by r.genc_id
"""

pharmacy = pd.read_sql_query(pharmacy_query, conn)

In [None]:
pharmacy.loc[:, "medications"] = pharmacy.loc[:, "medications"].str.replace(
    "[^\w\s]", ""
)

Find number of Beer's List medications per encounter. Cross check against Beer's List (csv) and create a count of Beer's List medications per encounter.

In [None]:
beers_list = pd.read_csv(beers_list_path)

In [None]:
pharmacy_raw_query = """
    select
        p.genc_id, 
        p.med_id_generic_name_raw 
        from pharmacy p 
            inner join ip_administrative ia 
                on p.genc_id = ia.genc_id 
        where ia.del_present is not null and ia.del_present <> 3;
"""

pharmacy_raw = pd.read_sql_query(pharmacy_raw_query, conn)

In [None]:
def in_beers_list(x):
    for drug in beers_list.Medication:
        if re.search(r"" + drug, x, re.IGNORECASE):
            return True
    return False

In [None]:
in_beers = pharmacy_raw.med_id_generic_name_raw.apply(in_beers_list)
pharmacy_raw.loc[:, "in_beers"] = in_beers

In [None]:
pharmacy_beers = pharmacy_raw.groupby("genc_id")["in_beers"].sum()

Some drug class can be mapped manually. We have CSV files with mapped names for drugs of interest. Cross check pharmacy table for medications in these groups (per encounter):

In [None]:
def mapped_meds(path, name):
    def in_class(x, rxnorm_output):
        for name in rxnorm_output.med_id_generic_name_raw:
            if isinstance(name, str):
                if re.search(r"" + name, x, re.IGNORECASE):
                    return True
        return False

    mappings = pd.read_csv(antipsychotics_path)

    mask = pharmacy_raw.med_id_generic_name_raw.apply(in_class, rxnorm_output=mappings)
    pharmacy_raw.loc[:, name] = mask

In [None]:
mapped_meds(antipsychotics_path, "n_antipsychotics")

In [None]:
mapped_meds(nootropics_path, "n_nootropics")

In [None]:
mapped_meds(benzodiazepine_path, "n_benzodiazepine")

In [None]:
mapped_meds(antibiotics_path, "n_antibiotics")

In [None]:
mapped_meds(sedatives_path, "n_sedatives")

In [None]:
pharmacy_meds_mapped = pharmacy_raw.groupby("genc_id")[
    [
        "n_nootropics",
        "n_antipsychotics",
        "n_benzodiazepine",
        "n_antibiotics",
        "n_sedatives",
    ]
].sum()

## 1.11 `imaging`

In [None]:
imaging_query = """
    select 
        i.genc_id, 
        string_agg(i.test_result, ', ') as test_result
    from imaging i
        right join ip_administrative ia 
            on ia.genc_id = i.genc_id 
    where ia.del_present is not null and ia.del_present <> 3
    group by i.genc_id
"""

imaging = pd.read_sql_query(imaging_query, conn)

In [None]:
imaging.loc[:, "test_result"] = imaging.loc[:, "test_result"].str.replace("[^\w\s]", "")

Create hard-coded imaging keyword features based on recommended patterns as described in the chart abstracting manual:

In [None]:
synonymous_diagnoses = {
    "imag_Acute brain syndrome": "acute brain syndrome",
    "imag_Acute brain failure": "acute brain failure",
    "imag_Acute cerebral insufficiency": "acute cerebral insufficiency",
    "imag_Acute organic psychosis": "acute organic psychosis",
    "imag_Acute organic brain syndrome": "acute organic brain syndrome",
    "imag_ICU Psychosis": "icu psychosis",
    "imag_Metabolic encephalopathy": "metabolic encephalopathy",
    "imag_Pseudosenility": "pseudosenility",
    "imag_Reversible dementia": "reversible dementia",
    "imag_Toxic-metabolic encepalopathy": "toxic metabolic encepalopathy",
    "imag_Toxic psychosis": "toxic psychosis",
}

trigger_words = {
    "imag_Acute confusion": "acute confusion",
    #'imag_Acute': 'acute', # assume this will create lots of false positives
    "imag_Acute mental status change (MS)": "acute mental status change",
    "imag_Altered mental status (AMS)": "altered mental status",
    "imag_Alert and Disoriented <3": "alert and disoriented",
    "imag_Confus*": "confus[A-Za-z]*",
    "imag_Disorient*": "disorient[A-Za-z]*",
    "imag_Deliri*": "deliri[A-Za-z]*",
    "imag_Encephalopathy": "encephalopathy",
    "imag_Hallucinati*": "hallucinati[A-Za-z]*",
    "imag_Mental status change": "mental status change",
    "imag_Reorient*": "reorient[A-Za-z]*",
}

supporting_words = {
    "imag_Agitat*": "agitat[A-Za-z]",
    "imag_Alarm": "alarm",
    "imag_Anxi*": "anxi[A-Za-z]*",
    "imag_Attent*": "attent[A-Za-z]*",
    "imag_Combative": "combative",
    "imag_Commands": "commands",
    "imag_Delusion": "delusion",
    "imag_Distract*": "distract[A-Za-z]*",
    "imag_Fall": "fall",
    "imag_Fluctu*": "fluctu[A-Za-z]*",
    "imag_Forget*": "forget[A-Za-z]*",
    "imag_Hypoactive": "hypoactive",
    "imag_Illusion": "illusion",
    "imag_Impuls*": "impuls[A-Za-z]*",
    "imag_Letharg*": "letharg[A-Za-z]*",
    "imag_Multifactorial": "multifactorial",
    "imag_Not Cooperative": "not cooperative",
    "imag_Non-responsiveness": "non-responsiveness",
    "imag_Narcotic*": "narcotic[A-Za-z]*",
    "imag_Out of Bed or OOB": "(out of bed)|(OOB)",
    "imag_Redirected": "redirected",
    "imag_Refus*": "refus[A-Za-z]*",
    "imag_Restless": "restless",
    "imag_Sedat*": "sedat[A-Za-z]*",
    "imag_Sleepy": "sleepy",
    "imag_Somnolen*": "somnolent[A-Za-z]*",
    "imag_Unrespons*": "unrespons[A-Za-z]*",
    "imag_Uncooperative": "uncooperative",
    "imag_Resist*": "resist[A-Za-z]",
    "imag_Inappropriate": "inappropriate",
    "imag_Altered level of consciousness": "altered level of consciousness",
    "imag_Diminished level of responsiveness": "diminished level of responsiveness",
    "imag_Difficulty with arousal": "difficulty with arousal",
    "imag_New onset coma": "new onset coma",
    "imag_Stupor": "stupor",
    "imag_Lethargy": "lethargy",
    "imag_Obtunded": "obtunded",
}

In [None]:
def re_search_dict(x, d):
    if isinstance(x, str):
        for key, value in d.items():
            if re.search(r"" + value, x, re.IGNORECASE):
                return True

    return False


def re_search_value(x, v):
    if isinstance(x, str):
        if re.search(r"" + v, x, re.IGNORECASE):
            return True

    return False

In [None]:
if HARDCODED_IMAGING_SYNONYMOUS_DIAGNOSES == "aggregate":
    imaging.loc[:, "imag_Synonymous diagnoses"] = imaging.loc[:, "test_result"].apply(
        re_search_dict, d=synonymous_diagnoses
    )

elif HARDCODED_IMAGING_SYNONYMOUS_DIAGNOSES == "granular":
    for key, value in synonymous_diagnoses.items():
        imaging.loc[:, "imag_regex_" + key] = imaging.loc[:, "test_result"].apply(
            re_search_value, v=value
        )

In [None]:
if HARDCODED_IMAGING_TRIGGER_WORDS == "aggregate":
    imaging.loc[:, "imag_Trigger words"] = imaging.loc[:, "test_result"].apply(
        re_search_dict, d=trigger_words
    )

elif HARDCODED_IMAGING_TRIGGER_WORDS == "granular":
    for key, value in trigger_words.items():
        imaging.loc[:, "imag_regex_" + key] = imaging.loc[:, "test_result"].apply(
            re_search_value, v=value
        )

In [None]:
if HARDCODED_IMAGING_SUPPORTING_WORDS == "aggregate":
    imaging.loc[:, "imag_Supporting words"] = imaging.loc[:, "test_result"].apply(
        re_search_dict, d=supporting_words
    )

elif HARDCODED_IMAGING_SUPPORTING_WORDS == "granular":
    for key, value in supporting_words.items():
        imaging.loc[:, "imag_regex_" + key] = imaging.loc[:, "test_result"].apply(
            re_search_value, v=value
        )

Just diagnostics:

In [None]:
query_end = time.time()
query_time = query_end - query_start

# 2 Processing

## 2.1 Combine Tables

Check missing values in `ip_administrative`:

In [None]:
ip_administrative.columns[ip_administrative.isna().any()].tolist()

Use `institution_from` and `institution_to` to determine if patient was from or was sent to a nursing home:

In [None]:
institution_list = [4, 9, 50004, 50009]
institution_type_list = ["CR", "HF", "LT", "NH", "TM"]

ip_administrative.loc[:, "to_nursing_home"] = np.where(
    ip_administrative.institution_to.isin(institution_list)
    | ip_administrative.institution_to_type.isin(institution_type_list),
    1,
    0,
)
ip_administrative.loc[:, "from_nursing_home"] = np.where(
    ip_administrative.institution_from.isin(institution_list)
    | ip_administrative.institution_from_type.isin(institution_type_list),
    1,
    0,
)

ip_administrative = ip_administrative.drop(
    [
        "institution_to",
        "institution_to_type",
        "institution_from",
        "institution_from_type",
    ],
    axis=1,
)

Merge room transfer:

In [None]:
data = pd.merge(ip_administrative, room_transfer, on="genc_id", how="left")
data.columns[data.isna().any()].tolist()

Fill missing `room_transfers` with 0:

In [None]:
data.fillna({"room_transfers": 0}, inplace=True)

Merge `ip_scu`:

In [None]:
data = pd.merge(data, ip_scu, on="genc_id", how="left")
data.columns[data.isna().any()].tolist()

Fill missing `icu_transfers` with 0:

In [None]:
data.fillna({"icu_transfers": 0}, inplace=True)

Merge `er_administrative`:

In [None]:
data = pd.merge(data, er_administrative, on="genc_id", how="left")
data.columns[data.isna().any()].tolist()

In [None]:
data.admit_via_ambulance.unique()

`@TODO`: Apply processing function to column

In [None]:
data.triage_level.unique()

`@TODO`: Apply processing function to column

Fill missing `er_los_derived` with 0:

In [None]:
data.fillna({"er_los_derived": 0}, inplace=True)

Merge `blood_transfusion`:

In [None]:
data = pd.merge(data, blood_transfusion, on="genc_id", how="left")
data.columns[data.isna().any()].tolist()

Fill missing `total_transfusions` and `unique_transfusions` with 0:

In [None]:
data.fillna({"total_transfusions": 0, "unique_transfusions": 0}, inplace=True)

Merge `delirium_icd10`:

In [None]:
data = pd.merge(data, delirium_icd10, on="genc_id", how="left")
data.columns[data.isna().any()].tolist()

In [None]:
data.icd_delirium.unique()

Impute missing `ICD10` code for Delirium with `False`:

In [None]:
data.fillna({"icd_delirium": False}, inplace=True)

Merge `diagnosis`:

In [None]:
data = pd.merge(data, diagnosis, on="genc_id", how="left")
data.columns[data.isna().any()].tolist()

Impute `nan` diagnoses as 0:

In [None]:
diagnosis_imputer_mask = np.where(diagnosis.columns.str.startswith("diag_"), 0, np.nan)
diagnosis_imputer = {k: v for k, v in zip(diagnosis.columns, diagnosis_imputer_mask)}

data.fillna(diagnosis_imputer, inplace=True)

Merge `lab`:

In [None]:
data = pd.merge(data, lab, on="genc_id", how="left")
data.columns[data.isna().any()].tolist()

We will want to try a couple of imputation strategies with lab stats, so this will be part of the pipeline and not done at this step. However, when frequency is `nan`, then it should be 0:

In [None]:
lab_imputer_mask = np.where(
    lab.columns.str.endswith("_n") & lab.columns.str.startswith("lab_"), 0, np.nan
)
lab_imputer = {k: v for k, v in zip(lab.columns, lab_imputer_mask)}

data.fillna(lab_imputer, inplace=True)

Merge `intervention`:

In [None]:
data = pd.merge(data, intervention, on="genc_id", how="left")
data.columns[data.isna().any()].tolist()

Missing intervention codes should be imputed with 0:

In [None]:
intervention_imputer_mask = np.where(
    intervention.columns.str.startswith("intervention_"), 0, np.nan
)
intervention_imputer = {
    k: v for k, v in zip(intervention.columns, intervention_imputer_mask)
}

data.fillna(intervention_imputer, inplace=True)

Merge `rare_interventions`:

In [None]:
data = pd.merge(data, rare_interventions, on="genc_id", how="left")
data.columns[data.isna().any()].tolist()

Missing `rare_interventions` should be imputed with 0:

In [None]:
rare_interventions_imputer_mask = np.where(
    rare_interventions.columns.str.startswith("rare_intervention_"), 0, np.nan
)
rare_interventions_imputer = {
    k: v for k, v in zip(rare_interventions.columns, rare_interventions_imputer_mask)
}

data.fillna(rare_interventions_imputer, inplace=True)

Merge `pharmacy`:

In [None]:
data = pd.merge(data, pharmacy, on="genc_id", how="left")
data.columns[data.isna().any()].tolist()

Impute missing `medications` with an empty string and impute missing `n_meds` with 0:

In [None]:
data.fillna({"medications": "", "n_meds": 0}, inplace=True)

Merge `pharmacy_beers` and `pharmacy_meds_mapped`:

In [None]:
data = pd.merge(data, pharmacy_beers, on="genc_id", how="left")
data = pd.merge(data, pharmacy_meds_mapped, on="genc_id", how="left")
data.columns[data.isna().any()].tolist()

Impute missing Beer's medications with False:

In [None]:
data.fillna(
    {
        "in_beers": False,
        "n_antipsychotics": 0,
        "n_antibiotics": 0,
        "n_sedatives": 0,
        "n_benzodiazepine": 0,
        "n_nootropics": 0,
    },
    inplace=True,
)

Merge `imaging`:

In [None]:
data = pd.merge(data, imaging, on="genc_id", how="left")
data.columns[data.isna().any()].tolist()

Impute missing `test_results` with an empty string:

In [None]:
data.fillna(
    {
        "test_result": "",
        "imag_Synonymous diagnoses": False,
        "imag_Trigger words": False,
        "imag_Supporting words": False,
    },
    inplace=True,
)

In [None]:
if HARDCODED_IMAGING_SUPPORTING_WORDS == "granular":
    imag_regex_to_impute = data.columns[
        self.data.columns.str.startswith("imag_regex_")
    ].tolist()

    data.fillna(
        imag_regex_impute_map=dict((x, 0) for x in imag_regex_to_impute), inplace=True
    )

In [None]:
data.set_index("genc_id", inplace=True)

## 2.2 Custom Column Cleanup (CCC)

`triage_level` should not have an `L` prefix:

In [None]:
def trim_l(x):
    if isinstance(x, str):
        if x[0] == "L":
            x = int(x[1])
        else:
            x = int(x)

    return x

In [None]:
data.loc[:, "triage_level"] = data.loc[:, "triage_level"].apply(trim_l)

`admit_via_ambulance` sometimes uses `GROUND` which means `G`:

In [None]:
def GROUND_to_G(x):
    if x == "GROUND":
        return "G"
    else:
        return x

In [None]:
data.loc[:, "admit_via_ambulance"] = data.loc[:, "admit_via_ambulance"].apply(
    GROUND_to_G
)

Make `del_present` 0/1

In [None]:
data.loc[:, "del_present"] = np.where(data.loc[:, "del_present"] == 2, 0, 1)

In [None]:
data.columns.to_frame().to_csv(
    "H:\\repos\\delirium\\delirium_features.csv", index=False
)

## 2.3 Splitting Strategies

`@TODO` Move these as static methods in `DeliriumExperimenter` class

In [None]:
def hold_out_one_quarter(data, datetime_colname, target_colname, year=2015, quarter=1):
    quarter_mask = data.loc[:, datetime_colname].dt.quarter == quarter
    year_mask = data.loc[:, datetime_colname].dt.year == year

    X = data.drop([datetime_colname, target_colname], axis=1)
    y = data.loc[:, target_colname]
    mask = quarter_mask & year_mask

    X_train = X[~mask]
    X_test = X[mask]

    y_train = y[~mask]
    y_test = y[mask]

    return X_train, X_test, y_train, y_test

In [None]:
def random_split(data, datetime_colname, target_colname):
    X = data.drop([datetime_colname, target_colname], axis=1)
    y = data.loc[:, target_colname]

    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=0.1, random_state=42, stratify=y
    )

    return X_train, X_test, y_train, y_test

In [None]:
def ordered_split(data, datetime_colname, target_colname):
    data = data.sort_values(datetime_colname)

    X = data.drop([datetime_colname, target_colname], axis=1)
    y = data.loc[:, target_colname]

    X_train, X_test = np.split(data_w_nlp, [int(0.9 * len(data_w_nlp))])
    y_train, y_test = np.split(y, [int(0.9 * len(y))])

    return X_train, X_test, y_train, y_test

# 3 Modelling

## 3.1 Tokenizers

In [None]:
def wordnet_stemming(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(WordNetLemmatizer().lemmatize(item))
    return stems

Create stemmed stopwords:

In [None]:
stemmed_stopwords = []

for stopword in stopwords.words("english"):
    stemmed = wordnet_stemming(stopword)
    for stemmed_stopword in stemmed:
        stemmed_stopwords.append(stemmed_stopword)

## 3.2 Vectorizers

In [None]:
tfidf_stopwords_stem = TfidfVectorizer(
    stop_words=stemmed_stopwords,
    ngram_range=(1, 1),
    max_features=1000,
    tokenizer=wordnet_stemming,
)

In [None]:
count_stopwords_stem = CountVectorizer(
    stop_words=stemmed_stopwords,
    ngram_range=(1, 1),
    max_features=1000,
    tokenizer=wordnet_stemming,
)

# 4 Experimentation

Below is an class which will allow for faster experimentation with respect to the delirium classifier. The class will be initated with particular parameters for the pipeline, and then will output performance metrics for comparison.

In [None]:
class DeliriumExperimenter:
    def __init__(
        self,
        data,
        lab_frequency_only=LAB_FREQUENCY_ONLY,
        first_experiment=False,
        split_method=SPLIT_METHOD,
        scoring=SCORING,
        vectorizer=VECTORIZER,
        del_label_weight=DEL_LABEL_WEIGHT,
        dimensionality_reduction=DIMENSIONALITY_REDUCTION,
        scaler=SCALING,
        metrics_filename="H:\\delirium_metrics_separate_keywords.csv",
    ):

        if lab_frequency_only:
            labs_to_drop = data.columns[
                data.columns.str.startswith("lab_") & ~(data.columns.str.endswith("_n"))
            ].tolist()
            data.drop(labs_to_drop, axis=1, inplace=True)

        # set variables
        self.data = data
        self.scoring = scoring
        self.vectorizer = vectorizer
        self.del_label_weight = del_label_weight
        self.dimensionality_reduction = dimensionality_reduction
        self.scaler = scaler
        self.params = {}
        self.metrics = {  # @TODO: Not clean design
            "split_method": split_method,
            "scoring": self.scoring,
            "vectorizer": self.vectorizer,
            "del_label_weight": self.del_label_weight,
            "dimensionality_reduction": self.dimensionality_reduction,
            "scaler": self.scaler,
            "model": "GradientBoostingClassifier",
            "hardcoded_imaging_synonymous_diagnoses": HARDCODED_IMAGING_SYNONYMOUS_DIAGNOSES,
            "hardcoded_imaging_trigger_words": HARDCODED_IMAGING_TRIGGER_WORDS,
            "hardcoded_imaging_supporting_words": HARDCODED_IMAGING_SUPPORTING_WORDS,
        }

        # create training and test data
        self.create_train_test(split_method)

        # develop pipeline skeleton
        self.initialize_preprocessor()
        self.add_vectorizer()

        self.column_transformer = ColumnTransformer(
            self.transformer_list, remainder="passthrough"
        )

        self.initialize_pipeline()
        self.add_scaling()
        self.add_dimensionality_reduction()

        self.pipeline_list = self.pipeline_list + [
            ("classifier", GradientBoostingClassifier())
        ]

        self.pipe = Pipeline(self.pipeline_list)

        # tune the model
        self.params = {
            **self.params,
            **{
                "preprocessor__impute_labs__strategy": ["mean", "constant"],
                "preprocessor__medications_nlp__ngram_range": [(1, 2), (2, 3)],
                "preprocessor__medications_nlp__max_features": [1000, 5000, 10000],
                "preprocessor__imaging_nlp__ngram_range": [(1, 1), (1, 2)],
                "preprocessor__imaging_nlp__max_features": [1000, 5000, 10000],
                "classifier__n_estimators": [100, 250, 500],
                "classifier__learning_rate": [0.1, 0.01, 0.001],
                "classifier__max_depth": [2, 5, 10],
            },
        }
        self.create_tuner()
        self.tune()

        self.evaluate(self.X_test, self.y_test, "test")
        self.save_experiment(metrics_filename, first_experiment)

    def create_train_test(
        self,
        method,
        datetime_colname="admit_date_time",
        target_colname="del_present",
        year=2015,
        quarter=1,
    ):
        if method == "quarter":
            X_train, X_test, y_train, y_test = hold_out_one_quarter(
                data, datetime_colname, target_colname
            )

        elif method == "random":
            X_train, X_test, y_train, y_test = random_split(
                data, datetime_colname, target_colname
            )

        elif method == "ordered":
            X_train, X_test, y_train, y_test = ordered_split(
                data, datetime_colname, target_colname
            )

        self.X = data.drop([target_colname, datetime_colname], axis=1)
        self.y = data.loc[:, target_colname]

        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test

    def initialize_preprocessor(self):
        labs_to_impute = self.data.columns[
            self.data.columns.str.startswith("lab_")
            & ~self.data.columns.str.endswith("_n")
        ].tolist()

        transformer_list = [
            (
                "onehot",
                OneHotEncoder(handle_unknown="ignore"),
                [
                    "gender",
                    "discharge_disposition",
                    "admit_category",
                    "admit_via_ambulance",
                    "triage_level",
                ],
            ),
            (
                "impute_labs",
                SimpleImputer(strategy="constant", fill_value=-999),
                labs_to_impute,
            ),
        ]

        self.transformer_list = transformer_list

    def add_vectorizer(self):
        if self.vectorizer == "tfidf":
            self.transformer_list = self.transformer_list + [
                ("medications_nlp", tfidf_stopwords_stem, "medications"),
                ("imaging_nlp", tfidf_stopwords_stem, "test_result"),
            ]
        elif self.vectorizer == "count":
            self.transformer_list = self.transformer_list + [
                ("medications_nlp", count_stopwords_stem, "medications"),
                ("imaging_nlp", count_stopwords_stem, "test_result"),
            ]

    def initialize_pipeline(self):
        self.pipeline_list = [("preprocessor", self.column_transformer)]

    def add_scaling(self):
        if self.scaler == "maxabs":
            self.pipeline_list = self.pipeline_list + [("scaling", MaxAbsScaler())]
        elif self.scaler == "standard":
            self.pipeline_list = self.pipeline_list + [
                (
                    "scaling",
                    StandardScaler(with_mean=self.dimensionality_reduction != "pca"),
                )
            ]

    def add_dimensionality_reduction(self):
        columns = len(self.data.columns)

        if self.dimensionality_reduction == "pca":
            self.params = {
                **self.params,
                **{
                    "dimensionality_reduction__n_components": [
                        math.ceil(columns / 2),
                        math.ceil(columns * 0.9),
                        500,
                    ]
                },
            }
            self.pipeline_list = self.pipeline_list + [
                (
                    "dimensionality_reduction",
                    TruncatedSVD(n_components=math.ceil(columns * 0.9)),
                )
            ]

    def create_tuner(self):
        self.tuner = RandomizedSearchCV(
            self.pipe,
            param_distributions=self.params,
            cv=5,
            n_iter=12,  # @TODO make higher
            scoring=self.scoring,
        )

    def tune(self):
        tuning_start = time.time()

        self.tuner.fit(
            self.X_train,
            self.y_train,
            classifier__sample_weight=np.array(
                [self.del_label_weight if i == 1 else 1 for i in self.y_train]
            ),
        )

        tuning_end = time.time()
        self.tuning_time = tuning_end - tuning_start

        self.metrics = {
            **self.metrics,
            **{
                "tuning_time": self.tuning_time,
                "best_score": self.tuner.best_score_,
                **self.tuner.best_params_,
            },
        }

    def evaluate(self, X, y, name):
        y_pred = self.tuner.predict(X)

        roc_auc = roc_auc_score(y, y_pred)
        precision, recall, f1, support = precision_recall_fscore_support(
            y, y_pred, average="binary"
        )
        tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

        self.metrics = {
            **self.metrics,
            **{
                name + "_" + "roc_auc": roc_auc,
                name + "_" + "precision": precision,
                name + "_" + "recall": recall,
                name + "_" + "f1": f1,
                name + "_" + "support": support,
                name + "_" + "npv": tn / (tn + fn),
                name + "_" + "specificity": tn / (tn + fp),
                name + "_" + "fpr": fp / (fp + tn),
                name + "_" + "fnr": fn / (tp + fn),
                name + "_" + "fdr": fp / (tp + fp),
                name + "_" + "accuracy": (tp + tn) / (tp + fp + fn + tn),
            },
        }

    def save_experiment(self, metrics_filename, first_experiment):

        with open(metrics_filename, "w" if first_experiment else "a") as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=list(self.metrics.keys()))
            if first_experiment:
                writer.writeheader()
            writer.writerow(self.metrics)

## 4.1 Experiments

In [None]:
DeliriumExperimenter(
    data,
    first_experiment=False,
    lab_frequency_only=False,
    scaler="standard",
    dimensionality_reduction="pca",
    vectorizer="tfidf",
)