# Modeling for infection diagnosis

## Setup, get data & import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import spacy
from datetime import datetime
from tqdm import tqdm
tqdm.pandas()  # Enable progress bar for pandas (change apply to progress_apply)
import os

## Load MIMIC data

In [2]:
MIMIC_3_DIR = '../mimic/mimic-iii-clinical-database-1.4'
os.chdir(MIMIC_3_DIR)

diagnoses_df = pd.read_csv('DIAGNOSES_ICD.csv').set_index('ROW_ID')
diagnoses_df.columns = diagnoses_df.columns.str.lower()

notes_df = pd.read_csv('NOTEEVENTS.csv', low_memory=False).set_index('ROW_ID')
notes_df.columns = notes_df.columns.str.lower()

microbio_df = pd.read_csv('MICROBIOLOGYEVENTS.csv').set_index('ROW_ID')
microbio_df.columns = microbio_df.columns.str.lower()

patients_df = pd.read_csv('PATIENTS.csv').set_index('ROW_ID')
patients_df.columns = patients_df.columns.str.lower()

prescriptions_df = pd.read_csv('PRESCRIPTIONS.csv').set_index('ROW_ID')
prescriptions_df.columns = prescriptions_df.columns.str.lower()

  prescriptions_df = pd.read_csv('PRESCRIPTIONS.csv').set_index('ROW_ID')


In [3]:
# Keep only unique diagnoses
print(diagnoses_df.shape)
print("Duplicates in diagnoses_df:", diagnoses_df.duplicated().sum())
diagnoses_df = diagnoses_df.drop_duplicates()

# Keep only unique notes
print(notes_df.shape)
print("Duplicates in notes_df:", notes_df.duplicated().sum())
notes_df = notes_df.drop_duplicates()

# Keep only unique microbiology events
print(microbio_df.shape)
print("Duplicates in microbio_df:", microbio_df.duplicated().sum())
microbio_df = microbio_df.drop_duplicates()

# Keep only unique patients
print(patients_df.shape)
print("Duplicates in patients_df:", patients_df.duplicated().sum())
patients_df = patients_df.drop_duplicates()

# Keep only unique prescriptions
print(prescriptions_df.shape)
print("Duplicates in prescriptions_df:", prescriptions_df.duplicated().sum())
prescriptions_df = prescriptions_df.drop_duplicates()

(651047, 4)
Duplicates in diagnoses_df: 0
(2083180, 10)
Duplicates in notes_df: 4863
(631726, 15)
Duplicates in microbio_df: 18414
(46520, 7)
Duplicates in patients_df: 0
(4156450, 18)
Duplicates in prescriptions_df: 236072


## Get & set patient ages for later analysis

In [4]:
# Calculate age function
def calculate_age(dob, dod):
    dob_date = datetime.strptime(dob, "%Y-%m-%d %H:%M:%S")
    if pd.isna(dod):
        return None
    dod_date = datetime.strptime(dod, "%Y-%m-%d %H:%M:%S")
    age = (dod_date - dob_date).days // 365
    return age

# Apply calculate_age function to create "age" column
patients_df['age'] = patients_df.apply(lambda row: calculate_age(row['dob'], row['dod']), axis=1)

# Filter out rows with age greater than or equal to 120
filtered_patients_df = patients_df[patients_df['age'] < 120]

## Identify common bloodborne pathogens

In [5]:
# Common blood infection codes
infection_codes = ['0380', '03810', '0382', '0383', '03842', '03843']
# Filter for only infection codes
infection_df = diagnoses_df[diagnoses_df['icd9_code'].isin(infection_codes)].copy()

infection_types = {   
    "0380": "Streptococcal septicemia",
    "03810": "Staphylococcal septicemia, unspecified",
    "0382": "Pneumococcal septicemia [Streptococcus pneumoniae septicemia]",
    "0383": "Septicemia due to anaerobes",
    "03842": "Septicemia due to escherichia coli [E. coli]",
    "03843": "Septicemia due to pseudomonas",
}

# Map infection types to infection codes
infection_df['infection_type'] = infection_df['icd9_code'].map(infection_types)

## Merge and process Infection, Notes, & Patients dataframes

In [6]:
# Aggregate notes by subject_id
aggregated_notes_df = notes_df.groupby('subject_id')['text'].unique().reset_index()
aggregated_notes_df.rename(columns={'text': 'unique_notes'}, inplace=True)
# Ensure 'unique_notes' is a string
aggregated_notes_df['unique_notes'] = aggregated_notes_df['unique_notes'].apply(lambda x: ' '.join(x) if isinstance(x, np.ndarray) else x)
# Limit notes to 50K characters
aggregated_notes_df['unique_notes'] = aggregated_notes_df['unique_notes'].apply(lambda x: x[:5000] if isinstance(x, str) else x)


# merge notes with infection data
merged_inf_notes_df = pd.merge(infection_df, aggregated_notes_df, on='subject_id', how='inner')

# merge notes with infection data with patients
merged_inf_notes_pats_df = pd.merge(merged_inf_notes_df, filtered_patients_df, on='subject_id', how='inner')

# Keep only relevant features from merged data
merged_inf_notes_pats_df = merged_inf_notes_pats_df[['subject_id', 'infection_type', 'unique_notes', 'gender', 'expire_flag', 'age']]

# Drop rows with missing age
merged_inf_notes_pats_df.dropna(subset=['age'], inplace=True)

# Drop duplicate notes
# merged_inf_notes_pats_df.drop_duplicates(subset=['text'], inplace=True)

# Because dataset is large, sample 1% of the data
# merged_inf_notes_pats_df = merged_inf_notes_pats_df.sample(frac=0.01, random_state=42)
 

In [7]:
aggregated_notes_df.head()
# print one entire note to check
print(aggregated_notes_df['unique_notes'][350])
# count characters in note
print(len(aggregated_notes_df['unique_notes'][350]))
# find note with highest char count
print(aggregated_notes_df['unique_notes'].apply(len).idxmax())
# find note with lowest char count
print(aggregated_notes_df['unique_notes'].apply(len).idxmin())
# determine what type of object is in the 'unique_notes' column
print(type(aggregated_notes_df['unique_notes'][350]))

Admission Date:  [**2114-7-8**]       Discharge Date:  [**2114-7-16**]

Date of Birth:   [**2069-7-16**]       Sex:  M

Service:  Trauma

HISTORY OF PRESENT ILLNESS:  1. Closed head injury consisting
of a left frontal subarachnoid hemorrhage which was stable.
2. Left elbow fracture.  3. Left scapula fracture.  4. Left
distal radioulnar joint separation.  5. Altered mental status
secondary to #1.

PHYSICAL EXAMINATION:  Examination on discharge includes in
general the patient is wearing a cervical collar and appears
alert and oriented x 1 and is in no acute distress.  HEENT
shows the pupils to be equal, round, and reactive to light,
extraocular motion is intact.  Neck examination shows the
trachea is midline, no jugular venous distension is noted.
Cardiovascularly he has regular rate and rhythm.  Pulmonary
showed that the lungs are clear to auscultation bilaterally,
no wheezes are heard.  Abdomen is soft, nondistended and
nontender.  Bowel sounds are normal and active.  Extremities
show

## Process & merge Prescriptions dataframe

In [8]:
# Clean up prescription data
print(prescriptions_df.shape)
prescriptions_df = prescriptions_df[['subject_id', 'drug_name_generic', 'formulary_drug_cd', 'prod_strength', 'dose_val_rx', 'dose_unit_rx', 'form_val_disp', 'form_unit_disp', 'route']]
prescriptions_df = prescriptions_df.dropna(subset=['drug_name_generic'])

# Aggregate prescriptions by subject_id
aggregated_prescriptions_df = prescriptions_df.groupby('subject_id')['drug_name_generic'].unique().reset_index()
aggregated_prescriptions_df.rename(columns={'drug_name_generic': 'unique_prescriptions'}, inplace=True)

# merge notes, infections, patients, and prescriptions
data_df = pd.merge(merged_inf_notes_pats_df, aggregated_prescriptions_df, on='subject_id', how='inner')
data_df.shape

(3920378, 18)


(567, 7)

In [12]:
# Load SciSpacy Model (en_core_sci_md)
nlp = spacy.load("en_core_sci_md")

def get_note_embedding(text):
    """Computes sentence embedding by averaging word vectors in the note."""
    try:
        # If text is a numpy array, convert to string
        if isinstance(text, np.ndarray):
            text = ' '.join(text)

        # Process text with SciSpacy model
        doc = nlp(text)

        # Extract word vectors for all words that have embeddings
        vectors = [token.vector for token in doc if token.has_vector]

        # If the sentence has no word vectors (rare case), return zeros
        if len(vectors) == 0:
            return np.zeros((300,))

        # Compute sentence embedding (average of word embeddings)
        return np.mean(vectors, axis=0)

    except Exception as e:
        print(f"Error processing text: {text}")
        print(f"Error: {e}")
        return np.zeros((300,))  # Return a zero vector if error occurs


  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [None]:
# Apply embedding extraction
data_df["sentence_embeddings"] = data_df["unique_notes"].progress_apply(get_note_embedding)

100%|██████████| 567/567 [21:15<00:00,  2.25s/it]    


In [None]:
data_df.head()

# drop test_unique_prescriptions and another_test_unique_prescriptions columns
# data_df.drop(columns=['unique_prescriptions', 'another_test_unique_prescriptions'], inplace=True)
# data_df.dropna(subset=['test_unique_prescriptions', 'another_test_unique_prescriptions'], inplace=True)
# change column name to unique_prescriptions
# data_df.rename(columns={'test_unique_prescriptions': 'unique_prescriptions'}, inplace=True)
# data_df.drop(columns=['text_embeddings'], inplace=True)

# get type of object in 'unique_prescriptions' column
print(type(data_df['unique_prescriptions'][0]))



AttributeError: 'Series' object has no attribute 'type'

In [None]:
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

# Load dataset
# df = pd.read_csv("your_data.csv")  # Replace with actual data

# Encode categorical variables
le_infection = LabelEncoder()
data_df['infection_type'] = le_infection.fit_transform(data_df['infection_type'])

data_df['gender'] = data_df['gender'].map({'M': 0, 'F': 1})  # Convert gender to binary

# Encode prescriptions as multi-label binary
mlb = MultiLabelBinarizer()
# df['unique_prescriptions'] = df['unique_prescriptions'].apply(eval)  # Convert string lists to actual lists
data_df['unique_prescriptions'] = data_df['unique_prescriptions'].progress_apply(lambda x: eval(x) if isinstance(x, str) else x)  # Convert string lists to actual lists
prescription_matrix = mlb.fit_transform(data_df['unique_prescriptions'])

# Convert text data to SciSpacy embeddings
data_df['text_embeddings'] = data_df['unique_notes'].apply(get_note_embedding)
