# Late Fusion Model Embeddings Generation

Purpose of this notebook: Generate embeddings for Late Fusion Model

Based on paper: Soenksen, L.R., Ma, Y., Zeng, C. et al. Integrated multimodal artificial intelligence framework for healthcare applications. npj Digit. Med. 5, 149 (2022). https://doi.org/10.1038/s41746-022-00689-4

Using our best tabular, notes, and imaging models, getting the embeddings and classifications for additional modelling

Goal: model using embeddings alone, or embeddings + classification (approach done in Soenksen et al)

In [1]:
#relevant code chunks to adapt to our purpose

In [32]:
#imports adapted from HAIM API.py
# Base
import cv2
import math
import copy
import pickle
import numpy as np
import pandas as pd
import pandas.io.sql as psql
import datetime as dt
import plotly.express as px
import matplotlib.pyplot as plt
from tqdm import tqdm
from glob import glob
from shutil import copyfile

from dask import dataframe as dd
from dask.diagnostics import ProgressBar
ProgressBar().register()

# Core AI/ML
import tensorflow as tf
import torch
import torch.nn.functional as F
import torchvision, torchvision.transforms
from torch.utils.data import Dataset, DataLoader
from torchinfo import summary

# Scikit-learn
from sklearn.preprocessing import scale
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer

# NLP
from torch import nn
from transformers import AutoTokenizer, AutoModel, logging
logging.set_verbosity_error()
# os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Computer Vision
import cv2
import skimage, skimage.io
import torchxrayvision as xrv
from torchvision import transforms
import timm
from PIL import Image

# Warning handling
import warnings
warnings.filterwarnings("ignore")

# Data Mapping

In [3]:
target_variables_dict = {
    'no_finding': 0,
    'atelectasis': 1,
    'cardiomegaly': 2,
    'lung_opacity': 3,
    'pleural_effusion': 4,
}

In [4]:
#run processing notebook to get data files
%run processing_data.ipynb

Loading data files... C:/Users/Carolyn/Documents/MIDS/210 Capstone/fusion_data\test_set__chexpert__4_findings__single_label__unbalanced.json test
Loading data files... C:/Users/Carolyn/Documents/MIDS/210 Capstone/fusion_data\train_set__chexpert__4_findings__single_label__balanced.json train
Loading data files... C:/Users/Carolyn/Documents/MIDS/210 Capstone/fusion_data\validation_set__chexpert__4_findings__single_label__unbalanced.json validate
Total Cols
 Index(['patient_id', 'visit_id', 'study_id', 'temperature', 'heartrate',
       'resprate', 'o2sat', 'sbp', 'dbp', 'pain', 'acuity',
       'positive_label_total', 'finding_names', 'radiology_note',
       'discharge_note', 'chief_complaint',
       'major_surgical_or_invasive_procedure', 'history_of_present_illness',
       'past_medical_history', 'family_history', 'atelectasis', 'cardiomegaly',
       'lung_opacity', 'pleural_effusion', 'dataset_type'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16877 en

# Notes

## Notes Model Setup

In [5]:
#original biobert models
biobert_tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
biobert_model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [6]:
# Select the Bio_Discharge_Summary_BERT model
MODEL_CHECKPOINT = 'emilyalsentzer/Bio_ClinicalBERT'

In [7]:
# Select parameters
NUM_CLASSES = 5
MAX_SEQUENCE_LENGTH = 512
NUM_EPOCHS = 15
BATCH_SIZE = 16
LEARNING_RATE = 0.00005

In [8]:
class MulticlassClassification(nn.Module):

    def __init__(self, checkpoint, num_classes, hidden_size=201, dropout_prob=0.3, freeze_bert=True):
        super(MulticlassClassification, self).__init__()

        self.model = AutoModel.from_pretrained(checkpoint)
        self.hidden_size = hidden_size
        self.dropout_prob = dropout_prob
        self.num_classes = num_classes
        self.freeze_bert = freeze_bert

        for param in self.model.parameters():
            param.requires_grad = not self.freeze_bert

        self.pooler_layer = nn.Linear(self.model.config.hidden_size, hidden_size) # maps the output of the BERT model's hidden state to the hidden_size
        self.relu = nn.ReLU() # introduces non-linearity to the model
        self.dropout = nn.Dropout(dropout_prob) # applied for regularization
        self.classification_layer = nn.Linear(hidden_size, num_classes) # projects the hidden_size down to the number of target classes

    def forward(self, input_ids, token_type_ids = None, attention_mask = None):
        outputs = self.model(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask)

        pooler_output = outputs.pooler_output
        hidden = self.pooler_layer(pooler_output)
        hidden = self.relu(hidden)
        hidden = self.dropout(hidden)
        classification = self.classification_layer(hidden) # logits for each class

        return classification, hidden ###STEVEN - ONLY CHANGE I MADE TO THE MODEL

    def unfreeze_bert_layers(self, n_layers):
        """Unfreezes the top n layers of the BERT model."""
        layers_to_unfreeze = list(self.model.encoder.layer[-n_layers:])
        for layer in layers_to_unfreeze:
            for param in layer.parameters():
                param.requires_grad = True

In [9]:
def load_model(model, checkpoint_path):
    model.load_state_dict(torch.load(checkpoint_path, map_location=torch.device('cpu')))
    return model

In [10]:
CHECKPOINT_FOLDER = "./checkpoints"
MODEL_NAME_FOLDER = "./model_findings"

esteban_model = MulticlassClassification(
    checkpoint=MODEL_CHECKPOINT,
    num_classes=NUM_CLASSES,
    freeze_bert=False,
    )

CHECKPOINT_FILE = r"C:\Users\Carolyn\Documents\MIDS\210 Capstone\mids-210-radiology-triage-models-spring24\pretrained_weights\bio_clinical_bert\bio_clinical_bert__balanced__unfrozen_layers__best.pt"

esteban_model.load_state_dict(torch.load(CHECKPOINT_FILE, map_location=torch.device('cpu')))

<All keys matched successfully>

In [11]:
esteban_model.eval()

MulticlassClassification(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

## Notes Embeddings

In [12]:
def get_biobert_classifications(text, model = "MMMM"):
    # Inputs:
    #   text -> Input text (str)
    #
    # Outputs:
    #   classifications -> Final Biobert classifications = (1,num_classifcations)
    #   hidden_embeddings -> Last hidden layer in Biobert model with vector dimensionality = (1,hidder_layer_size)
  
    # %% EXAMPLE OF USE
    # classifications, hidden_embeddings = get_biobert_embeddings(text)
    if model == "HAIM": #to test the HAIM model
        model = biobert_model
    
        tokens_pt = biobert_tokenizer(text, return_tensors="pt")
        outputs = model(**tokens_pt)
        last_hidden_state = outputs.last_hidden_state
        pooler_output = outputs.pooler_output
        classifications = pooler_output.detach().numpy()
        
    else:
        model = esteban_model

        tokens_pt = biobert_tokenizer(text, return_tensors='pt',  add_special_tokens=True, padding='max_length', max_length=MAX_SEQUENCE_LENGTH, truncation=True) # (input_ids, attention_mask, token_type_ids)
        outputs, hidden_outputs = model(**tokens_pt)
        prob = torch.softmax(outputs, dim=1)
        classifications = prob.detach().numpy()

    return classifications

In [13]:
def get_biobert_embeddings(text, model = "MMMM"):
    # Inputs:
    #   text -> Input text (str)
    #
    # Outputs:
    #   classifications -> Final Biobert classifications = (1,num_classifcations)
    #   hidden_embeddings -> Last hidden layer in Biobert model with vector dimensionality = (1,hidder_layer_size)
  
    # %% EXAMPLE OF USE
    # classifications, hidden_embeddings = get_biobert_embeddings(text)
    if model == "HAIM": #to test the HAIM model
        model = biobert_model
    
        tokens_pt = biobert_tokenizer(text, return_tensors="pt")
        outputs = model(**tokens_pt)
        last_hidden_state = outputs.last_hidden_state
        pooler_output = outputs.pooler_output
        hidden_embeddings = last_hidden_state.detach().numpy()
        
    else:
        model = esteban_model

        tokens_pt = biobert_tokenizer(text, return_tensors='pt',  add_special_tokens=True, padding='max_length', max_length=MAX_SEQUENCE_LENGTH, truncation=True) # (input_ids, attention_mask, token_type_ids)
        outputs, hidden_outputs = model(**tokens_pt)
        hidden_embeddings = hidden_outputs.detach().numpy()

    return hidden_embeddings

In [14]:
#final embeddings layer converted to classification softmax probabilities
def get_biobert_classifications(text):
    # Inputs:
    #   text -> Input text (str)
    #
    # Outputs:
    #   hidden_embeddings -> Last hidden layer in Biobert model with vector dimensionality = (1,hidder_layer_size)
  
    # %% EXAMPLE OF USE
    # classifications, hidden_embeddings = get_biobert_embeddings(text)
    model = esteban_model
    
    tokens_pt = biobert_tokenizer(text, return_tensors='pt',  add_special_tokens=True, padding='max_length', max_length=MAX_SEQUENCE_LENGTH, truncation=True) # (input_ids, attention_mask, token_type_ids)
    outputs, hidden_outputs = model(**tokens_pt)
    prob = torch.softmax(outputs, dim=1) #translate to softmax probabilities
    classifications = prob.detach().numpy()

    return classifications

In [22]:
#final embeddings layer converted to classification softmax probabilities
def get_biobert_classifications_sigmoid(text):
    # Inputs:
    #   text -> Input text (str)
    #
    # Outputs:
    #   hidden_embeddings -> Last hidden layer in Biobert model with vector dimensionality = (1,hidder_layer_size)
  
    # %% EXAMPLE OF USE
    # classifications, hidden_embeddings = get_biobert_embeddings(text)
    model = esteban_model
    
    tokens_pt = biobert_tokenizer(text, return_tensors='pt',  add_special_tokens=True, padding='max_length', max_length=MAX_SEQUENCE_LENGTH, truncation=True) # (input_ids, attention_mask, token_type_ids)
    outputs, hidden_outputs = model(**tokens_pt)
    prob = torch.sigmoid(outputs) #translate to softmax probabilities
    classifications = prob.detach().numpy()

    return classifications

In [16]:
#final embeddings layer (5 embeddings for 5 classes)
def get_biobert_embeddings(text):
    # Inputs:
    #   text -> Input text (str)
    #
    # Outputs:
    #   classifications -> Final Biobert classifications = (1,num_classifcations)
    #   hidden_embeddings -> Last hidden layer in Biobert model with vector dimensionality = (1,hidder_layer_size)
  
    # %% EXAMPLE OF USE
    # classifications, hidden_embeddings = get_biobert_embeddings(text)
    model = esteban_model

    tokens_pt = biobert_tokenizer(text, return_tensors='pt',  add_special_tokens=True, padding='max_length', max_length=MAX_SEQUENCE_LENGTH, truncation=True) # (input_ids, attention_mask, token_type_ids)
    outputs, hidden_outputs = model(**tokens_pt)
    embeddings = outputs.detach().numpy()

    return embeddings

In [17]:
#previous embeddings layer (size 201)
def get_biobert_hidden_embeddings(text):
    # Inputs:
    #   text -> Input text (str)
    #
    # Outputs:
    #   hidden_embeddings -> Last hidden layer in Biobert model with vector dimensionality = (1,hidder_layer_size)
  
    # %% EXAMPLE OF USE
    # classifications, hidden_embeddings = get_biobert_embeddings(text)
    model = esteban_model
    
    tokens_pt = biobert_tokenizer(text, return_tensors='pt',  add_special_tokens=True, padding='max_length', max_length=MAX_SEQUENCE_LENGTH, truncation=True) # (input_ids, attention_mask, token_type_ids)
    outputs, hidden_outputs = model(**tokens_pt)
    hidden_embeddings = hidden_outputs.detach().numpy()

    return hidden_embeddings

In [18]:
test = get_biobert_classifications(notes_train_df.history_of_present_illness[0])
test

array([[0.12290624, 0.30666423, 0.04976013, 0.39768472, 0.12298463]],
      dtype=float32)

In [259]:
#run on EC2 instance
#notes_train_df['notes_classifications'] = notes_train_df.history_of_present_illness.apply(get_biobert_classifications)
#notes_train_df['notes_embeddings'] = notes_train_df.history_of_present_illness.apply(get_biobert_embeddings)
#notes_val_df['notes_classifications'] = notes_val_df.history_of_present_illness.apply(get_biobert_classifications)
#notes_val_df['notes_embeddings'] = notes_val_df.history_of_present_illness.apply(get_biobert_embeddings)
#notes_test_df['notes_classifications'] = notes_test_df.history_of_present_illness.apply(get_biobert_classifications)
#notes_test_df['notes_embeddings'] = notes_test_df.history_of_present_illness.apply(get_biobert_embeddings)

In [303]:
print(notes_train_df.shape)
print(notes_val_df.shape)
print(notes_test_df.shape)

(2086, 6)
(1924, 6)
(1920, 6)


In [20]:
notes_train_df_sample = notes_train_df.loc[0:5]

In [23]:
notes_train_df_sample['notes_classifications'] = notes_train_df_sample.history_of_present_illness.apply(get_biobert_classifications)
notes_train_df_sample['notes_classifications_sigmoid'] = notes_train_df_sample.history_of_present_illness.apply(get_biobert_classifications_sigmoid)
notes_train_df_sample['notes_embeddings'] = notes_train_df_sample.history_of_present_illness.apply(get_biobert_embeddings)
notes_train_df_sample['notes_hidden_embeddings'] = notes_train_df_sample.history_of_present_illness.apply(get_biobert_hidden_embeddings)


In [24]:
notes_preds = pd.read_csv(r"C:\Users\Carolyn\Downloads\train_valid_test_results.csv")
notes_preds.columns

Index(['split', 'patient_id', 'visit_id', 'study_id', '0_no_finding_prob',
       '1_atelectasis_prob', '2_cardiomegaly_prob', '3_lung_opacity_prob',
       '4_pleural_effusion_prob', 'prediction_model', 'label_model'],
      dtype='object')

In [25]:
test = pd.merge(notes_train_df_sample, notes_preds[['patient_id','0_no_finding_prob',
       '1_atelectasis_prob', '2_cardiomegaly_prob', '3_lung_opacity_prob',
       '4_pleural_effusion_prob']],on='patient_id', how='left')
test

Unnamed: 0,patient_id,chief_complaint,history_of_present_illness,past_medical_history,family_history,finding_names,notes_classifications,notes_classifications_sigmoid,notes_embeddings,notes_hidden_embeddings,0_no_finding_prob,1_atelectasis_prob,2_cardiomegaly_prob,3_lung_opacity_prob,4_pleural_effusion_prob
0,11388716,SOB,"[UNK] with history of DVT on warfarin, silent ...",ONCOLOGIC HISTORY: \n- Early [UNK]: presented ...,Mother had an MI in her [UNK]. Father had an ...,atelectasis,"[[0.12290624, 0.30666423, 0.049760133, 0.39768...","[[0.43358842, 0.6563581, 0.23659606, 0.7123884...","[[-0.26722527, 0.64710647, -1.171433, 0.907012...","[[0.0, 0.7634526, 1.3019177, 0.0, 0.0, 0.0, 0....",0.018453,0.066146,0.015534,0.57169,0.328177
1,11539363,Chest pain,"Mr [UNK] is a [UNK] yo M w/IDDM, HTN, HLD, has...",- IDDM \n - HTN \n - HLD,Noncontributory,atelectasis,"[[0.47425452, 0.1929495, 0.2165972, 0.0819284,...","[[0.7678642, 0.5737029, 0.60170776, 0.3636382,...","[[1.1962906, 0.29697505, 0.41258588, -0.559607...","[[0.0, 0.953765, 1.2499045, 0.0, 0.0, 0.483167...",0.089891,0.290761,0.107167,0.101473,0.410708
2,10833304,"Lip swelling, shortness of breath","This is a [UNK] F PMhx hypothyroidism, depress...","- HTN\n- Hypothyroidism\n- Urticaria, chronic ...","Estranged from family, no known history of mal...",atelectasis,"[[0.46157345, 0.21572524, 0.1611983, 0.1239591...","[[0.7669955, 0.6060617, 0.53479797, 0.4692224,...","[[1.191423, 0.4307874, 0.13941717, -0.12326611...","[[0.057520777, 0.75045353, 1.1770848, 0.0, 0.0...",0.274893,0.599116,0.027246,0.072365,0.02638
3,19849119,Confusion,"[UNK] PMHx Hep C Cirrhosis, HTN, DM II present...","PMH: DMII, Cirrhosis, grade 1 esophageal varic...",Brothers x3 both with MI's. Father [UNK] arthr...,atelectasis,"[[0.23268963, 0.4965662, 0.054956377, 0.093964...","[[0.60900843, 0.7687306, 0.26893756, 0.3861229...","[[0.44314617, 1.2011575, -1.0000197, -0.463639...","[[0.0, 1.2091765, 1.6213006, 0.0, 0.0, 0.07568...",0.153558,0.681795,0.020531,0.060097,0.08402
4,11749991,"N/V, Abdominal Pain","[UNK] with hx of COPD, CHF, esophageal ulcer, ...",- HTN\n- CVA\n- CHF\n- Restles leg\n- Fibromya...,Not obtained,atelectasis,"[[0.27171558, 0.46165487, 0.045886148, 0.16378...","[[0.66237664, 0.7692294, 0.24886234, 0.5418233...","[[0.6739033, 1.2039651, -1.1046891, 0.16768499...","[[0.025130361, 1.3001677, 1.832436, 0.0, 0.0, ...",0.112563,0.662661,0.016461,0.143429,0.064886
5,17585582,Abdominal Pain,This is an [UNK] gentleman with a history of S...,"1. CARDIAC RISK FACTORS:: -Diabetes,- Dyslipid...","Family history of heart disease, multiple myel...",atelectasis,"[[0.4821642, 0.23429148, 0.12681454, 0.1187876...","[[0.7749602, 0.6259348, 0.47526413, 0.4589891,...","[[1.2365347, 0.514816, -0.09902414, -0.1644129...","[[0.0, 0.9920984, 1.2240568, 0.0, 0.0, 1.07559...",0.310272,0.569537,0.030307,0.062832,0.027053


In [345]:
test.to_csv("test_notes_embeddings_comparison.csv")

# Image Embeddings

In [26]:
#create Cinthya's model
cinthya_model = torch.load(r"C:\Users\Carolyn\Documents\MIDS\210 Capstone\mids-210-radiology-triage-models-spring24\pretrained_weights\EfficientNet-B3-Unb.pth", map_location=torch.device('cpu'))

In [27]:
cinthya_model

EfficientNet(
  (conv_stem): Conv2d(3, 40, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (bn1): BatchNormAct2d(
    40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
    (drop): Identity()
    (act): SiLU(inplace=True)
  )
  (blocks): Sequential(
    (0): Sequential(
      (0): DepthwiseSeparableConv(
        (conv_dw): Conv2d(40, 40, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=40, bias=False)
        (bn1): BatchNormAct2d(
          40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
          (drop): Identity()
          (act): SiLU(inplace=True)
        )
        (se): SqueezeExcite(
          (conv_reduce): Conv2d(40, 10, kernel_size=(1, 1), stride=(1, 1))
          (act1): SiLU(inplace=True)
          (conv_expand): Conv2d(10, 40, kernel_size=(1, 1), stride=(1, 1))
          (gate): Sigmoid()
        )
        (conv_pw): Conv2d(40, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn2): BatchNormAct2d(
    

In [28]:
class MedicalImageDataset(Dataset):
    def __init__(self, data, root_dir, transform=None):
        self.annotations = data
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        img_id = self.annotations.iloc[index, 1]
        img_path = os.path.join(self.root_dir, str(img_id))
        try:
            image = Image.open(img_path)
        except FileNotFoundError:
            print(f"File {img_path} not used in this dataset.")
            return None
        #print(f"File {img_path} found.")

        if self.transform:
            image = self.transform(image)

        return img_id, image

In [36]:
#based off of cinthya's definition of pulling data
def get_chest_xray_embeddings(model, data, root_dir):

    #convert img files into DataLoader Cinthya's model is expecting
    loader_transform = torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    
    dataset = MedicalImageDataset(data=data,
                                        root_dir=root_dir,
                                        transform=loader_transform)
    dataset = [data for data in dataset if data is not None]
    
    dataset_loader = DataLoader(test, batch_size=32, shuffle=False)
    
    model.eval()
    if torch.cuda.is_available():
        model.cuda()
    
    all_ids = []
    all_probs = []
    #all_probs_sigmoid = []
    all_embeddings = []
    all_densefeatures = []
    with torch.no_grad():
        for data in dataset_loader:
            if data is None:  # Skip the loop iteration if the dataset returned None
                continue
            img_ids, images = data  

            if torch.cuda.is_available():
                images = images.cuda()

            #for final embeddings and probabilities
            outputs = model(images)
            probabilities = torch.softmax(outputs, dim=0).cpu().numpy()
            #probs_sigmoid = torch.sigmoid(outputs).cpu().numpy()
            final_embeddings = outputs.cpu().numpy()
            all_probs.extend(probabilities)
            #all_probs_sigmoid.extend(probs_sigmoid)
            all_embeddings.extend(final_embeddings)
            all_ids.extend(list(img_ids))  # Make sure img_ids is iterable

            #for hidden embeddings (one layer up)
            feats = model.forward_features(images)
            feats = F.relu(feats, inplace=True)
            feats = F.adaptive_avg_pool2d(feats, (1, 1))
            densefeatures = feats.cpu().detach().numpy()
            all_densefeatures.extend(densefeatures)
            
    return all_ids, all_probs, all_embeddings, all_densefeatures

In [37]:
img_train_df_test = img_train_df.loc[0:5]

In [38]:
img_train_df_test

Unnamed: 0,patient_id,dicom_id,finding_names
0,11388716,7d6ea06e-554c2ccb-4d9ecefe-eb5ca0e0-7049fa19.jpg,atelectasis
1,11539363,b4b4f64f-ba5163c8-1b7ce58e-a0030af6-4d09dec1.jpg,atelectasis
2,10833304,3b09e882-c5f0d93b-106c7c94-29b839e8-00e7a950.jpg,atelectasis
3,19849119,3100385d-f9e0610a-e32f54df-3c17088b-4da3d491.jpg,atelectasis
4,11749991,c83dc36c-6c58d087-0ef18130-dd3e8cbf-faa5e609.jpg,atelectasis
5,17585582,02e93e76-cc72090c-b446147e-46bc26bc-547f10f0.jpg,atelectasis


In [39]:
train_ids, train_classifications, train_embeddings, train_densefeatures = get_chest_xray_embeddings(cinthya_model, img_train_df_test, root_dir=r"C:\Users\Carolyn\Documents\MIDS\210 Capstone\fusion_data\train_4_bal_s")

KeyError: 0

In [88]:
img_train_embeddings = pd.read_pickle(r"C:\Users\Carolyn\Downloads\img_train_embeddings.pkl")
image_preds_train = pd.read_csv(r"C:\Users\Carolyn\Downloads\train_bal_results.csv")

In [89]:
image_preds_train = image_preds_train.rename(columns={"img_id": "dicom_id"})

In [90]:
test = pd.merge(img_train_embeddings, image_preds_train, on='dicom_id', how='left')

In [91]:
test.tail()

Unnamed: 0,patient_id,dicom_id,finding_names,img_classifications,img_embeddings,img_densefeatures,No_Findings,Atelectasis,Cardiomegaly,Lung_Opacity,Pleural_Effusion
2081,16937222,69ef21d0-910dbded-aaef3c5b-4cda358b-8dee10a4.jpg,no_finding,"[0.7430532, 0.12572886, 0.044390652, 0.0841130...","[0.32454368, -1.4520963, -2.493195, -1.8540618...","[0.28614873, 0.0, 0.14239393, 0.11818148, 0.02...",0.627783,0.021455,0.144979,0.197374,0.008409
2082,12036096,ddf2c0fa-f7fb72b0-20309c3a-7c99f287-26e330cd.jpg,no_finding,"[0.994934, 0.0037281248, 0.0010363812, 0.00021...","[3.3385503, -2.2482204, -3.528391, -5.123619, ...","[0.280284, 0.0, 0.040529415, 0.02130972, 0.068...",0.887739,0.102148,0.008171,0.001313,0.00063
2083,17385589,3b1432b1-8159b96d-f3088b3f-a36e5416-b7a3463b.jpg,no_finding,"[0.9999316, 2.9949833e-05, 1.913851e-06, 3.558...","[4.942763, -5.4731555, -8.223561, -5.300732, -...","[0.31266996, 0.003200526, 0.05088086, 0.044937...",0.999967,1.8e-05,2e-06,1.1e-05,3e-06
2084,18088200,1e6ed6ad-8dbe2951-bd1403ff-307b58bd-27c725fe.jpg,no_finding,"[0.89402574, 0.032319363, 0.0028031713, 0.0700...","[1.2237699, -2.0962982, -4.5412135, -1.3231226...","[0.30471715, 0.0, 0.15632595, 0.15642409, 0.03...",0.316924,0.118666,0.003013,0.558805,0.002591
2085,12078448,4bbc5ac0-528eaa47-0f80e7c5-c66263cb-cea38862.jpg,no_finding,"[0.9976641, 0.0018453909, 0.000387495, 7.80137...","[3.1906168, -3.102109, -4.6628523, -6.265671, ...","[0.22868997, 0.000105697574, 0.03994514, 0.000...",0.971031,0.023259,0.005036,0.000479,0.000195


In [96]:
np.argmax(test.img_classifications[700])

3

In [100]:
test['index_max'] = test.img_classifications.apply(lambda x: np.argmax(x))
test.head()

Unnamed: 0,patient_id,dicom_id,finding_names,img_classifications,img_embeddings,img_densefeatures,No_Findings,Atelectasis,Cardiomegaly,Lung_Opacity,Pleural_Effusion,index_max
0,11388716,7d6ea06e-554c2ccb-4d9ecefe-eb5ca0e0-7049fa19.jpg,atelectasis,"[0.0026789217, 0.9865623, 0.001634778, 0.00850...","[-3.2017438, 2.7070682, -3.695651, -2.0468888,...","[0.27610707, 0.0, 0.089533396, 0.29763734, 0.1...",0.000967,0.996075,0.000822,0.0019,0.000236,1
1,11539363,b4b4f64f-ba5163c8-1b7ce58e-a0030af6-4d09dec1.jpg,atelectasis,"[0.08505852, 0.90619695, 0.004783125, 0.003748...","[-0.72294307, 1.6429741, -3.6011887, -3.844996...","[0.24303229, 0.0, 0.089104556, 0.1176696, 0.06...",0.000125,0.9998,5.8e-05,1.5e-05,2e-06,1
2,10833304,3b09e882-c5f0d93b-106c7c94-29b839e8-00e7a950.jpg,atelectasis,"[0.0023914576, 0.9973139, 0.00017710082, 0.000...","[-2.322076, 3.7110865, -4.9250154, -5.4447713,...","[0.23652035, 0.0, 0.052620452, 0.0682023, 0.13...",0.000533,0.999227,6.4e-05,0.000159,1.7e-05,1
3,19849119,3100385d-f9e0610a-e32f54df-3c17088b-4da3d491.jpg,atelectasis,"[0.00016185592, 0.9996964, 2.220072e-05, 5.294...","[-4.398639, 4.329861, -6.3852215, -5.5161505, ...","[0.24093658, 0.0, 0.047212478, 0.06716528, 0.2...",2.8e-05,0.999915,7e-06,1e-05,4e-05,1
4,11749991,c83dc36c-6c58d087-0ef18130-dd3e8cbf-faa5e609.jpg,atelectasis,"[0.0021358263, 0.9971975, 0.00016448315, 0.000...","[-2.8200288, 3.3260665, -5.3838296, -5.4086094...","[0.22600123, 0.0, 0.042545523, 0.027820904, 0....",1.7e-05,0.999928,8e-06,6e-06,4.1e-05,1


In [111]:
test['cinthya_index'] = test[["No_Findings","Atelectasis","Cardiomegaly","Lung_Opacity", "Pleural_Effusion"]].idxmax(axis=1)
test.head()

Unnamed: 0,patient_id,dicom_id,finding_names,img_classifications,img_embeddings,img_densefeatures,No_Findings,Atelectasis,Cardiomegaly,Lung_Opacity,Pleural_Effusion,index_max,cinthya_index
0,11388716,7d6ea06e-554c2ccb-4d9ecefe-eb5ca0e0-7049fa19.jpg,atelectasis,"[0.0026789217, 0.9865623, 0.001634778, 0.00850...","[-3.2017438, 2.7070682, -3.695651, -2.0468888,...","[0.27610707, 0.0, 0.089533396, 0.29763734, 0.1...",0.000967,0.996075,0.000822,0.0019,0.000236,1,Atelectasis
1,11539363,b4b4f64f-ba5163c8-1b7ce58e-a0030af6-4d09dec1.jpg,atelectasis,"[0.08505852, 0.90619695, 0.004783125, 0.003748...","[-0.72294307, 1.6429741, -3.6011887, -3.844996...","[0.24303229, 0.0, 0.089104556, 0.1176696, 0.06...",0.000125,0.9998,5.8e-05,1.5e-05,2e-06,1,Atelectasis
2,10833304,3b09e882-c5f0d93b-106c7c94-29b839e8-00e7a950.jpg,atelectasis,"[0.0023914576, 0.9973139, 0.00017710082, 0.000...","[-2.322076, 3.7110865, -4.9250154, -5.4447713,...","[0.23652035, 0.0, 0.052620452, 0.0682023, 0.13...",0.000533,0.999227,6.4e-05,0.000159,1.7e-05,1,Atelectasis
3,19849119,3100385d-f9e0610a-e32f54df-3c17088b-4da3d491.jpg,atelectasis,"[0.00016185592, 0.9996964, 2.220072e-05, 5.294...","[-4.398639, 4.329861, -6.3852215, -5.5161505, ...","[0.24093658, 0.0, 0.047212478, 0.06716528, 0.2...",2.8e-05,0.999915,7e-06,1e-05,4e-05,1,Atelectasis
4,11749991,c83dc36c-6c58d087-0ef18130-dd3e8cbf-faa5e609.jpg,atelectasis,"[0.0021358263, 0.9971975, 0.00016448315, 0.000...","[-2.8200288, 3.3260665, -5.3838296, -5.4086094...","[0.22600123, 0.0, 0.042545523, 0.027820904, 0....",1.7e-05,0.999928,8e-06,6e-06,4.1e-05,1,Atelectasis


In [113]:
names_dict = {
    0 : "No_Findings",
    1 : "Atelectasis",
    2 : "Cardiomegaly",
    3 : "Lung_Opacity",
    4 : "Pleural_Effusion"
}

test['index_names'] = None

test.index_names[test.index_max == 0] = "No_Findings"
test.index_names[test.index_max == 1] = "Atelectasis"
test.index_names[test.index_max == 2] = "Cardiomegaly"
test.index_names[test.index_max == 3] = "Lung_Opacity"
test.index_names[test.index_max == 4] = "Pleural_Effusion"

test.head()

Unnamed: 0,patient_id,dicom_id,finding_names,img_classifications,img_embeddings,img_densefeatures,No_Findings,Atelectasis,Cardiomegaly,Lung_Opacity,Pleural_Effusion,index_max,cinthya_index,index_names
0,11388716,7d6ea06e-554c2ccb-4d9ecefe-eb5ca0e0-7049fa19.jpg,atelectasis,"[0.0026789217, 0.9865623, 0.001634778, 0.00850...","[-3.2017438, 2.7070682, -3.695651, -2.0468888,...","[0.27610707, 0.0, 0.089533396, 0.29763734, 0.1...",0.000967,0.996075,0.000822,0.0019,0.000236,1,Atelectasis,Atelectasis
1,11539363,b4b4f64f-ba5163c8-1b7ce58e-a0030af6-4d09dec1.jpg,atelectasis,"[0.08505852, 0.90619695, 0.004783125, 0.003748...","[-0.72294307, 1.6429741, -3.6011887, -3.844996...","[0.24303229, 0.0, 0.089104556, 0.1176696, 0.06...",0.000125,0.9998,5.8e-05,1.5e-05,2e-06,1,Atelectasis,Atelectasis
2,10833304,3b09e882-c5f0d93b-106c7c94-29b839e8-00e7a950.jpg,atelectasis,"[0.0023914576, 0.9973139, 0.00017710082, 0.000...","[-2.322076, 3.7110865, -4.9250154, -5.4447713,...","[0.23652035, 0.0, 0.052620452, 0.0682023, 0.13...",0.000533,0.999227,6.4e-05,0.000159,1.7e-05,1,Atelectasis,Atelectasis
3,19849119,3100385d-f9e0610a-e32f54df-3c17088b-4da3d491.jpg,atelectasis,"[0.00016185592, 0.9996964, 2.220072e-05, 5.294...","[-4.398639, 4.329861, -6.3852215, -5.5161505, ...","[0.24093658, 0.0, 0.047212478, 0.06716528, 0.2...",2.8e-05,0.999915,7e-06,1e-05,4e-05,1,Atelectasis,Atelectasis
4,11749991,c83dc36c-6c58d087-0ef18130-dd3e8cbf-faa5e609.jpg,atelectasis,"[0.0021358263, 0.9971975, 0.00016448315, 0.000...","[-2.8200288, 3.3260665, -5.3838296, -5.4086094...","[0.22600123, 0.0, 0.042545523, 0.027820904, 0....",1.7e-05,0.999928,8e-06,6e-06,4.1e-05,1,Atelectasis,Atelectasis


In [117]:
test[test.index_names != test.cinthya_index]

Unnamed: 0,patient_id,dicom_id,finding_names,img_classifications,img_embeddings,img_densefeatures,No_Findings,Atelectasis,Cardiomegaly,Lung_Opacity,Pleural_Effusion,index_max,cinthya_index,index_names
9,15107987,a9a3f13d-628513d1-8dce8bb7-f10ba272-d0024848.jpg,atelectasis,"[0.7133208, 0.2544471, 0.0027581349, 0.0274035...","[0.34616393, -0.68467426, -5.209213, -2.913094...","[0.24335423, 0.0, 0.14474876, 0.1739217, 0.169...",0.002892,0.995713,0.000102,0.001152,0.000141,0,Atelectasis,No_Findings
12,17751692,b997a8d2-1b96550c-a5805db6-7163be8a-02e71e78.jpg,atelectasis,"[0.5931087, 0.4054608, 0.00045778, 0.000736896...","[0.80672777, 0.42637432, -6.3600163, -5.883958...","[0.26021403, 0.0, 0.04485948, 0.027659187, 0.1...",0.001580,0.998362,0.000018,0.000030,0.000010,0,Atelectasis,No_Findings
20,10001217,a917c883-720a5bbf-02c84fc6-98ad00ac-c562ff80.jpg,atelectasis,"[0.9366648, 0.06055419, 0.00034124273, 0.00222...","[1.6857357, -1.0530509, -6.231751, -4.3584423,...","[0.25776124, 0.00062956417, 0.08351371, 0.0881...",0.012087,0.986644,0.000087,0.001076,0.000107,0,Atelectasis,No_Findings
22,10306584,6b8037ae-cea888e0-a1f3b44c-30e1d8c1-1a64b62f.jpg,atelectasis,"[0.7004442, 0.29855207, 0.00018421662, 0.00072...","[1.235228, 0.38245755, -7.0081296, -5.6436353,...","[0.2707543, 0.0, 0.04281694, 0.01327201, 0.053...",0.001684,0.998268,0.000019,0.000026,0.000003,0,Atelectasis,No_Findings
38,12931268,8ddcf226-8809a4e0-25cf46d6-40ff7743-b8bfbfe2.jpg,atelectasis,"[0.7521169, 0.24401994, 0.0036009937, 0.000202...","[0.97483337, -0.15080851, -4.3668485, -7.24702...","[0.23023964, 0.0, 0.04079595, 0.0, 0.22496933,...",0.001100,0.998306,0.000579,0.000009,0.000007,0,Atelectasis,No_Findings
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2038,18553288,04f2c220-57aec172-0dffcc31-fffc5812-9a601aaf.jpg,no_finding,"[0.36277196, 0.0070187505, 0.15907116, 0.46544...","[-0.31879258, -4.263982, -1.1432153, -0.069573...","[0.32054085, 0.0, 0.27061754, 0.2691546, 0.061...",0.712486,0.003768,0.139353,0.139977,0.004416,3,No_Findings,Lung_Opacity
2061,12312635,f6f8fa94-9861ec44-fca8ebcf-95bc1fe3-a69324dd.jpg,no_finding,"[0.6856424, 0.015133883, 0.16403697, 0.1332271...","[0.3199906, -3.4934294, -1.1102738, -1.3183099...","[0.33273733, 0.0, 0.17955476, 0.10640761, 0.05...",0.090674,0.004066,0.240808,0.662490,0.001961,0,Lung_Opacity,No_Findings
2064,15823696,959ad5ca-4eac34d7-09e64594-a5e8dedc-81255d80.jpg,no_finding,"[0.85356736, 0.00042644757, 0.14321464, 0.0006...","[1.0920707, -6.5096197, -0.6930092, -6.1644964...","[0.27198488, 0.0, 0.057585906, 0.00016376641, ...",0.250014,0.000511,0.747710,0.000484,0.001281,0,Cardiomegaly,No_Findings
2071,15712703,18b34624-0a74afa1-e9a0b0fa-6761191e-732f0e30.jpg,no_finding,"[0.16730496, 0.715085, 0.0010436748, 0.0858354...","[-0.89034325, 0.56223994, -5.9674134, -1.55772...","[0.27168033, 0.001014771, 0.06455391, 0.213756...",0.640111,0.141175,0.003093,0.081221,0.134399,1,No_Findings,Atelectasis


In [118]:
test[test.index_names == test.cinthya_index]

Unnamed: 0,patient_id,dicom_id,finding_names,img_classifications,img_embeddings,img_densefeatures,No_Findings,Atelectasis,Cardiomegaly,Lung_Opacity,Pleural_Effusion,index_max,cinthya_index,index_names
0,11388716,7d6ea06e-554c2ccb-4d9ecefe-eb5ca0e0-7049fa19.jpg,atelectasis,"[0.0026789217, 0.9865623, 0.001634778, 0.00850...","[-3.2017438, 2.7070682, -3.695651, -2.0468888,...","[0.27610707, 0.0, 0.089533396, 0.29763734, 0.1...",0.000967,0.996075,0.000822,0.001900,0.000236,1,Atelectasis,Atelectasis
1,11539363,b4b4f64f-ba5163c8-1b7ce58e-a0030af6-4d09dec1.jpg,atelectasis,"[0.08505852, 0.90619695, 0.004783125, 0.003748...","[-0.72294307, 1.6429741, -3.6011887, -3.844996...","[0.24303229, 0.0, 0.089104556, 0.1176696, 0.06...",0.000125,0.999800,0.000058,0.000015,0.000002,1,Atelectasis,Atelectasis
2,10833304,3b09e882-c5f0d93b-106c7c94-29b839e8-00e7a950.jpg,atelectasis,"[0.0023914576, 0.9973139, 0.00017710082, 0.000...","[-2.322076, 3.7110865, -4.9250154, -5.4447713,...","[0.23652035, 0.0, 0.052620452, 0.0682023, 0.13...",0.000533,0.999227,0.000064,0.000159,0.000017,1,Atelectasis,Atelectasis
3,19849119,3100385d-f9e0610a-e32f54df-3c17088b-4da3d491.jpg,atelectasis,"[0.00016185592, 0.9996964, 2.220072e-05, 5.294...","[-4.398639, 4.329861, -6.3852215, -5.5161505, ...","[0.24093658, 0.0, 0.047212478, 0.06716528, 0.2...",0.000028,0.999915,0.000007,0.000010,0.000040,1,Atelectasis,Atelectasis
4,11749991,c83dc36c-6c58d087-0ef18130-dd3e8cbf-faa5e609.jpg,atelectasis,"[0.0021358263, 0.9971975, 0.00016448315, 0.000...","[-2.8200288, 3.3260665, -5.3838296, -5.4086094...","[0.22600123, 0.0, 0.042545523, 0.027820904, 0....",0.000017,0.999928,0.000008,0.000006,0.000041,1,Atelectasis,Atelectasis
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2080,18251680,989c3279-38204566-e293687e-5fb57276-186bcf8c.jpg,no_finding,"[0.9994404, 0.00010507901, 1.3011231e-05, 0.00...","[3.9588208, -5.201418, -7.2903166, -3.7777133,...","[0.2615246, 0.0, 0.15933138, 0.15444286, 0.007...",0.999822,0.000027,0.000007,0.000141,0.000003,0,No_Findings,No_Findings
2081,16937222,69ef21d0-910dbded-aaef3c5b-4cda358b-8dee10a4.jpg,no_finding,"[0.7430532, 0.12572886, 0.044390652, 0.0841130...","[0.32454368, -1.4520963, -2.493195, -1.8540618...","[0.28614873, 0.0, 0.14239393, 0.11818148, 0.02...",0.627783,0.021455,0.144979,0.197374,0.008409,0,No_Findings,No_Findings
2082,12036096,ddf2c0fa-f7fb72b0-20309c3a-7c99f287-26e330cd.jpg,no_finding,"[0.994934, 0.0037281248, 0.0010363812, 0.00021...","[3.3385503, -2.2482204, -3.528391, -5.123619, ...","[0.280284, 0.0, 0.040529415, 0.02130972, 0.068...",0.887739,0.102148,0.008171,0.001313,0.000630,0,No_Findings,No_Findings
2083,17385589,3b1432b1-8159b96d-f3088b3f-a36e5416-b7a3463b.jpg,no_finding,"[0.9999316, 2.9949833e-05, 1.913851e-06, 3.558...","[4.942763, -5.4731555, -8.223561, -5.300732, -...","[0.31266996, 0.003200526, 0.05088086, 0.044937...",0.999967,0.000018,0.000002,0.000011,0.000003,0,No_Findings,No_Findings


# Generate Tabular Embeddings

In [5]:
#tabular embeddings
#need to build our own models for these
#for now, append tabular data similarly to demographic data (as raw values)

def get_tabular_embeddings(dt_patient, verbose=0):
    # Inputs:
    #   dt_patient -> Timebound mimic patient structure
    #   verbose -> Flag to print found keyword outputs (0,1)
    #
    # Outputs:
    #   base_embeddings -> Core base embeddings for the selected patient

    # %% EXAMPLE OF USE
    # base_embeddings = get_demographic_embeddings(dt_patient, verbose=1)

    # Retrieve dt_patient and get embeddings 
    demo_embeddings =  dt_patient[['temperature', 'heartrate','resprate', 'o2sat', 'sbp', 'dbp', 'pain', 'acuity']].values[0]
    if verbose >= 1:
        print(demo_embeddings)
    return demo_embeddings


In [9]:
tabular_train_df.head()

Unnamed: 0,patient_id,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,finding_names
0,11388716,98.8,106.0,22.0,96.0,93.0,67.0,0,2.0,atelectasis
1,11539363,99.1,80.0,16.0,97.0,162.0,67.0,0,3.0,atelectasis
2,10833304,97.0,98.0,14.0,100.0,159.0,88.0,2,2.0,atelectasis
3,19849119,98.6,92.0,20.0,98.0,127.0,70.0,0,2.0,atelectasis
4,11749991,100.6,110.0,16.0,97.0,166.0,100.0,8,2.0,atelectasis


# Concatenate Embeddings together and save output file

In [6]:
#HAIM's concatentation
def process_cxr_embeddings_patient_id(patient_id, dt_patient, df_init):
    
    # TABULAR EMBEDDINGS EXTRACTION
    tabular_embeddings = get_tabular_embeddings(dt_patient, verbose=0)
    gc.collect() #Clear memory
    
    # NOTES EMBEDDINGS
    aggregated_notes_embeddings, aggregated_notes_hidden_embeddings = get_biobert_embeddings(patient, note_type = 'radnotes')
    gc.collect() #Clear memory

    # CHEST XRAY VISION EMBEDDINGS EXTRACTION
    #aggregated_densefeature_embeddings, _, aggregated_prediction_embeddings, _, _ = get_chest_xray_embeddings(dt_patient, verbose=0)
    #gc.collect() #Clear memory
    
    # CHEST XRAY VISION SINGLE-IMAGE EMBEDDINGS EXTRACTION
    print('getting xray')
    img = df_imcxr[idx]
    densefeature_embeddings, prediction_embeddings = get_single_chest_xray_embeddings(img)
    gc.collect() #Clear memory

    # Create Dataframes filteed by ordered sample number for Fusion
    df_patient_ids_fusion = pd.DataFrame([patient_id],columns=['patient_id'])
    df_tabular_embeddings_fusion = pd.DataFrame(tabular_embeddings.reshape(1,-1), columns=['de_'+str(i) for i in range(demo_embeddings.shape[0])])
    
    df_vision_dense_embeddings_fusion = pd.DataFrame(densefeature_embeddings.reshape(1,-1), columns=['vd_'+str(i) for i in range(densefeature_embeddings.shape[0])])
    df_vision_predictions_embeddings_fusion = pd.DataFrame(prediction_embeddings.reshape(1,-1), columns=['vp_'+str(i) for i in range(prediction_embeddings.shape[0])])
    df_notes_embeddings_fusion = pd.DataFrame(aggregated_notes_embeddings.reshape(1,-1), columns=['n_rad_'+str(i) for i in range(aggregated_rad_embeddings.shape[0])])
    df_notes_hidden_embeddings_fusion = pd.DataFrame(aggregated_notes_hidden_embeddings.reshape(1,-1), columns=['n_rad_'+str(i) for i in range(aggregated_rad_embeddings.shape[0])])

    # Vision targets
    cxr_target_columns = ['split','Atelectasis','Cardiomegaly','Consolidation','Edema','Enlarged Cardiomediastinum','Fracture','Lung Lesion','Lung Opacity','No Finding','Pleural Effusion','Pleural Other','Pneumonia','Pneumothorax','Support Devices', 'PerformedProcedureStepDescription','ViewPosition']
    df_vision_targets_fusion = df_stay_cxr.loc[idx:idx][cxr_target_columns].reset_index(drop=True)

    # Embeddings FUSION
    df_fusion = df_patient_ids_fusion
    df_fusion = pd.concat([df_fusion, df_init], axis=1)
    df_fusion = pd.concat([df_fusion, df_tabular_embeddings_fusion], axis=1)
    df_fusion = pd.concat([df_fusion, df_vision_dense_embeddings_fusion], axis=1)
    df_fusion = pd.concat([df_fusion, df_vision_predictions_embeddings_fusion], axis=1)
    df_fusion = pd.concat([df_fusion, df_notes_embeddings_fusion], axis=1)
    
    #Add targets
    df_fusion = pd.concat([df_fusion, df_vision_targets_fusion], axis=1)
    gc.collect() #Clear memory
    
    return df_fusion


In [82]:
# test tabular "embeddings"

In [83]:
test_tabular_embedding = get_tabular_embeddings(train_df, verbose = 1)

[ 97.6 117.   18.   95.  128.   74.   10.    3. ]
