# Late Fusion Model Embeddings Generation

Purpose of this notebook: Generate embeddings for Late Fusion Model

Based on paper: Soenksen, L.R., Ma, Y., Zeng, C. et al. Integrated multimodal artificial intelligence framework for healthcare applications. npj Digit. Med. 5, 149 (2022). https://doi.org/10.1038/s41746-022-00689-4

Using our best tabular, notes, and imaging models, getting the embeddings and classifications for additional modelling

Goal: model using embeddings alone, or embeddings + classification (approach done in Soenksen et al)

In [1]:
#relevant code chunks to adapt to our purpose

In [2]:
#imports adapted from HAIM API.py
# Base
import cv2
import math
import copy
import pickle
import numpy as np
import pandas as pd
import pandas.io.sql as psql
import datetime as dt
import plotly.express as px
import matplotlib.pyplot as plt
from tqdm import tqdm
from glob import glob
from shutil import copyfile

# Core AI/ML
#import tensorflow as tf
import torch
import torch.nn.functional as F
import torchvision, torchvision.transforms #causing problems
from torch.utils.data import Dataset, DataLoader
#from torchinfo import summary

# Scikit-learn
from sklearn.preprocessing import scale
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer

# NLP
from torch import nn
from transformers import AutoTokenizer, AutoModel, logging
logging.set_verbosity_error()
# os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Computer Vision
import timm
from PIL import Image

# Warning handling
import warnings
warnings.filterwarnings("ignore")

# Data Mapping

In [3]:
target_variables_dict = {
    'no_finding': 0,
    'atelectasis': 1,
    'cardiomegaly': 2,
    'lung_opacity': 3,
    'pleural_effusion': 4,
}

In [4]:
#run processing notebook to get data files
%run processing_data_ec2.ipynb

Loading data files... ../data/s3/fusion_data/test_set__chexpert__4_findings__single_label__unbalanced.json test
Loading data files... ../data/s3/fusion_data/train_set__chexpert__4_findings__single_label__balanced.json train
Loading data files... ../data/s3/fusion_data/validation_set__chexpert__4_findings__single_label__unbalanced.json validate
Total Cols
 Index(['patient_id', 'visit_id', 'study_id', 'temperature', 'heartrate',
       'resprate', 'o2sat', 'sbp', 'dbp', 'pain', 'acuity',
       'positive_label_total', 'finding_names', 'radiology_note',
       'discharge_note', 'chief_complaint',
       'major_surgical_or_invasive_procedure', 'history_of_present_illness',
       'past_medical_history', 'family_history', 'atelectasis', 'cardiomegaly',
       'lung_opacity', 'pleural_effusion', 'dataset_type'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16877 entries, 0 to 16876
Data columns (total 40 columns):
 #   Column                                      No

# Notes

## Notes Model Setup

In [21]:
#original biobert models
biobert_tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
biobert_model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [22]:
# Select the Bio_Discharge_Summary_BERT model
MODEL_CHECKPOINT = 'emilyalsentzer/Bio_ClinicalBERT'

In [23]:
# Select parameters
NUM_CLASSES = 5
MAX_SEQUENCE_LENGTH = 512
NUM_EPOCHS = 15
BATCH_SIZE = 16
LEARNING_RATE = 0.00005

In [24]:
class MulticlassClassification(nn.Module):

    def __init__(self, checkpoint, num_classes, hidden_size=201, dropout_prob=0.3, freeze_bert=True):
        super(MulticlassClassification, self).__init__()

        self.model = AutoModel.from_pretrained(checkpoint)
        self.hidden_size = hidden_size
        self.dropout_prob = dropout_prob
        self.num_classes = num_classes
        self.freeze_bert = freeze_bert

        for param in self.model.parameters():
            param.requires_grad = not self.freeze_bert

        self.pooler_layer = nn.Linear(self.model.config.hidden_size, hidden_size) # maps the output of the BERT model's hidden state to the hidden_size
        self.relu = nn.ReLU() # introduces non-linearity to the model
        self.dropout = nn.Dropout(dropout_prob) # applied for regularization
        self.classification_layer = nn.Linear(hidden_size, num_classes) # projects the hidden_size down to the number of target classes

    def forward(self, input_ids, token_type_ids = None, attention_mask = None):
        outputs = self.model(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask)

        pooler_output = outputs.pooler_output
        hidden = self.pooler_layer(pooler_output)
        hidden = self.relu(hidden)
        hidden = self.dropout(hidden)
        classification = self.classification_layer(hidden) # logits for each class

        return classification, hidden ###STEVEN - ONLY CHANGE I MADE TO THE MODEL

    def unfreeze_bert_layers(self, n_layers):
        """Unfreezes the top n layers of the BERT model."""
        layers_to_unfreeze = list(self.model.encoder.layer[-n_layers:])
        for layer in layers_to_unfreeze:
            for param in layer.parameters():
                param.requires_grad = True

In [25]:
def load_model(model, checkpoint_path):
    model.load_state_dict(torch.load(checkpoint_path, map_location=torch.device('cpu')))
    return model

In [26]:
CHECKPOINT_FOLDER = "./checkpoints"
MODEL_NAME_FOLDER = "./model_findings"

esteban_model = MulticlassClassification(
    checkpoint=MODEL_CHECKPOINT,
    num_classes=NUM_CLASSES,
    freeze_bert=False,
    )

CHECKPOINT_FILE = r"../data/s3/fusion_data/models/bio_clinical_bert__balanced__unfrozen_layers__best.pt"

esteban_model.load_state_dict(torch.load(CHECKPOINT_FILE, map_location=torch.device('cpu')))

<All keys matched successfully>

In [27]:
esteban_model.eval()

MulticlassClassification(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

## Notes Embeddings

In [28]:
#final embeddings layer (5 embeddings for 5 classes)
def get_biobert_embeddings(text):
    # Inputs:
    #   text -> Input text (str)
    #
    # Outputs:
    #   classifications -> Final Biobert classifications = (1,num_classifcations)
    #   hidden_embeddings -> Last hidden layer in Biobert model with vector dimensionality = (1,hidder_layer_size)
  
    # %% EXAMPLE OF USE
    # classifications, hidden_embeddings = get_biobert_embeddings(text)
    model = esteban_model

    tokens_pt = biobert_tokenizer(text, return_tensors='pt',  add_special_tokens=True, padding='max_length', max_length=MAX_SEQUENCE_LENGTH, truncation=True) # (input_ids, attention_mask, token_type_ids)
    outputs, hidden_outputs = model(**tokens_pt)
    embeddings = outputs.detach().numpy()

    return embeddings

In [29]:
#previous embeddings layer (size 201)
def get_biobert_hidden_embeddings(text):
    # Inputs:
    #   text -> Input text (str)
    #
    # Outputs:
    #   hidden_embeddings -> Last hidden layer in Biobert model with vector dimensionality = (1,hidder_layer_size)
  
    # %% EXAMPLE OF USE
    # classifications, hidden_embeddings = get_biobert_embeddings(text)
    model = esteban_model
    
    tokens_pt = biobert_tokenizer(text, return_tensors='pt',  add_special_tokens=True, padding='max_length', max_length=MAX_SEQUENCE_LENGTH, truncation=True) # (input_ids, attention_mask, token_type_ids)
    outputs, hidden_outputs = model(**tokens_pt)
    hidden_embeddings = hidden_outputs.detach().numpy()

    return hidden_embeddings

In [30]:
#final embeddings layer converted to classification softmax probabilities
def get_biobert_classifications(text):
    # Inputs:
    #   text -> Input text (str)
    #
    # Outputs:
    #   hidden_embeddings -> Last hidden layer in Biobert model with vector dimensionality = (1,hidder_layer_size)
  
    # %% EXAMPLE OF USE
    # classifications, hidden_embeddings = get_biobert_embeddings(text)
    model = esteban_model
    
    tokens_pt = biobert_tokenizer(text, return_tensors='pt',  add_special_tokens=True, padding='max_length', max_length=MAX_SEQUENCE_LENGTH, truncation=True) # (input_ids, attention_mask, token_type_ids)
    outputs, hidden_outputs = model(**tokens_pt)
    prob = torch.softmax(outputs, dim=1) #translate to softmax probabilities
    classifications = prob.detach().numpy()

    return classifications

In [32]:
#run on EC2 instance
notes_train_df['notes_classifications'] = notes_train_df.history_of_present_illness.apply(get_biobert_classifications)
notes_train_df['notes_embeddings'] = notes_train_df.history_of_present_illness.apply(get_biobert_embeddings)
notes_train_df['notes_hidden_embeddings'] = notes_train_df.history_of_present_illness.apply(get_biobert_hidden_embeddings)

notes_val_df['notes_classifications'] = notes_val_df.history_of_present_illness.apply(get_biobert_classifications)
notes_val_df['notes_embeddings'] = notes_val_df.history_of_present_illness.apply(get_biobert_embeddings)
notes_val_df['notes_hidden_embeddings'] = notes_val_df.history_of_present_illness.apply(get_biobert_hidden_embeddings)

notes_test_df['notes_classifications'] = notes_test_df.history_of_present_illness.apply(get_biobert_classifications)
notes_test_df['notes_embeddings'] = notes_test_df.history_of_present_illness.apply(get_biobert_embeddings)
notes_test_df['notes_hidden_embeddings'] = notes_test_df.history_of_present_illness.apply(get_biobert_hidden_embeddings)

In [33]:
print(notes_train_df.shape)
print(notes_val_df.shape)
print(notes_test_df.shape)

(2086, 9)
(1924, 9)
(1920, 9)


In [34]:
#save embeddings and classifications file
#notes_train_df.to_csv("../data/s3/fusion_data/notes_train_embeddings.csv")
#notes_val_df.to_csv("../data/s3/fusion_data/notes_val_embeddings.csv")
#notes_test_df.to_csv("../data/s3/fusion_data/notes_test_embeddings.csv")

In [36]:
notes_train_df.to_pickle("../data/s3/fusion_data/notes_train_embeddings.pkl")
notes_val_df.to_pickle("../data/s3/fusion_data/notes_val_embeddings.pkl")
notes_test_df.to_pickle("../data/s3/fusion_data/notes_test_embeddings.pkl")

# Image Embeddings

In [47]:
#create Cinthya's model
cinthya_model = torch.load(r"../data/s3/fusion_data/models/EfficientNet-B3-Unb_20240331.pth", map_location=torch.device('cpu'))

In [48]:
cinthya_model

EfficientNet(
  (conv_stem): Conv2d(3, 40, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (bn1): BatchNormAct2d(
    40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
    (drop): Identity()
    (act): SiLU(inplace=True)
  )
  (blocks): Sequential(
    (0): Sequential(
      (0): DepthwiseSeparableConv(
        (conv_dw): Conv2d(40, 40, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=40, bias=False)
        (bn1): BatchNormAct2d(
          40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
          (drop): Identity()
          (act): SiLU(inplace=True)
        )
        (se): SqueezeExcite(
          (conv_reduce): Conv2d(40, 10, kernel_size=(1, 1), stride=(1, 1))
          (act1): SiLU(inplace=True)
          (conv_expand): Conv2d(10, 40, kernel_size=(1, 1), stride=(1, 1))
          (gate): Sigmoid()
        )
        (conv_pw): Conv2d(40, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn2): BatchNormAct2d(
    

In [49]:
class MedicalImageDataset(Dataset):
    def __init__(self, data, root_dir, transform=None):
        self.annotations = data
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        img_id = self.annotations.iloc[index, 1]
        img_path = os.path.join(self.root_dir, str(img_id))
        try:
            image = Image.open(img_path)
        except FileNotFoundError:
            print(f"File {img_path} not used in this dataset.")
            return None
        #print(f"File {img_path} found.")

        if self.transform:
            image = self.transform(image)

        return img_id, image

In [50]:
#based off of cinthya's definition of pulling data
def get_chest_xray_embeddings(model, data, root_dir):

    #convert img files into DataLoader Cinthya's model is expecting
    loader_transform = torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    
    dataset = MedicalImageDataset(data=data,
                                        root_dir=root_dir,
                                        transform=loader_transform)
    dataset = [data for data in dataset if data is not None]
    
    dataset_loader = DataLoader(dataset, batch_size=32, shuffle=False)
    
    model.eval()
    if torch.cuda.is_available():
        model.cuda()
    
    all_ids = []
    all_probs = []
    all_embeddings = []
    all_densefeatures = []
    with torch.no_grad():
        for data in dataset_loader:
            if data is None:  # Skip the loop iteration if the dataset returned None
                continue
            img_ids, images = data  

            if torch.cuda.is_available():
                images = images.cuda()

            #for final embeddings and probabilities
            outputs = model(images)
            probabilities = torch.softmax(outputs, dim=1).cpu().numpy()
            final_embeddings = outputs.cpu().numpy()
            all_probs.extend(probabilities)
            all_embeddings.extend(final_embeddings)
            all_ids.extend(list(img_ids))  # Make sure img_ids is iterable

            #for hidden embeddings (one layer up)
            feats = model.forward_features(images)
            feats = F.relu(feats, inplace=True)
            feats = F.adaptive_avg_pool2d(feats, (1, 1))
            densefeatures = feats.cpu().detach().numpy()
            all_densefeatures.extend(densefeatures)
            
    return all_ids, all_probs, all_embeddings, all_densefeatures

In [51]:
train_ids, train_classifications, train_embeddings, train_densefeatures = get_chest_xray_embeddings(cinthya_model, img_train_df, root_dir=r"../data/s3/fusion_data/train_4_bal_s")
val_ids, val_classifications, val_embeddings, val_densefeatures = get_chest_xray_embeddings(cinthya_model, img_val_df, root_dir=r"../data/s3/fusion_data/val_4_unb_s")
test_ids, test_classifications, test_embeddings, test_densefeatures = get_chest_xray_embeddings(cinthya_model, img_test_df, root_dir=r"../data/s3/fusion_data/test_4_unb_s")

In [52]:
#append embeddings to training, validation, and test datasets

#train
img_train_embeddings = pd.DataFrame()
img_train_embeddings['dicom_id'] = train_ids
img_train_embeddings['img_classifications'] = train_classifications
img_train_embeddings['img_embeddings'] = train_embeddings
img_train_embeddings['img_densefeatures'] = train_densefeatures
img_train_embeddings['img_densefeatures'] = img_train_embeddings['img_densefeatures'].apply(np.ravel)
img_train_df = pd.merge(img_train_df, img_train_embeddings, on='dicom_id', how='left')
print(f"Training image embeddings dataset is size {img_train_df.shape}")

#validation
img_val_embeddings = pd.DataFrame()
img_val_embeddings['dicom_id'] = val_ids
img_val_embeddings['img_classifications'] = val_classifications
img_val_embeddings['img_embeddings'] = val_embeddings
img_val_embeddings['img_densefeatures'] = val_densefeatures
img_val_embeddings['img_densefeatures'] = img_val_embeddings['img_densefeatures'].apply(np.ravel)
img_val_df = pd.merge(img_val_df, img_val_embeddings, on='dicom_id', how='left')
img_val_df.shape
print(f"Validation image embeddings dataset is size {img_val_df.shape}")

#test
img_test_embeddings = pd.DataFrame()
img_test_embeddings['dicom_id'] = test_ids
img_test_embeddings['img_classifications'] = test_classifications
img_test_embeddings['img_embeddings'] = test_embeddings
img_test_embeddings['img_densefeatures'] = test_densefeatures
img_test_embeddings['img_densefeatures'] = img_test_embeddings['img_densefeatures'].apply(np.ravel)
img_test_df = pd.merge(img_test_df, img_test_embeddings, on='dicom_id', how='left')
img_test_df.shape
print(f"Test image embeddings dataset is size {img_test_df.shape}")


Training image embeddings dataset is size (2086, 6)
Validation image embeddings dataset is size (1924, 6)
Test image embeddings dataset is size (1924, 6)


In [54]:
#save img_embedding files on S3
#img_train_df.to_csv("../data/s3/fusion_data/img_train_embeddings.csv")
#img_val_df.to_csv("../data/s3/fusion_data/img_val_embeddings.csv")
#img_test_df.to_csv("../data/s3/fusion_data/img_test_embeddings.csv")

In [64]:
#save pickle files
img_train_df.to_pickle("../data/s3/fusion_data/img_train_embeddings.pkl")
img_val_df.to_pickle("../data/s3/fusion_data/img_val_embeddings.pkl")
img_test_df.to_pickle("../data/s3/fusion_data/img_test_embeddings.pkl")

# Generate Tabular Embeddings

In [5]:
#unpack Adam's tabular model
# load
adam_model = pickle.load(open(r"../data/s3/fusion_data/models/xgb_tabular_model_4-1-24.pkl", "rb"))
adam_model

In [12]:
tabular_train_df.head()

Unnamed: 0,patient_id,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,finding_names,target
0,11388716,98.8,106.0,22.0,96.0,93.0,67.0,0,2.0,atelectasis,1
1,11539363,99.1,80.0,16.0,97.0,162.0,67.0,0,3.0,atelectasis,1
2,10833304,97.0,98.0,14.0,100.0,159.0,88.0,2,2.0,atelectasis,1
3,19849119,98.6,92.0,20.0,98.0,127.0,70.0,0,2.0,atelectasis,1
4,11749991,100.6,110.0,16.0,97.0,166.0,100.0,8,2.0,atelectasis,1


In [10]:
#rename findings to indexed conditions
conditions = [
    (tabular_train_df['finding_names'] == 'no_findings'),
    (tabular_train_df['finding_names'] == 'atelectasis'),
    (tabular_train_df['finding_names'] == 'cardiomegaly'),
    (tabular_train_df['finding_names'] == 'lung_opacity'),
    (tabular_train_df['finding_names'] == 'pleural_effusion')
]
choices = [0, 1, 2, 3, 4]
tabular_train_df['target'] = np.select(conditions, choices, default=0)

#val
conditions = [
    (tabular_val_df['finding_names'] == 'no_findings'),
    (tabular_val_df['finding_names'] == 'atelectasis'),
    (tabular_val_df['finding_names'] == 'cardiomegaly'),
    (tabular_val_df['finding_names'] == 'lung_opacity'),
    (tabular_val_df['finding_names'] == 'pleural_effusion')
]
choices = [0, 1, 2, 3, 4]
tabular_val_df['target'] = np.select(conditions, choices, default=0)

#test
conditions = [
    (tabular_test_df['finding_names'] == 'no_findings'),
    (tabular_test_df['finding_names'] == 'atelectasis'),
    (tabular_test_df['finding_names'] == 'cardiomegaly'),
    (tabular_test_df['finding_names'] == 'lung_opacity'),
    (tabular_test_df['finding_names'] == 'pleural_effusion')
]
choices = [0, 1, 2, 3, 4]
tabular_test_df['target'] = np.select(conditions, choices, default=0)

print(tabular_train_df.shape)
print(tabular_val_df.shape)
print(tabular_test_df.shape)

(2086, 11)
(1924, 11)
(1920, 11)


In [16]:
# Define columns for X and y
X_columns = ['temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp', 'pain', 'acuity']
y_columns = ['target']  # as per team agreement

tabular_train_df.set_index('patient_id', inplace=True)
tabular_val_df.set_index('patient_id', inplace=True)
tabular_test_df.set_index('patient_id', inplace=True)


# Split for 4_bal_s
X_train = tabular_train_df[X_columns]
y_train = tabular_train_df[y_columns]
X_val   = tabular_val_df[X_columns]     # intentionally the same as X_val_6_unb_m
y_val   = tabular_val_df[y_columns]     # intentionally the same as unb
X_test  = tabular_test_df[X_columns]    # intentionally the same as unb
y_test  = tabular_test_df[y_columns] 

In [17]:
#Adam's data preprocessing
# Define data type columns
ordinal_cols = ['pain', 'acuity']
ratio_cols   = ['temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp']

# Preprocess parameters
impOrd   = SimpleImputer(strategy='constant', fill_value=-1)
impRatio = SimpleImputer(strategy='mean')
scale    = RobustScaler(with_centering=False)
encode   = OrdinalEncoder()

# Simple imputing Preprocess
ord_pp_steps  = Pipeline([('missing',impOrd),('Ordinal',encode),('Scale',scale)])
ratio_pp_steps= Pipeline([('mean',impRatio),('Scale',scale)])

# create the preprocessor stage of final pipeline
t=[("ordinal",ord_pp_steps,ordinal_cols),('ratio',ratio_pp_steps ,ratio_cols)]
preprocessor = ColumnTransformer(transformers = t)

In [18]:
# Update pipeline
u_pipe = Pipeline([('preprocess',preprocessor), ('estimator', adam_model)])

In [30]:
#tabular embeddings
#because xgboost is a collection of trees, will just take classifications from xgboost for embeddings
#Adam's code for pulling probabilities

def get_tabular_embeddings(X_data, y_data, model):
    # Fit the training data
    u_pipe.fit(X_data, y_data)

    # Predict probabilities for training data
    probs = u_pipe.predict_proba(X_data)
    
    # Convert the predicted probabilities to a DataFrame, using the same index as X_train
    probs_df = pd.DataFrame(probs, columns=[f'tabular_classifications{i}' for i in range(len(np.unique(y_data)))], index=X_data.index)
    probs_df = probs_df.reset_index()
    
    return probs_df

In [31]:
#get classifications
tabular_train_probs = get_tabular_embeddings(X_train, y_train, adam_model)
tabular_val_probs = get_tabular_embeddings(X_val, y_val, adam_model)
tabular_test_probs = get_tabular_embeddings(X_test, y_test, adam_model)

In [32]:
#merge with tabular_dfs
tabular_train_df = pd.merge(tabular_train_df, tabular_train_probs, on='patient_id', how='left')
tabular_val_df = pd.merge(tabular_val_df, tabular_val_probs, on='patient_id', how='left')
tabular_test_df = pd.merge(tabular_test_df, tabular_test_probs, on='patient_id', how='left')

In [36]:
#save pickle files
tabular_train_df.to_pickle("../data/s3/fusion_data/tab_train_embeddings.pkl")
tabular_val_df.to_pickle("../data/s3/fusion_data/tab_val_embeddings.pkl")
tabular_test_df.to_pickle("../data/s3/fusion_data/tab_test_embeddings.pkl")

# Concatenate Embeddings together and save output file

We have pulled:

- Softmax classifications for notes and images from individual models
- Final embeddings (without the softmax function applied) for individual models
- The embedding layers beneath the final embeddings for individual models

To try for fusion:
- classification alone (Adam is tackling this)
- final embedding layers alone (should perform similarly to the first example)
- hidden embedding layers alone (focus 1)
- final embedding + hidden embedding layers (focus 2)

In [5]:
#download from pickle files
notes_train_embeddings = pd.read_pickle("../data/s3/fusion_data/notes_train_embeddings.pkl")
notes_val_embeddings = pd.read_pickle("../data/s3/fusion_data/notes_val_embeddings.pkl")
notes_test_embeddings = pd.read_pickle("../data/s3/fusion_data/notes_test_embeddings.pkl")

img_train_embeddings = pd.read_pickle("../data/s3/fusion_data/img_train_embeddings.pkl")
img_val_embeddings = pd.read_pickle("../data/s3/fusion_data/img_val_embeddings.pkl")
img_test_embeddings = pd.read_pickle("../data/s3/fusion_data/img_test_embeddings.pkl")

tab_train_embeddings = pd.read_pickle("../data/s3/fusion_data/tab_train_embeddings.pkl")
tab_val_embeddings = pd.read_pickle("../data/s3/fusion_data/tab_val_embeddings.pkl")
tab_test_embeddings = pd.read_pickle("../data/s3/fusion_data/tab_test_embeddings.pkl")

In [6]:
tab_train_embeddings.columns

Index(['patient_id', 'temperature', 'heartrate', 'resprate', 'o2sat', 'sbp',
       'dbp', 'pain', 'acuity', 'finding_names', 'target',
       'tabular_classifications0', 'tabular_classifications1',
       'tabular_classifications2', 'tabular_classifications3',
       'tabular_classifications4'],
      dtype='object')

In [7]:
#merge everything into one file
combined_train_df = pd.merge(combined_train_df, notes_train_embeddings[['patient_id', 'notes_classifications', 'notes_embeddings', 'notes_hidden_embeddings']],on='patient_id', how='left')
combined_train_df = pd.merge(combined_train_df, img_train_embeddings[['patient_id', 'img_classifications', 'img_embeddings', 'img_densefeatures']],on='patient_id', how='left')
combined_train_df = pd.merge(combined_train_df, tab_train_embeddings[['patient_id', 
                                                                      'tabular_classifications0', 'tabular_classifications1',
                                                                      'tabular_classifications2', 'tabular_classifications3',
                                                                      'tabular_classifications4']],on='patient_id', how='left')

combined_val_df = pd.merge(combined_val_df, notes_val_embeddings[['patient_id', 'notes_classifications', 'notes_embeddings', 'notes_hidden_embeddings']],on='patient_id', how='left')
combined_val_df = pd.merge(combined_val_df, img_val_embeddings[['patient_id', 'img_classifications', 'img_embeddings', 'img_densefeatures']],on='patient_id', how='left')
combined_val_df = pd.merge(combined_val_df, tab_val_embeddings[['patient_id', 
                                                                'tabular_classifications0', 'tabular_classifications1',
                                                                'tabular_classifications2', 'tabular_classifications3',
                                                                'tabular_classifications4']],on='patient_id', how='left')

combined_test_df = pd.merge(combined_test_df, notes_test_embeddings[['patient_id', 'notes_classifications', 'notes_embeddings', 'notes_hidden_embeddings']],on='patient_id', how='left')
combined_test_df = pd.merge(combined_test_df, img_test_embeddings[['patient_id', 'img_classifications', 'img_embeddings', 'img_densefeatures']],on='patient_id', how='left')
combined_test_df = pd.merge(combined_test_df, tab_test_embeddings[['patient_id', 
                                                                   'tabular_classifications0', 'tabular_classifications1',
                                                                   'tabular_classifications2', 'tabular_classifications3',
                                                                   'tabular_classifications4']],on='patient_id', how='left')

In [8]:
print(combined_train_df.shape)
print(combined_val_df.shape)
print(combined_test_df.shape)

(2086, 38)
(1924, 38)
(1920, 38)


In [9]:
#save as pickle files
combined_train_df.to_pickle("../data/s3/fusion_data/combined_train_df_all_embeddings.pkl")
combined_val_df.to_pickle("../data/s3/fusion_data/combined_val_df_all_embeddings.pkl")
combined_test_df.to_pickle("../data/s3/fusion_data/combined_test_df_all_embeddings.pkl")

In [10]:
#unravel classifications for steven
#function to unravel (de-nest) and split apart any pandas value containing a np.array
def reshape_data(data):
    #gets rid of nested embeddings
    flattened_data = pd.DataFrame()
    
    for col in data.columns:
        data[col] = data[col].apply(np.ravel)

        col_names = []
        for i in range(data[col][0].shape[0]):
            col_name = col + str(i)
            col_names.append(col_name)

        flattened_data[col_names] = pd.DataFrame(data[col].tolist(), index= data.index)
    
    return flattened_data

In [18]:
# pull out just classifcations
train_classes_img_notes = combined_train_df[['patient_id', 'notes_classifications', 'img_classifications']]
val_classes_img_notes = combined_val_df[['patient_id', 'notes_classifications', 'img_classifications']]
test_classes_img_notes = combined_test_df[['patient_id', 'notes_classifications', 'img_classifications']]

In [19]:
#make each classification its own column
train_classes_img_notes = reshape_data(train_classes_img_notes)
val_classes_img_notes = reshape_data(val_classes_img_notes)
test_classes_img_notes = reshape_data(test_classes_img_notes)

In [21]:
#rename paitent_id0 to patient_id
train_classes_img_notes = train_classes_img_notes.rename(columns={"patient_id0": "patient_id"})
val_classes_img_notes = val_classes_img_notes.rename(columns={"patient_id0": "patient_id"})
test_classes_img_notes = test_classes_img_notes.rename(columns={"patient_id0": "patient_id"})

In [22]:
#join with combine dfs
combined_train_df = pd.merge(combined_train_df, train_classes_img_notes,on='patient_id', how='left')
combined_val_df = pd.merge(combined_val_df, val_classes_img_notes,on='patient_id', how='left')
combined_test_df = pd.merge(combined_test_df, test_classes_img_notes,on='patient_id', how='left')

In [24]:
#save as pickle files for steven
combined_train_df.to_pickle("../data/s3/fusion_data/combined_train_df_all_embeddings_classifications.pkl")
combined_val_df.to_pickle("../data/s3/fusion_data/combined_val_df_all_embeddings_classifications.pkl")
combined_test_df.to_pickle("../data/s3/fusion_data/combined_test_df_all_embeddings_classifications.pkl")