# Processing data for fusion models
Purpose: to access processed data and combine for tabular, notes, and images

### Dataset selection: 4 findings, single label, balanced**

**for now because smaller dataset/testing pipeline, perhaps switch to unbalanced and for imaging data, will perform augmentation to class balance

TODOs:
- Save final dataframes for people to reference patient IDs being used (TONIGHT)
- Convert local access to EC2/S3 access so this notebook can be run by anyone (and can be accessed no problem in a pipeline (TOMORROW)
- Change data types to 4 finds, single label, unbalanced? and just balance the imaging data alone? (RUN BOTH TONIGHT)
- For now leave balanced but test unbalanced vs balanced in the fusion context
- potentially use this notebook to generate the image generator (dig further into the image embeddings code)

This will be for early concatenation and early fusion models

Currently accessing locally --> convert to EC2 and S3 access ASAP!

In [1]:
import pandas as pd
import numpy as np
import json
import os
from glob import glob
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, RobustScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold
from sklearn.datasets import make_multilabel_classification
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, confusion_matrix
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain, MultiOutputClassifier
from xgboost import XGBClassifier

import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

rand_state = 42

# Notes and Tabular Data

In [2]:
#notes data - currently pulling locally from google drive (need to mount to S3 and pull from there to use EC2 in the future for larger models!)
data_dir_processed = r"C:/Users/Carolyn/Documents/MIDS/210 Capstone/train_validation_test__datasets"

In [3]:
# Define the names of the JSON files
#train_df_filename = "/train_set__chexpert__4_findings__single_label__unbalanced.json"
#test_df_filename = "/test_set__chexpert__4_findings__single_label__unbalanced.json"
#valid_df_filename = "/validation_set__chexpert__4_findings__single_label__unbalanced.json"

#train_balanced_df_filename = "/train_set__chexpert__4_findings__single_label__balanced.json"

def load_and_combine_json_files(directory_path, search_pattern):
    # Use glob to find JSON files in the directory based on the search pattern
    pop_files = glob(directory_path + search_pattern)

    # Initialize an empty DataFrame to hold all the data
    combined_df = pd.DataFrame()

    # Loop through each JSON file
    for file in pop_files:
        # Determine the dataset type based on the file name
        if 'train' in file:
            dataset_type = 'train'
        elif 'test' in file:
            dataset_type = 'test'
        elif 'val' in file:
            dataset_type = 'validate'
        else:
            dataset_type = 'unknown'

        print('Loading data files...', file, dataset_type)
        # Load the JSON file into a DataFrame
        with open(file) as f:
            data = json.load(f)
        df = pd.DataFrame(data)
        if dataset_type == 'train':
            train_df = df
        elif dataset_type == 'test':
            test_df = df
        elif dataset_type == 'validate':
            val_df = df
        else:
            print("Unknown dataset type!")

        # Add a new column to flag the dataset type
        df['dataset_type'] = dataset_type

        # Append the DataFrame to the combined DataFrame
        combined_df = pd.concat([combined_df, df], ignore_index=True)

    return train_df, test_df, val_df, combined_df

# Define parameters for the function to combine JSON files
directory_path = 'C:/Users/Carolyn/Documents/MIDS/210 Capstone/fusion_data/'
pop_files = '*.json'  # This pattern can be changed based on the files you're looking for

# Load and combine the JSON files
train_df, test_df, val_df, combined_df = load_and_combine_json_files(directory_path, pop_files)

print('Total Cols\n',train_df.columns)

Loading data files... C:/Users/Carolyn/Documents/MIDS/210 Capstone/fusion_data\test_set__chexpert__4_findings__single_label__unbalanced.json test
Loading data files... C:/Users/Carolyn/Documents/MIDS/210 Capstone/fusion_data\train_set__chexpert__4_findings__single_label__balanced.json train
Loading data files... C:/Users/Carolyn/Documents/MIDS/210 Capstone/fusion_data\validation_set__chexpert__4_findings__single_label__unbalanced.json validate
Total Cols
 Index(['patient_id', 'visit_id', 'study_id', 'temperature', 'heartrate',
       'resprate', 'o2sat', 'sbp', 'dbp', 'pain', 'acuity',
       'positive_label_total', 'finding_names', 'radiology_note',
       'discharge_note', 'chief_complaint',
       'major_surgical_or_invasive_procedure', 'history_of_present_illness',
       'past_medical_history', 'family_history', 'atelectasis', 'cardiomegaly',
       'lung_opacity', 'pleural_effusion', 'dataset_type'],
      dtype='object')


In [4]:
train_df.head()

Unnamed: 0,patient_id,visit_id,study_id,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,positive_label_total,finding_names,radiology_note,discharge_note,chief_complaint,major_surgical_or_invasive_procedure,history_of_present_illness,past_medical_history,family_history,atelectasis,cardiomegaly,lung_opacity,pleural_effusion,dataset_type
992,11388716.0,23706855,51255419.0,98.8,106.0,22.0,96.0,93.0,67.0,0,2.0,1.0,atelectasis,FINAL REPORT\...,\nName: ___ Unit No: ___\n \...,SOB,,"___ with history of DVT on warfarin, silent MI...",ONCOLOGIC HISTORY: \n- Early ___: presented to...,Mother had an MI in her ___. Father had an MI...,1.0,0.0,0.0,0.0,train
1099,11539363.0,23558226,55922046.0,99.1,80.0,16.0,97.0,162.0,67.0,0,3.0,1.0,atelectasis,FINAL REPORT\...,\nName: ___ Unit No: ___\...,Chest pain,___ Cardiac Catheterization by Dr. ___,"Mr ___ is a ___ yo M w/IDDM, HTN, HLD, has not...",- IDDM \n - HTN \n - HLD,Noncontributory,1.0,0.0,0.0,0.0,train
569,10833304.0,26184887,59599449.0,97.0,98.0,14.0,100.0,159.0,88.0,2,2.0,1.0,atelectasis,FINAL REPORT\...,\nName: ___ Unit No: ___\n \nAdmi...,"Lip swelling, shortness of breath",,"This is a ___ F PMhx hypothyroidism, depressio...","- HTN\n- Hypothyroidism\n- Urticaria, chronic ...","Estranged from family, no known history of mal...",1.0,0.0,0.0,0.0,train
7640,19849119.0,27397159,58419216.0,98.6,92.0,20.0,98.0,127.0,70.0,0,2.0,1.0,atelectasis,FINAL REPORT\...,\nName: ___ Unit No: ___...,Confusion,,"___ PMHx Hep C Cirrhosis, HTN, DM II presents ...","PMH: DMII, Cirrhosis, grade 1 esophageal varic...",Brothers x3 both with MI's. Father ___ arthrit...,1.0,0.0,0.0,0.0,train
1255,11749991.0,20503367,55801381.0,100.6,110.0,16.0,97.0,166.0,100.0,8,2.0,1.0,atelectasis,FINAL REPORT\...,\nName: ___ Unit No: __...,"N/V, Abdominal Pain",,"___ with hx of COPD, CHF, esophageal ulcer, an...",- HTN\n- CVA\n- CHF\n- Restles leg\n- Fibromya...,Not obtained,1.0,0.0,0.0,0.0,train


In [5]:
def return_column_values_sum_and_percentage(dataframe_input, column_input):
    total_sum = dataframe_input[column_input].sum()
    percentages = dataframe_input[column_input] / total_sum
    sums_percentages = pd.DataFrame({
        'sum': dataframe_input[column_input],
        'percentage': percentages
    })
    sums_percentages['cumsum_percentage'] = sums_percentages['percentage'].cumsum()
    sums_percentages['sum'] = sums_percentages['sum'].apply(lambda x: "{:,}".format(x))
    sums_percentages['percentage'] = sums_percentages['percentage'].mul(100).round(1).astype(str) + '%'
    sums_percentages['cumsum_percentage'] = sums_percentages['cumsum_percentage'].mul(100).round(1).astype(str) + '%'
    return sums_percentages

In [6]:
return_column_values_sum_and_percentage(train_df.groupby("finding_names").agg({"study_id": "count"}).sort_values("study_id", ascending=False), "study_id")

Unnamed: 0_level_0,sum,percentage,cumsum_percentage
finding_names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no_finding,706,33.3%,33.3%
atelectasis,353,16.7%,50.0%
cardiomegaly,353,16.7%,66.7%
lung_opacity,353,16.7%,83.3%
pleural_effusion,353,16.7%,100.0%


# Images

In [7]:
# Define image directories
local_base_dir = 'C:/Users/Carolyn/Documents/MIDS/210 Capstone'
latest_processed_file = 'Processed_Image_Data_March_11_2024.csv'
latest_file_path = os.path.join(local_base_dir, latest_processed_file)

In [8]:
# Read the CSV file into a dataframe
image_loc_df = pd.read_csv(latest_file_path)

train_img_loc_df = image_loc_df[image_loc_df.train_4_bal_s == 1]
val_img_loc_df = image_loc_df[image_loc_df.val_4_unb_s == 1]
test_img_loc_df = image_loc_df[image_loc_df.test_4_unb_s == 1]

image_loc_df.info()
print("\n")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16877 entries, 0 to 16876
Data columns (total 40 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   patient_id                                  16877 non-null  int64  
 1   study_id                                    16877 non-null  int64  
 2   atelectasis                                 16877 non-null  float64
 3   cardiomegaly                                16877 non-null  float64
 4   edema                                       16877 non-null  float64
 5   lung_opacity                                16877 non-null  float64
 6   pleural_effusion                            16877 non-null  float64
 7   pneumonia                                   16877 non-null  float64
 8   prev_data_type                              16877 non-null  object 
 9   prev_is_sample                              16877 non-null  bool   
 10  dicom_id  

# Combine Files together for final dataset for fusion models

In [9]:
#convert patient ids in tab/notes df to int
train_df.patient_id = train_df.patient_id.astype('int64')

In [10]:
#join based on patient id and remove NA values
combined_train_df = pd.merge(train_df,train_img_loc_df[['patient_id', 'dicom_id']],on='patient_id', how='left').dropna().reset_index()
combined_val_df = pd.merge(val_df,val_img_loc_df[['patient_id','dicom_id']],on='patient_id', how='left').dropna().reset_index()
combined_test_df = pd.merge(test_df,test_img_loc_df[['patient_id', 'dicom_id']],on='patient_id', how='left').dropna().reset_index()

print("Create combined train, validation, and test datasets from note/tabular and imaging data of sizes: ")
print(combined_train_df.shape)
print(combined_val_df.shape)
print(combined_test_df.shape)

print("Null values for image IDs in combined_train_df:", sum(combined_train_df.dicom_id.isna()))
print("Null values for image IDs in combined_train_df:", sum(combined_val_df.dicom_id.isna()))
print("Null values for image IDs in combined_train_df:", sum(combined_test_df.dicom_id.isna()))
print("\n")

Create combined train, validation, and test datasets from note/tabular and imaging data of sizes: 
(2086, 27)
(1924, 27)
(1920, 27)
Null values for image IDs in combined_train_df: 0
Null values for image IDs in combined_train_df: 0
Null values for image IDs in combined_train_df: 0


# Use DICOM_ID to identify images for appended early fusion

In [11]:
## TODO use dicom_id to pull image values into array for early fusion


# Separate files for late fusion approach

In [12]:
print(f"Combined dataset has columns: {combined_train_df.columns}")
print("\n")

Index(['index', 'patient_id', 'visit_id', 'study_id', 'temperature',
       'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp', 'pain', 'acuity',
       'positive_label_total', 'finding_names', 'radiology_note',
       'discharge_note', 'chief_complaint',
       'major_surgical_or_invasive_procedure', 'history_of_present_illness',
       'past_medical_history', 'family_history', 'atelectasis', 'cardiomegaly',
       'lung_opacity', 'pleural_effusion', 'dataset_type', 'dicom_id'],
      dtype='object')

In [13]:
print(f"Combined dataset has the following findings (true labels): {combined_train_df.finding_names.unique()}")
print("\n")

array(['atelectasis', 'cardiomegaly', 'lung_opacity', 'pleural_effusion',
       'no_finding'], dtype=object)

In [19]:
## separate out fusion_df into tabular, images, and notes alone

#tabular data alone
tabular_train_df = combined_train_df[['patient_id', 'temperature','heartrate', 'resprate', 'o2sat', 'sbp', 'dbp', 'pain', 'acuity', 'finding_names']]
tabular_val_df = combined_val_df[['patient_id', 'temperature','heartrate', 'resprate', 'o2sat', 'sbp', 'dbp', 'pain', 'acuity', 'finding_names']]
tabular_test_df = combined_test_df[['patient_id', 'temperature','heartrate', 'resprate', 'o2sat', 'sbp', 'dbp', 'pain', 'acuity', 'finding_names']]

#notes data alone
notes_train_df = combined_train_df[['patient_id', 'chief_complaint', 'history_of_present_illness','past_medical_history', 'family_history', 'finding_names']]
notes_val_df = combined_val_df[['patient_id', 'chief_complaint', 'history_of_present_illness','past_medical_history', 'family_history', 'finding_names']]
notes_test_df = combined_test_df[['patient_id', 'chief_complaint', 'history_of_present_illness','past_medical_history', 'family_history', 'finding_names']]

#image data alone (only dicom_id - use for data generator at embeddings step
img_train_df = combined_train_df[['patient_id', 'dicom_id', 'finding_names']]
img_val_df = combined_val_df[['patient_id', 'dicom_id', 'finding_names']]
img_test_df = combined_test_df[['patient_id', 'dicom_id', 'finding_names']]

print("Split combined data into images, notes, and tabular datasets")
print(f"Imaging data has columns {img_train_df.columns}")
print(f"Notes data has columns {notes_train_df.columns}")
print(f"Tabular data has columns {tabular_train_df.columns}")
print("\n")

Split combined data into images, notes, and tabular datasets
Imaging data has columns Index(['patient_id', 'dicom_id', 'finding_names'], dtype='object')
Notes data has columns Index(['patient_id', 'chief_complaint', 'history_of_present_illness',
       'past_medical_history', 'family_history', 'finding_names'],
      dtype='object')
Tabular data has columns Index(['patient_id', 'temperature', 'heartrate', 'resprate', 'o2sat', 'sbp',
       'dbp', 'pain', 'acuity', 'finding_names'],
      dtype='object')


In [20]:
#replace ___ with [UNK] for all notes data
notes_train_df.loc[:, "history_of_present_illness"] = notes_train_df["history_of_present_illness"].str.replace("___", "[UNK]")
notes_val_df.loc[:, "history_of_present_illness"] = notes_val_df["history_of_present_illness"].str.replace("___", "[UNK]")
notes_test_df.loc[:, "history_of_present_illness"] = notes_test_df["history_of_present_illness"].str.replace("___", "[UNK]")

notes_train_df.loc[:, "chief_complaint"] = notes_train_df["chief_complaint"].str.replace("___", "[UNK]")
notes_val_df.loc[:, "chief_complaint"] = notes_val_df["chief_complaint"].str.replace("___", "[UNK]")
notes_test_df.loc[:, "chief_complaint"] = notes_test_df["chief_complaint"].str.replace("___", "[UNK]")

notes_train_df.loc[:, "past_medical_history"] = notes_train_df["past_medical_history"].str.replace("___", "[UNK]")
notes_val_df.loc[:, "past_medical_history"] = notes_val_df["past_medical_history"].str.replace("___", "[UNK]")
notes_test_df.loc[:, "past_medical_history"] = notes_test_df["past_medical_history"].str.replace("___", "[UNK]")

notes_train_df.loc[:, "family_history"] = notes_train_df["family_history"].str.replace("___", "[UNK]")
notes_val_df.loc[:, "family_history"] = notes_val_df["family_history"].str.replace("___", "[UNK]")
notes_test_df.loc[:, "family_history"] = notes_test_df["family_history"].str.replace("___", "[UNK]")

print("Processed all notes data to have [UNK] in place of ____")
print("\n")

Processed notes data to have [UNK] in place of ____


In [25]:
print(f"Created file img_train_df with size: {img_train_df.shape}")
print(f"Created file img_val_df with size: {img_val_df.shape}")
print(f"Created file img_test_df with size: {img_test_df.shape}")

print(f"Created file notes_train_df with size: {notes_train_df.shape}")
print(f"Created file notes_val_df with size: {notes_val_df.shape}")
print(f"Created file notes_test_df with size: {notes_test_df.shape}")

print(f"Created file tabular_train_df with size: {tabular_train_df.shape}")
print(f"Created file tabular_val_df with size: {tabular_val_df.shape}")
print(f"Created file tabular_test_df with size: {tabular_test_df.shape}")



Created file img_train_df with size: (2086, 3)
Created file img_val_df with size: (1924, 3)
Created file img_test_df with size: (1920, 3)
Created file notes_train_df with size: (2086, 6)
Created file notes_val_df with size: (1924, 6)
Created file notes_test_df with size: (1920, 6)
Created file tabular_train_df with size: (2086, 10)
Created file tabular_val_df with size: (1924, 10)
Created file tabular_test_df with size: (1920, 10)


In [None]:
#TODO: save output df as files for others to use as needed