# Contrastive Learning - Multimodality

**Objective:** The goal of this project is to try using contrastive learning to find the nearest embedding for multimodal healthcare data

Simon Lee (simonlee711@g.ucla.edu)

# Data Preprocessing

In [36]:
import pandas as pd
from datetime import datetime

### Diagnosis

In [60]:
diagnoses = pd.read_csv("/data2/mimic/mimic-iii-clinical-database-1.4/DIAGNOSES_ICD.csv.gz")
diagnoses_table =pd.read_csv("/data2/mimic/mimic-iii-clinical-database-1.4/D_ICD_DIAGNOSES.csv.gz")
icd_dict = dict(zip(diagnoses_table['ICD9_CODE'], diagnoses_table['LONG_TITLE']))
diagnoses['Diagnoses'] = diagnoses['ICD9_CODE'].map(icd_dict)
patient_info = pd.read_csv("/data2/mimic/mimic-iii-clinical-database-1.4/PATIENTS.csv.gz")
demo_dict = {}
for _, row in patient_info.iterrows():
    key = row['SUBJECT_ID']
    value = (row['GENDER'], row['DOB'])
    demo_dict[key] = value
diagnoses['Demographics'] = diagnoses.apply(lambda row: demo_dict.get((row['SUBJECT_ID']), None), axis=1)
diagnoses[['Sex', 'DoB']] = diagnoses['Demographics'].apply(pd.Series)
diagnoses= diagnoses.drop("Demographics",axis=1)
diagnoses

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,Diagnoses,Sex,DoB
0,1297,109,172335,1.0,40301,"Hypertensive chronic kidney disease, malignant...",F,2117-08-07 00:00:00
1,1298,109,172335,2.0,486,"Pneumonia, organism unspecified",F,2117-08-07 00:00:00
2,1299,109,172335,3.0,58281,Chronic glomerulonephritis in diseases classif...,F,2117-08-07 00:00:00
3,1300,109,172335,4.0,5855,"Chronic kidney disease, Stage V",F,2117-08-07 00:00:00
4,1301,109,172335,5.0,4254,Other primary cardiomyopathies,F,2117-08-07 00:00:00
...,...,...,...,...,...,...,...,...
651042,639798,97503,188195,2.0,20280,"Other malignant lymphomas, unspecified site, e...",M,2106-04-10 00:00:00
651043,639799,97503,188195,3.0,V5869,Long-term (current) use of other medications,M,2106-04-10 00:00:00
651044,639800,97503,188195,4.0,V1279,Personal history of other diseases of digestiv...,M,2106-04-10 00:00:00
651045,639801,97503,188195,5.0,5275,Sialolithiasis,M,2106-04-10 00:00:00


In [61]:
admissions_info = pd.read_csv("/data2/mimic/mimic-iii-clinical-database-1.4/ADMISSIONS.csv.gz")
admissions_dict = dict(zip(admissions_info['HADM_ID'], admissions_info['ADMITTIME']))
diagnoses['Admit_Date'] = diagnoses['HADM_ID'].map(admissions_dict)
diagnoses['DoB'] = pd.to_datetime(diagnoses['DoB'])
diagnoses['Admit_Date'] = pd.to_datetime(diagnoses['Admit_Date'])
diagnoses['Admit_Date'] = diagnoses['Admit_Date'].dt.date
diagnoses['Admit_Date'] = pd.to_datetime(diagnoses['Admit_Date'])
diagnoses['Age'] = diagnoses['Admit_Date'].dt.year - diagnoses['DoB'].dt.year
diagnoses = diagnoses.drop("DoB",axis=1)
diagnoses = diagnoses.drop("Admit_Date",axis=1)
gender_mapping = {'F': 'female', 'M': 'male'}
diagnoses["Sex"] = diagnoses["Sex"].map(gender_mapping)
diagnoses

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,Diagnoses,Sex,Age
0,1297,109,172335,1.0,40301,"Hypertensive chronic kidney disease, malignant...",female,24
1,1298,109,172335,2.0,486,"Pneumonia, organism unspecified",female,24
2,1299,109,172335,3.0,58281,Chronic glomerulonephritis in diseases classif...,female,24
3,1300,109,172335,4.0,5855,"Chronic kidney disease, Stage V",female,24
4,1301,109,172335,5.0,4254,Other primary cardiomyopathies,female,24
...,...,...,...,...,...,...,...,...
651042,639798,97503,188195,2.0,20280,"Other malignant lymphomas, unspecified site, e...",male,40
651043,639799,97503,188195,3.0,V5869,Long-term (current) use of other medications,male,40
651044,639800,97503,188195,4.0,V1279,Personal history of other diseases of digestiv...,male,40
651045,639801,97503,188195,5.0,5275,Sialolithiasis,male,40


In [62]:
# Define a madlibs template
template = "Patient {SUBJECT_ID} is a {Age} year old {Sex} with the following diagnostic: {Diagnoses}"
diagnoses['Diagnostic_Summary'] = diagnoses.apply(lambda row: template.format(**row), axis=1)

In [63]:
df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,Diagnoses,Sex,Age,Diagnostic_Summary
0,1297,109,172335,1.0,40301,"Hypertensive chronic kidney disease, malignant...",female,24,Patient 109 is a 24 year old female with the f...
1,1298,109,172335,2.0,486,"Pneumonia, organism unspecified",female,24,Patient 109 is a 24 year old female with the f...
2,1299,109,172335,3.0,58281,Chronic glomerulonephritis in diseases classif...,female,24,Patient 109 is a 24 year old female with the f...
3,1300,109,172335,4.0,5855,"Chronic kidney disease, Stage V",female,24,Patient 109 is a 24 year old female with the f...
4,1301,109,172335,5.0,4254,Other primary cardiomyopathies,female,24,Patient 109 is a 24 year old female with the f...
...,...,...,...,...,...,...,...,...,...
651042,639798,97503,188195,2.0,20280,"Other malignant lymphomas, unspecified site, e...",male,40,Patient 97503 is a 40 year old male with the f...
651043,639799,97503,188195,3.0,V5869,Long-term (current) use of other medications,male,40,Patient 97503 is a 40 year old male with the f...
651044,639800,97503,188195,4.0,V1279,Personal history of other diseases of digestiv...,male,40,Patient 97503 is a 40 year old male with the f...
651045,639801,97503,188195,5.0,5275,Sialolithiasis,male,40,Patient 97503 is a 40 year old male with the f...
