## 0. Setup

In [1]:
!pip install scikit-multilearn

Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/89.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.4/89.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


In [2]:
import pandas as pd
import json

from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import IterativeStratification
from imblearn.under_sampling import RandomUnderSampler
from copy import deepcopy

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
pd.options.display.max_columns = 999

In [5]:
# # current directory
# current_dir = os.getcwd()

# # two folders up
# parent_dir = os.path.dirname(os.path.dirname(current_dir))

# # data directory
# data_dir_processed = os.path.join(parent_dir, "data/processed")

# data directory (GDrive)
data_dir_processed = '/content/drive/MyDrive/DATASCI_210/data/processed/'

In [6]:
def return_column_values_sum_and_percentage(dataframe_input, column_input):
    total_sum = dataframe_input[column_input].sum()
    percentages = dataframe_input[column_input] / total_sum
    sums_percentages = pd.DataFrame({
        'sum': dataframe_input[column_input],
        'percentage': percentages
    })
    sums_percentages['cumsum_percentage'] = sums_percentages['percentage'].cumsum()
    # sums_percentages['sum'] = sums_percentages['sum'].apply(lambda x: "{:,}".format(x))
    sums_percentages['percentage'] = sums_percentages['percentage'].mul(100).round(1).astype(str) + '%'
    sums_percentages['cumsum_percentage'] = sums_percentages['cumsum_percentage'].mul(100).round(1).astype(str) + '%'
    return sums_percentages

## 1. Data Acquisition

In [7]:
# Define the names of the JSON files
chexpert_final_df_filename = 'chexpert_selected_6_findings.json'

# Load the JSON files into a DataFrame
with open(data_dir_processed + chexpert_final_df_filename) as chexpert_json_file:
    chexpert_dict_file = json.load(chexpert_json_file)

# Converting json dataset from dictionary to dataframe
chexpert_final_df = pd.DataFrame.from_dict(chexpert_dict_file)
chexpert_final_df.reset_index(drop=True, inplace=True)

## 2. Data Preprocessing

In [8]:
chexpert_final_df.head()

Unnamed: 0,patient_id,visit_id,study_id,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,atelectasis,cardiomegaly,edema,lung_opacity,pleural_effusion,pneumonia,positive_label_total,finding_names,radiology_note,discharge_note,chief_complaint,major_surgical_or_invasive_procedure,history_of_present_illness,past_medical_history,family_history
0,10000032.0,22595853,50414267.0,98.4,70.0,16.0,97.0,106.0,63.0,0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,no_finding,FINAL REPORT\...,\nName: ___ Unit No: _...,Worsening ABD distension and pain,Paracentesis,"___ HCV cirrhosis c/b ascites, hiv on ART, h/o...",1. HCV Cirrhosis \n2. No history of abnormal ...,"She a total of five siblings, but she is not ..."
1,10000980.0,29654838,59988438.0,97.8,57.0,18.0,100.0,180.0,88.0,0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,pleural_effusion,FINAL REPORT\...,\nName: ___ Unit No: ___\n \nAdmi...,Shortness of breath,,"___ yo woman with h/o hypertension, hyperlipid...","1. CAD RISK FACTORS: +Diabetes, +Dyslipidemia,...",Denies cardiac family history. Family hx of DM...
2,10001176.0,23334588,53186264.0,101.3,97.0,18.0,93.0,168.0,58.0,6,3.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,lung_opacity,FINAL REPORT\...,\nName: ___ Unit No: ___...,fever,none,"___ with history of morbid obesity, coronary a...",MYOCARDIAL INFARCT - INFEROPOSTERIOR \nHYPERC...,Non contributory
3,10001217.0,24597018,52067803.0,99.0,81.0,16.0,97.0,160.0,102.0,0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,atelectasis,FINAL REPORT\...,\nName: ___ Unit No: ___\n \n...,"Left hand and face numbness, left hand weaknes...",Right parietal craniotomy for abscess incision...,Mrs. ___ is a ___ y/o F from ___ with history ...,Multiple sclerosis,"Mother with pancreatic cancer, brother-lung ca..."
4,10001401.0,26840593,51065211.0,97.8,67.0,20.0,95.0,90.0,43.0,8,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,atelectasis,FINAL REPORT\...,\nName: ___ Unit No: ___\n \n...,"Abdominal pain, distention, nausea",Interventional radiology placement of abdomina...,"___ F with h/o muscle invasive bladder cancer,...","Hypertension, laparoscopic cholecystectomy, le...",Negative for bladder CA.


In [9]:
# Target variables
target_variables = [
    'atelectasis',
    'cardiomegaly',
    'edema',
    'lung_opacity',
    'pleural_effusion',
    'pneumonia',
]

pathology_dict = {}

for col in target_variables:
  value_zero = chexpert_final_df[chexpert_final_df[col] == 0][col].count()
  value_positive = chexpert_final_df[chexpert_final_df[col] == 1][col].count()
  total_studies = value_zero + value_positive
  pathology_dict[col] = [col, value_zero, value_positive, total_studies]

pathology_dict["no_finding"] = ["no_finding", 0, chexpert_final_df[chexpert_final_df["finding_names"] == 'no_finding'].shape[0], chexpert_final_df.shape[0]]

df_distribution = pd.DataFrame.from_dict(pathology_dict, orient="index", columns=["finding", "not_mention", "positive_mention", "total_studies"]).sort_values("positive_mention", ascending=False).reset_index(drop=True)
df_distribution["percentage"] = round(df_distribution["positive_mention"] / df_distribution["total_studies"], 2)*100
df_distribution

Unnamed: 0,finding,not_mention,positive_mention,total_studies,percentage
0,no_finding,0,7863,13235,59.0
1,lung_opacity,11143,2092,13235,16.0
2,atelectasis,11864,1371,13235,10.0
3,pleural_effusion,12104,1131,13235,9.0
4,cardiomegaly,12129,1106,13235,8.0
5,edema,12493,742,13235,6.0
6,pneumonia,12515,720,13235,5.0


In [10]:
return_column_values_sum_and_percentage(chexpert_final_df.groupby("finding_names").agg({"study_id": "count"}).sort_values("study_id", ascending=False), "study_id")

Unnamed: 0_level_0,sum,percentage,cumsum_percentage
finding_names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no_finding,7863,59.4%,59.4%
lung_opacity,1178,8.9%,68.3%
cardiomegaly,689,5.2%,73.5%
atelectasis,644,4.9%,78.4%
pleural_effusion,418,3.2%,81.5%
pneumonia,358,2.7%,84.2%
"atelectasis, lung_opacity",326,2.5%,86.7%
edema,295,2.2%,88.9%
"atelectasis, pleural_effusion",268,2.0%,91.0%
"lung_opacity, pneumonia",232,1.8%,92.7%


# 3. Data Split

## 3.1 `train`, `test`, `valid`: unbalanced (same real data distribution)

In [11]:
# Separate features and target variables
X = chexpert_final_df.drop(target_variables, axis=1)  # Features
y = chexpert_final_df[target_variables]  # Target variables

# First, split the data into a training set and a temporary set (combining test and validation)
stratifier = IterativeStratification(n_splits=3, order=1)
train_indexes, temp_indexes = next(stratifier.split(X.values, y.values))

# First, split the data into a training set and a temporary set (combining test and validation)
X_train, y_train = X.iloc[train_indexes], y.iloc[train_indexes]
X_temp, y_temp = X.iloc[temp_indexes], y.iloc[temp_indexes]

# Split the temporary set into actual test and validation sets
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp.values)

# Recombine features and target for each set
train_df = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
val_df = pd.concat([X_val.reset_index(drop=True), y_val.reset_index(drop=True)], axis=1)
test_df = pd.concat([X_test.reset_index(drop=True), y_test.reset_index(drop=True)], axis=1)

In [12]:
pathology_dict = {}

for col in target_variables:
  value_zero = train_df[train_df[col] == 0][col].count()
  value_positive = train_df[train_df[col] == 1][col].count()
  total_studies = value_zero + value_positive
  pathology_dict[col] = [col, value_zero, value_positive, total_studies]

pathology_dict["no_finding"] = ["no_finding", 0, train_df[train_df["finding_names"] == 'no_finding'].shape[0], train_df.shape[0]]

train_distribution = pd.DataFrame.from_dict(pathology_dict, orient="index", columns=["finding", "not_mention", "positive_mention", "total_studies"]).sort_values("positive_mention", ascending=False).reset_index(drop=True)
train_distribution["percentage"] = round(train_distribution["positive_mention"] / train_distribution["total_studies"], 2)*100
train_distribution

Unnamed: 0,finding,not_mention,positive_mention,total_studies,percentage
0,no_finding,0,5257,8824,60.0
1,lung_opacity,7430,1394,8824,16.0
2,atelectasis,7910,914,8824,10.0
3,pleural_effusion,8070,754,8824,9.0
4,cardiomegaly,8087,737,8824,8.0
5,edema,8329,495,8824,6.0
6,pneumonia,8344,480,8824,5.0


In [13]:
pathology_dict = {}

for col in target_variables:
  value_zero = val_df[val_df[col] == 0][col].count()
  value_positive = val_df[val_df[col] == 1][col].count()
  total_studies = value_zero + value_positive
  pathology_dict[col] = [col, value_zero, value_positive, total_studies]

pathology_dict["no_finding"] = ["no_finding", 0, val_df[val_df["finding_names"] == 'no_finding'].shape[0], val_df.shape[0]]

val_distribution = pd.DataFrame.from_dict(pathology_dict, orient="index", columns=["finding", "not_mention", "positive_mention", "total_studies"]).sort_values("positive_mention", ascending=False).reset_index(drop=True)
val_distribution["percentage"] = round(val_distribution["positive_mention"] / val_distribution["total_studies"], 2)*100
val_distribution

Unnamed: 0,finding,not_mention,positive_mention,total_studies,percentage
0,no_finding,0,1303,2206,59.0
1,lung_opacity,1857,349,2206,16.0
2,atelectasis,1977,229,2206,10.0
3,pleural_effusion,2018,188,2206,9.0
4,cardiomegaly,2022,184,2206,8.0
5,edema,2082,124,2206,6.0
6,pneumonia,2087,119,2206,5.0


In [14]:
pathology_dict = {}

for col in target_variables:
  value_zero = test_df[test_df[col] == 0][col].count()
  value_positive = test_df[test_df[col] == 1][col].count()
  total_studies = value_zero + value_positive
  pathology_dict[col] = [col, value_zero, value_positive, total_studies]

pathology_dict["no_finding"] = ["no_finding", 0, test_df[test_df["finding_names"] == 'no_finding'].shape[0], test_df.shape[0]]

test_distribution = pd.DataFrame.from_dict(pathology_dict, orient="index", columns=["finding", "not_mention", "positive_mention", "total_studies"]).sort_values("positive_mention", ascending=False).reset_index(drop=True)
test_distribution["percentage"] = round(test_distribution["positive_mention"] / test_distribution["total_studies"], 2)*100
test_distribution

Unnamed: 0,finding,not_mention,positive_mention,total_studies,percentage
0,no_finding,0,1303,2205,59.0
1,lung_opacity,1856,349,2205,16.0
2,atelectasis,1977,228,2205,10.0
3,pleural_effusion,2016,189,2205,9.0
4,cardiomegaly,2020,185,2205,8.0
5,edema,2082,123,2205,6.0
6,pneumonia,2084,121,2205,5.0


In [None]:
# Export dataframes to json files
train_df.to_json(data_dir_processed + 'train_set__chexpert__6_findings__unbalanced.json')
val_df.to_json(data_dir_processed + 'validation_set__chexpert__6_findings__unbalanced.json')
test_df.to_json(data_dir_processed + 'test_set__chexpert__6_findings__unbalanced.json')

## 3.2 `train`: balanced / `test`, `valid`: unbalanced (same real data distribution)

In [None]:
# # Separate features and target variables
# X = train_df.drop(target_variables, axis=1)  # Features
# y = train_df[target_variables]  # Target variables

# # Initialize the RandomUnderSampler
# rus = RandomUnderSampler(random_state=42)

# # Apply undersampling to the training set
# X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# #
# train_balanced_df = pd.concat([X_train_resampled.reset_index(drop=True), y_train_resampled.reset_index(drop=True)], axis=1)

In [15]:
minimum_positive_mention = train_distribution.positive_mention.min()
minimum_positive_mention

480

In [16]:
dataframes_list = []

for finding in target_variables:
    finding_balanced = train_df[train_df[finding] == 1].sample(n=minimum_positive_mention, random_state=42)
    dataframes_list.append(finding_balanced)

no_finding_balanced = train_df[train_df["finding_names"] == 'no_finding'].sample(n=2*minimum_positive_mention, random_state=42)
dataframes_list.append(no_finding_balanced)

train_balanced_df = pd.concat(dataframes_list)

In [17]:
train_balanced_df.head()

Unnamed: 0,patient_id,visit_id,study_id,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,positive_label_total,finding_names,radiology_note,discharge_note,chief_complaint,major_surgical_or_invasive_procedure,history_of_present_illness,past_medical_history,family_history,atelectasis,cardiomegaly,edema,lung_opacity,pleural_effusion,pneumonia
3469,14043257.0,20906582,56794763.0,97.5,107.0,19.0,98.0,140.0,87.0,0,2.0,2.0,"atelectasis, pleural_effusion",FINAL REPORT\...,\nName: ___ Unit No: ___\...,Dyspnea with exertion.,Cardiac catheterization - ___.,"In brief this is a ___ y/o F with PMHx DM2, HL...","1. CARDIAC RISK FACTORS: +Diabetes, +Dyslipide...","No family history of early MI, arrhythmia, car...",1.0,0.0,0.0,0.0,1.0,0.0
8333,19410285.0,21413973,56756767.0,101.0,91.0,18.0,97.0,131.0,75.0,8,3.0,2.0,"atelectasis, cardiomegaly",FINAL REPORT\...,\nName: ___ Unit No: ___\n \...,Headaches and fever,none,Ms. ___ is a ___ year old female with history ...,1) PCKD-related renal transplant on ___ \n2) ...,DM: maternal GM. HTN: maternal GM. Father's si...,1.0,1.0,0.0,0.0,0.0,0.0
3237,13770510.0,29250526,56190948.0,97.7,117.0,22.0,95.0,143.0,107.0,0,3.0,2.0,"atelectasis, pleural_effusion",WET READ: ___ ___ ___ 11:08 AM\n \n 1. No e...,\nName: ___ Unit No: __...,"dyspnea on exertion, new leukocytosis",,___ speaking ___ F w/ 7cm enlarging thyroid no...,Large thyroid mass\nHyperlipidemia\nCOPD\nChol...,Sister died of pancreatic cancer,1.0,0.0,0.0,0.0,1.0,0.0
3250,13789129.0,26873497,59504884.0,97.2,84.0,18.0,100.0,212.0,108.0,7,2.0,1.0,atelectasis,FINAL REPORT\...,\nName: ___ Unit No: __...,shortness of breath,,"Mr. ___ is a ___ yo M w/ PMH of IV drug abuse,...","# DMII, A1C 6.4 on metformin \n# Hypertension...",His mother has diabetes and died of a stroke a...,1.0,0.0,0.0,0.0,0.0,0.0
3270,13806563.0,26672357,57134801.0,97.4,71.0,18.0,100.0,111.0,71.0,8,3.0,1.0,atelectasis,FINAL REPORT\...,\nName: ___ Unit No: ___...,Chest Pain,,"___ y/o female with sickle cell, HIV + (last C...",- Sickle Cell/HB-C disease\n- HIV\n- GERD\n- C...,Dad passed away from stroke at age ___. Mom is...,1.0,0.0,0.0,0.0,0.0,0.0


In [18]:
pathology_dict = {}

for col in target_variables:
  value_zero = train_balanced_df[train_balanced_df[col] == 0][col].count()
  value_positive = train_balanced_df[train_balanced_df[col] == 1][col].count()
  total_studies = value_zero + value_positive
  pathology_dict[col] = [col, value_zero, value_positive, total_studies]

pathology_dict["no_finding"] = ["no_finding", 0, train_balanced_df[train_balanced_df["finding_names"] == 'no_finding'].shape[0], train_balanced_df.shape[0]]

train_balanced_distribution = pd.DataFrame.from_dict(pathology_dict, orient="index", columns=["finding", "not_mention", "positive_mention", "total_studies"]).sort_values("positive_mention", ascending=False).reset_index(drop=True)
train_balanced_distribution["percentage"] = round(train_balanced_distribution["positive_mention"] / train_balanced_distribution["total_studies"], 2)*100
train_balanced_distribution

Unnamed: 0,finding,not_mention,positive_mention,total_studies,percentage
0,no_finding,0,960,3840,25.0
1,lung_opacity,2901,939,3840,24.0
2,pleural_effusion,3080,760,3840,20.0
3,atelectasis,3091,749,3840,20.0
4,cardiomegaly,3154,686,3840,18.0
5,edema,3186,654,3840,17.0
6,pneumonia,3254,586,3840,15.0


In [19]:
train_balanced_df.drop_duplicates(inplace=True)

In [20]:
pathology_dict = {}

for col in target_variables:
  value_zero = train_balanced_df[train_balanced_df[col] == 0][col].count()
  value_positive = train_balanced_df[train_balanced_df[col] == 1][col].count()
  total_studies = value_zero + value_positive
  pathology_dict[col] = [col, value_zero, value_positive, total_studies]

pathology_dict["no_finding"] = ["no_finding", 0, train_balanced_df[train_balanced_df["finding_names"] == 'no_finding'].shape[0], train_balanced_df.shape[0]]

train_balanced_distribution = pd.DataFrame.from_dict(pathology_dict, orient="index", columns=["finding", "not_mention", "positive_mention", "total_studies"]).sort_values("positive_mention", ascending=False).reset_index(drop=True)
train_balanced_distribution["percentage"] = round(train_balanced_distribution["positive_mention"] / train_balanced_distribution["total_studies"], 2)*100
train_balanced_distribution

Unnamed: 0,finding,not_mention,positive_mention,total_studies,percentage
0,no_finding,0,960,3389,28.0
1,lung_opacity,2617,772,3389,23.0
2,atelectasis,2782,607,3389,18.0
3,pleural_effusion,2812,577,3389,17.0
4,cardiomegaly,2838,551,3389,16.0
5,edema,2904,485,3389,14.0
6,pneumonia,2909,480,3389,14.0


In [21]:
pathology_dict = {}

for col in target_variables:
  value_zero = train_df[train_df[col] == 0][col].count()
  value_positive = train_df[train_df[col] == 1][col].count()
  total_studies = value_zero + value_positive
  pathology_dict[col] = [col, value_zero, value_positive, total_studies]

pathology_dict["no_finding"] = ["no_finding", 0, train_df[train_df["finding_names"] == 'no_finding'].shape[0], train_df.shape[0]]

train_distribution = pd.DataFrame.from_dict(pathology_dict, orient="index", columns=["finding", "not_mention", "positive_mention", "total_studies"]).sort_values("positive_mention", ascending=False).reset_index(drop=True)
train_distribution["percentage"] = round(train_distribution["positive_mention"] / train_distribution["total_studies"], 2)*100
train_distribution

Unnamed: 0,finding,not_mention,positive_mention,total_studies,percentage
0,no_finding,0,5257,8824,60.0
1,lung_opacity,7430,1394,8824,16.0
2,atelectasis,7910,914,8824,10.0
3,pleural_effusion,8070,754,8824,9.0
4,cardiomegaly,8087,737,8824,8.0
5,edema,8329,495,8824,6.0
6,pneumonia,8344,480,8824,5.0


In [22]:
# Export dataframes to json files
train_balanced_df.to_json(data_dir_processed + 'train_set__chexpert__6_findings__balanced.json')