## 0. Setup

In [1]:
!pip install scikit-multilearn

Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/89.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.4/89.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


In [2]:
import pandas as pd
import json

from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import IterativeStratification
from imblearn.under_sampling import RandomUnderSampler
from copy import deepcopy

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
pd.options.display.max_columns = 999

In [5]:
# # current directory
# current_dir = os.getcwd()

# # two folders up
# parent_dir = os.path.dirname(os.path.dirname(current_dir))

# # data directory
# data_dir_processed = os.path.join(parent_dir, "data/processed")

# data directory (GDrive)
data_dir_processed = '/content/drive/MyDrive/DATASCI_210/data/processed/'

In [6]:
def return_column_values_sum_and_percentage(dataframe_input, column_input):
    total_sum = dataframe_input[column_input].sum()
    percentages = dataframe_input[column_input] / total_sum
    sums_percentages = pd.DataFrame({
        'sum': dataframe_input[column_input],
        'percentage': percentages
    })
    sums_percentages['cumsum_percentage'] = sums_percentages['percentage'].cumsum()
    # sums_percentages['sum'] = sums_percentages['sum'].apply(lambda x: "{:,}".format(x))
    sums_percentages['percentage'] = sums_percentages['percentage'].mul(100).round(1).astype(str) + '%'
    sums_percentages['cumsum_percentage'] = sums_percentages['cumsum_percentage'].mul(100).round(1).astype(str) + '%'
    return sums_percentages

## 1. Data Acquisition

In [7]:
# Define the names of the JSON files
chexpert_final_df_filename = 'chexpert_selected_6_findings__single_label.json'

# Load the JSON files into a DataFrame
with open(data_dir_processed + chexpert_final_df_filename) as chexpert_json_file:
    chexpert_dict_file = json.load(chexpert_json_file)

# Converting json dataset from dictionary to dataframe
chexpert_final_df = pd.DataFrame.from_dict(chexpert_dict_file)
chexpert_final_df.reset_index(drop=True, inplace=True)

## 2. Data Preprocessing

In [8]:
chexpert_final_df.head()

Unnamed: 0,patient_id,visit_id,study_id,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,atelectasis,cardiomegaly,edema,lung_opacity,pleural_effusion,pneumonia,positive_label_total,finding_names,radiology_note,discharge_note,chief_complaint,major_surgical_or_invasive_procedure,history_of_present_illness,past_medical_history,family_history
0,10000032.0,22595853,50414267.0,98.4,70.0,16.0,97.0,106.0,63.0,0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,no_finding,FINAL REPORT\...,\nName: ___ Unit No: _...,Worsening ABD distension and pain,Paracentesis,"___ HCV cirrhosis c/b ascites, hiv on ART, h/o...",1. HCV Cirrhosis \n2. No history of abnormal ...,"She a total of five siblings, but she is not ..."
1,10000935.0,26381316,51178377.0,97.6,117.0,18.0,95.0,128.0,74.0,10,3.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,lung_opacity,FINAL REPORT\...,\nName: ___ Unit No: ___...,"Weakness, nausea/vomiting",none,This is a ___ yo f with h/o recently diagnosed...,PMH: \n# high grade SBO ___ s/p exploratory la...,Mother - ___ cancer d. at ___ \nYoungest of _...
2,10000980.0,29654838,59988438.0,97.8,57.0,18.0,100.0,180.0,88.0,0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,pleural_effusion,FINAL REPORT\...,\nName: ___ Unit No: ___\n \nAdmi...,Shortness of breath,,"___ yo woman with h/o hypertension, hyperlipid...","1. CAD RISK FACTORS: +Diabetes, +Dyslipidemia,...",Denies cardiac family history. Family hx of DM...
3,10001176.0,23334588,53186264.0,101.3,97.0,18.0,93.0,168.0,58.0,6,3.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,lung_opacity,FINAL REPORT\...,\nName: ___ Unit No: ___...,fever,none,"___ with history of morbid obesity, coronary a...",MYOCARDIAL INFARCT - INFEROPOSTERIOR \nHYPERC...,Non contributory
4,10001217.0,24597018,52067803.0,99.0,81.0,16.0,97.0,160.0,102.0,0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,atelectasis,FINAL REPORT\...,\nName: ___ Unit No: ___\n \n...,"Left hand and face numbness, left hand weaknes...",Right parietal craniotomy for abscess incision...,Mrs. ___ is a ___ y/o F from ___ with history ...,Multiple sclerosis,"Mother with pancreatic cancer, brother-lung ca..."


In [9]:
# Target variables
target_variables = [
    'atelectasis',
    'cardiomegaly',
    'edema',
    'lung_opacity',
    'pleural_effusion',
    'pneumonia',
]

pathology_dict = {}

for col in target_variables:
  value_zero = chexpert_final_df[chexpert_final_df[col] == 0][col].count()
  value_positive = chexpert_final_df[chexpert_final_df[col] == 1][col].count()
  total_studies = value_zero + value_positive
  pathology_dict[col] = [col, value_zero, value_positive, total_studies]

pathology_dict["no_finding"] = ["no_finding", 0, chexpert_final_df[chexpert_final_df["finding_names"] == 'no_finding'].shape[0], chexpert_final_df.shape[0]]

df_distribution = pd.DataFrame.from_dict(pathology_dict, orient="index", columns=["finding", "not_mention", "positive_mention", "total_studies"]).sort_values("positive_mention", ascending=False).reset_index(drop=True)
df_distribution["percentage"] = round(df_distribution["positive_mention"] / df_distribution["total_studies"], 2)*100
df_distribution

Unnamed: 0,finding,not_mention,positive_mention,total_studies,percentage
0,no_finding,0,8293,12412,67.0
1,lung_opacity,11073,1339,12412,11.0
2,cardiomegaly,11638,774,12412,6.0
3,atelectasis,11713,699,12412,6.0
4,pleural_effusion,11882,530,12412,4.0
5,pneumonia,12020,392,12412,3.0
6,edema,12027,385,12412,3.0


In [10]:
return_column_values_sum_and_percentage(chexpert_final_df.groupby("finding_names").agg({"study_id": "count"}).sort_values("study_id", ascending=False), "study_id")

Unnamed: 0_level_0,sum,percentage,cumsum_percentage
finding_names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no_finding,8293,66.8%,66.8%
lung_opacity,1339,10.8%,77.6%
cardiomegaly,774,6.2%,83.8%
atelectasis,699,5.6%,89.5%
pleural_effusion,530,4.3%,93.7%
pneumonia,392,3.2%,96.9%
edema,385,3.1%,100.0%


## 3.1 `train`, `test`, `valid`: unbalanced (same real data distribution)

In [11]:
# Separate features and target variables
X = chexpert_final_df.drop(target_variables, axis=1)  # Features
y = chexpert_final_df[target_variables]  # Target variables

# First, split the data into a training set and a temporary set (combining test and validation)
stratifier = IterativeStratification(n_splits=3, order=1)
train_indexes, temp_indexes = next(stratifier.split(X.values, y.values))

# First, split the data into a training set and a temporary set (combining test and validation)
X_train, y_train = X.iloc[train_indexes], y.iloc[train_indexes]
X_temp, y_temp = X.iloc[temp_indexes], y.iloc[temp_indexes]

# Split the temporary set into actual test and validation sets
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp.values)

# Recombine features and target for each set
train_df = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
val_df = pd.concat([X_val.reset_index(drop=True), y_val.reset_index(drop=True)], axis=1)
test_df = pd.concat([X_test.reset_index(drop=True), y_test.reset_index(drop=True)], axis=1)

In [12]:
pathology_dict = {}

for col in target_variables:
  value_zero = train_df[train_df[col] == 0][col].count()
  value_positive = train_df[train_df[col] == 1][col].count()
  total_studies = value_zero + value_positive
  pathology_dict[col] = [col, value_zero, value_positive, total_studies]

pathology_dict["no_finding"] = ["no_finding", 0, train_df[train_df["finding_names"] == 'no_finding'].shape[0], train_df.shape[0]]

train_distribution = pd.DataFrame.from_dict(pathology_dict, orient="index", columns=["finding", "not_mention", "positive_mention", "total_studies"]).sort_values("positive_mention", ascending=False).reset_index(drop=True)
train_distribution["percentage"] = round(train_distribution["positive_mention"] / train_distribution["total_studies"], 2)*100
train_distribution

Unnamed: 0,finding,not_mention,positive_mention,total_studies,percentage
0,no_finding,0,5529,8274,67.0
1,lung_opacity,7382,892,8274,11.0
2,cardiomegaly,7758,516,8274,6.0
3,atelectasis,7808,466,8274,6.0
4,pleural_effusion,7921,353,8274,4.0
5,pneumonia,8013,261,8274,3.0
6,edema,8017,257,8274,3.0


In [13]:
pathology_dict = {}

for col in target_variables:
  value_zero = val_df[val_df[col] == 0][col].count()
  value_positive = val_df[val_df[col] == 1][col].count()
  total_studies = value_zero + value_positive
  pathology_dict[col] = [col, value_zero, value_positive, total_studies]

pathology_dict["no_finding"] = ["no_finding", 0, val_df[val_df["finding_names"] == 'no_finding'].shape[0], val_df.shape[0]]

val_distribution = pd.DataFrame.from_dict(pathology_dict, orient="index", columns=["finding", "not_mention", "positive_mention", "total_studies"]).sort_values("positive_mention", ascending=False).reset_index(drop=True)
val_distribution["percentage"] = round(val_distribution["positive_mention"] / val_distribution["total_studies"], 2)*100
val_distribution

Unnamed: 0,finding,not_mention,positive_mention,total_studies,percentage
0,no_finding,0,1382,2069,67.0
1,lung_opacity,1846,223,2069,11.0
2,cardiomegaly,1940,129,2069,6.0
3,atelectasis,1952,117,2069,6.0
4,pleural_effusion,1981,88,2069,4.0
5,pneumonia,2003,66,2069,3.0
6,edema,2005,64,2069,3.0


In [14]:
pathology_dict = {}

for col in target_variables:
  value_zero = test_df[test_df[col] == 0][col].count()
  value_positive = test_df[test_df[col] == 1][col].count()
  total_studies = value_zero + value_positive
  pathology_dict[col] = [col, value_zero, value_positive, total_studies]

pathology_dict["no_finding"] = ["no_finding", 0, test_df[test_df["finding_names"] == 'no_finding'].shape[0], test_df.shape[0]]

test_distribution = pd.DataFrame.from_dict(pathology_dict, orient="index", columns=["finding", "not_mention", "positive_mention", "total_studies"]).sort_values("positive_mention", ascending=False).reset_index(drop=True)
test_distribution["percentage"] = round(test_distribution["positive_mention"] / test_distribution["total_studies"], 2)*100
test_distribution

Unnamed: 0,finding,not_mention,positive_mention,total_studies,percentage
0,no_finding,0,1382,2069,67.0
1,lung_opacity,1845,224,2069,11.0
2,cardiomegaly,1940,129,2069,6.0
3,atelectasis,1953,116,2069,6.0
4,pleural_effusion,1980,89,2069,4.0
5,pneumonia,2004,65,2069,3.0
6,edema,2005,64,2069,3.0


In [15]:
# Export dataframes to json files
train_df.to_json(data_dir_processed + 'train_set__chexpert__6_findings__single_label__unbalanced.json')
val_df.to_json(data_dir_processed + 'validation_set__chexpert__6_findings__single_label__unbalanced.json')
test_df.to_json(data_dir_processed + 'test_set__chexpert__6_findings__single_label__unbalanced.json')

## 3.2 `train`: balanced / `test`, `valid`: unbalanced (same real data distribution)

In [None]:
# # Separate features and target variables
# X = train_df.drop(target_variables, axis=1)  # Features
# y = train_df[target_variables]  # Target variables

# # Initialize the RandomUnderSampler
# rus = RandomUnderSampler(random_state=42)

# # Apply undersampling to the training set
# X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# #
# train_balanced_df = pd.concat([X_train_resampled.reset_index(drop=True), y_train_resampled.reset_index(drop=True)], axis=1)

In [16]:
minimum_positive_mention = train_distribution.positive_mention.min()
minimum_positive_mention

257

In [21]:
dataframes_list = []

for finding in target_variables:
    finding_balanced = train_df[train_df[finding] == 1].sample(n=minimum_positive_mention, random_state=42)
    dataframes_list.append(finding_balanced)

no_finding_balanced = train_df[train_df["finding_names"] == 'no_finding'].sample(n=2*minimum_positive_mention, random_state=42)
dataframes_list.append(no_finding_balanced)

train_balanced_df = pd.concat(dataframes_list)

In [22]:
train_balanced_df.head()

Unnamed: 0,patient_id,visit_id,study_id,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,positive_label_total,finding_names,radiology_note,discharge_note,chief_complaint,major_surgical_or_invasive_procedure,history_of_present_illness,past_medical_history,family_history,atelectasis,cardiomegaly,edema,lung_opacity,pleural_effusion,pneumonia
1166,11362587.0,23607298,52098273.0,97.3,67.0,18.0,99.0,138.0,59.0,6,2.0,1.0,atelectasis,FINAL REPORT\...,\nName: ___ Unit No: ___\n...,Chest Pain,Cardiac catheterization with balloon angioplasty,"Ms. ___ is a ___ w/ PMH hypertension, arthriti...","PMH: HTN, BCC, CAD, gastritis, LBP, OA, RA\nPS...",No valvular heart disease,1.0,0.0,0.0,0.0,0.0,0.0
1295,11539363.0,23558226,55922046.0,99.1,80.0,16.0,97.0,162.0,67.0,0,3.0,1.0,atelectasis,FINAL REPORT\...,\nName: ___ Unit No: ___\...,Chest pain,___ Cardiac Catheterization by Dr. ___,"Mr ___ is a ___ yo M w/IDDM, HTN, HLD, has not...",- IDDM \n - HTN \n - HLD,Noncontributory,1.0,0.0,0.0,0.0,0.0,0.0
735,10837103.0,26991020,50708867.0,98.4,82.0,10.0,97.0,88.0,49.0,0,1.0,1.0,atelectasis,FINAL REPORT\...,\nName: ___ Unit No: __...,Altered mental status,,Mr. ___ is a ___ year old male with PMH notabl...,"1. Right frontal mass, s/p craniotomy for rese...",Non-contributory to presentation.,1.0,0.0,0.0,0.0,0.0,0.0
8143,19849119.0,27397159,58419216.0,98.6,92.0,20.0,98.0,127.0,70.0,0,2.0,1.0,atelectasis,FINAL REPORT\...,\nName: ___ Unit No: ___...,Confusion,,"___ PMHx Hep C Cirrhosis, HTN, DM II presents ...","PMH: DMII, Cirrhosis, grade 1 esophageal varic...",Brothers x3 both with MI's. Father ___ arthrit...,1.0,0.0,0.0,0.0,0.0,0.0
1451,11738050.0,21323353,59693387.0,98.9,86.0,20.0,97.0,132.0,85.0,9,3.0,1.0,atelectasis,FINAL REPORT\...,\nName: ___ Unit No: __...,wound breakdown,___ Lumbar wound washout and re-closure with p...,___ y/o female s/p L3-S1 laminectomy on ___ pr...,Schizophrenia. Epilepsy since ___.\nAtrial fi...,Seizures: Her mother started getting seizures ...,1.0,0.0,0.0,0.0,0.0,0.0


In [23]:
pathology_dict = {}

for col in target_variables:
  value_zero = train_balanced_df[train_balanced_df[col] == 0][col].count()
  value_positive = train_balanced_df[train_balanced_df[col] == 1][col].count()
  total_studies = value_zero + value_positive
  pathology_dict[col] = [col, value_zero, value_positive, total_studies]

pathology_dict["no_finding"] = ["no_finding", 0, train_balanced_df[train_balanced_df["finding_names"] == 'no_finding'].shape[0], train_balanced_df.shape[0]]

train_balanced_distribution = pd.DataFrame.from_dict(pathology_dict, orient="index", columns=["finding", "not_mention", "positive_mention", "total_studies"]).sort_values("positive_mention", ascending=False).reset_index(drop=True)
train_balanced_distribution["percentage"] = round(train_balanced_distribution["positive_mention"] / train_balanced_distribution["total_studies"], 2)*100
train_balanced_distribution

Unnamed: 0,finding,not_mention,positive_mention,total_studies,percentage
0,no_finding,0,514,2056,25.0
1,atelectasis,1799,257,2056,12.0
2,cardiomegaly,1799,257,2056,12.0
3,edema,1799,257,2056,12.0
4,lung_opacity,1799,257,2056,12.0
5,pleural_effusion,1799,257,2056,12.0
6,pneumonia,1799,257,2056,12.0


In [24]:
pathology_dict = {}

for col in target_variables:
  value_zero = train_df[train_df[col] == 0][col].count()
  value_positive = train_df[train_df[col] == 1][col].count()
  total_studies = value_zero + value_positive
  pathology_dict[col] = [col, value_zero, value_positive, total_studies]

pathology_dict["no_finding"] = ["no_finding", 0, train_df[train_df["finding_names"] == 'no_finding'].shape[0], train_df.shape[0]]

train_distribution = pd.DataFrame.from_dict(pathology_dict, orient="index", columns=["finding", "not_mention", "positive_mention", "total_studies"]).sort_values("positive_mention", ascending=False).reset_index(drop=True)
train_distribution["percentage"] = round(train_distribution["positive_mention"] / train_distribution["total_studies"], 2)*100
train_distribution

Unnamed: 0,finding,not_mention,positive_mention,total_studies,percentage
0,no_finding,0,5529,8274,67.0
1,lung_opacity,7382,892,8274,11.0
2,cardiomegaly,7758,516,8274,6.0
3,atelectasis,7808,466,8274,6.0
4,pleural_effusion,7921,353,8274,4.0
5,pneumonia,8013,261,8274,3.0
6,edema,8017,257,8274,3.0


In [25]:
# Export dataframes to json files
train_balanced_df.to_json(data_dir_processed + 'train_set__chexpert__6_findings__single_label__balanced.json')