## 0. Setup

In [1]:
!pip install scikit-multilearn

Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.4/89.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


In [2]:
import pandas as pd
import json

from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import IterativeStratification
from imblearn.under_sampling import RandomUnderSampler
from copy import deepcopy

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
pd.options.display.max_columns = 999

In [5]:
# # current directory
# current_dir = os.getcwd()

# # two folders u
# parent_dir = os.path.dirname(os.path.dirname(current_dir))

# # data directory
# data_dir_processed = os.path.join(parent_dir, "data/processed")

# data directory (GDrive)
data_dir_processed = '/content/drive/MyDrive/DATASCI_210/data/processed/'

In [6]:
def return_column_values_sum_and_percentage(dataframe_input, column_input):
    total_sum = dataframe_input[column_input].sum()
    percentages = dataframe_input[column_input] / total_sum
    sums_percentages = pd.DataFrame({
        'sum': dataframe_input[column_input],
        'percentage': percentages
    })
    sums_percentages['cumsum_percentage'] = sums_percentages['percentage'].cumsum()
    # sums_percentages['sum'] = sums_percentages['sum'].apply(lambda x: "{:,}".format(x))
    sums_percentages['percentage'] = sums_percentages['percentage'].mul(100).round(1).astype(str) + '%'
    sums_percentages['cumsum_percentage'] = sums_percentages['cumsum_percentage'].mul(100).round(1).astype(str) + '%'
    return sums_percentages

## 1. Data Acquisition

In [7]:
# Define the names of the JSON files
chexpert_final_df_filename = 'chexpert_selected_4_findings.json'

# Load the JSON files into a DataFrame
with open(data_dir_processed + chexpert_final_df_filename) as chexpert_json_file:
    chexpert_dict_file = json.load(chexpert_json_file)

# Converting json dataset from dictionary to dataframe
chexpert_final_df = pd.DataFrame.from_dict(chexpert_dict_file)
chexpert_final_df.reset_index(drop=True, inplace=True)

## 2. Data Preprocessing

In [8]:
chexpert_final_df.head()

Unnamed: 0,patient_id,visit_id,study_id,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,atelectasis,cardiomegaly,lung_opacity,pleural_effusion,positive_label_total,finding_names,radiology_note,discharge_note,chief_complaint,major_surgical_or_invasive_procedure,history_of_present_illness,past_medical_history,family_history
0,10000032.0,22595853,50414267.0,98.4,70.0,16.0,97.0,106.0,63.0,0,3.0,0.0,0.0,0.0,0.0,1.0,no_finding,FINAL REPORT\...,\nName: ___ Unit No: _...,Worsening ABD distension and pain,Paracentesis,"___ HCV cirrhosis c/b ascites, hiv on ART, h/o...",1. HCV Cirrhosis \n2. No history of abnormal ...,"She a total of five siblings, but she is not ..."
1,10000980.0,29654838,59988438.0,97.8,57.0,18.0,100.0,180.0,88.0,0,2.0,0.0,0.0,0.0,1.0,1.0,pleural_effusion,FINAL REPORT\...,\nName: ___ Unit No: ___\n \nAdmi...,Shortness of breath,,"___ yo woman with h/o hypertension, hyperlipid...","1. CAD RISK FACTORS: +Diabetes, +Dyslipidemia,...",Denies cardiac family history. Family hx of DM...
2,10001176.0,23334588,53186264.0,101.3,97.0,18.0,93.0,168.0,58.0,6,3.0,0.0,0.0,1.0,0.0,1.0,lung_opacity,FINAL REPORT\...,\nName: ___ Unit No: ___...,fever,none,"___ with history of morbid obesity, coronary a...",MYOCARDIAL INFARCT - INFEROPOSTERIOR \nHYPERC...,Non contributory
3,10001217.0,24597018,52067803.0,99.0,81.0,16.0,97.0,160.0,102.0,0,3.0,1.0,0.0,0.0,0.0,1.0,atelectasis,FINAL REPORT\...,\nName: ___ Unit No: ___\n \n...,"Left hand and face numbness, left hand weaknes...",Right parietal craniotomy for abscess incision...,Mrs. ___ is a ___ y/o F from ___ with history ...,Multiple sclerosis,"Mother with pancreatic cancer, brother-lung ca..."
4,10001401.0,26840593,51065211.0,97.8,67.0,20.0,95.0,90.0,43.0,8,2.0,1.0,0.0,0.0,0.0,1.0,atelectasis,FINAL REPORT\...,\nName: ___ Unit No: ___\n \n...,"Abdominal pain, distention, nausea",Interventional radiology placement of abdomina...,"___ F with h/o muscle invasive bladder cancer,...","Hypertension, laparoscopic cholecystectomy, le...",Negative for bladder CA.


In [9]:
# Target variables
target_variables = [
    'atelectasis',
    'cardiomegaly',
    'lung_opacity',
    'pleural_effusion',
]

pathology_dict = {}

for col in target_variables:
  value_zero = chexpert_final_df[chexpert_final_df[col] == 0][col].count()
  value_positive = chexpert_final_df[chexpert_final_df[col] == 1][col].count()
  total_studies = value_zero + value_positive
  pathology_dict[col] = [col, value_zero, value_positive, total_studies]

pathology_dict["no_finding"] = ["no_finding", 0, chexpert_final_df[chexpert_final_df["finding_names"] == 'no_finding'].shape[0], chexpert_final_df.shape[0]]

df_distribution = pd.DataFrame.from_dict(pathology_dict, orient="index", columns=["finding", "not_mention", "positive_mention", "total_studies"]).sort_values("positive_mention", ascending=False).reset_index(drop=True)
df_distribution["percentage"] = round(df_distribution["positive_mention"] / df_distribution["total_studies"], 2)*100
df_distribution

Unnamed: 0,finding,not_mention,positive_mention,total_studies,percentage
0,no_finding,0,7863,11793,67.0
1,lung_opacity,10027,1766,11793,15.0
2,atelectasis,10488,1305,11793,11.0
3,pleural_effusion,10861,932,11793,8.0
4,cardiomegaly,10865,928,11793,8.0


In [10]:
return_column_values_sum_and_percentage(chexpert_final_df.groupby("finding_names").agg({"study_id": "count"}).sort_values("study_id", ascending=False), "study_id")

Unnamed: 0_level_0,sum,percentage,cumsum_percentage
finding_names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no_finding,7863,66.7%,66.7%
lung_opacity,1178,10.0%,76.7%
cardiomegaly,689,5.8%,82.5%
atelectasis,644,5.5%,88.0%
pleural_effusion,418,3.5%,91.5%
"atelectasis, lung_opacity",326,2.8%,94.3%
"atelectasis, pleural_effusion",268,2.3%,96.5%
"lung_opacity, pleural_effusion",168,1.4%,98.0%
"cardiomegaly, lung_opacity",94,0.8%,98.8%
"cardiomegaly, pleural_effusion",78,0.7%,99.4%


# 3. Data Split

## 3.1 `train`, `test`, `valid`: unbalanced (same real data distribution)

In [11]:
# Separate features and target variables
X = chexpert_final_df.drop(target_variables, axis=1)  # Features
y = chexpert_final_df[target_variables]  # Target variables

# First, split the data into a training set and a temporary set (combining test and validation)
stratifier = IterativeStratification(n_splits=3, order=1)
train_indexes, temp_indexes = next(stratifier.split(X.values, y.values))

# First, split the data into a training set and a temporary set (combining test and validation)
X_train, y_train = X.iloc[train_indexes], y.iloc[train_indexes]
X_temp, y_temp = X.iloc[temp_indexes], y.iloc[temp_indexes]

# Split the temporary set into actual test and validation sets
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp.values)

# Recombine features and target for each set
train_df = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
val_df = pd.concat([X_val.reset_index(drop=True), y_val.reset_index(drop=True)], axis=1)
test_df = pd.concat([X_test.reset_index(drop=True), y_test.reset_index(drop=True)], axis=1)

In [12]:
pathology_dict = {}

for col in target_variables:
  value_zero = train_df[train_df[col] == 0][col].count()
  value_positive = train_df[train_df[col] == 1][col].count()
  total_studies = value_zero + value_positive
  pathology_dict[col] = [col, value_zero, value_positive, total_studies]

pathology_dict["no_finding"] = ["no_finding", 0, train_df[train_df["finding_names"] == 'no_finding'].shape[0], train_df.shape[0]]

train_distribution = pd.DataFrame.from_dict(pathology_dict, orient="index", columns=["finding", "not_mention", "positive_mention", "total_studies"]).sort_values("positive_mention", ascending=False).reset_index(drop=True)
train_distribution["percentage"] = round(train_distribution["positive_mention"] / train_distribution["total_studies"], 2)*100
train_distribution

Unnamed: 0,finding,not_mention,positive_mention,total_studies,percentage
0,no_finding,0,5247,7862,67.0
1,lung_opacity,6684,1178,7862,15.0
2,atelectasis,6992,870,7862,11.0
3,pleural_effusion,7241,621,7862,8.0
4,cardiomegaly,7243,619,7862,8.0


In [13]:
pathology_dict = {}

for col in target_variables:
  value_zero = val_df[val_df[col] == 0][col].count()
  value_positive = val_df[val_df[col] == 1][col].count()
  total_studies = value_zero + value_positive
  pathology_dict[col] = [col, value_zero, value_positive, total_studies]

pathology_dict["no_finding"] = ["no_finding", 0, val_df[val_df["finding_names"] == 'no_finding'].shape[0], val_df.shape[0]]

val_distribution = pd.DataFrame.from_dict(pathology_dict, orient="index", columns=["finding", "not_mention", "positive_mention", "total_studies"]).sort_values("positive_mention", ascending=False).reset_index(drop=True)
val_distribution["percentage"] = round(val_distribution["positive_mention"] / val_distribution["total_studies"], 2)*100
val_distribution

Unnamed: 0,finding,not_mention,positive_mention,total_studies,percentage
0,no_finding,0,1308,1966,67.0
1,lung_opacity,1671,295,1966,15.0
2,atelectasis,1749,217,1966,11.0
3,pleural_effusion,1810,156,1966,8.0
4,cardiomegaly,1812,154,1966,8.0


In [14]:
pathology_dict = {}

for col in target_variables:
  value_zero = test_df[test_df[col] == 0][col].count()
  value_positive = test_df[test_df[col] == 1][col].count()
  total_studies = value_zero + value_positive
  pathology_dict[col] = [col, value_zero, value_positive, total_studies]

pathology_dict["no_finding"] = ["no_finding", 0, test_df[test_df["finding_names"] == 'no_finding'].shape[0], test_df.shape[0]]

test_distribution = pd.DataFrame.from_dict(pathology_dict, orient="index", columns=["finding", "not_mention", "positive_mention", "total_studies"]).sort_values("positive_mention", ascending=False).reset_index(drop=True)
test_distribution["percentage"] = round(test_distribution["positive_mention"] / test_distribution["total_studies"], 2)*100
test_distribution

Unnamed: 0,finding,not_mention,positive_mention,total_studies,percentage
0,no_finding,0,1308,1965,67.0
1,lung_opacity,1672,293,1965,15.0
2,atelectasis,1747,218,1965,11.0
3,cardiomegaly,1810,155,1965,8.0
4,pleural_effusion,1810,155,1965,8.0


In [15]:
# Export dataframes to json files
train_df.to_json(data_dir_processed + 'train_set__chexpert__4_findings__unbalanced.json')
val_df.to_json(data_dir_processed + 'validation_set__chexpert__4_findings__unbalanced.json')
test_df.to_json(data_dir_processed + 'test_set__chexpert__4_findings__unbalanced.json')

## 3.2 `train`: balanced / `test`, `valid`: unbalanced (same real data distribution)

In [None]:
# # Separate features and target variables
# X = train_df.drop(target_variables, axis=1)  # Features
# y = train_df[target_variables]  # Target variables

# # Initialize the RandomUnderSampler
# rus = RandomUnderSampler(random_state=42)

# # Apply undersampling to the training set
# X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# #
# train_balanced_df = pd.concat([X_train_resampled.reset_index(drop=True), y_train_resampled.reset_index(drop=True)], axis=1)

In [16]:
minimum_positive_mention = train_distribution.positive_mention.min()
minimum_positive_mention

619

In [17]:
dataframes_list = []

for finding in target_variables:
    finding_balanced = train_df[train_df[finding] == 1].sample(n=minimum_positive_mention, random_state=42)
    dataframes_list.append(finding_balanced)

no_finding_balanced = train_df[train_df["finding_names"] == 'no_finding'].sample(n=2*minimum_positive_mention, random_state=42)
dataframes_list.append(no_finding_balanced)

train_balanced_df = pd.concat(dataframes_list)

In [18]:
train_balanced_df.head()

Unnamed: 0,patient_id,visit_id,study_id,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,positive_label_total,finding_names,radiology_note,discharge_note,chief_complaint,major_surgical_or_invasive_procedure,history_of_present_illness,past_medical_history,family_history,atelectasis,cardiomegaly,lung_opacity,pleural_effusion
3493,14522824.0,21480350,58258218.0,97.4,59.0,18.0,100.0,156.0,70.0,8,2.0,2.0,"atelectasis, cardiomegaly",FINAL REPORT\...,\nName: ___ Unit No: ___\n ...,lower extremity weakness,,"___ pmh HTN, HLD, IDDM presenting with subacut...","Diabetes Mellitus, type II, diagnosed before _...",Unknown,1.0,1.0,0.0,0.0
452,10669036.0,27332252,57435376.0,97.9,95.0,16.0,97.0,133.0,79.0,10,3.0,1.0,atelectasis,FINAL REPORT\...,\nName: ___ Unit No: ___...,Epigastric pain w/ nausea and vomiting,laparoscopic cholecystectomy,Mrs. ___ is a ___ year old woman with a histor...,"PMH: anxiety, agoraphobia, osteoporosis, breas...",non-contributory,1.0,0.0,0.0,0.0
4507,15819830.0,27187288,51425175.0,104.4,148.0,22.0,98.0,95.0,61.0,0,1.0,1.0,atelectasis,FINAL REPORT\...,\nName: ___ Unit No: __...,Fever,none,The patient is a ___ y/o F with PMHx of metast...,Ooncologic History:\nInitial presentation with...,Significant for cancer in her mother (colon) a...,1.0,0.0,0.0,0.0
462,10682162.0,26844965,54146638.0,98.7,67.0,16.0,100.0,158.0,70.0,0,3.0,2.0,"atelectasis, lung_opacity",FINAL REPORT\...,\nName: ___ Unit No: ___\n ...,"failure to thrive, lethargy",none,"___ man w/ ___ disease, AF on Coumadin, prior\...",# ___ disease\n - orhtostatic hypotension\n# H...,"- DM, heart disease, arthritis",1.0,0.0,1.0,0.0
7720,19831368.0,20556494,53009539.0,98.0,78.0,18.0,96.0,113.0,78.0,3,3.0,1.0,atelectasis,WET READ: ___ ___ ___ 6:43 PM\n No free air ...,\nName: ___ Unit No: ...,abdominal pain,none,___ w/o significant past medical and surgical ...,none,none,1.0,0.0,0.0,0.0


In [19]:
pathology_dict = {}

for col in target_variables:
  value_zero = train_balanced_df[train_balanced_df[col] == 0][col].count()
  value_positive = train_balanced_df[train_balanced_df[col] == 1][col].count()
  total_studies = value_zero + value_positive
  pathology_dict[col] = [col, value_zero, value_positive, total_studies]

pathology_dict["no_finding"] = ["no_finding", 0, train_balanced_df[train_balanced_df["finding_names"] == 'no_finding'].shape[0], train_balanced_df.shape[0]]

train_balanced_distribution = pd.DataFrame.from_dict(pathology_dict, orient="index", columns=["finding", "not_mention", "positive_mention", "total_studies"]).sort_values("positive_mention", ascending=False).reset_index(drop=True)
train_balanced_distribution["percentage"] = round(train_balanced_distribution["positive_mention"] / train_balanced_distribution["total_studies"], 2)*100
train_balanced_distribution

Unnamed: 0,finding,not_mention,positive_mention,total_studies,percentage
0,no_finding,0,1238,3714,33.0
1,atelectasis,2745,969,3714,26.0
2,lung_opacity,2779,935,3714,25.0
3,pleural_effusion,2856,858,3714,23.0
4,cardiomegaly,2971,743,3714,20.0


In [20]:
train_balanced_df.drop_duplicates(inplace=True)

In [21]:
pathology_dict = {}

for col in target_variables:
  value_zero = train_balanced_df[train_balanced_df[col] == 0][col].count()
  value_positive = train_balanced_df[train_balanced_df[col] == 1][col].count()
  total_studies = value_zero + value_positive
  pathology_dict[col] = [col, value_zero, value_positive, total_studies]

pathology_dict["no_finding"] = ["no_finding", 0, train_balanced_df[train_balanced_df["finding_names"] == 'no_finding'].shape[0], train_balanced_df.shape[0]]

train_balanced_distribution = pd.DataFrame.from_dict(pathology_dict, orient="index", columns=["finding", "not_mention", "positive_mention", "total_studies"]).sort_values("positive_mention", ascending=False).reset_index(drop=True)
train_balanced_distribution["percentage"] = round(train_balanced_distribution["positive_mention"] / train_balanced_distribution["total_studies"], 2)*100
train_balanced_distribution

Unnamed: 0,finding,not_mention,positive_mention,total_studies,percentage
0,no_finding,0,1238,3320,37.0
1,lung_opacity,2562,758,3320,23.0
2,atelectasis,2599,721,3320,22.0
3,cardiomegaly,2701,619,3320,19.0
4,pleural_effusion,2701,619,3320,19.0


In [22]:
pathology_dict = {}

for col in target_variables:
  value_zero = train_df[train_df[col] == 0][col].count()
  value_positive = train_df[train_df[col] == 1][col].count()
  total_studies = value_zero + value_positive
  pathology_dict[col] = [col, value_zero, value_positive, total_studies]

pathology_dict["no_finding"] = ["no_finding", 0, train_df[train_df["finding_names"] == 'no_finding'].shape[0], train_df.shape[0]]

train_distribution = pd.DataFrame.from_dict(pathology_dict, orient="index", columns=["finding", "not_mention", "positive_mention", "total_studies"]).sort_values("positive_mention", ascending=False).reset_index(drop=True)
train_distribution["percentage"] = round(train_distribution["positive_mention"] / train_distribution["total_studies"], 2)*100
train_distribution

Unnamed: 0,finding,not_mention,positive_mention,total_studies,percentage
0,no_finding,0,5247,7862,67.0
1,lung_opacity,6684,1178,7862,15.0
2,atelectasis,6992,870,7862,11.0
3,pleural_effusion,7241,621,7862,8.0
4,cardiomegaly,7243,619,7862,8.0


In [23]:
# Export dataframes to json files
train_balanced_df.to_json(data_dir_processed + 'train_set__chexpert__4_findings__balanced.json')