# Data Preprocessing and Splitting
---

In [1]:
# import libraries
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append('../scripts')
from create_labels import create_labels

In [2]:
# note: during data splitting, do an 80/20 split based on the data listed in the train_val_list file. use the test_list file for model evaluation

In [3]:
# load data

bbox_df = pd.read_pickle('../data/interim/bbox_df.pkl')
data_entry_df = pd.read_pickle('../data/interim/data_entry_df.pkl')
train_val_list = pd.read_pickle('../data/interim/train_val_list.pkl')
test_list = pd.read_pickle('../data/interim/test_list.pkl')
patient_data = pd.read_pickle('../data/interim/patient_data.pkl')

In [4]:
labels_exploded = data_entry_df["finding_labels"].str.split('|').explode()
sorted(labels_exploded.unique())

['Atelectasis',
 'Cardiomegaly',
 'Consolidation',
 'Edema',
 'Effusion',
 'Emphysema',
 'Fibrosis',
 'Hernia',
 'Infiltration',
 'Mass',
 'No Finding',
 'Nodule',
 'Pleural_Thickening',
 'Pneumonia',
 'Pneumothorax']

In [5]:
test_df = data_entry_df.head()
X = test_df.drop(columns="finding_labels", axis=1)
y = test_df["finding_labels"]


In [6]:
output = create_labels(X,y)
output

Unnamed: 0,image_index,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,...,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,follow_up_number,patient_id,patient_age,patient_gender,view_position
0,00000001_000.png,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,58,M,PA
1,00000001_001.png,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,1,1,58,M,PA
2,00000001_002.png,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,2,1,58,M,PA
3,00000002_000.png,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,2,81,M,PA
4,00000003_000.png,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,3,81,F,PA


In [11]:
train_val_list

Unnamed: 0,00000001_000.png
0,00000001_001.png
1,00000001_002.png
2,00000002_000.png
3,00000004_000.png
4,00000005_000.png
...,...
86518,00030789_000.png
86519,00030793_000.png
86520,00030795_000.png
86521,00030801_000.png


In [15]:
test_list

Unnamed: 0,00000003_000.png
0,00000003_001.png
1,00000003_002.png
2,00000003_003.png
3,00000003_004.png
4,00000003_005.png
...,...
25590,00030800_000.png
25591,00030802_000.png
25592,00030803_000.png
25593,00030804_000.png


In [13]:
# split multi-labeled data up

"""
labels_exploded = data_entry_df["finding_labels"].str.split('|').explode()
len(labels_exploded.unique())
"""

'\nlabels_exploded = data_entry_df["finding_labels"].str.split(\'|\').explode()\nlen(labels_exploded.unique())\n'

In [14]:
# [add folder numbers to the dataset]

"""

# extracted the targeted images
targeted_imgs = list(data_entry_df["image_index"])

# extract only the images folders paths from kaggle and store it in a list
folder_paths = []

for folder in os.listdir(path):

    if folder.startswith("images") == True:
        f_path = os.path.join(path, folder, "images")
        folder_paths.append(f_path)


# search folders based on targeted images
parent_folders = []

for img in targeted_imgs:
    found = False
    for folder in folder_paths:
        if img in os.listdir(folder):
            parent_folders.append(os.path.basename(os.path.dirname(folder)))
            found = True
            break  
    if not found:
        print(f"Not found: {img}")

# create a dataframe 
parent_folders_df = pd.DataFrame({
    "image_index": targeted_imgs,
    "folders": parent_folders
})



"""

'\n\n# extracted the targeted images\ntargeted_imgs = list(data_entry_df["image_index"])\n\n# extract only the images folders paths from kaggle and store it in a list\nfolder_paths = []\n\nfor folder in os.listdir(path):\n\n    if folder.startswith("images") == True:\n        f_path = os.path.join(path, folder, "images")\n        folder_paths.append(f_path)\n\n\n# search folders based on targeted images\nparent_folders = []\n\nfor img in targeted_imgs:\n    found = False\n    for folder in folder_paths:\n        if img in os.listdir(folder):\n            parent_folders.append(os.path.basename(os.path.dirname(folder)))\n            found = True\n            break  \n    if not found:\n        print(f"Not found: {img}")\n\n# create a dataframe \nparent_folders_df = pd.DataFrame({\n    "image_index": targeted_imgs,\n    "folders": parent_folders\n})\n\n\n\n'