# Data Extraction
---

In [56]:
# import libraries
import pandas as pd 
import kagglehub # import data via Kaggle API
import os

import warnings
warnings.filterwarnings("ignore")

In [57]:
# Download data from Kaggle API to local machine
path = kagglehub.dataset_download("nih-chest-xrays/data")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\reala\.cache\kagglehub\datasets\nih-chest-xrays\data\versions\3


In [58]:
# extract bounding box coordinates
bbox_df = pd.read_csv(os.path.join(path, 'BBox_List_2017.csv'))
bbox_df.head()

Unnamed: 0,Image Index,Finding Label,Bbox [x,y,w,h],Unnamed: 6,Unnamed: 7,Unnamed: 8
0,00013118_008.png,Atelectasis,225.084746,547.019217,86.779661,79.186441,,,
1,00014716_007.png,Atelectasis,686.101695,131.543498,185.491525,313.491525,,,
2,00029817_009.png,Atelectasis,221.830508,317.053115,155.118644,216.949153,,,
3,00014687_001.png,Atelectasis,726.237288,494.95142,141.016949,55.322034,,,
4,00017877_001.png,Atelectasis,660.067797,569.780787,200.677966,78.101695,,,


In [59]:
# extract data entry file
data_entry_df = pd.read_csv(os.path.join(path,'Data_Entry_2017.csv'))
data_entry_df.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


# Data Cleaning
---

In [60]:
# removed unnesscary columns and reformated column names

bbox_df = bbox_df.drop(["Unnamed: 6", "Unnamed: 7", "Unnamed: 8"], axis=1) # removed NaN columns

# reformate columns (lowercase, no spaces)
bbox_df.columns = bbox_df.columns.tolist()

# renamed bbox coordinate columns
bbox_df.columns = ["image_index", "finding_label","x_min", "y_min", "x_max", "y_max"]

bbox_df.head()

Unnamed: 0,image_index,finding_label,x_min,y_min,x_max,y_max
0,00013118_008.png,Atelectasis,225.084746,547.019217,86.779661,79.186441
1,00014716_007.png,Atelectasis,686.101695,131.543498,185.491525,313.491525
2,00029817_009.png,Atelectasis,221.830508,317.053115,155.118644,216.949153
3,00014687_001.png,Atelectasis,726.237288,494.95142,141.016949,55.322034
4,00017877_001.png,Atelectasis,660.067797,569.780787,200.677966,78.101695


In [61]:
# checking basic characteristic of the dataset (na values, null values, duplicated values, and checked data type of each column)

print(f'Checking for NA values: \n {bbox_df.isna().sum()}')
print(f'Checking for Null values: \n {bbox_df.isnull().sum()}')
print(f'Checking for duplicated values: \n {bbox_df.duplicated().sum()}')
print(f'Checking data type of each column: \n {bbox_df.info()}')

Checking for NA values: 
 image_index      0
finding_label    0
x_min            0
y_min            0
x_max            0
y_max            0
dtype: int64
Checking for Null values: 
 image_index      0
finding_label    0
x_min            0
y_min            0
x_max            0
y_max            0
dtype: int64
Checking for duplicated values: 
 0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 984 entries, 0 to 983
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   image_index    984 non-null    object 
 1   finding_label  984 non-null    object 
 2   x_min          984 non-null    float64
 3   y_min          984 non-null    float64
 4   x_max          984 non-null    float64
 5   y_max          984 non-null    float64
dtypes: float64(4), object(2)
memory usage: 46.3+ KB
Checking data type of each column: 
 None


In [62]:
# removed unnamed column

data_entry_df = data_entry_df.drop(columns=["Unnamed: 11"],axis=1)

# rename columns 
data_entry_df.columns = data_entry_df.columns.tolist() 
data_entry_df.columns = ["image_index", "finding_labels", "follow_up_number", "patient_id", "patient_age", "patient_gender", "view_position", "original_img_width", "original_img_height", "img_pixel_spacing_x", "img_pixel_spacing_y"]
data_entry_df.head()

Unnamed: 0,image_index,finding_labels,follow_up_number,patient_id,patient_age,patient_gender,view_position,original_img_width,original_img_height,img_pixel_spacing_x,img_pixel_spacing_y
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143


In [63]:
# checking basic characteristic of the dataset (na values, null values, duplicated values, and checked data type of each column)

print(f'Checking for NA values: \n {data_entry_df.isna().sum()}')
print(f'Checking for Null values: \n {data_entry_df.isnull().sum()}')
print(f'Checking for duplicated values: \n {data_entry_df.duplicated().sum()}')
print(f'Checking data type of each column: \n {data_entry_df.info()}')

Checking for NA values: 
 image_index            0
finding_labels         0
follow_up_number       0
patient_id             0
patient_age            0
patient_gender         0
view_position          0
original_img_width     0
original_img_height    0
img_pixel_spacing_x    0
img_pixel_spacing_y    0
dtype: int64
Checking for Null values: 
 image_index            0
finding_labels         0
follow_up_number       0
patient_id             0
patient_age            0
patient_gender         0
view_position          0
original_img_width     0
original_img_height    0
img_pixel_spacing_x    0
img_pixel_spacing_y    0
dtype: int64
Checking for duplicated values: 
 0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112120 entries, 0 to 112119
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   image_index          112120 non-null  object 
 1   finding_labels       112120 non-null  object 
 2   follow_up_numb

In [64]:
data_entry_df

Unnamed: 0,image_index,finding_labels,follow_up_number,patient_id,patient_age,patient_gender,view_position,original_img_width,original_img_height,img_pixel_spacing_x,img_pixel_spacing_y
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143
...,...,...,...,...,...,...,...,...,...,...,...
112115,00030801_001.png,Mass|Pneumonia,1,30801,39,M,PA,2048,2500,0.168,0.168
112116,00030802_000.png,No Finding,0,30802,29,M,PA,2048,2500,0.168,0.168
112117,00030803_000.png,No Finding,0,30803,42,F,PA,2048,2500,0.168,0.168
112118,00030804_000.png,No Finding,0,30804,30,F,PA,2048,2500,0.168,0.168


# Data Splitting
---

In [65]:
labeled_df = pd.merge(bbox_df, data_entry_df, on="image_index", how="inner")
labeled_df.head()

Unnamed: 0,image_index,finding_label,x_min,y_min,x_max,y_max,finding_labels,follow_up_number,patient_id,patient_age,patient_gender,view_position,original_img_width,original_img_height,img_pixel_spacing_x,img_pixel_spacing_y
0,00013118_008.png,Atelectasis,225.084746,547.019217,86.779661,79.186441,Atelectasis,8,13118,69,M,PA,2992,2991,0.143,0.143
1,00014716_007.png,Atelectasis,686.101695,131.543498,185.491525,313.491525,Atelectasis|Effusion|Mass,7,14716,53,M,AP,3056,2544,0.139,0.139
2,00029817_009.png,Atelectasis,221.830508,317.053115,155.118644,216.949153,Atelectasis,9,29817,67,F,AP,3056,2544,0.139,0.139
3,00014687_001.png,Atelectasis,726.237288,494.95142,141.016949,55.322034,Atelectasis|Cardiomegaly|Consolidation,1,14687,51,M,AP,2500,2048,0.168,0.168
4,00017877_001.png,Atelectasis,660.067797,569.780787,200.677966,78.101695,Atelectasis,1,17877,75,F,AP,2500,2048,0.168,0.168


In [None]:
# extracted the targeted images
targeted_imgs = list(data_entry_df["image_index"])

# extract only the images folders paths from kaggle and store it in a list
folder_paths = []

for folder in os.listdir(path):

    if folder.startswith("images") == True:
        f_path = os.path.join(path, folder, "images")
        folder_paths.append(f_path)


# search folders based on targeted images
parent_folders = []

for img in targeted_imgs:
    found = False
    for folder in folder_paths:
        if img in os.listdir(folder):
            parent_folders.append(os.path.basename(os.path.dirname(folder)))
            found = True
            break  
    if not found:
        print(f"Not found: {img}")

# create a dataframe 
parent_folders_df = pd.DataFrame({
    "image_index": targeted_imgs,
    "folders": parent_folders
})

KeyboardInterrupt: 

In [71]:
parent_folders_df.duplicated().sum()

0

In [72]:
image_data_df = pd.merge(data_entry_df, parent_folders_df, on="image_index", how="inner")
image_data_df = labeled_df.drop_duplicates(subset="image_index")
image_data_df.head()

Unnamed: 0,image_index,finding_label,x_min,y_min,x_max,y_max,finding_labels,follow_up_number,patient_id,patient_age,patient_gender,view_position,original_img_width,original_img_height,img_pixel_spacing_x,img_pixel_spacing_y
0,00013118_008.png,Atelectasis,225.084746,547.019217,86.779661,79.186441,Atelectasis,8,13118,69,M,PA,2992,2991,0.143,0.143
1,00014716_007.png,Atelectasis,686.101695,131.543498,185.491525,313.491525,Atelectasis|Effusion|Mass,7,14716,53,M,AP,3056,2544,0.139,0.139
2,00029817_009.png,Atelectasis,221.830508,317.053115,155.118644,216.949153,Atelectasis,9,29817,67,F,AP,3056,2544,0.139,0.139
3,00014687_001.png,Atelectasis,726.237288,494.95142,141.016949,55.322034,Atelectasis|Cardiomegaly|Consolidation,1,14687,51,M,AP,2500,2048,0.168,0.168
4,00017877_001.png,Atelectasis,660.067797,569.780787,200.677966,78.101695,Atelectasis,1,17877,75,F,AP,2500,2048,0.168,0.168
