# Data collection
We have collected NIH Chest X-rays dataset for the project: 
- Dataset of 14 Common Thorax Disease Categories.
- Publisher: National Institutes of Health Chest X-Ray Dataset and Chris Crawford
- Type: Multi-labels classification


In [3]:
import pandas as pd
from glob import glob
import os

In [4]:
chest_Xray_path = r'.\chestXray14'
chest_Xray_dir_list = os.listdir(chest_Xray_path)
print(chest_Xray_dir_list)

['ARXIV_V5_CHESTXRAY.pdf', 'BBox_List_2017.csv', 'Data_Entry_2017.csv', 'FAQ_CHESTXRAY.pdf', 'images_001', 'images_002', 'images_003', 'images_004', 'images_005', 'images_006', 'images_007', 'images_008', 'images_009', 'images_010', 'images_011', 'images_012', 'LOG_CHESTXRAY.pdf', 'README_CHESTXRAY.pdf', 'test_list.txt', 'train_val_list.txt']


In [5]:
data_entry_path = os.path.join(chest_Xray_path, 'Data_Entry_2017.csv')
data_entry_df = pd.read_csv(data_entry_path)
print(data_entry_df.head())

        Image Index          Finding Labels  Follow-up #  Patient ID  \
0  00000001_000.png            Cardiomegaly            0           1   
1  00000001_001.png  Cardiomegaly|Emphysema            1           1   
2  00000001_002.png   Cardiomegaly|Effusion            2           1   
3  00000002_000.png              No Finding            0           2   
4  00000003_000.png                  Hernia            0           3   

   Patient Age Patient Gender View Position  OriginalImage[Width  Height]  \
0           58              M            PA                 2682     2749   
1           58              M            PA                 2894     2729   
2           58              M            PA                 2500     2048   
3           81              M            PA                 2500     2048   
4           81              F            PA                 2582     2991   

   OriginalImagePixelSpacing[x     y]  Unnamed: 11  
0                        0.143  0.143          NaN 

In [6]:
pattern = os.path.join(chest_Xray_path, 'images*', '*', '*.png')
all_image_paths = glob(pattern)
print('Scans found:', len(all_image_paths), ', Total Headers', data_entry_df.shape[0])

Scans found: 112120 , Total Headers 112120


In [7]:
image_path_dict = {os.path.basename(path): path for path in all_image_paths}
data_entry_df['path'] = data_entry_df['Image Index'].map(image_path_dict)
data_entry_df.sample(3)

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11,path
54996,00013774_023.png,Atelectasis|Infiltration,23,13774,77,F,AP,2500,2048,0.168,0.168,,.\chestXray14\images_006\images\00013774_023.png
4265,00001162_000.png,Pneumonia,0,1162,82,M,AP,2500,2048,0.168,0.168,,.\chestXray14\images_001\images\00001162_000.png
88428,00021890_000.png,No Finding,0,21890,50,M,PA,2992,2991,0.143,0.143,,.\chestXray14\images_010\images\00021890_000.png


In [11]:
xray14_df = data_entry_df.dropna(subset=['Finding Labels', 'path'])
xray14_df['labels'] = xray14_df['Finding Labels']

dataset_df = pd.concat([xray14_df[['path', 'labels']]], ignore_index=True)

dataset_df.to_csv('dataset.csv', index=False)

print(dataset_df.shape)
print(dataset_df.head())

(112120, 2)
                                               path                  labels
0  .\chestXray14\images_001\images\00000001_000.png            Cardiomegaly
1  .\chestXray14\images_001\images\00000001_001.png  Cardiomegaly|Emphysema
2  .\chestXray14\images_001\images\00000001_002.png   Cardiomegaly|Effusion
3  .\chestXray14\images_001\images\00000002_000.png              No Finding
4  .\chestXray14\images_001\images\00000003_000.png                  Hernia
