In this notebook we are going to create a train folder divided in no findings and lung_disease. Since there are many images of no findings, we want to create first a model able to recognize if a lung is healthy or not, and only then try to detect the disease.

In [1]:
import pandas as pd

In [2]:
Data_entry = pd.read_csv(r'C:\Users\Hp\Documents\Machine_learning_projects\Chest_X_rays\0_Chest_X_rays_data\Data_Entry_2017.csv')
Data_entry['Finding Labels'].value_counts()[:10]

Finding Labels
No Finding                  60361
Infiltration                 9547
Atelectasis                  4215
Effusion                     3955
Nodule                       2705
Pneumothorax                 2194
Mass                         2139
Effusion|Infiltration        1603
Atelectasis|Infiltration     1350
Consolidation                1310
Name: count, dtype: int64

In [3]:
Disease_labels = ['Infiltration', 'Effusion', 'Atelectasis', 'Nodule', 'Pneumothorax', 'Mass']

# creating the dataframe for the diseases and no findings
Lung_diseases = Data_entry[Data_entry['Finding Labels'].isin(Disease_labels)]
No_findings = Data_entry[Data_entry['Finding Labels'] == 'No Finding']

# Creating train dataset

In [4]:
import os 
test_path = r'C:\Users\Hp\Documents\Machine_learning_projects\Chest_X_rays\1_Datasets\Test'
valid_path = r'C:\Users\Hp\Documents\Machine_learning_projects\Chest_X_rays\1_Datasets\Validation'

test_valid_disease_images =[]
test_valid_no_finding_images = []
temp_len = 0

# finding all the images in the test and validation folders
for disease in Disease_labels:
    test_valid_disease_images = test_valid_disease_images + os.listdir(test_path + '\\' + disease) + os.listdir(valid_path + '\\' + disease)
    print(f'found {len(test_valid_disease_images)-temp_len} {disease} images')
    temp_len= len(test_valid_disease_images)

test_valid_no_finding_images = test_valid_no_finding_images + os.listdir(test_path + '\\' + 'No_finding') + os.listdir(valid_path + '\\' + 'No_finding')
print(f'found a total of {len(test_valid_no_finding_images)} no finding images')
print(f'found a total of {len(test_valid_disease_images)} for lung diseases')

found 580 Infiltration images
found 580 Effusion images
found 580 Atelectasis images
found 580 Nodule images
found 580 Pneumothorax images
found 580 Mass images
found a total of 580 no finding images
found a total of 3480 for lung diseases


In [5]:
total_num_lung_diseases = len(Lung_diseases)
total_num_no_findings = len(No_findings)
print(f'there is a total of {total_num_lung_diseases} lung diseases images')
print(f'there is a total of {total_num_no_findings} no finding images')

there is a total of 24755 lung diseases images
there is a total of 60361 no finding images


In [6]:
# selecting all the images that are not in the test and validation folders
Lung_diseases = Lung_diseases[~(Lung_diseases['Image Index'].isin(test_valid_disease_images))]
No_findings = No_findings[~(No_findings['Image Index'].isin(test_valid_no_finding_images))]
print(f'{(total_num_lung_diseases- len(Lung_diseases))} have been removed from the lung diseases dataframe')
print(f'{(total_num_no_findings- len(No_findings))} have been removed from the no finding dataframe')

3480 have been removed from the lung diseases dataframe
580 have been removed from the no finding dataframe


In [7]:
print(f'there is a total of {len(Lung_diseases)} lung diseases images')
print(f'there is a total of {len(No_findings)} no finding images')

there is a total of 21275 lung diseases images
there is a total of 59781 no finding images


In [47]:
import shutil

origin_directory = r'C:\Users\Hp\Documents\Machine_learning_projects\Chest_X_rays\0_Chest_X_rays_data\images'
destination_directory_train = r'C:\Users\Hp\Documents\Machine_learning_projects\Chest_X_rays\1_Datasets\Train_no_finding_disease'

# creating the train folder
for label in ['No_finding', 'Lung_disease']:
    destination_directory = destination_directory_train + '\\' + label + '\\'
    # Creates destination directory if it doesnt exist
    os.makedirs(destination_directory, exist_ok=True)
    # Copying train images to each label train directory
    if label == 'No_finding':
        for nome_immagine in No_findings['Image Index'][:21275]:
            origin_path = os.path.join(origin_directory, nome_immagine)
            destination_path = os.path.join(destination_directory, nome_immagine)
            shutil.copy(origin_path, destination_path)
    else:
        for nome_immagine in Lung_diseases['Image Index']:
            origin_path = os.path.join(origin_directory, nome_immagine)
            destination_path = os.path.join(destination_directory, nome_immagine)
            shutil.copy(origin_path, destination_path)

    print(f"Copiati {len(os.listdir(destination_directory))} file della classe " + label + " nella cartella " + destination_directory)

Copiati 21275 file della classe No_finding nella cartella C:\Users\Hp\Documents\Machine_learning_projects\Chest_X_rays\1_Datasets\Train_no_finding_disease\No_finding\
Copiati 21275 file della classe Lung_disease nella cartella C:\Users\Hp\Documents\Machine_learning_projects\Chest_X_rays\1_Datasets\Train_no_finding_disease\Lung_disease\


# Creating validation dataset

In [None]:
print(f"There are {len(os.listdir(destination_directory_train + '//' + 'No_finding'))} in the no finding folder")
print(f"There are {len(os.listdir(destination_directory_train + '//' + 'Lung_disease'))} in the lung disease folder")

In [12]:
valid_path = r'C:\Users\Hp\Documents\Machine_learning_projects\Chest_X_rays\1_Datasets\Validation'

valid_disease_images=[]
temp_len = 0

# finding all the images in the validation folders
for disease in Disease_labels:
    valid_disease_images = valid_disease_images + os.listdir(valid_path + '\\' + disease)
    print(f'found {len(valid_disease_images)-temp_len} {disease} images')
    temp_len= len(valid_disease_images)

valid_no_finding_images = list(No_findings['Image Index'][21275:21275+1740])
print(f'found a total of {len(valid_disease_images)} no finding images')
print(f'found a total of {len(valid_no_finding_images)} for lung diseases')


found 290 Infiltration images
found 290 Effusion images
found 290 Atelectasis images
found 290 Nodule images
found 290 Pneumothorax images
found 290 Mass images
found a total of 1740 no finding images
found a total of 1740 for lung diseases


In [13]:
import shutil

origin_directory = r'C:\Users\Hp\Documents\Machine_learning_projects\Chest_X_rays\0_Chest_X_rays_data\images'
destination_directory_valid = r'C:\Users\Hp\Documents\Machine_learning_projects\Chest_X_rays\1_Datasets\Valid_no_finding_disease'

# creating the valid folder
for label in ['No_finding', 'Lung_disease']:
    destination_directory = destination_directory_valid + '\\' + label + '\\'
    # Creates destination directory if it doesnt exist
    os.makedirs(destination_directory, exist_ok=True)
    # Copying valid images to each label valid directory
    if label == 'No_finding':
        for nome_immagine in valid_no_finding_images:
            origin_path = os.path.join(origin_directory, nome_immagine)
            destination_path = os.path.join(destination_directory, nome_immagine)
            shutil.copy(origin_path, destination_path)
    else:
        for nome_immagine in valid_disease_images:
            origin_path = os.path.join(origin_directory, nome_immagine)
            destination_path = os.path.join(destination_directory, nome_immagine)
            shutil.copy(origin_path, destination_path)

    print(f"Copiati {len(os.listdir(destination_directory))} file della classe " + label + " nella cartella " + destination_directory)

Copiati 1740 file della classe No_finding nella cartella C:\Users\Hp\Documents\Machine_learning_projects\Chest_X_rays\1_Datasets\Valid_no_finding_disease\No_finding\
Copiati 1740 file della classe Lung_disease nella cartella C:\Users\Hp\Documents\Machine_learning_projects\Chest_X_rays\1_Datasets\Valid_no_finding_disease\Lung_disease\
