In [2]:
pip install opencv-python-headless

Collecting opencv-python-headless
  Downloading opencv_python_headless-4.9.0.80-cp37-abi3-macosx_10_16_x86_64.whl (55.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.7/55.7 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: opencv-python-headless
Successfully installed opencv-python-headless-4.9.0.80
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install albumentations

Collecting albumentations
  Downloading albumentations-1.4.2-py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.9/133.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy>=1.24.4
  Downloading numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl (20.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.6/20.6 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hCollecting typing-extensions>=4.9.0
  Downloading typing_extensions-4.10.0-py3-none-any.whl (33 kB)
Collecting scipy>=1.10.0
  Downloading scipy-1.12.0-cp39-cp39-macosx_10_9_x86_64.whl (38.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.9/38.9 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting scikit-learn>=1.3.2
  Downloading scikit_learn-1.4.1.post1-cp39-cp39-macosx_10_9_x86_64.whl (11.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m7.2 MB/s[0m 

In [15]:
        
import os
from pathlib import Path

# Path to the directory where the class-specific folders are saved
output_directory = Path('/Users/saikiranreddyvellanki/Documents/CapstoneProject/Files')

# Dictionary to hold the count of images in each class directory
class_image_counts = {}

# Iterate over each class directory in the output directory
for class_directory in output_directory.iterdir():
    if class_directory.is_dir():  # Check if it is a directory
        # Count the number of files in the directory
        class_image_count = len(list(class_directory.glob('*.*')))
        # Add the count to the dictionary
        class_image_counts[class_directory.name] = class_image_count

# Sort the classes by count
sorted_class_image_counts = dict(sorted(class_image_counts.items(), key=lambda item: item[1]))

# Print the sorted counts for each class
for class_name, count in sorted_class_image_counts.items():
    print(f'{class_name}: {count} images')

Pneumonia: 1431 images
Fibrosis: 1686 images
Subcutaneous Emphysema: 1991 images
Edema: 2303 images
Emphysema: 2516 images
Cardiomegaly: 2776 images
Pleural Thickening: 3385 images
Consolidation: 4667 images
Pneumothorax: 5302 images
Mass: 5782 images
Nodule: 6331 images
Atelectasis: 11559 images
Effusion: 13317 images
Infiltration: 19894 images
No Finding: 59406 images


In [16]:
import os
import cv2
from albumentations import (Compose, HorizontalFlip, RandomBrightnessContrast, ShiftScaleRotate, RandomGamma, GaussNoise, PadIfNeeded)
import numpy as np
from tqdm import tqdm

# Define the augmentation pipeline with safe transformations for chest X-rays
aug_pipeline = Compose([
    HorizontalFlip(p=0.5),
    RandomBrightnessContrast(p=0.1, brightness_limit=0.1, contrast_limit=0.1),
    ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.05, rotate_limit=10, p=0.5),
    RandomGamma(gamma_limit=(80, 120), p=0.1),
    GaussNoise(var_limit=(10, 50), p=0.1),
    PadIfNeeded(min_height=224, min_width=224, p=0.5)  # Padding if needed
])


def augment_image(file_path, folder_path, augmentation_index):
    # Read the image file
    image = cv2.imread(file_path)
    base_filename = os.path.splitext(os.path.basename(file_path))[0]
    
    # Augment the image
    augmented_image = aug_pipeline(image=image)['image']
    
    # Construct new filename
    new_filename = f"{base_filename}_{augmentation_index}.png"  # Change the extension if necessary
    new_image_path = os.path.join(folder_path, new_filename)
    
    # Write the augmented image to the disk
    cv2.imwrite(new_image_path, augmented_image)
    return augmented_image


                                
def augment_to_target(folder_path, target_count=10000):
    image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    current_count = len(image_files)
    augments_needed = target_count - current_count
    
    # If we don't need any augmentations, we can return early
    if augments_needed <= 0:
        return

    augmentation_index = 0
    while len(os.listdir(folder_path)) < target_count:
        # Randomly pick an original image to augment
        img_file = np.random.choice(image_files)
        file_path = os.path.join(folder_path, img_file)
        augment_image(file_path, folder_path, augmentation_index)
        augmentation_index += 1
                
                
                
                
                

main_folder_path = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Files-to-augment'  # Update this to your path

# Loop through each subfolder and augment images to reach 10,000 images
for subfolder in tqdm(os.listdir(main_folder_path), desc="Processing Folders"):
    subfolder_path = os.path.join(main_folder_path, subfolder)
    if os.path.isdir(subfolder_path):
        print(f"Processing: {subfolder}")
        augment_to_target(subfolder_path)


Processing Folders:   0%|                                | 0/16 [00:00<?, ?it/s]

Processing: Mass


Processing Folders:   6%|█▌                      | 1/16 [00:46<11:34, 46.32s/it]

Processing: Cardiomegaly


Processing Folders:  19%|████▌                   | 3/16 [01:54<08:03, 37.17s/it]

Processing: Atelectasis
Processing: Effusion
Processing: Pneumothorax


Processing Folders:  44%|██████████▌             | 7/16 [02:45<02:52, 19.21s/it]

Processing: No Finding
Processing: Subcutaneous Emphysema


Processing Folders:  50%|████████████            | 8/16 [03:57<04:16, 32.11s/it]

Processing: Nodule


Processing Folders:  56%|█████████████▌          | 9/16 [04:38<04:01, 34.47s/it]

Processing: Pleural Thickening


Processing Folders:  62%|██████████████▍        | 10/16 [05:43<04:16, 42.75s/it]

Processing: Edema


Processing Folders:  69%|███████████████▊       | 11/16 [06:53<04:11, 50.26s/it]

Processing: Pneumonia


Processing Folders:  75%|█████████████████▎     | 12/16 [08:08<03:48, 57.21s/it]

Processing: Emphysema


Processing Folders:  81%|██████████████████▋    | 13/16 [09:17<03:02, 60.81s/it]

Processing: Infiltration
Processing: Consolidation


Processing Folders:  94%|█████████████████████▌ | 15/16 [10:13<00:45, 45.80s/it]

Processing: Fibrosis


Processing Folders: 100%|███████████████████████| 16/16 [11:25<00:00, 42.87s/it]


In [17]:
import os
from pathlib import Path

# Path to the directory where the class-specific folders are saved
output_directory = Path('/Users/saikiranreddyvellanki/Documents/CapstoneProject/Files-to-augment')

# Dictionary to hold the count of images in each class directory
class_image_counts = {}

# Iterate over each class directory in the output directory
for class_directory in output_directory.iterdir():
    if class_directory.is_dir():  # Check if it is a directory
        # Count the number of files in the directory
        class_image_count = len(list(class_directory.glob('*.*')))
        # Add the count to the dictionary
        class_image_counts[class_directory.name] = class_image_count

# Sort the classes by count
sorted_class_image_counts = dict(sorted(class_image_counts.items(), key=lambda item: item[1]))

# Print the sorted counts for each class
for class_name, count in sorted_class_image_counts.items():
    print(f'{class_name}: {count} images')

Mass: 10000 images
Cardiomegaly: 10000 images
Pneumothorax: 10000 images
Subcutaneous Emphysema: 10000 images
Nodule: 10000 images
Pleural Thickening: 10000 images
Edema: 10000 images
Pneumonia: 10000 images
Emphysema: 10000 images
Consolidation: 10000 images
Fibrosis: 10000 images
Atelectasis: 11559 images
Effusion: 13317 images
Infiltration: 19894 images
No Finding: 59406 images


In [18]:
import os
import random
from pathlib import Path
from tqdm import tqdm

# Function to reduce the number of images in a folder to 10,000
def sample_images(folder_path, target_count=10000):
    # List all image files in the folder
    images = list(Path(folder_path).glob('*.*'))
    
    # Shuffle the list of images to ensure random selection
    random.shuffle(images)
    
    # Keep the first 10,000 images, delete the rest
    for image in images[target_count:]:
        os.remove(image)

main_folder_path = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Files-to-augment' 

# Loop through each subfolder and sample images
for subfolder in tqdm(os.listdir(main_folder_path), desc="Processing Folders"):
    subfolder_path = os.path.join(main_folder_path, subfolder)
    if os.path.isdir(subfolder_path):
        sample_images(subfolder_path)


Processing Folders: 100%|███████████████████████| 16/16 [00:14<00:00,  1.08it/s]


In [19]:
import os
from pathlib import Path

# Path to the directory where the class-specific folders are saved
output_directory = Path('/Users/saikiranreddyvellanki/Documents/CapstoneProject/Files-to-augment')

# Dictionary to hold the count of images in each class directory
class_image_counts = {}

# Iterate over each class directory in the output directory
for class_directory in output_directory.iterdir():
    if class_directory.is_dir():  # Check if it is a directory
        # Count the number of files in the directory
        class_image_count = len(list(class_directory.glob('*.*')))
        # Add the count to the dictionary
        class_image_counts[class_directory.name] = class_image_count

# Sort the classes by count
sorted_class_image_counts = dict(sorted(class_image_counts.items(), key=lambda item: item[1]))

# Print the sorted counts for each class
for class_name, count in sorted_class_image_counts.items():
    print(f'{class_name}: {count} images')

Mass: 10000 images
Cardiomegaly: 10000 images
Atelectasis: 10000 images
Effusion: 10000 images
Pneumothorax: 10000 images
No Finding: 10000 images
Subcutaneous Emphysema: 10000 images
Nodule: 10000 images
Pleural Thickening: 10000 images
Edema: 10000 images
Pneumonia: 10000 images
Emphysema: 10000 images
Infiltration: 10000 images
Consolidation: 10000 images
Fibrosis: 10000 images


In [1]:
pip install pandas openpyxl

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import pandas as pd
from pathlib import Path


main_folder_path = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Files-to-augment'


# Prepare a list to store the data
data = []

# Iterate through each subfolder and collect image names
for subfolder_name in os.listdir(main_folder_path):
    subfolder_path = os.path.join(main_folder_path, subfolder_name)
    if os.path.isdir(subfolder_path):
        # Get all image files in the folder
        image_files = os.listdir(subfolder_path)
        # Add the folder name and image file name to the data list
        for image_file in image_files:
            data.append({'Folder Name': subfolder_name, 'Image File Name': image_file})

# Create a DataFrame with the data
df = pd.DataFrame(data)

# Define the path for the Excel file to be saved
excel_path = os.path.join(main_folder_path, 'ImageData.xlsx')

# Write the DataFrame to an Excel file
df.to_excel(excel_path, index=False)


In [4]:
import pandas as pd

df = pd.read_excel('/Users/saikiranreddyvellanki/Documents/CapstoneProject/Files-to-augment/ImageData.xlsx')


In [6]:
df

Unnamed: 0,Folder Name,Image File Name
0,Mass,00014105_001.png
1,Mass,00013285_013_3581.png
2,Mass,00005785_000_2597.png
3,Mass,00016778_022.png
4,Mass,00019369_002.png
...,...,...
149995,Fibrosis,00006043_000_3247.png
149996,Fibrosis,00003098_012_5153.png
149997,Fibrosis,00000149_007_5731.png
149998,Fibrosis,00003834_001_6861.png


In [8]:
duplicates = df[df.duplicated(['Folder Name', 'Image File Name'], keep=False)]
print(duplicates)

Empty DataFrame
Columns: [Folder Name, Image File Name]
Index: []


In [11]:
import os
import shutil
from sklearn.model_selection import train_test_split

# Define the paths to the main directory and the train and test directories
main_folder_path = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Files-to-augment'
train_folder_path = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Files-to-augment'
test_folder_path = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Files-to-augment'

# Ensure the train and test folders exist
os.makedirs(train_folder_path, exist_ok=True)
os.makedirs(test_folder_path, exist_ok=True)

# Go through each folder and randomly select 80% for training and the rest for testing
for folder_name in os.listdir(main_folder_path):
    folder_path = os.path.join(main_folder_path, folder_name)
    if os.path.isdir(folder_path):  # Check if it's a directory
        # List all image files
        images = os.listdir(folder_path)
        # Split the images into train and test sets
        train_images, test_images = train_test_split(images, train_size=0.8, random_state=42)
        
        # Create and populate the train folder
        train_subfolder_path = os.path.join(train_folder_path, f'{folder_name}_train')
        os.makedirs(train_subfolder_path, exist_ok=True)
        for image_file in train_images:
            src_path = os.path.join(folder_path, image_file)
            dst_path = os.path.join(train_subfolder_path, image_file)
            shutil.copy(src_path, dst_path)
        
        # Create and populate the test folder
        test_subfolder_path = os.path.join(test_folder_path, f'{folder_name}_test')
        os.makedirs(test_subfolder_path, exist_ok=True)
        for image_file in test_images:
            src_path = os.path.join(folder_path, image_file)
            dst_path = os.path.join(test_subfolder_path, image_file)
            shutil.copy(src_path, dst_path)



In [16]:
import os
import shutil
from sklearn.model_selection import train_test_split

# Base directory where the '_train' folders are located
base_train_dir = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Files-to-augment/Train'

# New base directory for '_validation' folders
base_validation_dir = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Files-to-augment/Train'
os.makedirs(base_validation_dir, exist_ok=True)

# Split ratio for validation set
validation_split = 0.2

# Process each '_train' folder to split off a validation set
for folder_name in os.listdir(base_train_dir):
    if folder_name.endswith('_train'):
        folder_path = os.path.join(base_train_dir, folder_name)
        images = os.listdir(folder_path)
        
        # Split the images into train and validation sets
        _, validation_images = train_test_split(images, test_size=validation_split, random_state=42)
        
        # Create a new corresponding '_validation' folder
        validation_folder_name = folder_name.replace('_train', '_validation')
        validation_folder_path = os.path.join(base_validation_dir, validation_folder_name)
        os.makedirs(validation_folder_path, exist_ok=True)
        
        # Move the selected validation images to the new validation folder
        for image in validation_images:
            src_path = os.path.join(folder_path, image)
            dst_path = os.path.join(validation_folder_path, image)
            shutil.move(src_path, dst_path)  # Use move to transfer files


In [19]:
import os
from pathlib import Path

# Path to the directory where the class-specific folders are saved
output_directory = Path('/Users/saikiranreddyvellanki/Documents/CapstoneProject/Files-to-augment/Test')

# Dictionary to hold the count of images in each class directory
class_image_counts = {}

# Iterate over each class directory in the output directory
for class_directory in output_directory.iterdir():
    if class_directory.is_dir():  # Check if it is a directory
        # Count the number of files in the directory
        class_image_count = len(list(class_directory.glob('*.*')))
        # Add the count to the dictionary
        class_image_counts[class_directory.name] = class_image_count

# Sort the classes by count
sorted_class_image_counts = dict(sorted(class_image_counts.items(), key=lambda item: item[1]))

# Print the sorted counts for each class
for class_name, count in sorted_class_image_counts.items():
    print(f'{class_name}: {count} images')

Pneumothorax_test: 2000 images
Consolidation_test: 2000 images
Fibrosis_test: 2000 images
Effusion_test: 2000 images
Mass_test: 2000 images
Pneumonia_test: 2000 images
Atelectasis_test: 2000 images
Infiltration_test: 2000 images
Subcutaneous Emphysema_test: 2000 images
Edema_test: 2000 images
Pleural Thickening_test: 2000 images
Nodule_test: 2000 images
Emphysema_test: 2000 images
No Finding_test: 2000 images
Cardiomegaly_test: 2000 images


In [20]:
import os
import pandas as pd
from pathlib import Path

# Path to the main folder containing all subfolders
main_folder_path = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Files-to-augment/Train'  # Change this to your main folder path

# Prepare a list to store the data
data = []

# Iterate through each subfolder and collect image names
for subfolder_name in os.listdir(main_folder_path):
    subfolder_path = os.path.join(main_folder_path, subfolder_name)
    if os.path.isdir(subfolder_path):
        # Get all image files in the folder
        image_files = os.listdir(subfolder_path)
        # Add the folder name and image file name to the data list
        for image_file in image_files:
            data.append({'class': subfolder_name, 'image': image_file})

# Create a DataFrame with the data
df = pd.DataFrame(data)

# Define the path for the Excel file to be saved
excel_path = os.path.join(main_folder_path, 'train_data.xlsx')

# Write the DataFrame to an Excel file
df.to_excel(excel_path, index=False)


In [21]:
import os
import pandas as pd
from pathlib import Path

# Path to the main folder containing all subfolders
main_folder_path = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Files-to-augment/Test'  # Change this to your main folder path

# Prepare a list to store the data
data = []

# Iterate through each subfolder and collect image names
for subfolder_name in os.listdir(main_folder_path):
    subfolder_path = os.path.join(main_folder_path, subfolder_name)
    if os.path.isdir(subfolder_path):
        # Get all image files in the folder
        image_files = os.listdir(subfolder_path)
        # Add the folder name and image file name to the data list
        for image_file in image_files:
            data.append({'class': subfolder_name, 'image': image_file})

# Create a DataFrame with the data
df = pd.DataFrame(data)

# Define the path for the Excel file to be saved
excel_path = os.path.join(main_folder_path, 'test_data.xlsx')

# Write the DataFrame to an Excel file
df.to_excel(excel_path, index=False)

In [22]:
import os
import pandas as pd
from pathlib import Path

# Path to the main folder containing all subfolders
main_folder_path = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Files-to-augment/Validation'  # Change this to your main folder path

# Prepare a list to store the data
data = []

# Iterate through each subfolder and collect image names
for subfolder_name in os.listdir(main_folder_path):
    subfolder_path = os.path.join(main_folder_path, subfolder_name)
    if os.path.isdir(subfolder_path):
        # Get all image files in the folder
        image_files = os.listdir(subfolder_path)
        # Add the folder name and image file name to the data list
        for image_file in image_files:
            data.append({'class': subfolder_name, 'image': image_file})

# Create a DataFrame with the data
df = pd.DataFrame(data)

# Define the path for the Excel file to be saved
excel_path = os.path.join(main_folder_path, 'validation_data.xlsx')

# Write the DataFrame to an Excel file
df.to_excel(excel_path, index=False)

In [24]:
import pandas as pd

train = pd.read_excel('/Users/saikiranreddyvellanki/Documents/CapstoneProject/Files-to-augment/train_data.xlsx')

duplicates = train[train.duplicated(['class', 'image'], keep=False)]
print(duplicates)


Empty DataFrame
Columns: [class, image]
Index: []


In [25]:
import pandas as pd

test = pd.read_excel('/Users/saikiranreddyvellanki/Documents/CapstoneProject/Files-to-augment/test_data.xlsx')

duplicates = test[test.duplicated(['class', 'image'], keep=False)]
print(duplicates)

Empty DataFrame
Columns: [class, image]
Index: []


In [27]:
import pandas as pd

val = pd.read_excel('/Users/saikiranreddyvellanki/Documents/CapstoneProject/Files-to-augment/validation_data.xlsx')

duplicates = val[val.duplicated(['class', 'image'], keep=False)]
print(duplicates)

Empty DataFrame
Columns: [class, image]
Index: []


In [31]:
image_counts = val['class'].value_counts()
print(image_counts) 

Mass                      1600
Cardiomegaly              1600
Atelectasis               1600
Effusion                  1600
Pneumothorax              1600
No Finding                1600
Subcutaneous Emphysema    1600
Nodule                    1600
Pleural Thickening        1600
Edema                     1600
Pneumonia                 1600
Emphysema                 1600
Infiltration              1600
Consolidation             1600
Fibrosis                  1600
Name: class, dtype: int64
