In [1]:
pip install opencv-python-headless

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install -U albumentations

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd

# Path to your CSV file
csv_file_path = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/filtered_images.csv'  # Replace with the path to your CSV file

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)
df

In [None]:
import shutil
import os

# List of disease folders to delete
diseases_to_delete = [
    'Emphysema', 'Edema', 'Subcutaneous Emphysema', 'Fibrosis',
    'Pneumonia', 'Tortuous Aorta', 'Calcification of the Aorta',
    'Pneumoperitoneum', 'Pneumomediastinum', 'Hernia'
]

# Root directory where the disease folders are located
root_directory = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Images'  # Replace with the path to your folders

# Delete the specified folders and their contents
for disease in diseases_to_delete:
    folder_path = os.path.join(root_directory, disease)
    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)
        print(f"Deleted folder: {folder_path}")
    else:
        print(f"Folder not found, skipped: {folder_path}")

# Assuming df is your DataFrame and it contains a column 'disease' that matches the folder names
# If the disease names are not an exact match, you will need to adjust the filtering accordingly


# Filter out the rows for each of the diseases to delete
for disease in diseases_to_delete:
    df = df[df[disease] != 1]  # Filtering rows where disease column is not equal to 1


In [None]:
# Drop the columns corresponding to the deleted diseases
df_dropped = df.drop(columns=diseases_to_delete, errors='ignore')

In [None]:
df_dropped

In [None]:
# Now save the updated DataFrame to a CSV file
updated_csv_file_path = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/final10.csv'  # Replace with your desired path
df_dropped.to_csv(updated_csv_file_path, index=False)

print(f"Updated DataFrame saved to {updated_csv_file_path}")

In [None]:
import os
from pathlib import Path

# Path to the directory where the class-specific folders are saved
output_directory = Path('/Users/saikiranreddyvellanki/Documents/CapstoneProject/Images')

# Dictionary to hold the count of images in each class directory
class_image_counts = {}

# Iterate over each class directory in the output directory
for class_directory in output_directory.iterdir():
    if class_directory.is_dir():  # Check if it is a directory
        # Count the number of files in the directory
        class_image_count = len(list(class_directory.glob('*.*')))
        # Add the count to the dictionary
        class_image_counts[class_directory.name] = class_image_count

# Sort the classes by count
sorted_class_image_counts = dict(sorted(class_image_counts.items(), key=lambda item: item[1]))

# Print the sorted counts for each class
for class_name, count in sorted_class_image_counts.items():
    print(f'{class_name}: {count} images')

In [None]:
import os

root_directory = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Images'  # Replace with the path to your folders

# Supported image file extensions
image_extensions = {'.jpg', '.jpeg', '.png', '.gif'}

# Initialize a counter
total_images = 0

# Walk through all directories and files in the root directory
for root, dirs, files in os.walk(root_directory):
    # Count only files with supported image extensions
    total_images += sum(file.endswith(ext) for file in files for ext in image_extensions)

print(f"Total number of images in all folders: {total_images}")

In [3]:
import numpy as np

In [4]:
import os
import cv2
from tqdm import tqdm

from albumentations import Compose, HorizontalFlip, RandomBrightnessContrast, ShiftScaleRotate, RandomGamma, GaussNoise, PadIfNeeded, ElasticTransform, CLAHE, RandomCrop
          
aug_pipeline = Compose([
    HorizontalFlip(p=0.5),  # Flips the image horizontally with a 50% chance, useful for simulating patient orientation variations.
    RandomBrightnessContrast(p=0.1, brightness_limit=0.1, contrast_limit=0.1),  # Subtly adjusts brightness/contrast to mimic exposure variations without obscuring details.
    ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.05, rotate_limit=10, p=0.5),  # Slightly shifts, scales, and rotates images to simulate patient movement and positioning, avoiding extreme distortions.
    RandomGamma(gamma_limit=(80, 120), p=0.1),  # Adjusts the gamma to introduce variations in image brightness akin to different acquisition settings, ensuring the changes remain realistic.
    GaussNoise(var_limit=(10, 50), p=0.1),  # Adds Gaussian noise to simulate electronic noise in X-ray acquisition, with a careful limit to prevent artificial appearance.
    CLAHE(clip_limit=2, tile_grid_size=(8, 8), p=0.2),  # Enhances local contrast in a way that is adaptive to different regions of the image, improving visibility of important features without overemphasizing noise.
])

def augment_image(file_path, folder_path, augmentation_index):
    # Read the image file
    image = cv2.imread(file_path)
    base_filename = os.path.splitext(os.path.basename(file_path))[0]
    
    # Augment the image
    augmented_image = aug_pipeline(image=image)['image']
    
    # Construct new filename
    new_filename = f"{base_filename}_{augmentation_index}.png"  # Change the extension if necessary
    new_image_path = os.path.join(folder_path, new_filename)
    
    # Write the augmented image to the disk
    cv2.imwrite(new_image_path, augmented_image)
    return augmented_image


                                
def augment_to_target(folder_path, target_count=10000):
    image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    current_count = len(image_files)
    augments_needed = target_count - current_count
    
    # If we don't need any augmentations, we can return early
    if augments_needed <= 0:
        return

    augmentation_index = 0
    while len(os.listdir(folder_path)) < target_count:
        # Randomly pick an original image to augment
        img_file = np.random.choice(image_files)
        file_path = os.path.join(folder_path, img_file)
        augment_image(file_path, folder_path, augmentation_index)
        augmentation_index += 1
                

main_folder_path = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Images'  # Update this to your path

# Loop through each subfolder and augment images to reach 10,000 images
for subfolder in tqdm(os.listdir(main_folder_path), desc="Processing Folders"):
    subfolder_path = os.path.join(main_folder_path, subfolder)
    if os.path.isdir(subfolder_path):
        print(f"Processing: {subfolder}")
        augment_to_target(subfolder_path)


Processing Folders:   0%|                                | 0/11 [00:00<?, ?it/s]

Processing: Mass


Processing Folders:   9%|██▏                     | 1/11 [01:12<12:02, 72.22s/it]

Processing: Cardiomegaly


Processing Folders:  27%|██████▌                 | 3/11 [02:27<06:11, 46.46s/it]

Processing: Atelectasis


Processing Folders:  36%|████████▋               | 4/11 [03:27<05:59, 51.33s/it]

Processing: Effusion


Processing Folders:  45%|██████████▉             | 5/11 [04:29<05:28, 54.76s/it]

Processing: Pneumothorax


Processing Folders:  55%|█████████████           | 6/11 [05:41<05:01, 60.31s/it]

Processing: No Finding


Processing Folders:  64%|███████████████▎        | 7/11 [05:41<02:45, 41.46s/it]

Processing: Nodule


Processing Folders:  73%|█████████████████▍      | 8/11 [06:51<02:31, 50.34s/it]

Processing: Pleural Thickening


Processing Folders:  82%|███████████████████▋    | 9/11 [08:09<01:57, 58.63s/it]

Processing: Infiltration


Processing Folders:  91%|████████████████████▉  | 10/11 [08:16<00:42, 42.94s/it]

Processing: Consolidation


Processing Folders: 100%|███████████████████████| 11/11 [09:31<00:00, 51.93s/it]


In [5]:
import os
from pathlib import Path

# Path to the directory where the class-specific folders are saved
output_directory = Path('/Users/saikiranreddyvellanki/Documents/CapstoneProject/Images')

# Dictionary to hold the count of images in each class directory
class_image_counts = {}

# Iterate over each class directory in the output directory
for class_directory in output_directory.iterdir():
    if class_directory.is_dir():  # Check if it is a directory
        # Count the number of files in the directory
        class_image_count = len(list(class_directory.glob('*.*')))
        # Add the count to the dictionary
        class_image_counts[class_directory.name] = class_image_count

# Sort the classes by count
sorted_class_image_counts = dict(sorted(class_image_counts.items(), key=lambda item: item[1]))

# Print the sorted counts for each class
for class_name, count in sorted_class_image_counts.items():
    print(f'{class_name}: {count} images')

Mass: 10000 images
Cardiomegaly: 10000 images
Atelectasis: 10000 images
Effusion: 10000 images
Pneumothorax: 10000 images
Nodule: 10000 images
Pleural Thickening: 10000 images
Infiltration: 10000 images
Consolidation: 10000 images
No Finding: 59406 images


In [6]:
import os
from pathlib import Path

# Path to the directory where the class-specific folders are saved
output_directory = Path('/Users/saikiranreddyvellanki/Documents/CapstoneProject/Images')

# Dictionary to hold the count of images in each class directory
class_image_counts = {}

# Iterate over each class directory in the output directory
for class_directory in output_directory.iterdir():
    if class_directory.is_dir():  # Check if it is a directory
        # Count the number of files in the directory
        class_image_count = len(list(class_directory.glob('*.*')))
        # Add the count to the dictionary
        class_image_counts[class_directory.name] = class_image_count

# Sort the classes by count
sorted_class_image_counts = dict(sorted(class_image_counts.items(), key=lambda item: item[1]))

# Print the sorted counts for each class
for class_name, count in sorted_class_image_counts.items():
    print(f'{class_name}: {count} images')

Mass: 10000 images
Cardiomegaly: 10000 images
Atelectasis: 10000 images
Effusion: 10000 images
Pneumothorax: 10000 images
Nodule: 10000 images
Pleural Thickening: 10000 images
Infiltration: 10000 images
Consolidation: 10000 images
No Finding: 59406 images


In [8]:
import os
import random
from pathlib import Path
from tqdm import tqdm

# Function to reduce the number of images in a folder to 10,000
def sample_images(folder_path, target_count=10000):
    # List all image files in the folder
    images = list(Path(folder_path).glob('*.*'))
    
    # Shuffle the list of images to ensure random selection
    random.shuffle(images)
    
    # Keep the first 10,000 images, delete the rest
    for image in images[target_count:]:
        os.remove(image)

main_folder_path = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Images' 

# Loop through each subfolder and sample images
for subfolder in tqdm(os.listdir(main_folder_path), desc="Processing Folders"):
    subfolder_path = os.path.join(main_folder_path, subfolder)
    if os.path.isdir(subfolder_path):
        sample_images(subfolder_path)


Processing Folders: 100%|███████████████████████| 11/11 [00:12<00:00,  1.17s/it]


In [9]:
import os
from pathlib import Path

# Path to the directory where the class-specific folders are saved
output_directory = Path('/Users/saikiranreddyvellanki/Documents/CapstoneProject/Images')

# Dictionary to hold the count of images in each class directory
class_image_counts = {}

# Iterate over each class directory in the output directory
for class_directory in output_directory.iterdir():
    if class_directory.is_dir():  # Check if it is a directory
        # Count the number of files in the directory
        class_image_count = len(list(class_directory.glob('*.*')))
        # Add the count to the dictionary
        class_image_counts[class_directory.name] = class_image_count

# Sort the classes by count
sorted_class_image_counts = dict(sorted(class_image_counts.items(), key=lambda item: item[1]))

# Print the sorted counts for each class
for class_name, count in sorted_class_image_counts.items():
    print(f'{class_name}: {count} images')

Mass: 10000 images
Cardiomegaly: 10000 images
Atelectasis: 10000 images
Effusion: 10000 images
Pneumothorax: 10000 images
No Finding: 10000 images
Nodule: 10000 images
Pleural Thickening: 10000 images
Infiltration: 10000 images
Consolidation: 10000 images


In [10]:
pip install pandas openpyxl

Note: you may need to restart the kernel to use updated packages.


In [12]:
import os
import pandas as pd
from pathlib import Path


main_folder_path = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Images'


# Prepare a list to store the data
data = []

# Iterate through each subfolder and collect image names
for subfolder_name in os.listdir(main_folder_path):
    subfolder_path = os.path.join(main_folder_path, subfolder_name)
    if os.path.isdir(subfolder_path):
        # Get all image files in the folder
        image_files = os.listdir(subfolder_path)
        # Add the folder name and image file name to the data list
        for image_file in image_files:
            data.append({'Folder Name': subfolder_name, 'Image File Name': image_file})

# Create a DataFrame with the data
df = pd.DataFrame(data)

# Define the path for the Excel file to be saved
excel_path = os.path.join(main_folder_path, 'AugmentExcel.xlsx')

# Write the DataFrame to an Excel file
df.to_excel(excel_path, index=False)


In [14]:
import pandas as pd

df = pd.read_excel('/Users/saikiranreddyvellanki/Documents/CapstoneProject/AugmentExcel.xlsx')


In [15]:
df

Unnamed: 0,Folder Name,Image File Name
0,Mass,00014105_001.png
1,Mass,00016778_022.png
2,Mass,00019369_002.png
3,Mass,00012342_003.png
4,Mass,00018663_002.png
...,...,...
99995,Consolidation,00008397_003_723.png
99996,Consolidation,00010805_002_8557.png
99997,Consolidation,00016729_002_1396.png
99998,Consolidation,00013310_011_7883.png


In [16]:
duplicates = df[df.duplicated(['Folder Name', 'Image File Name'], keep=False)]
print(duplicates)

Empty DataFrame
Columns: [Folder Name, Image File Name]
Index: []


In [17]:
import os
import shutil
from sklearn.model_selection import train_test_split

# Define the paths to the main directory and the train and test directories
main_folder_path = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Images'
train_folder_path = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Images'
test_folder_path = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Images'

# Ensure the train and test folders exist
os.makedirs(train_folder_path, exist_ok=True)
os.makedirs(test_folder_path, exist_ok=True)

# Go through each folder and randomly select 80% for training and the rest for testing
for folder_name in os.listdir(main_folder_path):
    folder_path = os.path.join(main_folder_path, folder_name)
    if os.path.isdir(folder_path):  # Check if it's a directory
        # List all image files
        images = os.listdir(folder_path)
        # Split the images into train and test sets
        train_images, test_images = train_test_split(images, train_size=0.8, random_state=42)
        
        # Create and populate the train folder
        train_subfolder_path = os.path.join(train_folder_path, f'{folder_name}_train')
        os.makedirs(train_subfolder_path, exist_ok=True)
        for image_file in train_images:
            src_path = os.path.join(folder_path, image_file)
            dst_path = os.path.join(train_subfolder_path, image_file)
            shutil.copy(src_path, dst_path)
        
        # Create and populate the test folder
        test_subfolder_path = os.path.join(test_folder_path, f'{folder_name}_test')
        os.makedirs(test_subfolder_path, exist_ok=True)
        for image_file in test_images:
            src_path = os.path.join(folder_path, image_file)
            dst_path = os.path.join(test_subfolder_path, image_file)
            shutil.copy(src_path, dst_path)



In [18]:
import os
import shutil
from sklearn.model_selection import train_test_split

# Base directory where the '_train' folders are located
base_train_dir = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Images/Train'

# New base directory for '_validation' folders
base_validation_dir = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Images/Train'
os.makedirs(base_validation_dir, exist_ok=True)

# Split ratio for validation set
validation_split = 0.2

# Process each '_train' folder to split off a validation set
for folder_name in os.listdir(base_train_dir):
    if folder_name.endswith('_train'):
        folder_path = os.path.join(base_train_dir, folder_name)
        images = os.listdir(folder_path)
        
        # Split the images into train and validation sets
        _, validation_images = train_test_split(images, test_size=validation_split, random_state=42)
        
        # Create a new corresponding '_validation' folder
        validation_folder_name = folder_name.replace('_train', '_validation')
        validation_folder_path = os.path.join(base_validation_dir, validation_folder_name)
        os.makedirs(validation_folder_path, exist_ok=True)
        
        # Move the selected validation images to the new validation folder
        for image in validation_images:
            src_path = os.path.join(folder_path, image)
            dst_path = os.path.join(validation_folder_path, image)
            shutil.move(src_path, dst_path)  # Use move to transfer files


In [19]:
import os
from pathlib import Path

# Path to the directory where the class-specific folders are saved
output_directory = Path('/Users/saikiranreddyvellanki/Documents/CapstoneProject/Images/Test')

# Dictionary to hold the count of images in each class directory
class_image_counts = {}

# Iterate over each class directory in the output directory
for class_directory in output_directory.iterdir():
    if class_directory.is_dir():  # Check if it is a directory
        # Count the number of files in the directory
        class_image_count = len(list(class_directory.glob('*.*')))
        # Add the count to the dictionary
        class_image_counts[class_directory.name] = class_image_count

# Sort the classes by count
sorted_class_image_counts = dict(sorted(class_image_counts.items(), key=lambda item: item[1]))

# Print the sorted counts for each class
for class_name, count in sorted_class_image_counts.items():
    print(f'{class_name}: {count} images')

Pneumothorax_test: 2000 images
Consolidation_test: 2000 images
Effusion_test: 2000 images
Mass_test: 2000 images
Atelectasis_test: 2000 images
Infiltration_test: 2000 images
Pleural Thickening_test: 2000 images
Nodule_test: 2000 images
No Finding_test: 2000 images
Cardiomegaly_test: 2000 images


In [20]:
import os
import pandas as pd
from pathlib import Path

# Path to the main folder containing all subfolders
main_folder_path = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Images/Train'  # Change this to your main folder path

# Prepare a list to store the data
data = []

# Iterate through each subfolder and collect image names
for subfolder_name in os.listdir(main_folder_path):
    subfolder_path = os.path.join(main_folder_path, subfolder_name)
    if os.path.isdir(subfolder_path):
        # Get all image files in the folder
        image_files = os.listdir(subfolder_path)
        # Add the folder name and image file name to the data list
        for image_file in image_files:
            data.append({'class': subfolder_name, 'image': image_file})

# Create a DataFrame with the data
df = pd.DataFrame(data)

# Define the path for the Excel file to be saved
excel_path = os.path.join(main_folder_path, 'train_data.xlsx')

# Write the DataFrame to an Excel file
df.to_excel(excel_path, index=False)


In [21]:
import os
import pandas as pd
from pathlib import Path

# Path to the main folder containing all subfolders
main_folder_path = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Images/Test'  # Change this to your main folder path

# Prepare a list to store the data
data = []

# Iterate through each subfolder and collect image names
for subfolder_name in os.listdir(main_folder_path):
    subfolder_path = os.path.join(main_folder_path, subfolder_name)
    if os.path.isdir(subfolder_path):
        # Get all image files in the folder
        image_files = os.listdir(subfolder_path)
        # Add the folder name and image file name to the data list
        for image_file in image_files:
            data.append({'class': subfolder_name, 'image': image_file})

# Create a DataFrame with the data
df = pd.DataFrame(data)

# Define the path for the Excel file to be saved
excel_path = os.path.join(main_folder_path, 'test_data.xlsx')

# Write the DataFrame to an Excel file
df.to_excel(excel_path, index=False)

In [22]:
import os
import pandas as pd
from pathlib import Path

# Path to the main folder containing all subfolders
main_folder_path = '/Users/saikiranreddyvellanki/Documents/CapstoneProject/Images/Validation'  # Change this to your main folder path

# Prepare a list to store the data
data = []

# Iterate through each subfolder and collect image names
for subfolder_name in os.listdir(main_folder_path):
    subfolder_path = os.path.join(main_folder_path, subfolder_name)
    if os.path.isdir(subfolder_path):
        # Get all image files in the folder
        image_files = os.listdir(subfolder_path)
        # Add the folder name and image file name to the data list
        for image_file in image_files:
            data.append({'class': subfolder_name, 'image': image_file})

# Create a DataFrame with the data
df = pd.DataFrame(data)

# Define the path for the Excel file to be saved
excel_path = os.path.join(main_folder_path, 'validation_data.xlsx')

# Write the DataFrame to an Excel file
df.to_excel(excel_path, index=False)

In [23]:
import pandas as pd

train = pd.read_excel('/Users/saikiranreddyvellanki/Documents/CapstoneProject/Images/train_data.xlsx')

duplicates = train[train.duplicated(['class', 'image'], keep=False)]
print(duplicates)


Empty DataFrame
Columns: [class, image]
Index: []


In [24]:
import pandas as pd

test = pd.read_excel('/Users/saikiranreddyvellanki/Documents/CapstoneProject/Images/test_data.xlsx')

duplicates = test[test.duplicated(['class', 'image'], keep=False)]
print(duplicates)

Empty DataFrame
Columns: [class, image]
Index: []


In [25]:
import pandas as pd

val = pd.read_excel('/Users/saikiranreddyvellanki/Documents/CapstoneProject/Images/validation_data.xlsx')

duplicates = val[val.duplicated(['class', 'image'], keep=False)]
print(duplicates)

Empty DataFrame
Columns: [class, image]
Index: []


In [28]:
image_counts = train['class'].value_counts()
print(image_counts) 

Pneumothorax_test          2000
Consolidation_test         2000
Effusion_test              2000
Mass_test                  2000
Atelectasis_test           2000
Infiltration_test          2000
Pleural Thickening_test    2000
Nodule_test                2000
No Finding_test            2000
Cardiomegaly_test          2000
Name: class, dtype: int64


In [None]:
image_counts = test['class'].value_counts()
print(image_counts) 

In [29]:
image_counts = val['class'].value_counts()
print(image_counts) 

Consolidation_validation         1600
Effusion_validation              1600
Infiltration_validation          1600
Nodule_validation                1600
Cardiomegaly_validation          1600
Pleural Thickening_validation    1600
No Finding_validation            1600
Mass_validation                  1600
Atelectasis_validation           1600
Pneumothorax_validation          1600
Name: class, dtype: int64
