Step.1:

In [1]:
import os
import shutil

# Define the paths
input_folder = r"C:\Users\User\Desktop\compitission\competition_dataset"  # This is the main folder with the patient folders
output_folder = r"C:\Users\User\Desktop\compitission\1"

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Loop through each patient folder
for patient_id in os.listdir(input_folder):
    patient_folder = os.path.join(input_folder, patient_id)
    
    # Check if it is indeed a folder
    if os.path.isdir(patient_folder):
        
        # Loop through each image file inside the patient folder
        for filename in os.listdir(patient_folder):
            if filename.endswith(".dcm"):
                # Get the mammogram type (RCC, LCC, RMLO, LMLO) from the filename
                mammogram_type = filename.split('.')[0]
                
                # Define the new filename
                new_filename = f"{patient_id}_{mammogram_type}.dcm"
                
                # Source and destination paths
                src = os.path.join(patient_folder, filename)
                dst = os.path.join(output_folder, new_filename)
                
                # Copy and rename the file
                shutil.copy(src, dst)

print("Files have been copied and renamed successfully!")


Files have been copied and renamed successfully!


Step.2: Converting Dicom to PNG

In [2]:
import pydicom
from PIL import Image
import numpy as np
import os
import pandas as pd

def dicom_to_png_conversion(dicom_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Iterate through all files in the dicom folder
    for root, dirs, files in os.walk(dicom_folder):
        for file in files:
            if file.endswith('.dcm'):
                dicom_path = os.path.join(root, file)
                print(f"Processing file: {dicom_path}")  # Debugging log
                try:
                    # Read DICOM file
                    dicom_data = pydicom.dcmread(dicom_path)
                    pixel_array = dicom_data.pixel_array

                    # Normalize the pixel array
                    pixel_array = pixel_array - np.min(pixel_array)
                    pixel_array = (pixel_array / np.max(pixel_array) * 255).astype(np.uint8)

                    # Convert to 8-bit grayscale image
                    image = Image.fromarray(pixel_array)
                    
                    # Create corresponding folder structure in output folder
                    relative_path = os.path.relpath(root, dicom_folder)
                    output_subfolder = os.path.join(output_folder, relative_path)
                    if not os.path.exists(output_subfolder):
                        os.makedirs(output_subfolder)

                    # Save the image with the same name but PNG extension
                    png_filename = os.path.splitext(file)[0] + ".png"
                    png_path = os.path.join(output_subfolder, png_filename)
                    image.save(png_path)
                    print(f"Saved PNG: {png_path}")  # Debugging log
                except Exception as e:
                    print(f"Error processing file {dicom_path}: {e}")

# Paths for DICOM folders and new PNG output folders
# dicom_folder = r'C:\Users\User\Desktop\Test10\1'
# dicom_folder = output_folder


# New output folders for the PNG files
png_output = r'C:\Users\User\Desktop\compitission\2'

# Convert DICOM to PNG for both Training and Test folders
dicom_to_png_conversion(output_folder, png_output)

print("DICOM to PNG conversion completed.")


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Processing file: C:\Users\User\Desktop\compitission\1\20191_RCC.dcm
Saved PNG: C:\Users\User\Desktop\compitission\2\.\20191_RCC.png
Processing file: C:\Users\User\Desktop\compitission\1\20191_RMLO.dcm
Saved PNG: C:\Users\User\Desktop\compitission\2\.\20191_RMLO.png
Processing file: C:\Users\User\Desktop\compitission\1\20215_RCC.dcm
Saved PNG: C:\Users\User\Desktop\compitission\2\.\20215_RCC.png
Processing file: C:\Users\User\Desktop\compitission\1\20215_RMLO.dcm
Saved PNG: C:\Users\User\Desktop\compitission\2\.\20215_RMLO.png
Processing file: C:\Users\User\Desktop\compitission\1\20465_RCC.dcm
Saved PNG: C:\Users\User\Desktop\compitission\2\.\20465_RCC.png
Processing file: C:\Users\User\Desktop\compitission\1\20465_RMLO.dcm
Saved PNG: C:\Users\User\Desktop\compitission\2\.\20465_RMLO.png
Processing file: C:\Users\User\Desktop\compitission\1\20472_RCC.dcm
Saved PNG: C:\Users\User\Desktop\compitission\2\.\20472_RCC.png
Processing file: C:\Users\User\Desktop\compitission\1\20472_RMLO.dcm
S

Step.3: Create the mammogram data CSV file

In [3]:
import os
import pandas as pd
from PIL import Image

def create_csv_from_images(image_folder, output_csv):
    # List to store data for each image
    data = []

    # Iterate through all files in the image folder
    for root, dirs, files in os.walk(image_folder):
        for file in files:
            if file.endswith('.png'):
                # Extract patient ID and mammogram type from the filename
                filename = os.path.splitext(file)[0]
                patient_id, mammogram_type = filename.split('_')
                
                # Get the image size
                image_path = os.path.join(root, file)
                with Image.open(image_path) as img:
                    image_size = img.size  # This gives (width, height)
                
                # Append the information to the list
                data.append({
                    'PatientID': patient_id,
                    'Mammogram Type': mammogram_type,
                    'The size of the image': image_size
                })

    # Create a DataFrame from the data
    df = pd.DataFrame(data)
    
    # Save the DataFrame to a CSV file
    df.to_csv(output_csv, index=False)

    print(f"CSV file has been created successfully: {output_csv}")

# Paths for the PNG images folder and CSV output
image_folder = r'C:\Users\User\Desktop\compitission\2'
output_csv = r'C:\Users\User\Desktop\compitission\CSV_mammogram_data.csv'

# Create the CSV file
create_csv_from_images(image_folder, output_csv)


CSV file has been created successfully: C:\Users\User\Desktop\compitission\CSV_mammogram_data.csv


Step.4: Negative to Positive

In [4]:
from PIL import Image
import numpy as np
import os

# Function to determine if an image is negative (white background)
def is_negative(image_array):
    # Assuming white background images will have higher mean pixel values
    mean_pixel_value = np.mean(image_array)
    return mean_pixel_value > 127  # Adjust threshold based on your dataset

# Function to convert negative images to positive
def invert_image(image_array):
    return 255 - image_array  # Invert the image

# Function to process and save the images
def process_images(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for filename in os.listdir(input_folder):
        if filename.endswith('.jpg') or filename.endswith('.png'):
            # Open the image
            image_path = os.path.join(input_folder, filename)
            image = Image.open(image_path).convert('L')  # Convert to grayscale
            image_array = np.array(image)
            
            # Check if the image is negative
            if is_negative(image_array):
                # Invert the image if it's negative
                image_array = invert_image(image_array)
                print(f"Inverted {filename}")
            else:
                print(f"Kept {filename} as is (positive)")
            
            # Save the processed image
            output_image = Image.fromarray(image_array)
            output_image.save(os.path.join(output_folder, filename))

# Specify input and output folder paths
input_folder = r'C:\Users\User\Desktop\compitission\2'
output_folder = r'C:\Users\User\Desktop\compitission\3'


process_images(input_folder, output_folder)


Kept 20191_RCC.png as is (positive)
Kept 20191_RMLO.png as is (positive)
Inverted 20215_RCC.png
Inverted 20215_RMLO.png
Inverted 20465_RCC.png
Inverted 20465_RMLO.png
Inverted 20472_RCC.png
Inverted 20472_RMLO.png
Inverted 20565_RCC.png
Inverted 20565_RMLO.png
Inverted 20667_RCC.png
Inverted 20667_RMLO.png
Inverted 21047_LCC.png
Inverted 21047_LMLO.png
Inverted 21075_LCC.png
Inverted 21075_LMLO.png
Inverted 21123_LCC.png
Inverted 21123_LMLO.png
Inverted 21173_LCC.png
Inverted 21173_LMLO.png
Inverted 21201_RCC.png
Inverted 21201_RMLO.png
Inverted 21229_RCC.png
Inverted 21229_RMLO.png
Inverted 21320_LCC.png
Inverted 21320_LMLO.png
Kept 21332_LCC.png as is (positive)
Kept 21332_LMLO.png as is (positive)
Inverted 21753_LCC.png
Inverted 21753_LMLO.png
Inverted 21811_LCC.png
Inverted 21811_LMLO.png
Inverted 21996_RCC.png
Inverted 21996_RMLO.png
Inverted 22038_RCC.png
Inverted 22038_RMLO.png
Kept 22106_RCC.png as is (positive)
Kept 22106_RMLO.png as is (positive)
Kept 22124_RCC.png as is (pos

Step.5: Resizing to 512x512

In [5]:
import os
from PIL import Image

# Paths
input_dir = r"C:\Users\User\Desktop\compitission\3"
output_dir = r"C:\Users\User\Desktop\compitission\4"

# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Function to resize image
def resize_image(image_path, output_path, size=(512, 512)):
    with Image.open(image_path) as img:
        # Resize the image
        img_resized = img.resize(size, Image.Resampling.LANCZOS)
        # Save the resized image
        img_resized.save(output_path)

# Process each image in the input directory
for filename in os.listdir(input_dir):
    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
        input_image_path = os.path.join(input_dir, filename)
        output_image_path = os.path.join(output_dir, filename)

        # Resize and save the image
        resize_image(input_image_path, output_image_path)

print("All images have been resized and saved to the output directory.")


All images have been resized and saved to the output directory.
