# 3 Class image dataset of mammography with abnormalities for YOLONAS based detection models

The following notebook contains the code necessary for: 
* Rescaling the images to 1080 x 1080 pixels without distortion by completing the longest side with zeros
* Computing the Region Of Interest coordinates for the new size
* Generating a training dataset with labels and normalized coordinates in .txt format [label centerx centery width height].

The abnormality classes adressed in this experiment are: 1 Architectural distortion, 2 Mass and 3 Calcification

The public datasets used are: 

* MIAS
* CBIS-DDSM
* CDD-CESM
* INbreast
* BMCD
* VinDr

To which I'll provide a link to their respective documentation within the README.md. I will also provide in this repository the .xlsx file containing the coordinates to every ROI. There are 5191 images and 6678 abnormalities in total. 

Don't forget to modify the image paths in the 'AbsPath' column according to your own storage.

In [1]:
from utils import *
import os
from processing import *
import torch
#import cudf
from tqdm import tqdm
import pandas as pd
import numpy as np

In [2]:
this_device = 'cuda' if torch.cuda.is_available() else 'cpu'
this_device

'cuda'

## Coordinates file 

This code computes the necessary data to obtain both COCO and YOLO format labels. Larger dataframes will take a long time to process. Progress can be tracked with the tdqm module.

In [4]:
csv_file = '..\\DetectorDatasetMG\\Templatedetfile.csv'

prev_row = None
current_num = 1
current_idx = 1

def update_row(row):
    global prev_row, current_num, current_idx
    row, prev_row, current_num, current_idx = assign_image_numbers(row, prev_row, current_num, current_idx)
    row = assign_labels_and_names(row)
    row = resized_normalized_coordinates(row)
    return row

df = pd.read_csv(csv_file)

tqdm.pandas() # Initialize tqdm for progress bar


df_progress = df.progress_apply(update_row, axis=1) # Apply the transformation with tqdm for progress tracking
df_progress.to_csv(csv_file, index=False)

100%|██████████| 6678/6678 [57:31<00:00,  1.93it/s]  


In [None]:
# So the final dataset is randomly split into the train, valid and test categories
def assign_dl_folder (csv_file, ratio):
    df = pd.read_csv(csv_file)
    
    #Just to be sure...
    assert len(ratio) == 3, "Ratio must be a tuple of three elements"
    assert sum(ratio) <= 1, "Sum of ratio must be less than or equal to 1"
    
    categories = ['train', 'valid', 'test']
    
    df['set'] = np.random.choice(categories, size=len(df), p=ratio)
    
    df.to_csv(csv_file, index =False)

assign_dl_folder(csv_file, ratio=(0.7, 0.2, 0.1))

Now the Excel file contains all the needed information to generate the dataset. Please review the comments on utils.py and processing.py for further information

## Image processing
#### Run this version below if you don't have a GPU. It will take very long to process

The result is a dataset of normalized and denoised mammography images for better model performance

In [None]:
def dataset_for_detector (file, root):
    df = pd.read_excel(file)
    
    # Define data
    for i, row in df.iterrows():
        
        if row['AbsPath'].endswith('.dcm') or row['AbsPath'].endswith ('.dicom') or row['AbsPath'].endswith('.DCM'): 
            image = dicom_preprocessing(row['AbsPath'], row['pixy'], row['pixx'])
            
        else:
            image = cv2.imread(row['AbsPath'])
            image = cv2.normalize(image.astype(float), None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
        
        image = reduce_poisson_noise(image)
        
        squared = add_zeros_for_square(image, row['pixy'], row['pixx'])
        det_size = (1080, 1080)
        resized_squared = cv2.resize(squared, det_size, interpolation=cv2.INTER_LINEAR)
        
        # Build storing path
        dest_folder = os.path.join(root, row['set'],'images')
        if not os.path.exists(dest_folder):
            os.makedirs(dest_folder)
            
        # Save the images
        image_name = f"{row['ImageName']}_{row['Index']}.png"
        image_path = os.path.join(dest_folder, image_name)
        cv2.imwrite(image_path, resized_squared)
        
        # Write label information to text file
        label_folder = os.path.join(root, row['set'], 'labels')
        if not os.path.exists(label_folder):
            os.makedirs(label_folder)
            
        
        label_file_path = os.path.join(label_folder, f"{row['ImageName']}_{row['Index']}.txt")
        with open(label_file_path, 'w') as f:
            f.write(f"{row['Label']} {row['cx']} {row['cy']} {row['nw']} {row['nh']}")
            
        print (row['id'])


In [None]:
dataset_for_detector(csv_file, root=" ")

### Run this version below if you have a GPU

In [None]:
#This is still under construction
def dataset_for_detector(file, root):
    df = pd.read_excel(file)
    df_cudf = cudf.DataFrame.from_pandas(df)

    for i, row in df_cudf.iterrows():
        abs_path = row['AbsPath'].to_pandas()
        pixy = row['pixy']
        pixx = row['pixx']

        if abs_path.endswith('.dcm') or abs_path.endswith('.dicom') or abs_path.endswith('.DCM'): 
            image = dicom_preprocessing(abs_path, pixy, pixx)
        else:
            image = cv2.imread(abs_path)
            image = cv2.normalize(image.astype(float), None, 0, 255, cv2.NORM_MINMAX).astype(cp.uint8)

        # GPU-accelerated reduce_poisson_noise and add_zeros_for_square functions if possible
        image = reduce_poisson_noise(image)  # Ensure this function is optimized for GPU
        squared = add_zeros_for_square(image, pixy, pixx)  # Ensure this function is optimized for GPU

        det_size = (1080, 1080)
        resized_squared = cv2.resize(cp.asnumpy(squared), det_size, interpolation=cv2.INTER_LINEAR)

        # Build storing path
        dest_folder = os.path.join(root, row['set'].to_pandas(),'images')
        if not os.path.exists(dest_folder):
            os.makedirs(dest_folder)
            
        # Save the images
        image_name = f"{row['ImageName'].to_pandas()}_{row['Index']}.png"
        image_path = os.path.join(dest_folder, image_name)
        cv2.imwrite(image_path, resized_squared)
        
        # Write label information to text file
        label_folder = os.path.join(root, row['set'].to_pandas(), 'labels')
        if not os.path.exists(label_folder):
            os.makedirs(label_folder)
            
        label_file_path = os.path.join(label_folder, f"{row['ImageName'].to_pandas()}_{row['Index']}.txt")
        with open(label_file_path, 'w') as f:
            f.write(f"{row['Label']} {row['cx']} {row['cy']} {row['nw']} {row['nh']}")
            
        print(row['id'])

