# Train Dataset Creator

- This notebook creates creates a numpy representation of image names and lables.
- Source directory should contain one folder for one class of images.
- An empty destination directory must be given.
- Once notebook is run completely, 3 folders will be created in destination directory.
    - all images: contains all images supposed to be used for training.
    - train: contains 2 npy files. one containing train image names and one containing labels.
    - val: contains 2 npy files. one containing validation image names and one containing labels.
- All these folders/files will be processed later by a custom generator which feeds images into a image classification model.

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import os
import shutil
import numpy as np

from sklearn.utils import shuffle

# from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [4]:
# source directory is where actual images are stored at. In this folder there should be one folder for every class containing images corresponding only to that class
src_dir = r'C:\Users\subrahm\Documents\breast-cancer-detection\data\DDSM\train_data\final_train_new_ddsm\images'

# destination directory should be an empty directory. This is where prepared dataset will be stored at
dest_dir = r'C:\Users\subrahm\Documents\breast-cancer-detection\data\DDSM\train_data\final_train_new_ddsm\train_data'
counter = 0

# checking if destination directory is empty
if len(os.listdir(dest_dir)) != 0:
    raise ValueError(f"destination directory '{dest_dir}' is not empty")

In [5]:
# Create directories to store training data

all_img_dir = os.path.join(dest_dir, 'all_images')
train_dir = os.path.join(dest_dir, 'train')
val_dir = os.path.join(dest_dir, 'val')

os.makedirs(all_img_dir)
os.makedirs(train_dir)
os.makedirs(val_dir)

In [6]:
# Copy all images to all images directory

for subdir, dirs, files in os.walk(src_dir):
    for file in files:
        full_path = os.path.join(subdir, file)
        shutil.copy(full_path, all_img_dir)
        counter = counter + 1

In [7]:
len_files = len(os.listdir(all_img_dir))
filenames_counter = 0
labels_counter = 0

filenames = []
labels = np.zeros((len_files, 1))

## Labelling

In [8]:
for img_dir in os.listdir(src_dir):
    files = os.listdir(os.path.join(src_dir, img_dir))
    print(f"for {img_dir}, label is {labels_counter}")
    
    for file in files:
        filenames.append(file)
        labels[filenames_counter, 0] = labels_counter
        filenames_counter = filenames_counter + 1
    labels_counter = labels_counter+1

filenames = np.array(filenames)

print(filenames.shape)
print(labels.shape)

for Benign, label is 0
for Malignant, label is 1
(13050,)
(13050, 1)


In [9]:
filenames_shuffled, labels_shuffled = shuffle(filenames, labels)

In [10]:
X_train_filenames, X_val_filenames, y_train, y_val = train_test_split(
    filenames_shuffled, labels_shuffled, test_size=0.2, random_state=1)

In [11]:
print(X_train_filenames.shape) 
print(y_train.shape)           

print(X_val_filenames.shape)   
print(y_val.shape)             


(10440,)
(10440, 1)
(2610,)
(2610, 1)


In [12]:
# These npy files are the ones which will be used during training and validation.
np.save(os.path.join(train_dir, 'X_train_filenames.npy'), X_train_filenames)
np.save(os.path.join(train_dir, 'y_train.npy'), y_train)

np.save(os.path.join(val_dir, 'X_val_filenames.npy'), X_val_filenames)
np.save(os.path.join(val_dir, 'y_val.npy'), y_val)

In [13]:
# Create a zip file of all_images directory
shutil.make_archive(all_img_dir, "zip", all_img_dir)

'C:\\Users\\subrahm\\Documents\\breast-cancer-detection\\data\\DDSM\\train_data\\final_train_new_ddsm\\train_data\\all_images.zip'

In [14]:
# Remove all images directory as it's of no use now.
shutil.rmtree(all_img_dir)