In [1]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
%matplotlib inline

In [7]:
train_labels_file = 'data/CBIS-DDSM/csv/train_labels.csv'
test_labels_file = 'data/CBIS-DDSM/csv/test_labels.csv'
images_path = 'data/CBIS-DDSM/jpeg'

In [6]:
dest_folder = 'data/CBIS-DDSM/processed/cropped/roi'

# provide nested directory structure
dir_structure = dict()
dir_structure[0] = ['train','test']                                 
dir_structure[1] = ['BENIGN_Calc', 'BENIGN_Mass','MALIGNANT_Calc', 'MALIGNANT_Mass'] 

In [7]:
# create nested directories
for i in range(len(dir_structure[0])):
    dir_0 = dir_structure[0][i]  # directory at level 0
    for j in range(len(dir_structure[1])):
        dir_1 = dir_structure[1][j] # directpry at level 1
        dest_path = os.path.join(dest_folder, dir_0, dir_1)
        # create directories
        os.makedirs(dest_path, exist_ok=True)

In [10]:
# read label files
df_train = pd.read_csv(train_labels_file)
df_test = pd.read_csv(test_labels_file)

In [11]:
df_train.groupby('pathology').count().index

Index(['BENIGN_Calc', 'BENIGN_Mass', 'BENIGN_WITHOUT_CALLBACK_Calc',
       'BENIGN_WITHOUT_CALLBACK_Mass', 'MALIGNANT_Calc', 'MALIGNANT_Mass'],
      dtype='object', name='pathology')

In [12]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8184 entries, 0 to 8183
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   image_path         8184 non-null   object
 1   Laterality         8184 non-null   object
 2   PatientID          8184 non-null   object
 3   SeriesDescription  8184 non-null   object
 4   SeriesInstanceUID  8184 non-null   object
 5   StudyInstanceUID   8184 non-null   object
 6   patient_id         8184 non-null   int64 
 7   image view         8184 non-null   object
 8   train              8184 non-null   int64 
 9   type               8184 non-null   object
 10  key                8184 non-null   object
 11  pathology          8184 non-null   object
dtypes: int64(2), object(10)
memory usage: 767.4+ KB


In [13]:
'''
The destination location of each image depends of these attributes:
1. SeriesDescription: 'ROI mask images', 'cropped images','full mammogram images'
2. type: 'calc','mass'
3. train: 1, 0
4. pathology: 'benign', 'malignant'
'''

# this function takes each item in the series and creates the new path in order
def getDestinationPath(ser):
    # conver series to list for ease
    vals = ser.to_list()

    # SeriesDescription
    if vals[0] == 'full mammogram images':
        path1 = 'whole_images'
    elif vals[0] == 'cropped images':
        path1 = 'cropped_images'
    else:
        path1 = 'roi_images'

    # train info
    if vals[1] == 1:
        path2 = 'train'
    else: path2 = 'test'

    # class label
    path3 = str(vals[2])

    patient_id = str(vals[3])
    laterality = str(vals[4])
    view = str(vals[5])
    image_name = patient_id +'_'+ laterality +'_'+ view +'.jpeg'
    dest_path = os.path.join(dest_folder, path1, path2, path3, image_name)

    return dest_path

In [14]:
# create destination file path
df_train['dest_file_path'] = [getDestinationPath(df_train[['SeriesDescription','train','pathology',
                                  'patient_id','Laterality','image view']].iloc[i]) for i in range(len(df_train))]
df_test['dest_file_path'] = [getDestinationPath(df_test[['SeriesDescription','train','pathology',
                                  'patient_id','Laterality','image view']].iloc[i]) for i in range(len(df_test))]

In [15]:
df_train.head()

Unnamed: 0,image_path,Laterality,PatientID,SeriesDescription,SeriesInstanceUID,StudyInstanceUID,patient_id,image view,train,type,key,pathology,dest_file_path
0,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.129308...,R,Mass-Training_P_01265_RIGHT_MLO_1,cropped images,1.3.6.1.4.1.9590.100.1.2.129308726812851964007...,1.3.6.1.4.1.9590.100.1.2.271867287611061855725...,1265,MLO,1,Mass,01265_R,BENIGN_Mass,data/processed\cropped_images\train\BENIGN_Mas...
1,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.248386...,R,Mass-Training_P_01754_RIGHT_CC,full mammogram images,1.3.6.1.4.1.9590.100.1.2.248386742010678582309...,1.3.6.1.4.1.9590.100.1.2.161516517311681906612...,1754,CC,1,Mass,01754_R,MALIGNANT_Mass,data/processed\whole_images\train\MALIGNANT_Ma...
2,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.267213...,R,Calc-Training_P_00232_RIGHT_CC,full mammogram images,1.3.6.1.4.1.9590.100.1.2.267213171011171858918...,1.3.6.1.4.1.9590.100.1.2.291043622711253836701...,232,CC,1,Calc,00232_R,MALIGNANT_Calc,data/processed\whole_images\train\MALIGNANT_Ca...
3,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.153339...,R,Calc-Training_P_00181_RIGHT_CC_1,ROI mask images,1.3.6.1.4.1.9590.100.1.2.153339052913121382622...,1.3.6.1.4.1.9590.100.1.2.157384031212566921514...,181,CC,1,Calc,00181_R,BENIGN_Calc,data/processed\roi_images\train\BENIGN_Calc\18...
4,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.153339...,R,Calc-Training_P_00181_RIGHT_CC_1,cropped images,1.3.6.1.4.1.9590.100.1.2.153339052913121382622...,1.3.6.1.4.1.9590.100.1.2.157384031212566921514...,181,CC,1,Calc,00181_R,BENIGN_Calc,data/processed\cropped_images\train\BENIGN_Cal...


In [16]:
# replace old path with current image path in your machine
df_train['image_path'] = df_train.image_path.apply(lambda x: x.replace('CBIS-DDSM', 'data/CBIS-DDSM'))
df_test['image_path'] = df_test.image_path.apply(lambda x: x.replace('CBIS-DDSM', 'data/CBIS-DDSM'))

In [17]:
df_train['image_path'].iloc[0]

'data/CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.129308726812851964007517874181459556304/1-172.jpg'

In [18]:
from IPython.terminal.shortcuts import SEARCH_BUFFER
# Finally, copy files from source to destination
import shutil
invalid_files = 0

def copy_image(src, dst):
    try:
        # Check if source file exists before copying
        if os.path.isfile(src):
            shutil.copy(src, dst)
            return 0
        else:
            # print(f"Source image {src} does not exist.")
            return 1
    except Exception as e:
        print(f"An error occurred: {e}")

In [19]:
for i in range(len(df_train)):
    vals = df_train[['image_path', 'dest_file_path']].iloc[i].values.tolist()
    invalid_files += copy_image(vals[0], vals[1])

print(f'Total invalid files {invalid_files}')

Total invalid files 0


In [20]:
invalid_files = 0
for i in range(len(df_test)):
    vals = df_test[['image_path', 'dest_file_path']].iloc[i].values.tolist()
    invalid_files += copy_image(vals[0], vals[1])

print(f'Total invalid files for test {invalid_files}')

Total invalid files for test 0
