In [7]:
import os
import cv2
import pickle
import pydicom
import numpy as np
import pandas as pd
import torch
from tqdm.notebook import tqdm

In [3]:
train_folder = 'train_images'

In [4]:
'''This method creates the preprocessed images. They are saved with pickle to a separate folder.
   The names include the patient_id as well as the image_id. This way they can be loaded with these 2 values.'''

def create_preprcessed_images(folder):
    patients = os.listdir(folder)  # all patient folders with the scans
    
    folder_preprocessed_scans = 'Preprocessed_Scans'
    
    if not os.path.exists(folder_preprocessed_scans):
        os.mkdir(folder_preprocessed_scans)
        
        # only do it once (if folder didn't exist before)
        for patient_id in tqdm(patients):  # loop over all patient_ids (train folders)

            path_to_patient = os.path.join(folder, patient_id)

            for image_id in os.listdir(path_to_patient):  # loop over all scans (image_ids)

                full_path = os.path.join(path_to_patient, image_id)

                img = pydicom.dcmread(full_path).pixel_array  # load the scan

                img = (img - img.min()) / (img.max() - img.min())  # normalize values (0-1)

                img = cv2.resize(img, (512, 512))  # correct shape

                img_tensor = torch.tensor(img).type(torch.float32)  # tensor as float32

                img_tensor = torch.unsqueeze(img_tensor, 0)  # add color channel dim as first dim
                
                # should be done later for less disk usage
                #img_tensor = img_tensor.repeat(3, 1, 1)  # repeat it 3 times

                image_id = image_id.split('.')[0]  # get the image_id without the .dcm ending

                new_name = f'{patient_id}_{image_id}.pickle'  # new name

                new_name_path = os.path.join(folder_preprocessed_scans, new_name)

                with open(new_name_path, "wb") as f:
                    pickle.dump(img_tensor, f)  # save it

In [5]:
create_preprcessed_images(train_folder)

  0%|          | 0/11913 [00:00<?, ?it/s]

In [6]:
train_metadata = 'train.csv'

In [8]:
train_df = pd.read_csv(train_metadata)

In [9]:
len(train_df)

54706

In [10]:
if len(train_df) == len(os.listdir('Preprocessed_Scans')):
    print('Same length of newly created tensors and all patient scans')

Same length of newly created tensors and all patient scans
