In [24]:
# create a dataset: 

# start with raw images and masks
# no filtering !!!
# convert images to grayscle
# normalization
# applied masks
# resized to 128 x 128 ()
# do data augmentation

# input folder: 
# data\raw\COVID-19_Radiography_Dataset raw-data (299 * 299)

# additional input are saved data-frames of train & test data, in order to use the same test/ train split:
# - ../data/processed/df_xray_processed_normed_enc_train.cvs        (this is output from train_test_split.ipynb)
# - ../data/processed/df_xray_processed_normed_enc_test.csv         (this is output from train_test_split.ipynb)


In [25]:
import cv2
import os
import numpy as np
#import matplotlib.pyplot as plt
import albumentations as A
import pandas as pd
from tqdm import tqdm

In [33]:
#############################
# Define paths 
#############################
base_path = r"..\\data\\"
base_path_out = os.path.join(base_path, "processed")   # path to read input csv-file from

out_path_norm_mask = os.path.join(base_path_out, "normalized_with_masks_NOfilter")              # path to write the nromalized images with masks converted to grayscale
# if option resize is selected, "_resize_SIZE_SIZE" is added to the path

output_path_augmented = os.path.join(base_path_out, "augmented_normalized_with_masks_NOfilter") # path to write the augmented images
# if option resize is selected, "_resize_SIZE_SIZE" is added to the path


# Define classes which need augmentation: 
classes_aug = ["COVID", "Viral Pneumonia", "Lung_Opacity"]


#############################
# Optional resize iamges:
#############################

flag_resize = True   # True or False
size = 128           # result image size is (size x size)  pixel

## create train data set

In [34]:
# read csv with data frame which contains infos to train and test-split data
df_train = pd.read_csv(os.path.join(base_path_out,"df_xray_processed_normed_enc_train.csv"), sep=',', index_col=0)

# change folder name for imput images ( we want a df with paths to raw-data)
df_train['path'] = df_train['path'].apply(lambda x: x.replace(r'processed\normalized_xrays', r'raw\COVID-19_Radiography_Dataset')) 

### convert to grayscale, normalize images, add masks, optional resize

In [39]:
# Initialize lists to store images and labels
X = []
y = []

# loop through rows of df_train
for index, row in df_train.iterrows():
    
    image_dir = os.path.join(row['path'] ,"images", row['file']) # create path to image
    #print(image_dir)
    mask_dir = os.path.join(row['path'], "masks", row['file'])  # create path to corresponding mask
    print

    # file = os.path.join(row['path'], row['file'])  # create path to file
    img = cv2.imread(image_dir, cv2.IMREAD_GRAYSCALE)   # convert to grayscla while reading image
    mask = cv2.imread(mask_dir, cv2.IMREAD_GRAYSCALE)   # convert to grayscla while reading mask
    
    if img is None or mask is None:
        print(f"Skipping {row['file']}: missing image or mask")
        continue
    
    # Normalize pixel values to [0, 1] (divide by 255)
    img_normalized = img.astype(np.float32) / 255.0

    # Resize mask to match image size
    mask_resized = cv2.resize(mask, (img.shape[1], img.shape[0]))

    # Ensure binary mask (0, 255)
    _, mask_binary = cv2.threshold(mask_resized, 127, 255, cv2.THRESH_BINARY)

    # Keep processed pixels inside lung mask, zero elsewhere = add mask to iamges
    processed_img = cv2.bitwise_and(img_normalized, img_normalized, mask=mask_binary)

    # Save the processed image (convert back to 8-bit for saving)
    save_img = (processed_img * 255).astype(np.uint8)

    # optional resize of precessed images:
    if flag_resize == True:
        img_resized = cv2.resize(save_img, (size, size))

        # Save processed and resized images
        out_dir = os.path.join(out_path_norm_mask + "_resized_"+str(size)+"_"+str(size), row['label'])
        os.makedirs(out_dir, exist_ok=True) 
        save_path = os.path.join(out_dir ,row['file'])
        cv2.imwrite(save_path, img_resized)
    else:
        # Save processed image
        out_dir = os.path.join(out_path_norm_mask, row['label'])
        os.makedirs(out_dir, exist_ok=True) 
        save_path = os.path.join(out_dir, row['file'])
        cv2.imwrite(save_path, save_img)




### do data augmentatio & create a new DF with orig + augmented data

'..\\\\data\\\\processed\\normalized_with_masks_NOfilter'

In [40]:
# get infos which is the majority class
counts = df_train['label'].value_counts()

# assume that the normal class is the majority class

max_num = counts['Normal']

iteration_per_class = {}

for class_name in classes_aug:
    class_num = counts[class_name]

    div_res = round(max_num/class_num,0)    # calculate how many augmentation iteration we need
    iteration = div_res - 1                 # it needs one iteration less, because we add the unaugmented data
    
    iteration_per_class.update({class_name: iteration})

In [42]:
# Define augmentation pipeline

# define probabilities of applying the different augmentation methods
prob_rotate = 0.5    # probaility of using rotation
prob_shift = 0.5     # probaility of using shifting
prob_pipeline = 1.0  # The entire pipeline has a 100% chance to be applied. We want every image to be changed


# Define the augmentation pipeline
augmentation_pipeline = A.Compose([
    A.OneOf([
        A.Rotate(limit=15, p=prob_rotate),                                                 # Rotate with a limit of ±15 degrees
        A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0, rotate_limit=0, p=prob_shift)  # Translation (shifting)
    ], p=prob_pipeline)

], p=1.0,                    # probability of the entire pipeline to be applied
#seed = 137,                  # added seed for reproducibility  
save_applied_params=True)    # save applied transformation parameters

  original_init(self, **validated_kwargs)


In [44]:
out_path_norm_mask

out_path_norm_mask + "_resized_"+str(size)+"_"+str(size)

'..\\\\data\\\\processed\\normalized_with_masks_NOfilter_resized_128_128'

In [45]:
out_path_norm_mask

'..\\\\data\\\\processed\\normalized_with_masks_NOfilter'

In [None]:
# create a DF with paths to just created files: 
df_train_processed = df_train
# change folder name for imput images ( we want a df with paths to raw-data)
df_train_processed['path'] = df_train['path'].apply(lambda x: os.path.join(out_path_norm_mask, df_train['label'])) 

# hier muss ins feld path der pfad zu den gerade prozessierten bildern

# und alternative falls resized mit of programmieren



TypeError: join() argument must be str, bytes, or os.PathLike object, not 'Series'

In [None]:


# create data_frame which is filled with the infos to the augmented iamges:
df_train_augmented = pd.DataFrame(columns=df_train.columns)
#df_train_augmented.columns


# loop through rows of df_train
for index, row in df_train.iterrows():
    # only do augmentation if class is not 'Normal'
    if row['label'] in classes_aug:

        # define output path for augmented image
        augmented_dir = os.path.join(output_path, row['label'])
        os.makedirs(augmented_dir, exist_ok=True) 


        # how many iterations of augmentation do we need?
        num_iter = iteration_per_class[row['label']]

        file = os.path.join(row['path'], row['file'])
        
        iter = 1
        # for each file in the data frame to as many augmentation, as needed for this class
        
        img = cv2.imread(file, cv2.IMREAD_GRAYSCALE)
        if img is not None:
            while iter <= num_iter: 
                      
                augmented = augmentation_pipeline(image=img)
                augmented_img = augmented['image']
                
                # save used transformation to dict
                #used_transformation.update({img_name:augmented['applied_transforms']})
                        
                # define name for augmented image
                split_name, split_ending = row['file'].split('.') 
                img_name_aug = split_name + "_ag" + str(iter) + "." + split_ending

                # save augmented image
                save_path = os.path.join(augmented_dir, img_name_aug)
                cv2.imwrite(save_path, augmented_img)

                # build a new row for the new dataframe df_train_augmented:
                new_row_label = row['label']
                new_row_file = img_name_aug
                new_row_label_enc = row['label_enc']
                new_row_path = augmented_dir
                
                new_row = {'label': new_row_label, 
                        'file': new_row_file, 
                        'label_enc': new_row_label_enc, 
                        'path': new_row_path }

                #df_train_augmented= df_train_augmented.append(new_row, ignore_index = True)
                df_train_augmented = pd.concat([df_train_augmented,pd.DataFrame([new_row])], ignore_index = True, axis = 0)

                iter +=1

In [None]:

    #     # Append normalized and resized image to the list
    #     X.append(img_normalized)

    #     # Extract label from filename 
    #     label = filename.split('_')[0]
    #     y.append(label)


    #     if flag_resize == True:
    #         # Resize image to 128x128
    #         img_resized = cv2.resize(img, (128, 128))


    # # only do augmentation if class is not 'Normal'
    # if row['label'] in classes_aug:

    #     # define output path for augmented image
    #     augmented_dir = os.path.join(output_path, row['label'])
    #     os.makedirs(augmented_dir, exist_ok=True) 


    #     # how many iterations of augmentation do we need?
    #     num_iter = iteration_per_class[row['label']]

    #     file = os.path.join(row['path'], row['file'])
        
    #     iter = 1
    #     # for each file in the data frame to as many augmentation, as needed for this class
        
    #     img = cv2.imread(file, cv2.IMREAD_GRAYSCALE)
    #     if img is not None:
    #         while iter <= num_iter: 
                      
    #             augmented = augmentation_pipeline(image=img)
    #             augmented_img = augmented['image']
                
    #             # save used transformation to dict
    #             #used_transformation.update({img_name:augmented['applied_transforms']})
                        
    #             # define name for augmented image
    #             split_name, split_ending = row['file'].split('.') 
    #             img_name_aug = split_name + "_ag" + str(iter) + "." + split_ending

    #             # save augmented image
    #             save_path = os.path.join(augmented_dir, img_name_aug)
    #             cv2.imwrite(save_path, augmented_img)

    #             # build a new row for the new dataframe df_train_augmented:
    #             new_row_label = row['label']
    #             new_row_file = img_name_aug
    #             new_row_label_enc = row['label_enc']
    #             new_row_path = augmented_dir
                
    #             new_row = {'label': new_row_label, 
    #                     'file': new_row_file, 
    #                     'label_enc': new_row_label_enc, 
    #                     'path': new_row_path }

    #             #df_train_augmented= df_train_augmented.append(new_row, ignore_index = True)
    #             df_train_augmented = pd.concat([df_train_augmented,pd.DataFrame([new_row])], ignore_index = True, axis = 0)

    #             iter +=1

### save as npz

## test data: do the same as with train data but NO data augmentation