In [1]:
import cv2
import os
import numpy as np
#import matplotlib.pyplot as plt
import albumentations as A
import pandas as pd
from tqdm import tqdm

  check_for_updates()


In [2]:
# Define paths 
base_path = r"..\\data\\"
base_path_out = os.path.join(base_path, "processed")   # path to read input csv-file from

output_path = os.path.join(base_path_out, "augmented_without_masks_resized") # path to write the


# Define classes which need augmentation: 
classes_aug = ["COVID", "Viral Pneumonia", "Lung_Opacity"]

## create train data set

In [3]:
# read csv with data frame which contains infos to preprocessed  and normalized images and labels and encoded labels
df_train = pd.read_csv(os.path.join(base_path_out,"df_xray_processed_normed_enc_train.csv"), sep=',', index_col=0)

# change folder name for imput images
df_train['path'] = df_train['path'].apply(lambda x: x.replace('normalized_xrays', 'resized_and_normalized_images_without_masks')) 

In [4]:
# this cell is the workaround, because the mixed up folder structure: no subfolders for 4 classes

# use classname as prefix for the filename
df_train['file'] = df_train['file'].apply(lambda x: x.split("-")[0]+"_"+x)

# delete subdolder for class in paths
df_train['path'] = df_train['path'].apply(lambda x: os.sep.join(x.split(os.sep)[:-1]))  



In [6]:
# get infos which is the majority class
counts = df_train['label'].value_counts()

# assume that the normal class is the majority class

max_num = counts['Normal']

iteration_per_class = {}

for class_name in classes_aug:
    class_num = counts[class_name]

    div_res = round(max_num/class_num,0)    # calculate how many augmentation iteration we need
    iteration = div_res - 1                 # it needs one iteration less, because we add the unaugmented data
    
    iteration_per_class.update({class_name: iteration})


    






In [8]:
# Define augmentation pipeline

# define probabilities of applying the different augmentation methods
prob_rotate = 0.5    # probaility of using rotation
prob_shift = 0.5     # probaility of using shifting
prob_pipeline = 1.0  # The entire pipeline has a 100% chance to be applied. We want every image to be changed


# Define the augmentation pipeline
augmentation_pipeline = A.Compose([
    A.OneOf([
        A.Rotate(limit=15, p=prob_rotate),                                                 # Rotate with a limit of ±15 degrees
        A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0, rotate_limit=0, p=prob_shift)  # Translation (shifting)
    ], p=prob_pipeline)

], p=1.0,                    # probability of the entire pipeline to be applied
#seed = 137,                  # added seed for reproducibility  
save_applied_params=True)    # save applied transformation parameters


  original_init(self, **validated_kwargs)


In [9]:
# create data_frame which is filled with the infos to the augmented iamges:
df_train_augmented = pd.DataFrame(columns=df_train.columns)
#df_train_augmented.columns


# loop through rows of df_train
for index, row in df_train.iterrows():
    # only do augmentation if class is not 'Normal'
    if row['label'] in classes_aug:

        # define output path for augmented image
        augmented_dir = os.path.join(output_path, row['label'])
        os.makedirs(augmented_dir, exist_ok=True) 


        # how many iterations of augmentation do we need?
        num_iter = iteration_per_class[row['label']]

        file = os.path.join(row['path'], row['file'])
        
        iter = 1
        # for each file in the data frame to as many augmentation, as needed for this class
        
        img = cv2.imread(file, cv2.IMREAD_GRAYSCALE)
        if img is not None:
            while iter <= num_iter: 
                      
                augmented = augmentation_pipeline(image=img)
                augmented_img = augmented['image']
                
                # save used transformation to dict
                #used_transformation.update({img_name:augmented['applied_transforms']})
                        
                # define name for augmented image
                split_name, split_ending = row['file'].split('.') 
                img_name_aug = split_name + "_ag" + str(iter) + "." + split_ending

                # save augmented image
                save_path = os.path.join(augmented_dir, img_name_aug)
                cv2.imwrite(save_path, augmented_img)

                # build a new row for the new dataframe df_train_augmented:
                new_row_label = row['label']
                new_row_file = img_name_aug
                new_row_label_enc = row['label_enc']
                new_row_path = augmented_dir
                
                new_row = {'label': new_row_label, 
                        'file': new_row_file, 
                        'label_enc': new_row_label_enc, 
                        'path': new_row_path }

                #df_train_augmented= df_train_augmented.append(new_row, ignore_index = True)
                df_train_augmented = pd.concat([df_train_augmented,pd.DataFrame([new_row])], ignore_index = True, axis = 0)

                iter +=1

In [10]:
df_train_augmented['label'].value_counts()

label
Viral Pneumonia    7532
COVID              5786
Lung_Opacity       4809
Name: count, dtype: int64

In [12]:
# concat df with non augmented and augmented data: 
 
df_train_combined = pd.concat([df_train, df_train_augmented], ignore_index = True, axis = 0)

In [13]:
df_train_combined['label'].value_counts()

label
Lung_Opacity       9618
COVID              8679
Viral Pneumonia    8608
Normal             8154
Name: count, dtype: int64

In [14]:
df_train.tail()

Unnamed: 0_level_0,label,file,label_enc,path
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
19581,Lung_Opacity,Lung_Opacity_Lung_Opacity-5774.png,1,..\\data\\processed\resized_and_normalized_ima...
6967,Normal,NORMAL_NORMAL-3352.png,2,..\\data\\processed\resized_and_normalized_ima...
12317,Normal,NORMAL_NORMAL-8702.png,2,..\\data\\processed\resized_and_normalized_ima...
21004,Viral Pneumonia,Viral Pneumonia_Viral Pneumonia-1185.png,3,..\\data\\processed\resized_and_normalized_ima...
14804,Lung_Opacity,Lung_Opacity_Lung_Opacity-997.png,1,..\\data\\processed\resized_and_normalized_ima...


In [15]:
# save Df with combined data to csv
df_train.to_csv(os.path.join(base_path_out, 'df_xray_train_norm_plus_augmented_without_masks_resized.csv'), index_label='index')

In [16]:
# convert and save data to npz

df_train_combined['image_path'] = df_train_combined.apply(lambda row: os.path.join(row['path'], row['file']), axis=1)

image_data = []
for path in tqdm(df_train_combined['image_path'], desc="Loading and processing train images"):
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    img_flattened = img.reshape(-1)  # Flatten image to 1D vector
    if img_flattened is not None:
        image_data.append(img_flattened)

# Convert to NumPy array
X_train = np.array(image_data, dtype=np.uint8)
y_train = df_train_combined['label_enc'].to_numpy()

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

# Save the resized and flattened training data
np.savez_compressed(os.path.join(base_path_out, 'train_data_resized_without_masks.npz'), X_train=X_train, y_train=y_train)
print("Resized and flattened train images without masks have been saved!")

Loading and processing train images: 100%|██████████| 35059/35059 [07:16<00:00, 80.25it/s] 


X_train shape: (35059, 400)
y_train shape: (35059,)
Resized and flattened train images without masks have been saved!


## create test data set

In [17]:
# read csv with data frame which contains infos to preprocessed  and normalized images and labels and encoded labels
df_test = pd.read_csv(os.path.join(base_path_out,"df_xray_processed_normed_enc_test.csv"), sep=',', index_col=0)

# change folder name for imput images
df_test['path'] = df_test['path'].apply(lambda x: x.replace('normalized_xrays', 'resized_and_normalized_images_without_masks')) 

In [18]:
# this cell is the workaround, because the mixed up folder structure: no subfolders for 4 classes

# use classname as prefix for the filename
df_test['file'] = df_test['file'].apply(lambda x: x.split("-")[0]+"_"+x)

# delete subdolder for class in paths
df_test['path'] = df_test['path'].apply(lambda x: os.sep.join(x.split(os.sep)[:-1]))  


In [19]:
# save Df with test data to csv
df_test.to_csv(os.path.join(base_path_out, 'df_xray_test_norm_without_masks_resized.csv'), index_label='index')

In [20]:
# convert and save data to npz

df_test['image_path'] = df_test.apply(lambda row: os.path.join(row['path'], row['file']), axis=1)

image_data = []
for path in tqdm(df_test['image_path'], desc="Loading and processing test images"):
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    img_flattened = img.reshape(-1)  # Flatten image to 1D vector
    if img_flattened is not None:
        image_data.append(img_flattened)

# Convert to NumPy array
X_test = np.array(image_data, dtype=np.uint8)
y_test = df_test['label_enc'].to_numpy()

print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

# Save the resized and flattened test data
np.savez_compressed(os.path.join(base_path_out, 'test_data_resized_without_masks.npz'), X_test=X_test, y_test=y_test)
print("Resized and flattened test images without masks have been saved!")

Loading and processing test images: 100%|██████████| 4233/4233 [01:24<00:00, 50.12it/s]


X_test shape: (4233, 400)
y_test shape: (4233,)
Resized and flattened test images without masks have been saved!
