In [None]:
import cv2
import os
#import numpy as np
#import matplotlib.pyplot as plt
import albumentations as A
import pandas as pd

In [None]:
# Define paths 
base_path = r"..\\data\\"
base_path_out = os.path.join(base_path, "processed")   # path to read input csv-file from

output_path = os.path.join(base_path_out, "augmented") # path to write the


# Define classes which need augmentation: 
classes_aug = ["COVID", "Viral Pneumonia", "Lung_Opacity"]

In [None]:
# read csv with data frame which contains infos to preprocessed  and normalized images and labels and encoded labels
df_train = pd.read_csv(os.path.join(base_path_out,"df_xray_processed_normed_enc_train.csv"), sep=',', index_col=0)

In [None]:
# get infos which is the majority class
counts = df_train['label'].value_counts()

# assume that the normal class is the majority class

max_num = counts['Normal']

iteration_per_class = {}

for class_name in classes_aug:
    class_num = counts[class_name]

    div_res = round(max_num/class_num,0)    # calculate how many augmentation iteration we need
    iteration = div_res - 1                 # it needs one iteration less, because we add the unaugmented data
    
    iteration_per_class.update({class_name: iteration})


    






In [39]:
# Define augmentation pipeline

# define probabilities of applying the different augmentation methods
prob_rotate = 0.5    # probaility of using rotation
prob_shift = 0.5     # probaility of using shifting
prob_pipeline = 1.0  # The entire pipeline has a 100% chance to be applied. We want every image to be changed


# Define the augmentation pipeline
augmentation_pipeline = A.Compose([
    A.OneOf([
        A.Rotate(limit=15, p=prob_rotate),                                                 # Rotate with a limit of ±15 degrees
        A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0, rotate_limit=0, p=prob_shift)  # Translation (shifting)
    ], p=prob_pipeline)

], p=1.0,                    # probability of the entire pipeline to be applied
#seed = 137,                  # added seed for reproducibility  
save_applied_params=True)    # save applied transformation parameters


  original_init(self, **validated_kwargs)


In [60]:
# create data_frame which is filled with the infos to the augmented iamges:
df_train_augmented = pd.DataFrame(columns=df_train.columns)
#df_train_augmented.columns


# loop through rows of df_train
for index, row in df_train.iterrows():
    # only do augmentation if class is not 'Normal'
    if row['label'] in classes_aug:

        # define output path for augmented image
        augmented_dir = os.path.join(output_path, row['label'])
        os.makedirs(augmented_dir, exist_ok=True) 


        # how many iterations of augmentation do we need?
        num_iter = iteration_per_class[row['label']]

        file = os.path.join(row['path'], row['file'])
        
        iter = 1
        # for each file in the data frame to as many augmentation, as needed for this class
        
        img = cv2.imread(file, cv2.IMREAD_GRAYSCALE)
        if img is not None:
            while iter <= num_iter: 
                      
                augmented = augmentation_pipeline(image=img)
                augmented_img = augmented['image']
                
                # save used transformation to dict
                #used_transformation.update({img_name:augmented['applied_transforms']})
                        
                # define name for augmented image
                split_name, split_ending = row['file'].split('.') 
                img_name_aug = split_name + "_ag" + str(iter) + "." + split_ending

                # save augmented image
                save_path = os.path.join(augmented_dir, img_name_aug)
                cv2.imwrite(save_path, augmented_img)

                # build a new row for the new dataframe df_train_augmented:
                new_row_label = row['label']
                new_row_file = img_name_aug
                new_row_label_enc = row['label_enc']
                new_row_path = augmented_dir
                
                new_row = {'label': new_row_label, 
                        'file': new_row_file, 
                        'label_enc': new_row_label_enc, 
                        'path': new_row_path }

                #df_train_augmented= df_train_augmented.append(new_row, ignore_index = True)
                df_train_augmented = pd.concat([df_train_augmented,pd.DataFrame([new_row])], ignore_index = True, axis = 0)

                iter +=1

In [61]:
df_train_augmented['label'].value_counts()

label
Viral Pneumonia    7532
COVID              5786
Lung_Opacity       4809
Name: count, dtype: int64

In [63]:
# concat df with non augmented and augmented data: 
 
df_train_combined = pd.concat([df_train, df_train_augmented], ignore_index = True, axis = 0)

In [64]:
df_train_combined['label'].value_counts()

label
Lung_Opacity       9618
COVID              8679
Viral Pneumonia    8608
Normal             8154
Name: count, dtype: int64

In [65]:
df_train_combined.tail()

Unnamed: 0,label,file,label_enc,path
35054,Viral Pneumonia,Viral Pneumonia-1185_ag4.png,3,..\\data\\processed\augmented\Viral Pneumonia
35055,Viral Pneumonia,Viral Pneumonia-1185_ag5.png,3,..\\data\\processed\augmented\Viral Pneumonia
35056,Viral Pneumonia,Viral Pneumonia-1185_ag6.png,3,..\\data\\processed\augmented\Viral Pneumonia
35057,Viral Pneumonia,Viral Pneumonia-1185_ag7.png,3,..\\data\\processed\augmented\Viral Pneumonia
35058,Lung_Opacity,Lung_Opacity-997_ag1.png,1,..\\data\\processed\augmented\Lung_Opacity


In [None]:
# save Df with combined data to csv
df_train_combined.to_csv(os.path.join(base_path_out, 'df_xray_train_norm_plus_augmented.csv'), index_label='index')