In [None]:
# create_small_dataset_nonaugmented.ipynb

# This notebook creates a dataset which contains the resized images 128 x 128 pixel and has no augmented data.

**input:**    
../data/processed/df_xray_processed_normed_enc_test.cvs   (is created by train_test_split.ipynb)  
../data/processed/df_xray_processed_normed_enc_train.cvs (is created by train_test_split.ipynb)

It takes the filtered and normalized images which have masks included from folder \normalized_xrays

In [1]:
import cv2
import os
import numpy as np
#import matplotlib.pyplot as plt
import pandas as pd

## Some definitions which have to be set by the user:

In [None]:
# Define paths 
base_path = r"..\\data\\"
base_path_out = os.path.join(base_path, "processed")   # path to read input csv-file from

#output_path = os.path.join(base_path_out, "non_augmented_with_masks_resized_128_128") # path to write the


# define number of pixel to which the images should be resized: 
num_pixel = 128

# define number of images per class which the new smaller dataset should contain: 
num_subset_train = 1000

# Define classes which need augmentation: 
#classes_aug = ["COVID", "Viral Pneumonia", "Lung_Opacity"]

## images with masks

In [3]:
# read csv with data frame which contains infos to preprocessed  and normalized images and labels and encoded labels
df_train = pd.read_csv(os.path.join(base_path_out,"df_xray_processed_normed_enc_train.csv"), sep=',', index_col=0)
df_test = pd.read_csv(os.path.join(base_path_out,"df_xray_processed_normed_enc_test.csv"), sep=',', index_col=0)

In [4]:
# get num_subset images for each class

num_subset_test  = df_test['label'].value_counts()['Viral Pneumonia'] # for test set take as many images as the smallest class has

#print(df_y_train.columns)

df_train_0 = df_train[(df_train['label_enc']==0)].sample(n=num_subset_train, replace = False, random_state = 42, axis = 0)

In [5]:
# get num_subset images for each class

num_subset_test  = df_test['label'].value_counts()['Viral Pneumonia'] # for test set take as many images as the smallest class has

# train
df_train_0 = df_train[(df_train['label_enc']==0)].sample(n=num_subset_train, replace = False, random_state = 42, axis = 0)
df_train_1 = df_train[(df_train['label_enc']==1)].sample(n=num_subset_train, replace = False, random_state = 42, axis = 0)
df_train_2 = df_train[(df_train['label_enc']==2)].sample(n=num_subset_train, replace = False, random_state = 42, axis = 0)
df_train_3 = df_train[(df_train['label_enc']==3)].sample(n=num_subset_train, replace = False, random_state = 42, axis = 0)

df_train_subset = pd.concat([df_train_0, df_train_1, df_train_2, df_train_3], axis=0)

# test
df_test_0 = df_test[(df_test['label_enc']==0)].sample(n=num_subset_test, replace = False, random_state = 42, axis = 0)
df_test_1 = df_test[(df_test['label_enc']==1)].sample(n=num_subset_test, replace = False, random_state = 42, axis = 0)
df_test_2 = df_test[(df_test['label_enc']==2)].sample(n=num_subset_test, replace = False, random_state = 42, axis = 0)
df_test_3 = df_test[(df_test['label_enc']==3)].sample(n=num_subset_test, replace = False, random_state = 42, axis = 0)

df_test_subset = pd.concat([df_test_0, df_test_1, df_test_2, df_test_3], axis=0)


ValueError: Cannot take a larger sample than population when 'replace=False'

In [11]:
# Build image paths for train and test data
df_train_subset['image_path'] = df_train_subset.apply(lambda row: os.path.normpath(os.path.join(os.getcwd(), row['path'], row['file'])), axis=1)
df_test_subset['image_path'] = df_test_subset.apply(lambda row: os.path.normpath(os.path.join(os.getcwd(), row['path'], row['file'])), axis=1)


# Test

# Load and preprocess the test images (resize and flatten)
test_image_data = []
for index, row in df_test_subset.iterrows():
    img = cv2.imread(row['image_path'], cv2.IMREAD_GRAYSCALE)
    img_resized = cv2.resize(img, (num_pixel, num_pixel))  # Resize image
    img_flattened = img_resized.reshape(-1)  # Flatten image to 1D vector
    if img_flattened is not None:
        test_image_data.append(img_flattened) 

# Convert to NumPy array
X_test = np.array(test_image_data, dtype=np.uint8)
y_test = df_test_subset['label_enc'].to_numpy()

print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

# Save the resized and flattened test data
np.savez_compressed(os.path.join(base_path_out, 'test_non_augmented_with_masks_resized_128_128.npz'), X_test=X_test, y_test=y_test)
print("Resized and flattened test images have been saved!")


# Train

# Load and preprocess the train images (resize and flatten)
train_image_data = []
for index, row in df_train_subset.iterrows():
    img = cv2.imread(row['image_path'], cv2.IMREAD_GRAYSCALE)
    img_resized = cv2.resize(img, (num_pixel, num_pixel))  # Resize image
    img_flattened = img_resized.reshape(-1)  # Flatten image to 1D vector
    if img_flattened is not None:
        train_image_data.append(img_flattened) 

# Convert to NumPy array
X_train = np.array(train_image_data, dtype=np.uint8)
y_train = df_train_subset['label_enc'].to_numpy()

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

# Save the resized and flattened train data
np.savez_compressed(os.path.join(base_path_out, 'train_non_augmented_with_masks_resized_128_128.npz'), X_train=X_train, y_train=y_train)
print("Resized and flattened train images have been saved!")



X_test shape: (1076, 16384)
y_test shape: (1076,)
Resized and flattened test images have been saved!
X_train shape: (4000, 16384)
y_train shape: (4000,)
Resized and flattened train images have been saved!
