# Data preprocessing

In [None]:
# Disable some console warnings
import os
os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
import numpy as np

from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.image import img_to_array, load_img
from keras.utils import to_categorical

## Configuration

In [None]:
DATA_DIR = "data"

DATA_JPG_DIR = DATA_DIR + '/augmented/jpg_v02'
DATA_NPY_DIR = DATA_DIR + '/npy/'

IMAGE_WIDTH = 64
IMAGE_HEIGHT = 64
TEST_SIZE = 0.2
NUM_CLASSES = 52

CREATE_DATASET = True

## Create dataset

In [None]:
image_paths = []
labels = []

def preprocess_image(image_path):
    img = load_img(image_path, target_size=(IMAGE_WIDTH, IMAGE_HEIGHT))
    img_array = img_to_array(img) / 255.0  # Normalize pixel values
    return img_array

In [None]:
if CREATE_DATASET:

    for label in os.listdir(DATA_JPG_DIR):
        label_dir = os.path.join(DATA_JPG_DIR, label)
        for image_file in os.listdir(label_dir):
            image_paths.append(os.path.join(label_dir, image_file))
            labels.append(label)

    X_train_val, X_test, y_train_val, y_test = train_test_split(
        image_paths, labels, test_size=TEST_SIZE, random_state=42, stratify=labels)


    # Encode labels
    label_encoder = LabelEncoder()
    label_encoder.fit(y_train_val)
    y_train_val_encoded = label_encoder.transform(y_train_val)
    y_test_encoded = label_encoder.transform(y_test)


    # Convert integer labels to categorical format
    y_train_val = to_categorical(y_train_val_encoded, NUM_CLASSES)

    # Convert integer labels to categorical format
    y_test = to_categorical(y_test_encoded, NUM_CLASSES)

    # Verify one-hot encoding
    #print("Shape of y_train_val:", y_train_val.shape)
    #print("Shape of y_test:", y_test.shape)

    X_train_val = [preprocess_image(image_path) for image_path in X_train_val]
    X_test = [preprocess_image(image_path) for image_path in X_test]

    X_train_val = np.array(X_train_val)
    X_test = np.array(X_test)
    
    classes = label_encoder.classes_


    # Save the preprocessed data
    np.save(DATA_NPY_DIR + 'X_train_val.npy', X_train_val)
    np.save(DATA_NPY_DIR + 'X_test.npy', X_test)
    np.save(DATA_NPY_DIR + 'y_train_val.npy', y_train_val)
    np.save(DATA_NPY_DIR + 'y_test.npy', y_test)
    np.save(DATA_NPY_DIR + 'classes.npy', classes)
    
else:
    X_train_val = np.load(DATA_NPY_DIR + 'X_train_val.npy')
    X_test = np.load(DATA_NPY_DIR + 'X_test.npy')
    y_train_val = np.load(DATA_NPY_DIR + 'y_train_val.npy')
    y_test = np.load(DATA_NPY_DIR + 'y_test.npy')
    classes = np.load(DATA_NPY_DIR + 'classes.npy', allow_pickle=True)

In [None]:
print('Train and validation set:', X_train_val.shape[0])
print('Test set:                ', X_test.shape[0])
print('Classes:                 ', classes.shape[0])