In [None]:
#Import the cleaned dataset
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

file_path = 'final_filtered_training_set.npz'

with np.load(file_path) as data:
    training_images = data['images']
    training_labels = data['labels']

#Check
print(f"Shape of images: {training_images.shape}")
print(f"type of images: {type(training_images)}")
print(f"shape of labels: {training_labels.shape}")
print(f"type of labels: {type(training_labels)}")
# Dataframe for labels
labels = training_labels.flatten()
df_labels = pd.DataFrame(labels, columns=['label'])
print(df_labels.head())

In [None]:
class_counts = df_labels['label'].value_counts()
class_percent = 100 * class_counts / len(df_labels)

plt.figure(figsize=(10, 6))
sns.barplot(x=class_percent.index.astype(str), y=class_percent.values, palette='magma')
print(class_percent.values)
plt.title('Class distributions')
plt.xlabel('Class')
plt.ylabel('Percentage (%)')
plt.xticks(rotation=45)
plt.show()

###Train, val, test split


In [None]:
# train test val split

from sklearn.model_selection import train_test_split

# Define split sizes
test_size = 0.2        # 20% for test
validation_size = 0.2  # 20% of the remaining 80% = 16% for validation

# Split into (training+validation) and test sets
training_labels = training_labels.flatten()
X_train_val, X_test, y_train_val, y_test = train_test_split(
    training_images,
    training_labels,
    test_size=test_size,
    random_state=42,
    stratify=training_labels
)

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val,
    y_train_val,
    test_size=validation_size,
    random_state=42,
    stratify=y_train_val
)
print("Training Images - Min pixel value:", np.min(X_train))
print("Training Images - Max pixel value:", np.max(X_train))
print("Test Images - Min pixel value:", np.min(X_test))
print("Test Images - Max pixel value:", np.max(X_test))


#normalize here, how to do it for inceptionresnetv2? i read it needs normalization betwen [-1,1]


#.....

print(f"Training set shape: {X_train.shape}, {y_train.shape}")
print(f"Validation set shape: {X_val.shape}, {y_val.shape}")
print(f"Test set shape: {X_test.shape}, {y_test.shape}")


In [None]:
# Function to plot class distribution
def plot_class_distribution(labels, subset_name):
    df = pd.DataFrame({'Label': labels})
    plt.figure(figsize=(10,6))
    sns.countplot(x='Label', data=df, palette='viridis')
    plt.title(f'Class Distribution in {subset_name} Set')
    plt.xlabel('Class')
    plt.ylabel('Number of Images')
    plt.xticks(rotation=45)
    plt.show()

# Plot distributions
plot_class_distribution(y_train, 'Training')
plot_class_distribution(y_val, 'Validation')
plot_class_distribution(y_test, 'Test')


#Balancing of the training set, done by oversampling

In [None]:
#OVERSAMPLING

# Determine the maximum class count
import numpy as np

#class_counts = np.bincount(training_labels)
class_counts = pd.Series(y_train).value_counts()  # Use only y_train, not the full label set

# `class_counts` now contains the size of each class:
# - `class_counts[j]` gives the count of items with class label `j`.

max_count = class_counts.max()
print(f"Maximum class count: {max_count}")

# Initialize lists to hold oversampled data
X_train_oversampled = []
y_train_oversampled = []

# Iterate through each class to perform oversampling
for class_label in class_counts.index:
    # Current class samples
    X_class = X_train[y_train == class_label]
    y_class = y_train[y_train == class_label]

    # Number of samples to add
    samples_needed = max_count - len(X_class)

    if samples_needed > 0:
        # Calculate how many times to duplicate the class samples
        duplicates = samples_needed // len(X_class)
        remainder = samples_needed % len(X_class)

        # Duplicate the entire class as many times as needed
        for _ in range(duplicates):
            X_train_oversampled.append(X_class)
            y_train_oversampled.append(y_class)

        # Add the remaining samples by randomly selecting from the class
        if remainder > 0:
            indices = np.random.choice(len(X_class), size=remainder, replace=True)
            X_train_oversampled.append(X_class[indices])
            y_train_oversampled.append(y_class[indices])

# Concatenate the oversampled data
if X_train_oversampled:
    X_train_oversampled = np.vstack(X_train_oversampled)
    y_train_oversampled = np.hstack(y_train_oversampled)

    # Append the oversampled data to the original training set
    X_train_balanced = np.vstack((X_train, X_train_oversampled))
    y_train_balanced = np.hstack((y_train, y_train_oversampled))
else:
    # If no oversampling is needed
    X_train_balanced = X_train
    y_train_balanced = y_train

print(f"Training set shape after oversampling: {X_train_balanced.shape}, {y_train_balanced.shape}")



In [None]:
# Count classes in training set
class_counts = pd.Series(y_train_balanced).value_counts().sort_index()
print("Class distribution in Training set after oversampling:")
print(class_counts)

# Plot for better visualization
plt.figure(figsize=(10,6))
sns.barplot(x=class_counts.index, y=class_counts.values, palette='viridis')
plt.title('Class Distribution in Training Set after Oversampling')
plt.xlabel('Class')
plt.ylabel('Number of Images')
plt.xticks(rotation=45)
plt.show()


#AUGMENTATION template

In [None]:
import albumentations as A
from albumentations.core.composition import Compose
import cv2
import tensorflow as tf
from tensorflow.keras import backend as K



In [None]:
# Define the augmentation pipeline
augmentation_pipeline = Compose([
    #add what type of augmentations you want, example:
    #A.CoarseDropout(max_holes=8, max_height=16, max_width=16, min_holes=1, min_height=16, min_width=16, fill_value=0, p=0.5)
    #this puts black squares randomly in the images
])

print(y_train_balanced.dtype)
y_train_balanced = y_train_balanced.reshape(-1).astype(np.int32)
print(y_train_balanced.dtype)

In [None]:
import cv2
import tensorflow as tf

def augment_image(image, label):
    # Convert TensorFlow tensors to NumPy arrays
    image = image.numpy()
    label = label.numpy()

    # Ensure the image is in uint8 format
    if image.dtype != np.uint8:
        image = (image * 255).astype(np.uint8)

    # Convert RGB to BGR for OpenCV
    #image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

    # Apply augmentation
    augmented = augmentation_pipeline(image=image)
    image = augmented['image']

    # Convert back to float32 and normalize
    image = image.astype(np.float32) / 255.0
    image = (image - np.array([0.485, 0.456, 0.406])) / np.array([0.229, 0.224, 0.225])

    # Ensure label is int32
    label = label.astype(np.int32)

    return image, label


In [None]:
#wrapper
def tf_augment_image(image, label):
    # Apply the augment_image function using tf.py_function
    augmented_image, augmented_label = tf.py_function(
        func=augment_image,
        inp=[image, label],
        Tout=[tf.float32, tf.int32]  # Corrected label type to int32
    )

    # Set the shape information
    augmented_image.set_shape((96, 96, 3))
    augmented_label.set_shape(())

    return augmented_image, augmented_label



In [None]:
import tensorflow as tf

# Create TensorFlow Dataset from balanced training data
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_balanced, y_train_balanced))

# Apply the augmentation to the training dataset
train_dataset = train_dataset.map(
    tf_augment_image,
    num_parallel_calls=tf.data.AUTOTUNE
)

# Shuffle, batch, and prefetch for performance
train_dataset = train_dataset.shuffle(buffer_size=1000).batch(32).prefetch(tf.data.AUTOTUNE)


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Function to display images in a grid
def display_augmented_images(dataset, num_images=15, rows=3, cols=5):
    for images, labels in dataset.take(1):
        fig, axes = plt.subplots(rows, cols, figsize=(10, 6))
        fig.tight_layout(pad=2.0)  # Adjust spacing between images

        for i in range(num_images):
            row, col = divmod(i, cols)
            img = images[i].numpy()

            # Denormalize the image
            img = img * np.array([0.229, 0.224, 0.225]) + np.array([0.485, 0.456, 0.406])
            img = np.clip(img, 0, 1)  # Clip to [0, 1] range for display

            axes[row, col].imshow(img)
            axes[row, col].set_title(f'Label: {labels[i].numpy()}')
            axes[row, col].axis('off')

        # Hide any remaining empty subplots
        for j in range(num_images, rows * cols):
            fig.delaxes(axes.flatten()[j])

        plt.show()

# Display augmented images from the training set in a 3x5 grid
display_augmented_images(train_dataset, num_images=15, rows=3, cols=5)

