In [1]:
import os
import cv2
import numpy as np
from pathlib import Path
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

lower_age_limit = 1
upper_age_limit = 30
bin_size = 6

# dont keep beyond 30

In [2]:
age_images_dict = {}
ages_labels_dict = {}
cropped_folder = cropped_folder = Path(os.getcwd()) / 'cropped_images'

def get_sort_key(x):
    parts = x.stem.split('_')
    try:
        return (0, tuple(map(int, parts[:3])))
    except ValueError:
        return (1,)

for i in range(lower_age_limit, upper_age_limit+1, bin_size):
    
    age_bin = [image for j in range(bin_size) for image in sorted(cropped_folder.glob(f'{i+j}_*'), key=get_sort_key) ]
    age_images_dict[i] = age_bin
    ages_labels_dict[i] = (i - lower_age_limit) // bin_size

# some ages are missing 

In [3]:
def process_image(image, age_number):
    try:
        #image_path = os.path.abspath(image)
        img = cv2.imread(str(image))
        return img, ages_labels_dict[age_number]
        #return resized_img
    except cv2.error as e:
        print(f"Error processing {image}: {str(e)}")
        return None



results = Parallel(n_jobs=-1)(delayed(process_image)(image, age_number) for age_number, images in age_images_dict.items() for image in images)

x, y = zip(*results)
x = np.array(x)
y = np.array(y).reshape(-1, 1)
del results

In [4]:
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=42, shuffle = True)
del x, y, age_images_dict, ages_labels_dict
np.savez( os.getcwd() + '/ test_age_image_data.npz', X_test, y_test)
del X_test, y_test

In [5]:
unique_elements, counts = np.unique(y_train, return_counts=True)
print(f'Unique elements: {unique_elements}')
print(f'Counts: {counts}')

Unique elements: [0 1 2 3 4]
Counts: [1611  635  792 1789 4282]


In [6]:
dim1 = X_train.shape[1]
dim2 = X_train.shape[2]
dim3 = X_train.shape[3]

#Undersample class 4
class_4_indices = np.where(y_train == 4)[0]
np.random.seed(0)

undersampled_indices = np.random.choice(class_4_indices, size=2000, replace=False)
other_indices = np.where(y_train != 4)[0]
selected_indices = np.concatenate([undersampled_indices, other_indices])

X_train_undersampled = X_train[selected_indices]
y_train_undersampled = y_train[selected_indices]


# oversample classes 0, 1, 2, and 3 to have 2000 images each using SMOTE
sampling_strategy = {0: 2000, 1: 2000, 2: 2000, 3: 2000}
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=0)

X_train_reshaped = X_train_undersampled.reshape(X_train_undersampled.shape[0], -1)
del X_train_undersampled

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_reshaped, y_train_undersampled)
del X_train_reshaped

X_train_resampled = X_train_resampled.reshape(-1, dim1, dim2, dim3)
y_train_resampled = y_train_resampled.reshape(-1, 1)

np.savez( os.getcwd() + '/ train_age_image_data.npz', X_train_resampled, y_train_resampled)
del X_train_resampled, y_train_resampled