In [1]:
import os
import cv2
import numpy as np
from pathlib import Path
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split

lower_age_limit = 1
upper_age_limit = 30
bin_size = 6

# dont keep beyond 30

In [2]:
combined_dict = {}
cropped_folder = Path(os.getcwd()) / 'cropped_images'

def get_sort_key(x):
    parts = x.stem.split('_')
    try:
        return (0, tuple(map(int, parts[:3])))
    except ValueError:
        return (1,)

for i in range(lower_age_limit, upper_age_limit+1, bin_size):
    age_bin = []
    for j in range(bin_size):
        age_bin.extend(sorted(cropped_folder.glob(f'{i+j}_*'), key=get_sort_key))

    combined_dict[(i - lower_age_limit) // bin_size] = age_bin

In [3]:
image_paths = []
bins = []

for key, value in combined_dict.items():
    image_paths.extend(value)
    bins.extend([key] * len(value))

image_paths = np.array(image_paths)
bins = np.array(bins).reshape(-1, 1)

In [4]:
X_train_str, X_test_str, y_train, y_test = train_test_split(image_paths, bins, random_state=42, shuffle = True, stratify=bins)

In [5]:
unique_elements, counts = np.unique(y_train, return_counts=True)
print(f'Unique elements: {unique_elements}')
print(f'Counts: {counts}')

Unique elements: [0 1 2 3 4]
Counts: [1619  649  798 1803 4318]


In [6]:
np.random.seed(0)  # Set the seed for the random number generator to ensure reproducibility

# Get the indices of samples in class 4
class_4_indices = np.where(y_train == 4)[0]

# Initialize lists to hold gender and race values
gender_values = []
race_values = []

# Iterate over each file path in X_train_str
for file_path in X_train_str:
    # Extract the filename from the file path
    filename = os.path.basename(file_path)
    
    # Split the filename into its components [age, gender, race, date&time]
    components = filename.split("_")
    
    # Append the gender and race values (converted to integers) to their respective lists
    gender_values.append(int(components[1]))
    race_values.append(int(components[2]))

# Convert lists to numpy arrays
gender_values = np.array(gender_values)
race_values = np.array(race_values)

# Count the number of each gender and race within class 4
gender_counts = np.bincount(gender_values[class_4_indices])
race_counts = np.bincount(race_values[class_4_indices])

# Determine the minimum number of samples to take from each gender-race combination
min_samples = min(np.min(gender_counts), np.min(race_counts), 1500)

# Initialize a list to hold the indices of the undersampled data
undersampled_indices = []

# Iterate over each gender-race combination
for i in range(2):  # Gender values (0, 1)
    for j in range(4):  # Race values (0, 1, 2, 3)
        # Get the indices of samples that match the current gender-race combination and are in class 4
        indices = np.where((gender_values == i) & (race_values == j) & (y_train == 4))[0]
        
        if len(indices) > min_samples:
            # If there are more samples than min_samples, randomly select min_samples samples without replacement
            selected_indices = np.random.choice(indices, size=min_samples, replace=False)
            undersampled_indices.extend(selected_indices)
        else:
            # If there are less or equal samples than min_samples, take all samples
            undersampled_indices.extend(indices)

# Convert the list of undersampled indices to a numpy array
undersampled_indices = np.array(undersampled_indices)

In [7]:
other_indices = np.where(y_train != 4)[0]
selected_indices = np.concatenate([undersampled_indices, other_indices])

X_train_str_undersampled = X_train_str[np.array(selected_indices)]
y_train_undersampled = y_train[np.array(selected_indices)]

In [8]:
train_results = Parallel(n_jobs=-1)(delayed(lambda image: cv2.imread(str(image)))(image) for image in X_train_str_undersampled)
X_train_undersampled = np.array(train_results)
del train_results

  X_train_undersampled = np.array(train_results)


In [9]:
np.savez( os.getcwd() + '/ train_age_image_data.npz', X_train_undersampled, y_train_undersampled)
del X_train_undersampled, y_train_undersampled

In [10]:
test_results = Parallel(n_jobs=-1)(delayed(lambda image: cv2.imread(str(image)))(image) for image in X_test_str)
X_test = np.array(test_results)
del test_results

  X_test = np.array(test_results)


In [11]:
np.savez( os.getcwd() + '/ test_age_image_data.npz', X_test, y_test)
del X_test, y_test