<a href="https://colab.research.google.com/github/VladimirVladetic/AgeAndGenderPredictor/blob/main/EDAandModelTrain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import os
import cv2
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
from keras.models import Sequential, Model
from keras.layers import Dense, Conv2D, Dropout, Flatten, MaxPooling2D, Input, BatchNormalization
from PIL import Image
from keras.preprocessing.image import load_img, img_to_array
import zipfile
from keras.utils import to_categorical
from tensorflow.keras.regularizers import l1_l2, l1, l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.models import save_model, load_model
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, array_to_img
import shutil

In [2]:
from google.colab import files

uploaded = files.upload()

source_path = '/content/kaggle.json'
destination_path = '/root/.kaggle/kaggle.json'
kaggle_dir = '/root/.kaggle/'
if not os.path.exists(kaggle_dir):
    os.makedirs(kaggle_dir)
os.rename(source_path, destination_path)

Saving kaggle.json to kaggle.json


In [2]:
folder = '/content/output'
shutil.rmtree(folder)

In [3]:
folder = '/content/output'
os.makedirs(folder)

In [3]:
! kaggle datasets download -d jangedoo/utkface-new

with zipfile.ZipFile('/content/utkface-new.zip', 'r') as zip_ref:
  zip_ref.extractall('/content')

Downloading utkface-new.zip to /content
 98% 323M/331M [00:01<00:00, 206MB/s]
100% 331M/331M [00:01<00:00, 195MB/s]


In [4]:
## Function returns paths to images and their associated age and gender labels
def get_image_paths_age_gender_labels(image_directory):
    image_paths = []
    age_labels = []
    gender_labels = []
    for filename in tqdm(os.listdir(image_directory )):
        image_path = os.path.join(image_directory,filename)
        temporary_data = filename.split("_")
        age = int(temporary_data[0])
        gender = int(temporary_data[1])
        image_paths.append(image_path)
        age_labels.append(age)
        gender_labels.append(gender)
    return image_paths, age_labels, gender_labels

In [5]:
def map_age_to_category(age):
  age_ranges = [(0, 2), (3, 9), (10, 20), (21, 29), (30, 45), (46, 60), (61, 80), (81, 120)]
  for category, (min_age, max_age) in enumerate(age_ranges, start=0):
          if min_age <= age <= max_age:
              return category
  return 0

In [6]:
def create_model(input_shape,num_age_classes):
    inputs = Input(input_shape)

    conv_1 = Conv2D(32, kernel_size=(3,3), activation='relu')(inputs)
    batch_1 = BatchNormalization()(conv_1)

    conv_2 = Conv2D(64, kernel_size=(3,3), activation='relu')(batch_1)
    batch_2 = BatchNormalization()(conv_2)
    maxp_2 = MaxPooling2D(pool_size=(2,2))(batch_2)

    conv_3 = Conv2D(128, kernel_size=(3,3), activation='relu')(maxp_2)
    batch_3 = BatchNormalization()(conv_3)

    conv_4 = Conv2D(256, kernel_size=(3,3), activation='relu')(batch_3)
    batch_4 = BatchNormalization()(conv_4)
    maxp_4 = MaxPooling2D(pool_size=(2,2))(batch_4)

    conv_5 = Conv2D(512, kernel_size=(3,3), activation='relu')(maxp_4)
    batch_5 = BatchNormalization()(conv_5)

    flatten = Flatten()(batch_5)

    dense_1_1 = Dense(256, activation='relu')(flatten)
    dense_2_1 = Dense(256, activation='relu')(flatten)

    dropout_1 = Dropout(0.3)(dense_1_1)
    dropout_2 = Dropout(0.3)(dense_2_1)

    dense_1_2 = Dense(256, activation='relu')(dropout_1)
    dense_2_2 = Dense(256, activation='relu')(dropout_2)

    output_1 = Dense(1, activation='sigmoid', name="gender_out")(dense_1_2)
    output_2 = Dense(num_age_classes, activation='softmax', name="age_out")(dense_2_2)

    model = Model(inputs=[inputs], outputs=[output_1, output_2])
    return model

In [7]:
def extract_features(images, height, width, output_folder=None):
    features = []
    no_face_indices = []  # Keep track of indices for images with no detected faces

    # Load the pre-trained Haarcascades face classifier
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

    # Create the output folder if it doesn't exist
    if output_folder is not None:
        os.makedirs(output_folder, exist_ok=True)

    for idx, image in tqdm(enumerate(images)):
        # Load the image using PIL
        img = load_img(image, grayscale=True)

        # Convert PIL image to a numpy array
        img_array = np.array(img)

        # Detect faces in the image
        faces = face_cascade.detectMultiScale(img_array, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))

        # If faces are detected, extract features from the first detected face
        if len(faces) > 0:
            x, y, w, h = faces[0]
            face_img = img_array[y:y+h, x:x+w]

            # Resize the face image to the specified height and width
            face_img = cv2.resize(face_img, (height, width))

            # Normalize the face image
            face_img = face_img / 255.0

            # Add the face features to the list
            features.append(face_img)

            # Save the augmented face image if output folder is provided
            if output_folder is not None:
                img_filename = f"augmented_face_{idx}.jpg"
                img_path = os.path.join(output_folder, img_filename)
                array_to_img(face_img.reshape(height, width, 1)).save(img_path)
        else:
            # If no faces are detected, add the index to the no_face_indices list
            no_face_indices.append(idx)

    features = np.array(features)
    features = features.reshape(len(features), height, width, 1)
    features = np.array(features, dtype=np.float32)

    return features, no_face_indices


In [8]:
def augment_data(images, age_labels, gender_labels, augmentation_factor=2):
    datagen = ImageDataGenerator(
        rotation_range=3,
        zoom_range=0.03,
        width_shift_range=0.03,
        height_shift_range=0.03,
        horizontal_flip=True,
        fill_mode='nearest'
    )

    augmented_images = []
    augmented_age_labels = []
    augmented_gender_labels = []

    for image, age_label, gender_label in zip(images, age_labels, gender_labels):
        img = img_to_array(image)
        img = np.expand_dims(img, axis=0)

        # Apply data augmentation and get augmented images
        augmented_images.extend(list(datagen.flow(img, batch_size=1))[0])

        # Copy age and gender labels for each augmented image
        for _ in range(augmentation_factor):
            augmented_age_labels.append(age_label)
            augmented_gender_labels.append(gender_label)

    return np.array(augmented_images), np.array(augmented_age_labels), np.array(augmented_gender_labels)

In [9]:
image_directory = "/content/utkface_aligned_cropped/UTKFace/"

image_paths, age_labels, gender_labels = get_image_paths_age_gender_labels(image_directory)

age_categories = [map_age_to_category(age) for age in age_labels]

gender_dictionary = {0:"Male",1:"Female"}

  0%|          | 0/23708 [00:00<?, ?it/s]

In [10]:
## Structure of df: INDEX, PICTURE FILEPATH, AGE LABEL, GENDER LABEL
df = pd.DataFrame()
df["image"], df["age"], df["gender"] = image_paths, age_categories, gender_labels
##print(df)

In [11]:
image_height = 128
image_width = 128
num_age_classes = 8

In [12]:
X, no_face_indices = extract_features(df["image"],image_height,image_width, "/content/output/")

0it [00:00, ?it/s]

In [None]:
"""
folder_path = '/path/to/your/images'

# Initialize an empty list to store the images
X = []

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    # Check if the file is an image (you can add more image file extensions if needed)
    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
        # Construct the full file path
        file_path = os.path.join(folder_path, filename)

        # Read the image using OpenCV
        image = cv2.imread(file_path)

        # Optional: Convert the image to grayscale or perform any other preprocessing

        # Append the image to the list
        X.append(image)

# Convert the list of images to a NumPy array
X = np.array(X)

In [13]:
file_path = '/content/no_face_indices.txt'

# Open the file in write mode
with open(file_path, 'w') as file:
    # Write each number to the file, separated by a newline character
    for number in no_face_indices:
        file.write(f"{number}\n")

In [14]:
print(no_face_indices[5])

15


In [15]:
print(len(no_face_indices))

10419


In [16]:
df_filtered = df.drop(no_face_indices)

df_filtered.reset_index(drop=True, inplace=True)

y_gender = np.array(df_filtered["gender"])
y_age = np.array(df_filtered["age"])

y_age = np.array(list(df_filtered["age"].apply(lambda x: to_categorical(x, num_classes=num_age_classes))))

y_gender_tensor = tf.convert_to_tensor(y_gender, dtype=tf.float32)
y_age_tensor = tf.convert_to_tensor(y_age, dtype=tf.float32)

input_shape = (image_height,image_width,1)

In [17]:
print(len(y_gender))
print(len(y_age))
print(len(X))

13289
13289
13289


In [None]:
X, y_age, y_gender = augment_data(X, y_age, y_gender)

In [22]:
print(len(y_gender))
print(len(y_age))
print(len(X))

39867
39867
13289


In [20]:
opt = Adam(learning_rate=0.001)

model = create_model(input_shape, num_age_classes)

model.compile(loss=["binary_crossentropy", "categorical_crossentropy"], optimizer = opt, metrics=["accuracy"])

lr_scheduler = ReduceLROnPlateau(factor=0.5, patience=5)

history = model.fit(x=X, y=[y_gender, y_age], batch_size=32, epochs=50, validation_split=0.2, callbacks=[lr_scheduler])

Epoch 1/50

KeyboardInterrupt: 