In [1]:
import cv2
import os
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [2]:
# Function to load and process a single image
def load_image(image_path, color_mode='rgb', target_size=(256, 256)):
    # Load the image
    if color_mode == 'grayscale':
        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    else:  # Default to RGB
        img = cv2.imread(image_path)
    
    # Load the face cascade and detect faces in the image
    face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
    faces = face_cascade.detectMultiScale(img, 1.1, 4)

    if len(faces) > 0:
        # Assume the first detected face is the target
        x, y, w, h = faces[0]
        # Determine the center of the face
        center_x, center_y = x + w // 2, y + h // 2
        # Make the crop area square
        size = max(w, h) // 2
        # Calculate new x, y, w, h for the square crop
        x_new = max(center_x - size, 0)
        y_new = max(center_y - size, 0)
        w_new = h_new = size * 2
        # Crop the image to the new square bounds, making sure not to go out of the image's boundaries
        crop_img = img[max(0, y_new):min(y_new + h_new, img.shape[0]), max(0, x_new):min(x_new + w_new, img.shape[1])]
    else:
        # If no face is detected, you can choose to return the original image or handle differently
        crop_img = img

    # Resize the cropped image (or the whole image if no face was detected) to the target size
    resized_img = cv2.resize(crop_img, target_size)

    return resized_img

In [3]:
# Function to display an image using OpenCV
def show_image_with_cv2(image, title='Image'):
    cv2.imshow(title, image)
    cv2.waitKey(0)  # Wait indefinitely for a key press
    cv2.destroyAllWindows()  # Close the window when a key is pressed

In [4]:
# Function to process a folder of images
def process_folder(folder_path, output_folder=None, color_mode='rgb', target_size=(256, 256)):
    # Create the output folder if it doesn't exist
    if output_folder and not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tif', '.tiff')):
            image_path = os.path.join(folder_path, filename)
            processed_img = load_image(image_path, color_mode=color_mode, target_size=target_size)
            
            if output_folder:
                output_path = os.path.join(output_folder, filename)
                # If the color mode is grayscale, we need to save it correctly
                if color_mode == 'grayscale':
                    cv2.imwrite(output_path, processed_img, grayscale=True)
                else:
                    cv2.imwrite(output_path, processed_img)
            else:
                # Show the image for verification, comment this out for production use
                cv2.imshow('Processed Image', processed_img)
                cv2.waitKey(0)
                cv2.destroyAllWindows()

In [8]:
input_folder = '/Users/rutgergeerlings/PycharmProjects/Age Prediction/Images part 1'
output_folder = '/Users/rutgergeerlings/PycharmProjects/Age Prediction/Part 1 processed rgb'
process_folder(input_folder, output_folder, color_mode='rgb', target_size=(256, 256))

Corrupt JPEG data: premature end of data segment
Corrupt JPEG data: bad Huffman code


In [5]:
def load_and_normalize_image(image_path, color_mode='rgb'):
    # Load the image
    if color_mode == 'grayscale':
        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    else:  # Default to RGB
        img = cv2.imread(image_path)
    
    # Normalize the image
    img_normalized = img.astype('float32') / 255.0
    
    return img_normalized

In [6]:
def process_images(folder_path):
    data_rgb = []
    data_gray = []
    
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tif', '.tiff')):
            parts = filename.split('_')
            if len(parts) < 4:  # Check if filename format is correct
                print(f"Skipping file with unexpected format: {filename}")
                continue
            
            # Extract labels from filename
            age = int(parts[0])
            gender = int(parts[1])
            race = int(parts[2])
            
            # Full path to the image file
            image_path = os.path.join(folder_path, filename)
            
            # Load and normalize the image in both color modes
            img_rgb = load_and_normalize_image(image_path, 'rgb')
            img_gray = load_and_normalize_image(image_path, 'grayscale')
            
            # Append to the respective lists
            data_rgb.append([age, gender, race, img_rgb])
            data_gray.append([age, gender, race, img_gray])
    
    # Convert lists into pandas DataFrames
    df_rgb = pd.DataFrame(data_rgb, columns=['Age', 'Gender', 'Race', 'Image'])
    df_gray = pd.DataFrame(data_gray, columns=['Age', 'Gender', 'Race', 'Image'])
    
    return df_rgb, df_gray

In [7]:
folder_path = '/Users/rutgergeerlings/PycharmProjects/Age Prediction/Part 1 processed rgb'
df_rgb, df_gray = process_images(folder_path)

Skipping file with unexpected format: 61_1_20170109142408075.jpg
Skipping file with unexpected format: 61_3_20170109150557335.jpg


In [10]:
df_gray.head()

Unnamed: 0,Age,Gender,Race,Image
0,16,1,3,"[[0.38431373, 0.37254903, 0.35686275, 0.341176..."
1,40,1,0,"[[0.67058825, 0.67058825, 0.6666667, 0.6666667..."
2,71,1,0,"[[0.6156863, 0.6392157, 0.7019608, 0.7607843, ..."
3,80,1,0,"[[0.57254905, 0.57254905, 0.57254905, 0.572549..."
4,21,0,4,"[[0.06666667, 0.05882353, 0.050980393, 0.05098..."


In [12]:
df_rgb.to_pickle('Part1_Processed_RGB.pkl')
df_gray.to_pickle('Part1_Processed_Grey.pkl')