In [1]:
# #!pip install "dlib-19.22.99-cp310-cp310-win_amd64.whl"

In [1]:
import os
import tarfile
import dlib
import matplotlib.pyplot as plt
import cv2
from pathlib import Path
from joblib import Parallel, delayed
import time
import numpy as np
from scipy import stats
import pickle

lower_age = 1
upper_age = 30

In [3]:
tar_files = ['part1.tar.gz', 'part2.tar.gz', 'part3.tar.gz']
output_folder = os.path.join(os.getcwd(), 'all_images')

os.makedirs(output_folder, exist_ok=True)

def extract_jpg(tar_file, output_folder):
    with tarfile.open(tar_file, 'r:gz') as tar:
        jpg_members = [member for member in tar.getmembers() if member.name.endswith('.jpg')]
        for member in jpg_members:
            # Remove the leading directory name from the member's name
            member.name = os.path.basename(member.name)
        tar.extractall(path=output_folder, members=jpg_members)

Parallel(n_jobs=-1)(delayed(extract_jpg)(tar_file, output_folder) for tar_file in tar_files)

[None, None, None]

In [2]:
def hogDetectFaces(image, image_path, return_dims):
    height, width, _ = image.shape
    output_image = image.copy()
    
    # OpenCV reads images in BGR format by default
    imgRGB = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    hog_face_detector = dlib.get_frontal_face_detector()
    results = hog_face_detector(imgRGB, 0)
    
    # Initialize bounding box variables
    x1 = y1 = x2 = y2 = None

    # If 0 or more than 1 face is detected
    if not len(results)==1 and return_dims:
        return 0, 0
    for bbox in results:

        # Some images have bounding box in their borders
        x1 = max(0, bbox.left()) 
        y1 = max(0, bbox.top())
        x2 = max(0, bbox.right())
        y2 = max(0, bbox.bottom())
        
    if return_dims:
        cropped_image_height = y2 - y1
        cropped_image_width = x2 - x1
        return cropped_image_height, cropped_image_width
    else:
        if x1 is not None and y1 is not None and x2 is not None and y2 is not None:
            cropped_image_height = y2 - y1
            cropped_image_width = x2 - x1
            if 71 < cropped_image_height < 643 and 68 < cropped_image_width < 642:
                return output_image[y1:y2, x1:x2]
        return image_path

In [3]:
cropped_folder = os.path.join(os.getcwd(), 'cropped_images')
os.makedirs(cropped_folder, exist_ok=True)
output_folder = Path(os.getcwd()) / 'all_images'
cropped_folder = Path(os.getcwd()) / 'cropped_images'

In [4]:
def get_sort_key(x):
    parts = x.stem.split('_')
    try:
        if lower_age <= int(parts[0]) <= upper_age:
            return (0, tuple(map(int, parts[:3])))
        else:
            return (1,)
    except ValueError:
        return (1,)

age_image_list = []

for image_path in sorted(output_folder.glob('*'), key=get_sort_key):
    if get_sort_key(image_path)[0] == 0:
        age_image_list.append(image_path)

In [16]:
s = time.time()
def process_image(image_path):
    image_path_str = str(image_path)
    image = cv2.imread(image_path_str)
    result = hogDetectFaces(image, image_path_str, return_dims=False)
        
    if isinstance(result, str):  # The result is an image path
        print(f"Image at {result} does not meet dimension requirements.")
    else:  # The result is a cropped image
        
        # Interpolation
#         interpolation = cv2.INTER_CUBIC if cropped_image.shape[0] <= 180 or cropped_image.shape[1] <= 180 else cv2.INTER_AREA
#         cropped_image = cv2.resize(cropped_image, (180, 180), interpolation=interpolation)
        
        # Save the first cropped image to replace the original image
        cropped_path = cropped_folder / image_path.name
        cv2.imwrite(str(cropped_path), result)

Parallel(n_jobs=-5)(delayed(process_image)(image_path) for image_path in age_image_list )
time.time()-s

1319.0318975448608

In [None]:
# def process_image(image_path):
#     image = cv2.imread(str(image_path))
#     height, width = hogDetectFaces(image, return_dims = True)
#     if not height == 0:
#         return height, width, image_path

# dims = Parallel(n_jobs=-4)(delayed(process_image)(path) for path in age_image_list )
# dims = [dim for dim in dims if dim is not None]

# heights, widths, image_paths = map(list, zip(*dims))
# heights_array = np.array(heights)
# widths_array = np.array(widths)

# del heights, widths, dims

# np.save('heights.npy', heights_array)
# np.save('widths.npy', widths_array)

# del heights_array, widths_array

# # Save list to a pickle file
# with open('image_paths.pkl', 'wb') as f:
#     pickle.dump(image_paths, f)

# del image_paths

In [None]:
# heights_array = np.load('heights.npy')
# widths_array = np.load('widths.npy')

# # Load list from a pickle file
# with open('image_paths.pkl', 'rb') as f:
#     image_paths = pickle.load(f)

In [None]:
# data = [heights_array, widths_array]
# labels = ['Heights', 'Widths']

# plt.boxplot(data, labels=labels)
# plt.title('Box Plot of Cropped Image Dimensions')
# plt.show()

# # Calculate the z-scores of the heights and widths
# heights_zscores = stats.zscore(heights_array)
# widths_zscores = stats.zscore(widths_array)

# # Define the threshold for outliers
# threshold = 3

# # Find the indices of the outliers
# height_outlier_condition = np.abs(heights_zscores) > threshold
# width_outlier_condition = np.abs(widths_zscores) > threshold
# outlier_indices = np.where(height_outlier_condition | width_outlier_condition )

# crop_image_paths = [image_path for i, image_path in enumerate(image_paths) if i not in outlier_indices[0]]

# heights_array = np.delete(heights_array, outlier_indices[0])
# widths_array = np.delete(widths_array, outlier_indices[0])

# print(np.sort(heights_array)[::-1])
# print(np.sort(widths_array)[::-1])

# # Calculate the densities
# density = stats.gaussian_kde(heights_array)
# # Calculate the weighted mean
# center_point = np.average(heights_array, weights=density(heights_array))
# print('Center Point:', center_point)

# # Calculate the densities
# density = stats.gaussian_kde(widths_array)
# # Calculate the weighted mean
# center_point = np.average(widths_array, weights=density(widths_array))
# print('Center Point:', center_point)


# bins = np.arange(50, 700, 50)  # creates an array [50, 100, ..., 650]

# # compute histogram
# hist, bin_edges = np.histogram(heights_array, bins)

# # print histogram frequencies
# print(f'Frequencies: {hist}')

# # plot histogram
# plt.figure(figsize=[6,6])
# plt.bar(bin_edges[:-1], hist, width = 50, color='#0504aa',alpha=0.7)
# plt.xlim(min(bin_edges), max(bin_edges))
# plt.grid(axis='y', alpha=0.75)
# plt.xlabel('Height',fontsize=15)
# plt.ylabel('Frequency',fontsize=15)
# plt.xticks(fontsize=15)
# plt.yticks(fontsize=15)
# plt.title('Histogram of Heights',fontsize=15)
# plt.show()

# # compute histogram
# hist, bin_edges = np.histogram(widths_array, bins)

# # print histogram frequencies
# print(f'Frequencies: {hist}')

# # plot histogram
# plt.figure(figsize=[6,6])
# plt.bar(bin_edges[:-1], hist, width = 50, color='#0504aa',alpha=0.7)
# plt.xlim(min(bin_edges), max(bin_edges))
# plt.grid(axis='y', alpha=0.75)
# plt.xlabel('Height',fontsize=15)
# plt.ylabel('Frequency',fontsize=15)
# plt.xticks(fontsize=15)
# plt.yticks(fontsize=15)
# plt.title('Histogram of Heights',fontsize=15)
# plt.show()