In [1]:
import cv2
import os
import numpy as np
import itertools
from matplotlib import pyplot as plt

In [2]:
import tensorflow as tf
from tensorflow import keras

2023-11-20 16:36:26.236243: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# 1. Dataset

In [3]:
data_path = os.path.join('face_dataset')

In [4]:
IMG_SIZE = 128
BUFFER_SIZE = 123
BATCH_SIZE = 32

In [5]:
def create_data_pairs(dataset_path):
    def create_positive_pairs(dataset_path):
        positive_pairs = []
        for person_folder in os.listdir(dataset_path):
            images = os.listdir(os.path.join(dataset_path, person_folder))
            pairs = list(itertools.combinations(images, 2))
            positive_pairs.extend([(person_folder, pair[0], person_folder, pair[1]) for pair in pairs])
        return np.array(positive_pairs)
    
    def create_negative_pairs(dataset_path, positive_pairs):
        negative_pairs = []
        len_list = []
        for i in range(104):
            len_list.append(len(os.listdir(os.path.join(dataset_path, 'person_' + str(i)))))

        for i in range(len(positive_pairs)):
            person_1, img_1, _, _ = positive_pairs[i]

            person_2 = 'person_' + str(np.random.randint(0, 104))
            while person_2 == person_1:
                person_2 = 'person_' + str(np.random.randint(0, 104))

            img_2 = 'img_' + str(np.random.randint(0, len_list[int(person_2.split('_')[-1])]))
            negative_pairs.append((person_1, img_1, person_2, img_2))
        return np.array(negative_pairs)
    
    positive = create_positive_pairs(dataset_path)
    negative = create_negative_pairs(dataset_path, positive)
    data_pairs = np.concatenate((positive, negative), axis=0)
    np.random.shuffle(data_pairs)
    return data_pairs

In [6]:
def load_and_preprocess_image(image_path):
    byte_img = tf.io.read_file(image_path)
    img = tf.io.decode_jpeg(byte_img)
    img = tf.image.resize(img, (IMG_SIZE, IMG_SIZE))
    img = img / 255.0
    return img

In [7]:
def preprocess_and_create_example(person_1, img_1, person_2, img_2, dataset_path):
    image_1 = load_and_preprocess_image(os.path.join(dataset_path, person_1, img_1))
    image_2 = load_and_preprocess_image(os.path.join(dataset_path, person_2, img_2))
    
    label = int(person_1 == person_2)
    
    return image_1, image_2, label

In [8]:
def create_dataset(dataset_path):
    data_pairs = create_data_pairs(data_path)
    len_dataset = len(data_pairs)

    def generator():
        for person_1, img_1, person_2, img_2 in data_pairs:
            yield preprocess_and_create_example(person_1, img_1, person_2, img_2, dataset_path)

    output_signature = (
        tf.TensorSpec(shape=(IMG_SIZE, IMG_SIZE, 3), dtype=tf.float32),
        tf.TensorSpec(shape=(IMG_SIZE, IMG_SIZE, 3), dtype=tf.float32),
        tf.TensorSpec(shape=(), dtype=tf.int32)
    )

    tf_dataset = tf.data.Dataset.from_generator(generator, output_signature=output_signature)
    return tf_dataset, len_dataset

In [9]:
dataset, dataset_size = create_dataset(data_path)

2023-11-20 16:37:32.391551: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-20 16:37:32.425543: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-20 16:37:32.425891: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [10]:
for x in dataset.take(1):
    print(x)

(<tf.Tensor: shape=(128, 128, 3), dtype=float32, numpy=
array([[[0.97340685, 0.92242646, 0.9577206 ],
        [0.9687069 , 0.9177265 , 0.95302063],
        [0.97793067, 0.9269503 , 0.9622444 ],
        ...,
        [0.19522059, 0.11839288, 0.110499  ],
        [0.12360888, 0.04895163, 0.05626819],
        [0.10823951, 0.0337297 , 0.04941598]],

       [[0.9688907 , 0.91791034, 0.95320445],
        [0.9588848 , 0.9079044 , 0.9431985 ],
        [0.9689654 , 0.917985  , 0.95327914],
        ...,
        [0.24471028, 0.17032112, 0.1441885 ],
        [0.19966586, 0.12559839, 0.13870443],
        [0.12422832, 0.05946117, 0.07189989]],

       [[0.976327  , 0.9253466 , 0.9606407 ],
        [0.9803309 , 0.9293505 , 0.9646446 ],
        [0.9801911 , 0.9292107 , 0.96450484],
        ...,
        [0.19326842, 0.11822438, 0.09615694],
        [0.21691942, 0.12806948, 0.15539981],
        [0.16164024, 0.07673483, 0.09761891]],

       ...,

       [[0.8443206 , 0.4600069 , 0.6525314 ],
        [0.8

In [11]:
train = dataset.take(int(0.9 * dataset_size))
dev = dataset.skip(int(0.9 * dataset_size)).take(int(0.05 * dataset_size))
test = dataset.skip(int(0.9 * dataset_size)).skip(int(0.05 * dataset_size))

In [12]:
train = train.shuffle(BUFFER_SIZE)
dev = dev.shuffle(BUFFER_SIZE)

In [13]:
train = train.batch(BATCH_SIZE).cache().prefetch(tf.data.AUTOTUNE)
dev = dev.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test = test.batch(BATCH_SIZE)