## Create cropped datsets

This code is designed to process and extract data from two biometric datasets NUAA Photograph Imposter Database and MSU-MFSD used for tasks like face anti spoofing (distinguishing between real and fake faces). The code involves three main stages:

**1. Preprocessing images from videos using MTCNN:**
it uses the MTCNN (Multi-task Cascaded Convolutional Networks) model to detect faces in video frames, extract them, and save the cropped faces to new directories. This process is repeated for both the NUAA and MSU datasets, storing cropped faces from "real" and "attack" categories.

**2. Loading images and generating numpy arrays:**
for the NUAA dataset, the code reads image paths from text files, loads the images from disk, resizes them and stores them in numpy arrays alongside their corresponding labels. A similar process is applied to the MSU-MFSD dataset, where images are loaded from directories, their corresponding identity is extracted from filenames and the real/fake labels are assigned based on the folder structure.

**3. Dataset combination and shuffling:**
after processing both datasets, they are combined into a larger dataset by concatenating the image data and labels from both datasets. For the MSU dataset, a subset of 19,000 images from each category (real and attack) is selected at random, and all the data is shuffled to ensure randomization in the final dataset.

In [104]:
import os
import numpy as np
import torch
import warnings
from facenet_pytorch import MTCNN, training
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from tqdm import tqdm
import cv2
import re

In [90]:
data_dir_nuaa = r'C:\Users\Asus\Desktop\Biometric Systems\Datasets\NUAA Photograph Imposter Database\raw'
data_dir_msu = r'C:\Users\Asus\Desktop\Biometric Systems\Datasets\MSU-MFSD Photos\raw'

batch_size = 16
workers = 0 if os.name == 'nt' else 8

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Running on device: {}'.format(device))

warnings.filterwarnings("ignore")  # Suppress all warnings
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

Running on device: cuda:0


In [52]:
# Initialize the MTCNN model for face detection
mtcnn = MTCNN(
    image_size=224, # Resize detected face to 224x224 pixels
    margin=14, # Add a 14-pixel margin around the detected face
    device=device,
    selection_method='center_weighted_size' # Choose the best face based on center and size
)

In [54]:
# Define the data loader for the input set of images
orig_img_ds = datasets.ImageFolder(data_dir, transform=None)

In [56]:
orig_img_ds.samples = [
    (p, p)
    for p, _ in orig_img_ds.samples
]

In [58]:
# Set up a data loader to load the images in batches
loader = DataLoader(
    orig_img_ds,
    num_workers=workers,
    batch_size=batch_size,
    collate_fn=training.collate_pil
)

In [62]:
# Variables to store paths to cropped images and bounding box probabilities for the NUAA dataset
crop_paths_nuaa = []
box_probs_nuaa = []

# Loop through the data loader in batches
for i, (x, b_paths) in enumerate(loader):
    # Prepare the save paths for the cropped faces (replace original directory with cropped one)
    crops = [p.replace(data_dir, data_dir_nuaa + '_cropped') for p in b_paths]
    # Detect faces in the images using MTCNN and save the cropped faces
    mtcnn(x, save_path=crops)
    crop_paths_nuaa.extend(crops)
    print('\rBatch {} of {}'.format(i + 1, len(loader)), end='')
    torch.cuda.empty_cache()

Batch 789 of 789


In [92]:
# Repeat the same process for the MSU dataset
crop_paths_msu = []
box_probs_msu = []

for i, (x, b_paths) in enumerate(loader):
    crops = [p.replace(data_dir, data_dir_msu + '_cropped') for p in b_paths]
    mtcnn(x, save_path=crops)
    crop_paths_msu.extend(crops)
    print('\rBatch {} of {}'.format(i + 1, len(loader)), end='')
    torch.cuda.empty_cache()

Batch 4855 of 4855


## Create numpy arrays from NUAA dataset

In [68]:
## Create numpy arrays from NUAA dataset

# Folder paths for NUAA dataset (clients and imposters)
raw_dir = r'C:\Users\Asus\Desktop\Biometric Systems\Datasets\NUAA Photograph Imposter Database\raw_cropped'
client_raw_dir = os.path.join(raw_dir, 'ClientRaw')
imposter_raw_dir = os.path.join(raw_dir, 'ImposterRaw')

# Text file paths that contain lists of image file names
client_train_file = os.path.join(raw_dir, 'client_train_raw.txt')
client_test_file = os.path.join(raw_dir, 'client_test_raw.txt')
imposter_train_file = os.path.join(raw_dir, 'imposter_train_raw.txt')
imposter_test_file = os.path.join(raw_dir, 'imposter_test_raw.txt')

In [70]:
# Function to read image paths from a text file
def read_image_paths(file_path):
    with open(file_path, 'r') as file:
        image_paths = file.read().splitlines()
    return image_paths

# Function to load images from paths
def load_images(image_paths, base_dir, label, img_size=(224, 224)):
    images = []
    labels = []
    source_labels = []  # Used to differentiate between client (0) and imposter (1)

    for path in tqdm(image_paths):
        full_path = os.path.join(base_dir, path)
        image = cv2.imread(full_path)
        if image is not None:
            image = cv2.resize(image, img_size)
            images.append(image)
            id_str = os.path.normpath(path).split(os.sep)[0] # Extract client ID from the file path
            labels.append(int(id_str))
            source_labels.append(label) # Append the label (0 for client, 1 for imposter)

    return images, labels, source_labels

# Function to create the dataset for NUAA (returns image data, labels and source labels)
def create_dataset(client_train_file, client_test_file, imposter_train_file, imposter_test_file, client_raw_dir, imposter_raw_dir):
    X = []
    y = []
    S = []

    # Load training and test images for clients (real faces)
    client_train_paths = read_image_paths(client_train_file)
    client_test_paths = read_image_paths(client_test_file)
    imposter_train_paths = read_image_paths(imposter_train_file)
    imposter_test_paths = read_image_paths(imposter_test_file)

    # Load client (real) images for both training and test sets
    client_images, client_labels, client_source_labels = load_images(client_train_paths, client_raw_dir, 0)
    X.extend(client_images)
    y.extend(client_labels)
    S.extend(client_source_labels)

    client_images, client_labels, client_source_labels = load_images(client_test_paths, client_raw_dir, 0)
    X.extend(client_images)
    y.extend(client_labels)
    S.extend(client_source_labels)
    
    # Load imposter (fake) images for both training and test sets
    imposter_images, imposter_labels, imposter_source_labels = load_images(imposter_train_paths, imposter_raw_dir, 1)
    X.extend(imposter_images)
    y.extend(imposter_labels)
    S.extend(imposter_source_labels)

    imposter_images, imposter_labels, imposter_source_labels = load_images(imposter_test_paths, imposter_raw_dir, 1)
    X.extend(imposter_images)
    y.extend(imposter_labels)
    S.extend(imposter_source_labels)

    # Convert lists to numpy arrays
    X = np.array(X)
    y = np.array(y)
    S = np.array(S)

    return X, y, S

In [82]:
# Create numpy arrays for the NUAA dataset
X_nuaa, y_nuaa, S_nuaa = create_dataset(client_train_file, client_test_file, imposter_train_file, imposter_test_file, client_raw_dir, imposter_raw_dir)

print(f'X_nuaa shape: {X_nuaa.shape}')
print(f'y_nuaa shape: {y_nuaa.shape}')
print(f'S_nuaa shape: {S_nuaa.shape}')

100%|█████████████████████████████████████████████████████████████████████████████| 1743/1743 [00:02<00:00, 849.10it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 3362/3362 [00:03<00:00, 894.82it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1748/1748 [00:01<00:00, 962.57it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 5761/5761 [00:06<00:00, 833.07it/s]


X_nuaa shape: (12547, 224, 224, 3)
y_nuaa shape: (12547,)
S_nuaa shape: (12547,)


In [84]:
np.save('X_nuaa.npy', X_nuaa)
np.save('y_nuaa.npy', y_nuaa)
np.save('S_nuaa.npy', S_nuaa)

print("Numpy arrays saved successfully.")

Numpy arrays saved successfully.


## Create numpy arrays from MSU-MFSD dataset

In [94]:
# Function to load images and labels from the MSU dataset (real and attack images)
def load_images_and_labels(base_dir, label):
    images = []
    identities = []
    fake_real_types = [] # Used to differentiate real (0) and fake (1) faces

    # Traverse the directory and read all .jpg images
    for root, _, files in os.walk(base_dir):
        for file in tqdm(files, desc=f"Loading images from {label}"):
            if file.endswith('.jpg'):
                file_path = os.path.join(root, file)
                img = cv2.imread(file_path)
                img = cv2.resize(img, (224, 224)) # Resize to 224x224
                images.append(img)
                # Extract identity from the file name
                match = re.search(r'client(\d{3})_', file)
                if match:
                    identity = int(match.group(1))
                else:
                    identity = -1  # Default or error value if no match found
                
                identities.append(identity)
                
                # Assign real/fake label based on the directory structure
                if 'real' in root:
                    fake_real_types.append(0)
                elif 'attack' in root:
                    fake_real_types.append(1)
    
    return images, identities, fake_real_types

In [96]:
# Function to create numpy arrays for the MSU dataset (real and attack images)
def create_numpy_arrays(real_dir, attack_dir):
    # Load real images
    real_images, real_identities, real_fake_real_types = load_images_and_labels(real_dir, 'real')
    
    # Load attack images
    attack_images, attack_identities, attack_fake_real_types = load_images_and_labels(attack_dir, 'attack')
    
    # Combine real and attack images
    all_images = real_images + attack_images
    all_identities = real_identities + attack_identities
    all_fake_real_types = real_fake_real_types + attack_fake_real_types
    
    # Convert lists to numpy arrays
    X = np.array(all_images)
    identities = np.array(all_identities)
    fake_real_types = np.array(all_fake_real_types)
    
    return X, identities, fake_real_types

In [100]:
# Define paths for the MSU dataset
new_dataset_dir = r'C:\Users\Asus\Desktop\Biometric Systems\Datasets\MSU-MFSD Photos\raw_cropped'
real_dir = os.path.join(new_dataset_dir, 'real')
attack_dir = os.path.join(new_dataset_dir, 'attack')

In [106]:
# Create numpy arrays for MSU dataset
X_msu, y_msu, S_msu = create_numpy_arrays(real_dir, attack_dir)

Loading images from real: 100%|██████████████████████████████████████████████████| 19746/19746 [03:39<00:00, 90.08it/s]
Loading images from attack: 100%|████████████████████████████████████████████████| 57925/57925 [11:40<00:00, 82.71it/s]


In [108]:
print("Shape of X_msu:", X_msu.shape)
print("Shape of y_msu:", y_msu.shape)
print("Shape of S_msu:", S_msu.shape)

Shape of X_msu: (77671, 224, 224, 3)
Shape of y_msu: (77671,)
Shape of S_msu: (77671,)


In [110]:
np.save('X_msu.npy', X_msu)
np.save('y_msu.npy', y_msu)
np.save('S_msu.npy', S_msu)

print("Numpy arrays saved successfully.")

Numpy arrays saved successfully.


## Create numpy arrays for the combined dataset (MUAA + MSU-MFSD)

In [113]:
# Find the indices of real and attack images
real_indices = np.where(S_msu == 0)[0]
attack_indices = np.where(S_msu == 1)[0]

# Randomly select 19,000 real and 19,000 attack images
np.random.seed(42)  # Set the seed for reproducibility
selected_real_indices = np.random.choice(real_indices, 19000, replace=False)
selected_attack_indices = np.random.choice(attack_indices, 19000, replace=False)

# Combine the selected indices
selected_indices = np.concatenate((selected_real_indices, selected_attack_indices))

# Shuffle the indices to mix real and attack images
np.random.shuffle(selected_indices)

# Select the corresponding images and labels from the MSU dataset
X_msu = X_msu[selected_indices]
y_msu = y_msu[selected_indices]
S_msu = S_msu[selected_indices]

In [114]:
print("Shape of X_msu:", X_msu.shape)
print("Shape of y_msu:", y_msu.shape)
print("Shape of S_msu:", S_msu.shape)

Shape of X_msu: (38000, 224, 224, 3)
Shape of y_msu: (38000,)
Shape of S_msu: (38000,)


In [117]:
# Combine NUAA and MSU datasets

X_combined = np.concatenate((X_nuaa, X_msu), axis=0)
print("Shape of X_combined:", X_combined.shape)

y_combined = np.concatenate((y_nuaa, y_msu), axis=0)
print("Shape of y_combined:", y_combined.shape)

S_combined = np.concatenate((S_nuaa, S_msu), axis=0)
print("Shape of S_combined:", S_combined.shape)

Shape of X_combined: (50547, 224, 224, 3)
Shape of y_combined: (50547,)
Shape of S_combined: (50547,)


In [119]:
# Shuffle the combined dataset
indices = np.arange(X_combined.shape[0])
print(indices)
np.random.shuffle(indices)
print(indices)

[    0     1     2 ... 50544 50545 50546]
[31740 26556 31153 ... 34568 33012 41576]


In [121]:
# Apply the shuffled order to the combined dataset
X_combined_shuffled = X_combined[indices]
y_combined_shuffled = y_combined[indices]
S_combined_shuffled = S_combined[indices]

In [123]:
print(X_combined_shuffled.shape)
print(y_combined_shuffled.shape)
print(S_combined_shuffled.shape)

(50547, 224, 224, 3)
(50547,)
(50547,)


In [125]:
np.save('X_combined.npy', X_combined_shuffled)
np.save('y_combined.npy', y_combined_shuffled)
np.save('S_combined.npy', S_combined_shuffled)

print("Numpy arrays saved successfully.")

Numpy arrays saved successfully.
