In [2]:
import albumentations as alb
import numpy as np
import os
from PIL import Image
from mtcnn import MTCNN
import dlib
import cv2
import random
import uuid
import shutil
import tensorflow as tf
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm
  check_for_updates()


In [2]:
# Initialize detectors and predictors
detector = MTCNN()
landmark_predictor = dlib.shape_predictor('Dlib_model/shape_predictor_68_face_landmarks.dat')

In [3]:
# Helper function: Align face
def align_face(image):
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
    faces = dlib.get_frontal_face_detector()(gray)
    if len(faces) == 0:
        return None

    face = faces[0]  # Assume the largest detected face
    landmarks = landmark_predictor(gray, face)

    left_eye = np.array([landmarks.part(36).x, landmarks.part(36).y])
    right_eye = np.array([landmarks.part(45).x, landmarks.part(45).y])
    
    # Calculate rotation angle
    delta_x = right_eye[0] - left_eye[0]
    delta_y = right_eye[1] - left_eye[1]
    angle = np.degrees(np.arctan2(delta_y, delta_x))

    # Center the eyes and rotate
    center = tuple(list(np.mean([left_eye, right_eye], axis=0)))
    rotation_matrix = cv2.getRotationMatrix2D(center=center, angle=angle, scale=1.0)
    aligned = cv2.warpAffine(np.array(image), rotation_matrix, image.size, flags=cv2.INTER_CUBIC)

    return Image.fromarray(aligned)

In [4]:
# Helper function: Resize and normalize
def resize_and_normalize(image, size=(160, 160)):
    image = image.resize(size).convert('RGB')
    image_array = np.asarray(image) / 255.0  # Normalize to [0, 1]
    return image_array

# Helper function: Augment images (optional)
def augment_image(image_array):
    aug_pipeline = alb.Compose([
        alb.HorizontalFlip(p=0.5),
        alb.Rotate(limit=15, p=0.5),
        alb.AdditiveNoise(scale=(5, 25), p=0.2),  # Reduced noise
        alb.RandomGamma(gamma_limit=(80, 120), p=0.2),  # Narrower gamma range
        alb.RGBShift(r_shift_limit=10, g_shift_limit=10, b_shift_limit=10, p=0.2),  # Reduced RGB shift
        alb.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.2),  # Conservative limits
        alb.CLAHE(clip_limit=2.0, p=0.1)  # Optional: Enhance contrast
    ])
    augmented = aug_pipeline(image=image_array.astype(np.uint8))
    return augmented["image"]

In [5]:
# Main function: Process dataset
def process_dataset(input_dir, output_dir, augment=False):
    os.makedirs(output_dir, exist_ok=True)

    for person in os.listdir(input_dir):
        person_dir = os.path.join(input_dir, person)
        output_person_dir = os.path.join(output_dir, person)
        os.makedirs(output_person_dir, exist_ok=True)

        for img_name in os.listdir(person_dir):
            img_path = os.path.join(person_dir, img_name)
            img = Image.open(img_path)

            # Detect face
            detections = detector.detect_faces(np.array(img))
            if not detections:
                print(f"No face detected in {img_path}")
                continue

            x, y, width, height = detections[0]['box']
            cropped_face = img.crop((x, y, x + width, y + height))

            # Align face
            aligned_face = align_face(cropped_face)
            if aligned_face is None:
                print(f"Alignment failed for {img_path}")
                continue

            # Resize and normalize
            processed_face = resize_and_normalize(aligned_face)

            # Save original processed image with a unique name
            unique_filename = f"{uuid.uuid4().hex}.jpg"
            save_path = os.path.join(output_person_dir, unique_filename)
            Image.fromarray((processed_face * 255).astype(np.uint8)).save(save_path)
            print(f"Processed and saved: {save_path}")

            # Augment and save with another unique name
            if augment:
                augmented_face = augment_image(processed_face)
                augmented_filename = f"{uuid.uuid4().hex}.jpg"
                augmented_save_path = os.path.join(output_person_dir, augmented_filename)
                Image.fromarray((augmented_face * 255).astype(np.uint8)).save(augmented_save_path)
                print(f"Augmented and saved: {augmented_save_path}")


In [10]:
# Paths
input_dir = "Dataset_raw"  # Replace with the path to your organized dataset
output_dir = "processed_dataset"  # Path to save the processed dataset

# Process dataset
process_dataset(input_dir, output_dir, augment=True)

Processed and saved: processed_dataset\Akshay Kumar\6c49d13ffb534916b0c26a23a969df9e.jpg
Augmented and saved: processed_dataset\Akshay Kumar\23d3a782e99e4e3fab49116be260599a.jpg


  alb.AdditiveNoise(scale=(5, 25), p=0.2),  # Reduced noise


Processed and saved: processed_dataset\Akshay Kumar\d97e1b1237bf47098ab76d1e38d2dbe7.jpg
Augmented and saved: processed_dataset\Akshay Kumar\c38b21c19106490d84dae1fe681cd978.jpg
Alignment failed for Dataset_raw\Akshay Kumar\Akshay Kumar_10.jpg
Processed and saved: processed_dataset\Akshay Kumar\8fe8f50cb35e407c84280698ac1904f2.jpg
Augmented and saved: processed_dataset\Akshay Kumar\cb6cf390474e42e7b6dbaab44529cd3d.jpg
Alignment failed for Dataset_raw\Akshay Kumar\Akshay Kumar_12.jpg
Processed and saved: processed_dataset\Akshay Kumar\4eced3f918804a90a35cf7fa1fbb667a.jpg
Augmented and saved: processed_dataset\Akshay Kumar\7ce4e2078ef540629a116b2f2524a7dd.jpg
Alignment failed for Dataset_raw\Akshay Kumar\Akshay Kumar_14.jpg
Processed and saved: processed_dataset\Akshay Kumar\cdfa87738188404db2e7966bcc9a1202.jpg
Augmented and saved: processed_dataset\Akshay Kumar\455ae77743e74f4f9fb2246a0855222b.jpg
Processed and saved: processed_dataset\Akshay Kumar\48f8d92514334835b9ec8e423f061ede.jpg
A

## split into train test val 

In [3]:
# Paths
input_dir = './processed_dataset/'
output_dir = './Dataset_split/'

# Define split ratios (can be changed as needed)
train_split = 0.6
val_split = 0.1
test_split = 0.3

# Ensure the output directories exist
splits = ['train', 'val', 'test']
for split in splits:
    for class_name in os.listdir(input_dir):
        split_dir = os.path.join(output_dir, split, class_name)
        os.makedirs(split_dir, exist_ok=True)

# Process each folder (person1, person2, etc.)
for class_name in os.listdir(input_dir):
    class_path = os.path.join(input_dir, class_name)
    
    if not os.path.isdir(class_path):
        continue  # Skip files, only process directories (like person1, person2)
    
    # Get all image files in the current class folder
    images = [f for f in os.listdir(class_path) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))]
    random.shuffle(images)  # Shuffle to ensure randomness

    # Split images into train, val, and test
    total_images = len(images)
    train_count = int(total_images * train_split)
    val_count = int(total_images * val_split)
    
    train_images = images[:train_count]
    val_images = images[train_count:train_count + val_count]
    test_images = images[train_count + val_count:]

    # Copy files to train, val, and test directories
    for image_set, split in zip([train_images, val_images, test_images], splits):
        for image_name in image_set:
            src_path = os.path.join(class_path, image_name)
            dst_path = os.path.join(output_dir, split, class_name, image_name)
            
            try:
                shutil.copy(src_path, dst_path)
            except Exception as e:
                print(f"Error copying {src_path} to {dst_path}: {e}")

print("Dataset successfully organized!")

Dataset successfully organized!


## making the hugging face dataset

In [4]:
# Set base directory for dataset
base_dir = "Dataset_split"  # Change to your dataset folder path

# 1. Load pre-trained model (EfficientNetB0)
pretrained_model = tf.keras.applications.EfficientNetB0(
    include_top=False,  # Remove the classification head
    pooling="avg",      # Use Global Average Pooling for embeddings
    input_shape=(224, 224, 3)
)

# 2. Helper function to preprocess images
def preprocess_image(image_path):
    # Load and resize image
    image = tf.keras.utils.load_img(image_path, target_size=(224, 224))
    image_array = tf.keras.utils.img_to_array(image)
    # Preprocess for EfficientNetB0
    return tf.keras.applications.efficientnet.preprocess_input(image_array)

# 3. Collect image paths, embeddings, and labels for each split (train, val, test)
def collect_data_from_split(split_dir):
    image_paths = []
    embeddings = []
    labels = []
    label_map = {}

    for label_idx, label_name in enumerate(os.listdir(split_dir)):
        label_dir = os.path.join(split_dir, label_name)
        if not os.path.isdir(label_dir):
            continue

        # Map label to an integer
        label_map[label_name] = label_idx

        for img_name in tqdm(os.listdir(label_dir), desc=f"Processing {label_name}"):
            img_path = os.path.join(label_dir, img_name)
            try:
                # Preprocess image
                image = preprocess_image(img_path)
                image = np.expand_dims(image, axis=0)  # Add batch dimension

                # Get embedding
                embedding = pretrained_model.predict(image, verbose=0).squeeze()

                # Append data
                image_paths.append(img_path)
                embeddings.append(embedding)
                labels.append(label_idx)
            except Exception as e:
                print(f"Error processing {img_path}: {e}")

    return image_paths, embeddings, labels, label_map

# 4. Collect data for train, val, and test
print("Processing images and extracting embeddings...")

train_dir = os.path.join(base_dir, 'train')
val_dir = os.path.join(base_dir, 'val')
test_dir = os.path.join(base_dir, 'test')

# Collect data for each split
train_image_paths, train_embeddings, train_labels, label_map = collect_data_from_split(train_dir)
val_image_paths, val_embeddings, val_labels, _ = collect_data_from_split(val_dir)
test_image_paths, test_embeddings, test_labels, _ = collect_data_from_split(test_dir)

# 5. Train-Test-Validation Split
# You already have splits, so we skip splitting in this case
# Split data into dataset dict
data_dict = {
    "image_path": train_image_paths + val_image_paths + test_image_paths,
    "embedding": train_embeddings + val_embeddings + test_embeddings,
    "label": train_labels + val_labels + test_labels
}

# Helper function to create a Dataset from indices
def create_split(indices):
    return Dataset.from_dict({
        "image_path": [data_dict["image_path"][i] for i in indices],
        "embedding": [data_dict["embedding"][i] for i in indices],
        "label": [data_dict["label"][i] for i in indices],
    })

lable_data = Dataset.from_dict({
        "image_path": list(label_map.keys()) ,
        "embedding": [[np.float32(0.0)] for _ in range(len(label_map.keys()))],
        "label": list(label_map.values()),
    })

# Create DatasetDict
dataset_dict = DatasetDict({
    "train": create_split(range(len(train_labels))),
    "val": create_split(range(len(train_labels), len(train_labels) + len(val_labels))),
    "test": create_split(range(len(train_labels) + len(val_labels), len(data_dict["label"]))),
    "map" : lable_data
})

Processing images and extracting embeddings...


Processing Akshay Kumar: 100%|██████████| 24/24 [00:05<00:00,  4.40it/s]
Processing Alexandra Daddario: 100%|██████████| 54/54 [00:06<00:00,  8.65it/s]
Processing Alia Bhatt: 100%|██████████| 49/49 [00:05<00:00,  9.23it/s]
Processing Amitabh Bachchan: 100%|██████████| 27/27 [00:03<00:00,  8.78it/s]
Processing Andy Samberg: 100%|██████████| 52/52 [00:05<00:00,  9.21it/s]
Processing Anushka Sharma: 100%|██████████| 35/35 [00:03<00:00,  9.21it/s]
Processing Billie Eilish: 100%|██████████| 42/42 [00:04<00:00,  9.23it/s]
Processing Brad Pitt: 100%|██████████| 64/64 [00:06<00:00,  9.25it/s]
Processing Camila Cabello: 100%|██████████| 45/45 [00:04<00:00,  9.18it/s]
Processing Charlize Theron: 100%|██████████| 40/40 [00:04<00:00,  9.02it/s]
Processing Claire Holt: 100%|██████████| 51/51 [00:05<00:00,  9.42it/s]
Processing Courtney Cox: 100%|██████████| 44/44 [00:04<00:00,  9.06it/s]
Processing Dwayne Johnson: 100%|██████████| 33/33 [00:03<00:00,  9.29it/s]
Processing Elizabeth Olsen: 100%|████

In [5]:
import sys
from huggingface_hub import login, Repository
# Login to Hugging Face (replace with your token)
def huggingface_login():
    try:
        # Prompt for Hugging Face token if not already set
        token = os.environ.get('HF_TOKEN')
        if not token:
            token = "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"
        login(token=token)
    except Exception as e:
        print(f"Hugging Face login failed: {e}")
        sys.exit(1)

In [6]:
huggingface_login()

In [7]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['image_path', 'embedding', 'label'],
        num_rows: 2098
    })
    val: Dataset({
        features: ['image_path', 'embedding', 'label'],
        num_rows: 338
    })
    test: Dataset({
        features: ['image_path', 'embedding', 'label'],
        num_rows: 1082
    })
    map: Dataset({
        features: ['image_path', 'embedding', 'label'],
        num_rows: 31
    })
})

In [8]:
# 6. Save DatasetDict
dataset_dict.push_to_hub("Tarakeshwaran/sampleface30-Dataset_v2")


Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 23.32ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.13s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 28.63ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.33s/it]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 27.23ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.10s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<?, ?ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.13it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/Tarakeshwaran/sampleface30-Dataset_v2/commit/8081377801b237dd5365a545ca19cd24eb6f5b79', commit_message='Upload dataset', commit_description='', oid='8081377801b237dd5365a545ca19cd24eb6f5b79', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Tarakeshwaran/sampleface30-Dataset_v2', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Tarakeshwaran/sampleface30-Dataset_v2'), pr_revision=None, pr_num=None)