In [1]:
import os
import tensorflow as tf
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from datasets import DatasetDict, Dataset ,load_from_disk




In [None]:
# Set base directory for dataset
base_dir = "Dataset_raw"  # Change to your dataset folder path

# 1. Load pre-trained model (EfficientNetB0)
pretrained_model = tf.keras.applications.EfficientNetB0(
    include_top=False,  # Remove the classification head
    pooling="avg",      # Use Global Average Pooling for embeddings
    input_shape=(224, 224, 3)
)

# 2. Helper function to preprocess images
def preprocess_image(image_path):
    # Load and resize image
    image = tf.keras.utils.load_img(image_path, target_size=(224, 224))
    image_array = tf.keras.utils.img_to_array(image)
    # Preprocess for EfficientNetB0
    return tf.keras.applications.efficientnet.preprocess_input(image_array)

# 3. Collect image paths, embeddings, and labels
image_paths = []
embeddings = []
labels = []
label_map = {}

# Traverse directories and collect data
print("Processing images and extracting embeddings...")
for label_idx, label_name in enumerate(os.listdir(base_dir)):
    label_dir = os.path.join(base_dir, label_name)
    if not os.path.isdir(label_dir):
        continue

    # Map label to an integer
    label_map[label_name] = label_idx

    for img_name in tqdm(os.listdir(label_dir), desc=f"Processing {label_name}"):
        img_path = os.path.join(label_dir, img_name)
        try:
            # Preprocess image
            image = preprocess_image(img_path)
            image = np.expand_dims(image, axis=0)  # Add batch dimension

            # Get embedding
            embedding = pretrained_model.predict(image, verbose=0).squeeze()

            # Append data
            image_paths.append(img_path)
            embeddings.append(embedding)
            labels.append(label_idx)
        except Exception as e:
            print(f"Error processing {img_path}: {e}")

# 4. Create a dictionary of data
data_dict = {
    "image_path": image_paths,
    "embedding": embeddings,
    "label": labels
}

# 5. Train-Test-Validation Split
train_idx, test_idx = train_test_split(range(len(labels)), test_size=0.3, stratify=labels, random_state=42)
#test_idx, val_idx = train_test_split(test_idx, test_size=0.25, stratify=[labels[i] for i in test_idx], random_state=42)

# Helper function to create a Dataset
def create_split(indices):
    return Dataset.from_dict({
        "image_path": [data_dict["image_path"][i] for i in indices],
        "embedding": [data_dict["embedding"][i] for i in indices],
        "label": [data_dict["label"][i] for i in indices],
    })

# Create DatasetDict
dataset_dict = DatasetDict({
    "train": create_split(train_idx),
    "test": create_split(test_idx),
    #"validation": create_split(val_idx)
})

# 6. Save DatasetDict
dataset_dict.push_to_hub("Tarakeshwaran/sampleface30-Dataset")




Processing images and extracting embeddings...


Processing Akshay Kumar: 100%|██████████| 50/50 [00:06<00:00,  7.67it/s]
Processing Alexandra Daddario: 100%|██████████| 92/92 [00:09<00:00,  9.87it/s]
Processing Alia Bhatt: 100%|██████████| 79/79 [00:08<00:00,  9.69it/s]
Processing Amitabh Bachchan: 100%|██████████| 74/74 [00:07<00:00, 10.13it/s]
Processing Andy Samberg: 100%|██████████| 92/92 [00:09<00:00,  9.97it/s]
Processing Anushka Sharma: 100%|██████████| 68/68 [00:06<00:00,  9.80it/s]
Processing Billie Eilish: 100%|██████████| 98/98 [00:10<00:00,  9.43it/s]
Processing Brad Pitt: 100%|██████████| 120/120 [00:12<00:00,  9.71it/s]
Processing Camila Cabello: 100%|██████████| 87/87 [00:09<00:00,  9.49it/s]
Processing Charlize Theron: 100%|██████████| 78/78 [00:08<00:00,  9.48it/s]
Processing Claire Holt: 100%|██████████| 96/96 [00:09<00:00,  9.62it/s]
Processing Courtney Cox: 100%|██████████| 80/80 [00:08<00:00,  9.90it/s]
Processing Dwayne Johnson: 100%|██████████| 61/61 [00:06<00:00,  9.98it/s]
Processing Elizabeth Olsen: 100%|██

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['image_path', 'embedding', 'label'],
        num_rows: 1537
    })
    test: Dataset({
        features: ['image_path', 'embedding', 'label'],
        num_rows: 1025
    })
})


In [None]:
# Optional: Print dataset info
print(dataset_dict)

In [9]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from tqdm import tqdm

# Set base directory for dataset
base_dir = "Dataset_raw"  # Change to your dataset folder path

# 1. Load pre-trained model (EfficientNetB0)
pretrained_model = tf.keras.applications.EfficientNetB0(
    include_top=False,  # Remove the classification head
    pooling="avg",      # Use Global Average Pooling for embeddings
    input_shape=(224, 224, 3)
)

# 2. Data augmentation generator
datagen = ImageDataGenerator(
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    brightness_range=(0.8, 1.2),
    zoom_range=0.2,
    horizontal_flip=True,
    preprocessing_function=tf.keras.applications.efficientnet.preprocess_input
)

# 3. Helper function to preprocess images
def preprocess_image(image_path):
    # Load and resize image
    image = tf.keras.utils.load_img(image_path, target_size=(224, 224))
    image_array = tf.keras.utils.img_to_array(image)
    # Preprocess for EfficientNetB0
    return tf.keras.applications.efficientnet.preprocess_input(image_array)

# 4. Collect embeddings and labels
embeddings = []
labels = []
label_map = {}

print("Processing images and extracting embeddings...")
for label_idx, label_name in enumerate(os.listdir(base_dir)):
    label_dir = os.path.join(base_dir, label_name)
    if not os.path.isdir(label_dir):
        continue

    # Map label to an integer
    label_map[label_name] = label_idx

    for img_name in tqdm(os.listdir(label_dir), desc=f"Processing {label_name}"):
        img_path = os.path.join(label_dir, img_name)
        try:
            # Preprocess image
            image = preprocess_image(img_path)

            # Apply augmentation to the image
            image = np.expand_dims(image, axis=0)  # Add batch dimension
            augmented_images = datagen.flow(image, batch_size=1)

            # Process the original image and a batch of augmented images
            for _ in range(2):  # Generate 2 augmented samples per original
                augmented_image = next(augmented_images)[0]

                # Get embedding
                embedding = pretrained_model.predict(
                    np.expand_dims(augmented_image, axis=0), verbose=0
                ).squeeze()

                # Append data
                embeddings.append(embedding)
                labels.append(label_idx)
        except Exception as e:
            print(f"Error processing {img_path}: {e}")

# 5. Create a dictionary of data (exclude image paths)
data_dict = {
    "embedding": embeddings,
    "label": labels
}

# 6. Helper function to create a Dataset
def create_split(indices):
    return Dataset.from_dict({
        "embedding": [data_dict["embedding"][i] for i in indices],
        "label": [data_dict["label"][i] for i in indices],
    })

# 7. Train-Test Split
train_idx, test_idx = train_test_split(range(len(labels)), test_size=0.3, stratify=labels, random_state=42)

# 8. Create DatasetDict
dataset_dict = DatasetDict({
    "train": create_split(train_idx),
    "test": create_split(test_idx),
})


# 9. Save DatasetDict
dataset_dict.push_to_hub("Tarakeshwaran/sampleface30-Dataset")

Processing images and extracting embeddings...


Processing Akshay Kumar: 100%|██████████| 50/50 [00:13<00:00,  3.82it/s]
Processing Alexandra Daddario: 100%|██████████| 92/92 [00:24<00:00,  3.81it/s]
Processing Alia Bhatt: 100%|██████████| 79/79 [00:21<00:00,  3.68it/s]
Processing Amitabh Bachchan: 100%|██████████| 74/74 [00:17<00:00,  4.15it/s]
Processing Andy Samberg: 100%|██████████| 92/92 [00:22<00:00,  4.18it/s]
Processing Anushka Sharma: 100%|██████████| 68/68 [00:15<00:00,  4.34it/s]
Processing Billie Eilish: 100%|██████████| 98/98 [00:23<00:00,  4.16it/s]
Processing Brad Pitt: 100%|██████████| 120/120 [00:29<00:00,  4.10it/s]
Processing Camila Cabello: 100%|██████████| 87/87 [00:20<00:00,  4.26it/s]
Processing Charlize Theron: 100%|██████████| 78/78 [00:19<00:00,  4.10it/s]
Processing Claire Holt: 100%|██████████| 96/96 [00:22<00:00,  4.19it/s]
Processing Courtney Cox: 100%|██████████| 80/80 [00:19<00:00,  4.18it/s]
Processing Dwayne Johnson: 100%|██████████| 61/61 [00:14<00:00,  4.34it/s]
Processing Elizabeth Olsen: 100%|██

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Tarakeshwaran/sampleface30-Dataset/commit/ae4a85d6ac8a4a07c09b7c6c3772e0a32eb0a17b', commit_message='Upload dataset', commit_description='', oid='ae4a85d6ac8a4a07c09b7c6c3772e0a32eb0a17b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Tarakeshwaran/sampleface30-Dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Tarakeshwaran/sampleface30-Dataset'), pr_revision=None, pr_num=None)