In [2]:
import os
import tensorflow as tf
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from datasets import DatasetDict, Dataset ,load_from_disk

In [1]:

# Set base directory for dataset
base_dir = "Dataset_new"  # Change to your dataset folder path

# 1. Load pre-trained model (EfficientNetB0)
pretrained_model = tf.keras.applications.EfficientNetB0(
    include_top=False,  # Remove the classification head
    pooling="avg",      # Use Global Average Pooling for embeddings
    input_shape=(224, 224, 3)
)

# 2. Helper function to preprocess images
def preprocess_image(image_path):
    # Load and resize image
    image = tf.keras.utils.load_img(image_path, target_size=(224, 224))
    image_array = tf.keras.utils.img_to_array(image)
    # Preprocess for EfficientNetB0
    return tf.keras.applications.efficientnet.preprocess_input(image_array)

# 3. Collect image paths, embeddings, and labels
image_paths = []
embeddings = []
labels = []
label_map = {}

# Traverse directories and collect data
print("Processing images and extracting embeddings...")
for label_idx, label_name in enumerate(os.listdir(base_dir)):
    label_dir = os.path.join(base_dir, label_name)
    if not os.path.isdir(label_dir):
        continue

    # Map label to an integer
    label_map[label_name] = label_idx

    for img_name in tqdm(os.listdir(label_dir), desc=f"Processing {label_name}"):
        img_path = os.path.join(label_dir, img_name)
        try:
            # Preprocess image
            image = preprocess_image(img_path)
            image = np.expand_dims(image, axis=0)  # Add batch dimension

            # Get embedding
            embedding = pretrained_model.predict(image, verbose=0).squeeze()

            # Append data
            image_paths.append(img_path)
            embeddings.append(embedding)
            labels.append(label_idx)
        except Exception as e:
            print(f"Error processing {img_path}: {e}")

# 4. Create a dictionary of data
data_dict = {
    "image_path": image_paths,
    "embedding": embeddings,
    "label": labels
}

# 5. Train-Test-Validation Split
train_idx, test_idx = train_test_split(range(len(labels)), test_size=0.2, stratify=labels, random_state=42)
test_idx, val_idx = train_test_split(test_idx, test_size=0.5, stratify=[labels[i] for i in test_idx], random_state=42)

# Helper function to create a Dataset
def create_split(indices):
    return Dataset.from_dict({
        "image_path": [data_dict["image_path"][i] for i in indices],
        "embedding": [data_dict["embedding"][i] for i in indices],
        "label": [data_dict["label"][i] for i in indices],
    })

# Create DatasetDict
dataset_dict = DatasetDict({
    "train": create_split(train_idx),
    "test": create_split(test_idx),
    "validation": create_split(val_idx)
})

# 6. Save DatasetDict
dataset_dict.save_to_disk("huggingface_dataset")
print("DatasetDict saved to 'huggingface_dataset'.")

# Optional: Print dataset info
print(dataset_dict)





Processing images and extracting embeddings...


Processing Akshay Kumar: 100%|██████████| 50/50 [00:09<00:00,  5.02it/s]
Processing Alexandra Daddario: 100%|██████████| 92/92 [00:14<00:00,  6.23it/s]
Processing Alia Bhatt: 100%|██████████| 79/79 [00:12<00:00,  6.12it/s]
Processing Amitabh Bachchan: 100%|██████████| 74/74 [00:11<00:00,  6.52it/s]
Processing Andy Samberg: 100%|██████████| 92/92 [00:14<00:00,  6.16it/s]
Processing Anushka Sharma: 100%|██████████| 68/68 [00:10<00:00,  6.41it/s]
Processing Billie Eilish: 100%|██████████| 98/98 [00:16<00:00,  6.00it/s]
Processing Brad Pitt: 100%|██████████| 120/120 [00:19<00:00,  6.09it/s]
Processing Camila Cabello: 100%|██████████| 87/87 [00:14<00:00,  6.12it/s]
Processing Charlize Theron: 100%|██████████| 78/78 [00:12<00:00,  6.21it/s]
Processing Claire Holt: 100%|██████████| 96/96 [00:15<00:00,  6.21it/s]
Processing Courtney Cox: 100%|██████████| 80/80 [00:12<00:00,  6.24it/s]
Processing Dwayne Johnson: 100%|██████████| 61/61 [00:09<00:00,  6.49it/s]
Processing Elizabeth Olsen: 100%|██

Saving the dataset (0/1 shards):   0%|          | 0/2049 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/256 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/257 [00:00<?, ? examples/s]

DatasetDict saved to 'huggingface_dataset'.
DatasetDict({
    train: Dataset({
        features: ['image_path', 'embedding', 'label'],
        num_rows: 2049
    })
    test: Dataset({
        features: ['image_path', 'embedding', 'label'],
        num_rows: 256
    })
    validation: Dataset({
        features: ['image_path', 'embedding', 'label'],
        num_rows: 257
    })
})


In [3]:
dataset = load_from_disk("huggingface_dataset")

In [4]:
dataset.push_to_hub("Tarakeshwaran/sampleface30-Dataset")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Tarakeshwaran/sampleface30-Dataset/commit/bef9c22e5a37b991355e1401180538c1f01e4dbf', commit_message='Upload dataset', commit_description='', oid='bef9c22e5a37b991355e1401180538c1f01e4dbf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Tarakeshwaran/sampleface30-Dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Tarakeshwaran/sampleface30-Dataset'), pr_revision=None, pr_num=None)