## 📌 Reference

This notebook was adapted from code available on my GitHub.  
For updates or additional information, please visit:

👉 [https://github.com/your-username](https://github.com/your-username)

### Installing required Libraries

In [1]:
!pip install fiftyone



### Import Librarires

In [2]:
import os
import cv2
import numpy as np
import fiftyone as fo
import fiftyone.brain as fob
from sklearn.decomposition import PCA

### Required Functions

In [3]:
def track_deleted_images(dataset, original_filepaths):
    current_filepaths = {s.filepath for s in dataset}
    deleted_images = list(original_filepaths - current_filepaths)
    if deleted_images:
        for file_path in deleted_images:
            if os.path.exists(file_path):
                os.remove(file_path)
                print(f"Deleted: {file_path}")
            else:
                print(f"Skipped (not found): {file_path}")
        print(f"Deleted {len(deleted_images)} images.")
    else:
        print("No images were deleted.")

In [4]:
def load_dataset(dataset_dir, name, dataset_type):
    dataset_types = {
        "ImageDirectory": fo.types.ImageDirectory,
        "ImageDirectoryTree": fo.types.ImageClassificationDirectoryTree
    }

    if dataset_type not in dataset_types:
        raise ValueError(f"Invalid dataset_type {dataset_type}")

    dataset = fo.Dataset.from_dir(
        dataset_dir=dataset_dir,
        dataset_type=dataset_types[dataset_type],
        name=name
    )
    dataset.compute_metadata()
    return dataset

In [5]:
def compute_pca_embeddings(dataset_dir, name, dataset_type, n_components=512):
    dataset = load_dataset(dataset_dir, name, dataset_type)
    image_arrays = []
    max_pixels = 0
    for f in dataset.values("filepath"):
        img = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
        if img is None:
            print(f"Warning: Skipping unreadable image: {f}")
            continue
        img_flattened = img.ravel()
        image_arrays.append(img_flattened)
        max_pixels = max(max_pixels, img_flattened.shape[0])
    padded_embeddings = np.array([
        np.pad(img, (0, max_pixels - img.shape[0])) for img in image_arrays
    ])
    pca = PCA(n_components=min(n_components, padded_embeddings.shape[0]))
    pca_embeddings = pca.fit_transform(padded_embeddings)
    return pca_embeddings, dataset

In [6]:
def compute_embeddings(dataset_dir, name, dataset_type, choice):
    if choice == "pixel_embeddings":
        pca_embeddings, dataset = compute_pca_embeddings(dataset_dir, name, dataset_type)
        fob.compute_visualization(
            dataset,
            embeddings=pca_embeddings,
            num_dims=2,
            method="umap",
            brain_key=choice,
            verbose=True,
            seed=42
        )
    else:
        dataset = load_dataset(dataset_dir, name, dataset_type)
        model = {
            "resnet_embeddings": "resnet50-imagenet-torch",
            "clip_embeddings": "clip-vit-base32-torch",
            "inception_embeddings": "inception-v3-imagenet-torch"
        }
        fob.compute_visualization(
            dataset,
            model=model[choice],
            embeddings=choice,
            method="umap",
            brain_key=choice,
            batch_size=10
        )
    fob.compute_uniqueness(dataset, embeddings=choice)
    fob.compute_representativeness(dataset, embeddings=choice)
    return dataset

### Connect to your Google Drive if needed

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
# Set parameters manually
#dataset_dir = "/path/to/your/images"
dataset_dir = "/content/drive/MyDrive/Infeed_data"
dataset_name = "my_dataset"
dataset_type = "ImageDirectoryTree"  # or ImageDirectoryTree
embedding_type = "resnet"  # pixel | resnet | clip | inception

embed_dict = {
    "pixel": "pixel_embeddings",
    "resnet": "resnet_embeddings",
    "clip": "clip_embeddings",
    "inception": "inception_embeddings"
}

In [None]:
# Run embedding computation
dataset = compute_embeddings(dataset_dir, dataset_name, dataset_type, embed_dict[embedding_type])
original_filepaths = {s.filepath for s in dataset}

 100% |███████████████| 2040/2040 [1.3s elapsed, 0s remaining, 1.6K samples/s]         


INFO:eta.core.utils: 100% |███████████████| 2040/2040 [1.3s elapsed, 0s remaining, 1.6K samples/s]         


Computing metadata...


INFO:fiftyone.core.metadata:Computing metadata...


 100% |███████████████| 2040/2040 [4.0m elapsed, 0s remaining, 5.1 samples/s]       


INFO:eta.core.utils: 100% |███████████████| 2040/2040 [4.0m elapsed, 0s remaining, 5.1 samples/s]       


Computing embeddings...


INFO:fiftyone.brain.internal.core.utils:Computing embeddings...


Model does not support batching




   6% |\--------------|  116/2040 [53.8s elapsed, 15.3m remaining, 1.4 samples/s]   

In [None]:
# Launch FiftyOne App
session = fo.launch_app(dataset, address="0.0.0.0")

In [None]:
# After reviewing in FiftyOne, you can run this manually to track deleted files
track_deleted_images(dataset, original_filepaths)