In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Install Library

In [1]:
!uv pip install transformers tensorboard

[2mUsing Python 3.12.11 environment at: /usr[0m
[2mAudited [1m2 packages[0m [2min 287ms[0m[0m


# Import Library

In [35]:
from transformers import CLIPProcessor, CLIPModel
import glob
from PIL import Image
import os
import pickle
import tqdm
import numpy as np
import csv
import cv2
import math

# Download Data

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("prasunroy/natural-images")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/prasunroy/natural-images?dataset_version_number=1...


100%|██████████| 342M/342M [00:02<00:00, 167MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/prasunroy/natural-images/versions/1


In [5]:
import shutil

source_path = "/root/.cache/kagglehub/datasets/prasunroy/natural-images/versions/1"
destination_path = "./natural-images"  # or specify your desired destination

# Create the destination directory if it doesn't exist
if not os.path.exists(destination_path):
    os.makedirs(destination_path)

# Copy the directory tree
shutil.copytree(source_path, destination_path, dirs_exist_ok=True)

print(f"Dataset copied from {source_path} to {destination_path}")

directory_to_remove = "./natural-images/data"

if os.path.exists(directory_to_remove):
    shutil.rmtree(directory_to_remove)
    print(f"Removed directory and its contents: {directory_to_remove}")
else:
    print(f"Directory not found: {directory_to_remove}")

Dataset copied from /root/.cache/kagglehub/datasets/prasunroy/natural-images/versions/1 to ./natural-images


# Load Model

In [4]:
# 加载CLIP模型
print("start loading model")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

start loading model


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [36]:
def get_clip_embedding(image_path):
    image = Image.open(image_path)
    inputs = processor(images=image, return_tensors="pt")
    outputs = model.get_image_features(**inputs)
    return outputs

base_path = "/content/natural-images/natural_images/*/*.*"
img_names = glob.glob(base_path)
img_names = sorted(img_names)

print("start embedding")
embeddings={}
for i in tqdm.tqdm(range(len(img_names))):
    img_name = img_names[i]
    image_embedding = get_clip_embedding(os.path.join(base_path, img_name))
    image_embedding = image_embedding.detach().numpy()
    image_embedding = image_embedding.squeeze()
    embeddings[img_name] = image_embedding

# save data as a pickle file
with open('embeddings.pkl', 'wb') as file:
    pickle.dump(embeddings, file)

start embedding


100%|██████████| 6899/6899 [28:19<00:00,  4.06it/s]


# Visualization

In [55]:
from sklearn.decomposition import PCA
import plotly.express as px
import pandas as pd
import os

# Assuming 'embeddings' dictionary is already available from previous steps
# Convert the dictionary of embeddings to a numpy array and get the image paths
image_paths = list(embeddings.keys())
embeddings_array = np.array(list(embeddings.values()))

# Perform PCA to reduce to 3 dimensions
pca = PCA(n_components=3)
embeddings_pca = pca.fit_transform(embeddings_array)

# Extract labels and image names from image paths
image_labels = [os.path.basename(os.path.dirname(path)) for path in image_paths]
image_names = [os.path.basename(path) for path in image_paths]

# Create a pandas DataFrame for Plotly
df = pd.DataFrame(embeddings_pca, columns=['PCA Component 1', 'PCA Component 2', 'PCA Component 3'])
df['Category'] = image_labels
df['Image Name'] = image_names

# Create an interactive 3D scatter plot using Plotly
fig = px.scatter_3d(df, x='PCA Component 1', y='PCA Component 2', z='PCA Component 3',
                    color='Category', hover_name='Image Name',
                    title='Interactive 3D Visualization of Image Embeddings (PCA)')

fig.show()