### Recognition Model: VIT Model Embedding Pipeline

In [None]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import numpy as np
import os
import pandas as pd
from tqdm import tqdm

# Load the uploaded CSV file to inspect its contents
file_path = 'celeba_features_final.csv'
df = pd.read_csv(file_path)

# Initialize the CLIP model and processor
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

# Directory where images are stored
image_dir = 'img_align_celeba'

# Prepare to store embeddings and labels
embeddings = []
labels = []

# Mapping of categories to integer values for classification
def create_label_mapping(column):
    return {label: idx for idx, label in enumerate(sorted(column.unique()))}

# Create label mappings for the four columns
eye_color_mapping = create_label_mapping(df['eye_color'])
hair_color_mapping = create_label_mapping(df['hair_color'])
eyebrow_color_mapping = create_label_mapping(df['eyebrow_color'])
skin_tone_mapping = create_label_mapping(df['skin_tone'])

# Loop through each row in the CSV and process the corresponding image
for idx, row in df.iterrows():
    image_path = os.path.join(image_dir, row['file_name'])
    try:
        # Load and preprocess the image
        image = Image.open(image_path)
        inputs = processor(images=image, return_tensors="pt", padding=True)
        
        # Get the image embeddings from the model
        with torch.no_grad():
            image_embeddings = model.get_image_features(**inputs)
        
        # Append the embeddings
        embeddings.append(image_embeddings.cpu().numpy())
        
        # Convert the labels to integers based on the mappings
        eye_color_label = eye_color_mapping.get(row['eye_color'], -1)
        hair_color_label = hair_color_mapping.get(row['hair_color'], -1)
        eyebrow_color_label = eyebrow_color_mapping.get(row['eyebrow_color'], -1)
        skin_tone_label = skin_tone_mapping.get(row['skin_tone'], -1)
        
        # Store the four labels as a tuple
        labels.append([eye_color_label, hair_color_label, eyebrow_color_label, skin_tone_label])
    except Exception as e:
        print(f"Error processing {row['file_name']}: {e}")

# Convert embeddings and labels to numpy arrays
embeddings = np.array(embeddings)
labels = np.array(labels)

# Save the embeddings and labels to an NPZ file
np.savez('celeba_embeddings_with_all_labels.npz', embeddings=embeddings, labels=labels)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
import numpy as np

# Load the original CSV file
csv_file_path = 'celeba_features_final.csv'
df = pd.read_csv(csv_file_path)

# Load the NPZ file
data = np.load('celeba_embeddings_with_all_labels.npz')

# Extract embeddings and labels from NPZ
embeddings = data['embeddings']
npz_labels = data['labels']

# Mapping columns in CSV file for easier comparison
eye_color_mapping = {label: idx for idx, label in enumerate(sorted(df['eye_color'].unique()))}
hair_color_mapping = {label: idx for idx, label in enumerate(sorted(df['hair_color'].unique()))}
eyebrow_color_mapping = {label: idx for idx, label in enumerate(sorted(df['eyebrow_color'].unique()))}
skin_tone_mapping = {label: idx for idx, label in enumerate(sorted(df['skin_tone'].unique()))}

# Function to extract the correct label mappings from the CSV file
def get_csv_labels(row):
    eye_color_label = eye_color_mapping.get(row['eye_color'], -1)
    hair_color_label = hair_color_mapping.get(row['hair_color'], -1)
    eyebrow_color_label = eyebrow_color_mapping.get(row['eyebrow_color'], -1)
    skin_tone_label = skin_tone_mapping.get(row['skin_tone'], -1)
    return [eye_color_label, hair_color_label, eyebrow_color_label, skin_tone_label]

# Check if the labels from the NPZ file match the labels in the CSV file for the first few rows
sample_size = 5  # Number of samples to check
print("Comparing Labels from NPZ and CSV:")

for i in range(sample_size):
    csv_labels = get_csv_labels(df.iloc[i])
    npz_labels_sample = npz_labels[i]
    
    if np.array_equal(csv_labels, npz_labels_sample):
        print(f"Sample {i+1}: Labels match")
    else:
        print(f"Sample {i+1}: Labels do not match!")
        print(f"  CSV Labels: {csv_labels}")
        print(f"  NPZ Labels: {npz_labels_sample}")
    print("-" * 50)


Comparing Labels from NPZ and CSV:
Sample 1: Labels match
--------------------------------------------------
Sample 2: Labels match
--------------------------------------------------
Sample 3: Labels match
--------------------------------------------------
Sample 4: Labels match
--------------------------------------------------
Sample 5: Labels match
--------------------------------------------------
