# Testing Cuda and installing packages 

In [1]:
import torch
print(torch.cuda.is_available())  # Should return True if CUDA is correctly set up

True


# Importing packages and downloading/cleaning data

In [2]:
import warnings
warnings.filterwarnings('ignore')
import os
import pandas as pd
import requests
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForCausalLM, AutoProcessor
from PIL import Image
import torch
from sklearn.metrics.pairwise import cosine_similarity
import open_clip
from sklearn.model_selection import train_test_split
import numpy as np
import  unicom
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm
from torchvision import transforms
import torch.multiprocessing as mp
import faiss
import gc
from image_utils import worker_function, get_image_embeddings
#from embedding_utils import convert_to_tensors, mean_pool_embeddings, normalize_embeddings, worker_function


## Downloading Image

In [None]:
# Step 1: Download the metadata CSV if it doesn't already exist
def download_metadata_csv(metadata_url, metadata_file_path):
    if not os.path.exists(metadata_file_path):
        response = requests.get(metadata_url)
        if response.status_code == 200:
            with open(metadata_file_path, 'wb') as f:
                f.write(response.content)
            print(f"Metadata CSV file saved as {metadata_file_path}")
        else:
            print(f"Failed to download the metadata file. Status code: {response.status_code}")
            return
    else:
        print(f"Metadata file already exists at {metadata_file_path}")

# Step 2: Convert the metadata CSV file to DataFrame
def convert_metadata_to_dataframe(metadata_file_path):
    df = pd.read_csv(metadata_file_path)
    df.rename(columns={"fileNameAsDelivered": "image_filename", "scientificName": "scientificName"}, inplace=True)
    return df

# Helper function to download a single image
def download_single_image(row, chunk_base_url, chunk_count, output_dir):
    image_filename = row['image_filename']
    image_downloaded = False
    for chunk_index in range(chunk_count):
        image_url = f"{chunk_base_url}chunk_{chunk_index}/{image_filename}"
        response = requests.get(image_url)
        if response.status_code == 200:
            image_path = os.path.join(output_dir, image_filename)
            with open(image_path, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded and saved: {image_filename} from chunk_{chunk_index}")
            image_downloaded = True
            break
        else:
            print(f"Image not found in chunk_{chunk_index}: {image_filename}")
    if not image_downloaded:
        print(f"Failed to download {image_filename} from all chunks.")

# Step 3: Download images from the specified chunks using multithreading
def download_images(df, chunk_base_url, chunk_count, output_dir, max_threads=5):
    os.makedirs(output_dir, exist_ok=True)
    
    # Create a ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=max_threads) as executor:
        # Submit download tasks for each row
        futures = [executor.submit(download_single_image, row, chunk_base_url, chunk_count, output_dir) for _, row in df.iterrows()]
        
        # Wait for all the futures to complete
        for future in futures:
            future.result()  # This will raise an exception if the thread encountered any issues

# Step 4: Filter existing images and remove missing records
def filter_existing_images_with_scientific_name(df, output_dir):
    df['file_exists'] = df['image_filename'].apply(lambda x: os.path.exists(os.path.join(output_dir, x)))
    df_cleaned = df[df['file_exists']].copy()
    df_cleaned.drop(columns=['file_exists'], inplace=True)
    return df_cleaned

# Step 5: Save the DataFrame to CSV
def save_dataframe_to_csv(df, csv_file_path="cleaned_images_with_scientific_names.csv"):
    df.to_csv(csv_file_path, index=False)
    print(f"DataFrame saved to {csv_file_path}")

# Main execution
metadata_url = "https://huggingface.co/datasets/sammarfy/VLM4Bio/resolve/main/datasets/Bird/metadata/metadata_10k.csv"
metadata_file_path = "metadata_10k.csv"
output_dir = "downloaded_images"
chunk_base_url = "https://huggingface.co/datasets/sammarfy/VLM4Bio/resolve/main/datasets/Bird/"
chunk_count = 5  # We have chunk_0 to chunk_4

# Execute steps
download_metadata_csv(metadata_url, metadata_file_path)
df = convert_metadata_to_dataframe(metadata_file_path)
download_images(df, chunk_base_url, chunk_count, output_dir, max_threads=10)  # Set max_threads to the desired number
df_cleaned = filter_existing_images_with_scientific_name(df, output_dir)
save_dataframe_to_csv(df_cleaned, "cleaned_images_with_scientific_names.csv")

# Get unique labels from df["scientific_name"]
labels = df_cleaned["scientificName"].unique().tolist()


#combine chunk 0-4
# multi-threading

# Loading Data from CSV

In [None]:
df_cleaned = pd.read_csv("cleaned_images_with_scientific_names.csv")

# Extract Genus and then Split data on genus column

In [None]:
# Function to extract the genus from the scientific name (first part before space)
def extract_genus(scientific_name):
    return scientific_name.split()[0]

def split_create_genus():
    # Apply genus extraction to df_cleaned
    global train_df, test_df
    df_cleaned['genus'] = df_cleaned['scientificName'].apply(extract_genus)

    # Count the number of samples per genus
    genus_counts = df_cleaned['genus'].value_counts()

    # Splitting the dataframe into training and test sets
    train_df, test_df = train_test_split(df_cleaned, test_size=0.33, random_state=432, stratify=df_cleaned['genus'])

    # Checking the resulting shapes of the train and test sets
    print(f"Training set size: {train_df.shape}")
    print(f"Test set size: {test_df.shape}")
    
split_create_genus()


In [None]:
train_df, test_df = train_test_split(df_cleaned, test_size=0.33, random_state=432, stratify=df_cleaned['genus'])


# Zero Shot test, Split data into testing (70:30), Average Pooling (Mean of Embeddings) for Florence Only, Getting highest Score, Checking Accuracy

# Florence-2 Large 

In [7]:
# Set up device and torch dtype
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Load Florence-2 Large model and processor
model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
model = model.eval()
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)

Florence2LanguageForConditionalGeneration has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [8]:
# Class for handling image loading
class ImageDataset(Dataset):
    def __init__(self, image_filenames, image_folder):
        self.image_filenames = image_filenames
        self.image_folder = image_folder
        
    def __len__(self):
        return len(self.image_filenames)
    
    def __getitem__(self, idx):
        image_filename = self.image_filenames[idx]
        image_path = os.path.join(self.image_folder, image_filename)
        try:
            image = Image.open(image_path).convert('RGB')
            return image, image_filename
        except Exception as e:
            print(f"Error in processing image {image_filename}: {e}")
            return None, image_filename

# Custom collate function for DataLoader
def collate_fn(batch):
    images, filenames = zip(*batch)
    # Filter out None images from the batch
    valid_images = [img for img in images if img is not None]
    valid_filenames = [fname for img, fname in zip(images, filenames) if img is not None]

    if len(valid_images) > 0:
        # Pass the raw images directly to the processor here (on CPU)
        inputs = processor(images=valid_images, return_tensors="pt")
        # Ensure the input tensor is in float16 if the model expects it
        inputs = {k: v.to(dtype=torch.float16) if model.dtype == torch.float16 else v for k, v in inputs.items()}
        return inputs, valid_filenames
    else:
        return None, valid_filenames

# Function to generate image embeddings with tqdm progress bar and dtype correction
def image_embedding(df, batch_size=16, num_workers=3):  # You can use 3 workers now
    image_filenames = df['image_filename'].tolist()
    image_folder = "downloaded_images"
    
    # Create dataset
    dataset = ImageDataset(image_filenames, image_folder)
    
    # Create DataLoader with the custom collate function
    dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, collate_fn=collate_fn)
    
    results = []
    filenames = []
    
    # Initialize tqdm progress bar
    with tqdm(total=len(dataloader), desc="Processing Batches", unit="batch") as pbar:
        # Process using DataLoader
        for inputs, batch_filenames in dataloader:
            if inputs is not None:
                with torch.inference_mode():
                    # Move the inputs to the GPU and ensure they match the model's precision
                    inputs = {k: v.to(device) for k, v in inputs.items()}
                    # Ensure the embeddings computation uses consistent dtype
                    embeddings = model._encode_image(inputs["pixel_values"]).cpu().numpy()
                    results.append(embeddings)
                    filenames.extend(batch_filenames)
            pbar.update(1)  # Update progress bar after processing each batch
                
    # Flatten the list of results and store in df 
    if results:
        embeddings = np.vstack(results)
        df['image_embeddings'] = list(embeddings)
    return df

 

In [9]:
# Ensure this block is only for execution, after moving class/function definitions out
if __name__ == "__main__":
    df = df_cleaned  # Replace with your actual dataframe
    df = image_embedding(df)

Processing Batches:   0%|          | 0/694 [00:00<?, ?batch/s]You are using Florence-2 without a text prompt.
You are using Florence-2 without a text prompt.
You are using Florence-2 without a text prompt.
Processing Batches: 100%|██████████| 694/694 [03:49<00:00,  3.03batch/s]


In [10]:
train_df, test_df = train_test_split(df_cleaned, test_size=0.33, random_state=432, stratify=df_cleaned['genus'])

In [11]:
del model

In [5]:
train_df, test_df = train_test_split(df_cleaned, test_size=0.33, random_state=432, stratify=df_cleaned['genus'])


# Creating embeddings, normalizing L2, Builiding index

In [12]:
df_cleaned.to_pickle("df_cleaned.pkl")

In [3]:
df_cleaned = pd.read_pickle("df_cleaned.pkl")

# Inspiration frread_pickle code

In [None]:


gc.collect()


train_embeddings_tensor = torch.tensor(np.stack(train_df['image_embeddings'].values), device='cuda')
test_embeddings_tensor = torch.tensor(np.stack(test_df['image_embeddings'].values), device='cuda')

# Mean pool 
train_embeddings_mean = train_embeddings_tensor.mean(dim=1)  
test_embeddings_mean = test_embeddings_tensor.mean(dim=1)    

# Normalize 
train_embeddings_normalized = train_embeddings_mean / train_embeddings_mean.norm(dim=1, keepdim=True)
test_embeddings_normalized = test_embeddings_mean / test_embeddings_mean.norm(dim=1, keepdim=True)

def find_best_match(test_embedding, train_embeddings_normalized, train_labels):
    cosine_similarities = torch.matmul(test_embedding, train_embeddings_normalized.T)  # Shape [1, N]

    max_similarity_index = torch.argmax(cosine_similarities).item()

    return train_labels[max_similarity_index], cosine_similarities.max().item()

def apply_best_match(test_embeddings_normalized, train_embeddings_normalized, train_df, label_column):
    train_labels = train_df[label_column].tolist()
    results = []

    for test_embedding in tqdm(test_embeddings_normalized, desc="Processing Test Embeddings"):
        test_embedding = test_embedding.unsqueeze(0)
        result = find_best_match(test_embedding, train_embeddings_normalized, train_labels)
        results.append(result)

    results_df = pd.DataFrame(results, columns=['highest_individual_name', 'highest_individual_score'])

    results_df['true_label'] = test_df[label_column].tolist()

    accuracy_individual = np.mean(results_df['highest_individual_name'] == results_df['true_label'])

    print(f"Accuracy based on highest individual cosine similarity: {accuracy_individual * 100:.2f}%")

    return results_df, accuracy_individual

label_column = 'genus'  
results_df, accuracy_individual = apply_best_match(
    test_embeddings_normalized, train_embeddings_normalized, train_df, label_column
)


### Minor Changes to the code for better float16 conversion, avoiding division with 0 , cuda flushing, 

In [6]:


gc.collect()


print("Step 1: Converting train and test embeddings to torch tensors...")
train_embeddings_tensor = torch.tensor(
    np.stack(train_df['image_embeddings'].values), device='cuda', dtype=torch.float16
)
test_embeddings_tensor = torch.tensor(
    np.stack(test_df['image_embeddings'].values), device='cuda', dtype=torch.float16
)

print("Step 2: Mean pooling the embeddings...")
train_embeddings_mean = train_embeddings_tensor.mean(dim=1) 
test_embeddings_mean = test_embeddings_tensor.mean(dim=1)    

print("Step 3: Normalizing the embeddings...")
epsilon = 1e-8
train_embeddings_normalized = train_embeddings_mean / (train_embeddings_mean.norm(dim=1, keepdim=True) + epsilon)
test_embeddings_normalized = test_embeddings_mean / (test_embeddings_mean.norm(dim=1, keepdim=True) + epsilon)

def apply_best_match_with_batches(test_embeddings_normalized, train_embeddings_normalized, train_df, label_column, batch_size=512):
    train_labels = train_df[label_column].tolist()
    num_test_samples = test_embeddings_normalized.size(0)
    results = []

    for start_idx in tqdm(range(0, num_test_samples, batch_size), desc="Processing Test Embeddings in Batches"):
        print(f"Processing batch from index {start_idx} to {min(start_idx + batch_size, num_test_samples)}...")
        end_idx = min(start_idx + batch_size, num_test_samples)
        batch_embeddings = test_embeddings_normalized[start_idx:end_idx]

        print("Calculating cosine similarities...")
        cosine_similarities = torch.matmul(batch_embeddings, train_embeddings_normalized.T)

        print("Finding highest similarity indices...")
        max_similarity_indices = torch.argmax(cosine_similarities, dim=1)
        max_similarity_values = torch.max(cosine_similarities, dim=1).values

        print("Appending results...")
        for idx, sim_value in zip(max_similarity_indices, max_similarity_values):
            results.append((train_labels[idx.item()], sim_value.item()))

        # Manually clearing CUDA memory cache only
        print("Clearing CUDA memory...")
        del cosine_similarities
        torch.cuda.empty_cache()

    print("Creating results DataFrame...")
    results_df = pd.DataFrame(results, columns=['highest_individual_name', 'highest_individual_score'])
    results_df['true_label'] = test_df[label_column].tolist()

    print("Calculating accuracy...")
    accuracy_individual = (results_df['highest_individual_name'].astype(str) == results_df['true_label'].astype(str)).mean()

    print(f"Accuracy based on highest individual cosine similarity: {accuracy_individual * 100:.2f}%")

    return results_df, accuracy_individual

print("Starting batch processing for predictions...")
label_column = 'scientificName' 
results_df, accuracy_individual = apply_best_match_with_batches(
    test_embeddings_normalized, train_embeddings_normalized, train_df, label_column
)

Step 1: Converting train and test embeddings to torch tensors...
Step 2: Mean pooling the embeddings...
Step 3: Normalizing the embeddings...
Starting batch processing for predictions...


Processing Test Embeddings in Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Processing batch from index 0 to 512...
Calculating cosine similarities...
Finding highest similarity indices...
Appending results...


Processing Test Embeddings in Batches: 100%|██████████| 8/8 [00:07<00:00,  1.05it/s]

Clearing CUDA memory...
Processing batch from index 512 to 1024...
Calculating cosine similarities...
Finding highest similarity indices...
Appending results...
Clearing CUDA memory...
Processing batch from index 1024 to 1536...
Calculating cosine similarities...
Finding highest similarity indices...
Appending results...
Clearing CUDA memory...
Processing batch from index 1536 to 2048...
Calculating cosine similarities...
Finding highest similarity indices...
Appending results...
Clearing CUDA memory...
Processing batch from index 2048 to 2560...
Calculating cosine similarities...
Finding highest similarity indices...
Appending results...
Clearing CUDA memory...
Processing batch from index 2560 to 3072...
Calculating cosine similarities...
Finding highest similarity indices...
Appending results...
Clearing CUDA memory...
Processing batch from index 3072 to 3584...
Calculating cosine similarities...
Finding highest similarity indices...
Appending results...
Clearing CUDA memory...
Proce




tensor([[-0.0030, -0.0104,  0.0348,  ..., -0.0025, -0.0132, -0.0383],
        [-0.0163, -0.0357,  0.0053,  ...,  0.0172, -0.0041,  0.0042],
        [-0.0548,  0.0020,  0.0155,  ...,  0.0040, -0.0031, -0.0182],
        ...,
        [ 0.0287,  0.0106,  0.0475,  ..., -0.0117,  0.0215, -0.0833],
        [ 0.0435,  0.0472, -0.0168,  ..., -0.0033,  0.0178, -0.0201],
        [-0.0067,  0.0365,  0.0133,  ..., -0.0101,  0.0015, -0.0598]],
       device='cuda:0')

tensor(1.0000, device='cuda:0')


# Clip 

In [30]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, _, preprocess = open_clip.create_model_and_transforms('ViT-H-14-378-quickgelu', pretrained='dfn5b')
model.eval()  # Ensure the model is in evaluation mode
# Ensure the model is on the same device as the inputs
model = model.to(device)

tokenizer = open_clip.get_tokenizer('ViT-H-14-378-quickgelu')

In [None]:
def run_multiprocessing(df_cleaned, preprocess, model, device, num_workers=4):
    mp.set_start_method('spawn', force=True)

    image_filenames = df_cleaned['image_filename'].tolist()
    num_images = len(image_filenames)
    result_list = mp.Manager().list([None] * num_images)
    progress_counter = mp.Value('i', 0)  # Shared counter for tracking progress

    batch_size = num_images // num_workers

    processes = []
    for worker_id in range(num_workers):
        start_idx = worker_id * batch_size
        end_idx = (worker_id + 1) * batch_size if worker_id < num_workers - 1 else num_images
        process = mp.Process(target=worker_function, args=(image_filenames, start_idx, end_idx, result_list, preprocess, model, device, progress_counter))
        processes.append(process)
        process.start()

    with tqdm(total=num_images, desc="Getting Image Embeddings") as pbar:
        while any(p.is_alive() for p in processes):
            with progress_counter.get_lock():
                pbar.n = progress_counter.value
            pbar.refresh()

    for process in processes:
        process.join()

    df_cleaned['image_embeddings'] = list(result_list)

In [None]:


# Running the multiprocessing function in Jupyter
device = "cuda" if torch.cuda.is_available() else "cpu"
run_multiprocessing(df_cleaned, preprocess, model, device, num_workers=8)

Getting Image Embeddings:  97%|█████████▋| 10788/11092 [12:07<00:20, 14.83it/s]     

In [32]:
import warnings
warnings.filterwarnings('ignore')
import os
import pandas as pd
import requests
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForCausalLM, AutoProcessor
from PIL import Image
import torch
from sklearn.metrics.pairwise import cosine_similarity
#import open_clip
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
#import  unicom
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm
from torchvision import transforms
import torch.multiprocessing as mp
import faiss
# Function to extract the genus from the scientific name (first part before space)
df_cleaned = pd.read_pickle('df_cleaned.pkl')
def extract_genus(scientific_name):
    return scientific_name.split()[0]

def split_create_genus():
    # Apply genus extraction to df_cleaned
    global train_df, test_df
    df_cleaned['genus'] = df_cleaned['scientificName'].apply(extract_genus)

    # Count the number of samples per genus
    genus_counts = df_cleaned['genus'].value_counts()

    # Splitting the dataframe into training and test sets
    train_df, test_df = train_test_split(df_cleaned, test_size=0.33, random_state=432, stratify=df_cleaned['genus'])

    # Checking the resulting shapes of the train and test sets
    print(f"Training set size: {train_df.shape}")
    print(f"Test set size: {test_df.shape}")
    
split_create_genus()
train_df, test_df = train_test_split(df_cleaned, test_size=0.33, random_state=432, stratify=df_cleaned['genus'])



Training set size: (7431, 4)
Test set size: (3661, 4)


In [None]:
df_cleaned = df_cleaned.drop(columns=['image_embeddings'])


In [None]:
# Apply the function to each image in your dataframe and store embeddings in a new column
df_cleaned['image_embeddings'] = df_cleaned['image_filename'].apply(lambda x: get_image_embeddings(x))


In [None]:
split_create_genus()

apply_best_match(test_df, df_cleaned, 'genus')

In [1]:
def convert_to_torch_tensors(train_df, test_df, device='cuda', dtype=torch.float16):
    print("Step 1: Converting train and test embeddings to torch tensors...")
    train_embeddings_tensor = torch.tensor(
        np.stack(train_df['image_embeddings'].values), device=device, dtype=dtype
    )
    test_embeddings_tensor = torch.tensor(
        np.stack(test_df['image_embeddings'].values), device=device, dtype=dtype
    )
    return train_embeddings_tensor, test_embeddings_tensor

NameError: name 'torch' is not defined

In [33]:
def reshape_embeddings(train_embeddings_tensor, test_embeddings_tensor):
    print("Step 2: Reshaping the embeddings...")
    train_embeddings_reshaped = train_embeddings_tensor.view(train_embeddings_tensor.size(0), -1)
    test_embeddings_reshaped = test_embeddings_tensor.view(test_embeddings_tensor.size(0), -1)
    return train_embeddings_reshaped, test_embeddings_reshaped

train_embeddings_tensor, test_embeddings_tensor = convert_to_torch_tensors(train_df, test_df)

train_embeddings_reshaped, test_embeddings_reshaped = reshape_embeddings(train_embeddings_tensor, test_embeddings_tensor)

epsilon = 1e-8
train_embeddings_normalized = train_embeddings_reshaped / (train_embeddings_reshaped.norm(dim=1, keepdim=True) + epsilon)
test_embeddings_normalized = test_embeddings_reshaped / (test_embeddings_reshaped.norm(dim=1, keepdim=True) + epsilon)

def apply_best_match_with_batches(test_embeddings_normalized, train_embeddings_normalized, train_df, label_column, batch_size=512):
    train_labels = train_df[label_column].tolist()
    num_test_samples = test_embeddings_normalized.size(0)
    results = []

    for start_idx in tqdm(range(0, num_test_samples, batch_size), desc="Processing Test Embeddings in Batches"):
        print(f"Processing batch from index {start_idx} to {min(start_idx + batch_size, num_test_samples)}...")
        end_idx = min(start_idx + batch_size, num_test_samples)
        batch_embeddings = test_embeddings_normalized[start_idx:end_idx]

        print("Calculating cosine similarities...")
        cosine_similarities = torch.matmul(batch_embeddings, train_embeddings_normalized.T)

        print("Finding highest similarity indices...")
        max_similarity_indices = torch.argmax(cosine_similarities, dim=1)
        max_similarity_values = torch.max(cosine_similarities, dim=1).values

        print("Appending results...")
        for idx, sim_value in zip(max_similarity_indices, max_similarity_values):
            results.append((train_labels[idx.item()], sim_value.item()))

        print("Clearing CUDA memory...")
        del cosine_similarities
        torch.cuda.empty_cache()

    print("Creating results DataFrame...")
    results_df = pd.DataFrame(results, columns=['highest_individual_name', 'highest_individual_score'])
    results_df['true_label'] = test_df[label_column].tolist()

    print("Calculating accuracy...")
    accuracy_individual = (results_df['highest_individual_name'].astype(str) == results_df['true_label'].astype(str)).mean()

    print(f"Accuracy based on highest individual cosine similarity: {accuracy_individual * 100:.2f}%")

    return results_df, accuracy_individual

print("Starting batch processing for predictions...")
label_column = 'scientificName'
results_df, accuracy_individual = apply_best_match_with_batches(
    test_embeddings_normalized, train_embeddings_normalized, train_df, label_column
)

Step 1: Converting train and test embeddings to torch tensors...
Step 2: Reshaping the embeddings...
Starting batch processing for predictions...


Processing Test Embeddings in Batches: 100%|██████████| 8/8 [00:00<00:00, 82.95it/s]

Processing batch from index 0 to 512...
Calculating cosine similarities...
Finding highest similarity indices...
Appending results...
Clearing CUDA memory...
Processing batch from index 512 to 1024...
Calculating cosine similarities...
Finding highest similarity indices...
Appending results...
Clearing CUDA memory...
Processing batch from index 1024 to 1536...
Calculating cosine similarities...
Finding highest similarity indices...
Appending results...
Clearing CUDA memory...
Processing batch from index 1536 to 2048...
Calculating cosine similarities...
Finding highest similarity indices...
Appending results...
Clearing CUDA memory...
Processing batch from index 2048 to 2560...
Calculating cosine similarities...
Finding highest similarity indices...
Appending results...
Clearing CUDA memory...
Processing batch from index 2560 to 3072...
Calculating cosine similarities...
Finding highest similarity indices...
Appending results...
Clearing CUDA memory...
Processing batch from index 3072 




# Bio Clip

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, _, preprocess = open_clip.create_model_and_transforms('hf-hub:imageomics/bioclip')
model.eval()
model = model.to(device)
tokenizer = open_clip.get_tokenizer('hf-hub:imageomics/bioclip')


In [29]:
df_cleaned = df_cleaned.drop(columns=['image_embeddings'])


In [17]:
# Running the multiprocessing function in Jupyter
device = "cuda" if torch.cuda.is_available() else "cpu"
run_multiprocessing(df_cleaned, preprocess, model, device, num_workers=8)

Process Process-6:
Traceback (most recent call last):
  File "/blue/arthur.porto-biocosmos/ahmed.waseem/.conda/envs/bvl/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/blue/arthur.porto-biocosmos/ahmed.waseem/.conda/envs/bvl/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ahmed.waseem/Bio_vision_lab/embedding_utils.py", line 25, in worker_function
    embedding = get_image_embeddings(image_filenames[idx], preprocess, model, device)
                ^^^^^^^^^^^^^^^^^^^^
NameError: name 'get_image_embeddings' is not defined
Process Process-7:
Traceback (most recent call last):
  File "/blue/arthur.porto-biocosmos/ahmed.waseem/.conda/envs/bvl/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/blue/arthur.porto-biocosmos/ahmed.waseem/.conda/envs/bvl/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._ar

In [18]:
#split_create_genus()
train_df, test_df = train_test_split(df_cleaned, test_size=0.33, random_state=432, stratify=df_cleaned['genus'])


In [19]:
def convert_to_torch_tensors(train_df, test_df, device='cuda', dtype=torch.float16):
    print("Step 1: Converting train and test embeddings to torch tensors...")
    train_embeddings_tensor = torch.tensor(
        np.stack(train_df['image_embeddings'].values), device=device, dtype=dtype
    )
    test_embeddings_tensor = torch.tensor(
        np.stack(test_df['image_embeddings'].values), device=device, dtype=dtype
    )
    return train_embeddings_tensor, test_embeddings_tensor

def reshape_embeddings(train_embeddings_tensor, test_embeddings_tensor):
    print("Step 2: Reshaping the embeddings...")
    train_embeddings_reshaped = train_embeddings_tensor.view(train_embeddings_tensor.size(0), -1)
    test_embeddings_reshaped = test_embeddings_tensor.view(test_embeddings_tensor.size(0), -1)
    return train_embeddings_reshaped, test_embeddings_reshaped

train_embeddings_tensor, test_embeddings_tensor = convert_to_torch_tensors(train_df, test_df)

train_embeddings_reshaped, test_embeddings_reshaped = reshape_embeddings(train_embeddings_tensor, test_embeddings_tensor)

epsilon = 1e-8
train_embeddings_normalized = train_embeddings_reshaped / (train_embeddings_reshaped.norm(dim=1, keepdim=True) + epsilon)
test_embeddings_normalized = test_embeddings_reshaped / (test_embeddings_reshaped.norm(dim=1, keepdim=True) + epsilon)

def apply_best_match_with_batches(test_embeddings_normalized, train_embeddings_normalized, train_df, label_column, batch_size=512):
    train_labels = train_df[label_column].tolist()
    num_test_samples = test_embeddings_normalized.size(0)
    results = []

    for start_idx in tqdm(range(0, num_test_samples, batch_size), desc="Processing Test Embeddings in Batches"):
        print(f"Processing batch from index {start_idx} to {min(start_idx + batch_size, num_test_samples)}...")
        end_idx = min(start_idx + batch_size, num_test_samples)
        batch_embeddings = test_embeddings_normalized[start_idx:end_idx]

        print("Calculating cosine similarities...")
        cosine_similarities = torch.matmul(batch_embeddings, train_embeddings_normalized.T)

        print("Finding highest similarity indices...")
        max_similarity_indices = torch.argmax(cosine_similarities, dim=1)
        max_similarity_values = torch.max(cosine_similarities, dim=1).values

        print("Appending results...")
        for idx, sim_value in zip(max_similarity_indices, max_similarity_values):
            results.append((train_labels[idx.item()], sim_value.item()))

        print("Clearing CUDA memory...")
        del cosine_similarities
        torch.cuda.empty_cache()

    print("Creating results DataFrame...")
    results_df = pd.DataFrame(results, columns=['highest_individual_name', 'highest_individual_score'])
    results_df['true_label'] = test_df[label_column].tolist()

    print("Calculating accuracy...")
    accuracy_individual = (results_df['highest_individual_name'].astype(str) == results_df['true_label'].astype(str)).mean()

    print(f"Accuracy based on highest individual cosine similarity: {accuracy_individual * 100:.2f}%")

    return results_df, accuracy_individual

print("Starting batch processing for predictions...")
label_column = 'scientificName'
results_df, accuracy_individual = apply_best_match_with_batches(
    test_embeddings_normalized, train_embeddings_normalized, train_df, label_column
)

Step 1: Converting train and test embeddings to torch tensors...


TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.

In [20]:
test_df

Unnamed: 0,image_filename,scientificName,genus,image_embeddings
8039,Green_Tailed_Towhee_0027_154823.jpg,Pipilo chlorurus,Pipilo,
7275,Bank_Swallow_0053_129501.jpg,Riparia riparia,Riparia,
8632,Bay_Breasted_Warbler_0045_797135.jpg,Setophaga castanea,Setophaga,
9207,Kentucky_Warbler_0027_795917.jpg,Geothlypis formosa,Geothlypis,
629,Yellow_Headed_Blackbird_0083_8300.jpg,Xanthocephalus xanthocephalus,Xanthocephalus,
...,...,...,...,...
9772,Swainson_Warbler_0020_794863.jpg,Limnothlypis swainsonii,Limnothlypis,
1277,Red_Faced_Cormorant_0066_796333.jpg,Phalacrocorax urile,Phalacrocorax,
4007,Pomarine_Jaeger_0062_61351.jpg,Stercorarius pomarinus,Stercorarius,
7223,Cape_Glossy_Starling_0060_129222.jpg,Lamprotornis nitens,Lamprotornis,


# Unicom

In [None]:
import unicom
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Step 1: Load the UNICOM model
model_name = "ViT-L/14@336px"
model, preprocess = unicom.load(model_name)
model.eval()
model = model.to(device)


In [None]:
df_cleaned = df_cleaned.drop(columns=['image_embeddings'])


In [None]:
# Running the multiprocessing function in Jupyter
device = "cuda" if torch.cuda.is_available() else "cpu"
run_multiprocessing(df_cleaned, preprocess, model, device, num_workers=8)

In [23]:
def get_image_embeddings(image_filename):
    # Folder where images are stored
    image_folder = "downloaded_images"
    
    try:
        image_path = os.path.join(image_folder, image_filename)
        
        # Open the image
        image = Image.open(image_path)
        
        # Prepare the image using the CLIP processor (preprocessing)
        inputs = preprocess(image).unsqueeze(0).to(device)
        
        with torch.no_grad():  # Avoid computing gradients since we are not training
            image_embeddings = model(inputs)
        
        # Convert embeddings to a CPU tensor (to avoid keeping them on GPU)
        return image_embeddings.cpu().numpy()
    
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None  # Return None if the image could not be processed

In [24]:
df_cleaned['image_embeddings'] = df_cleaned['image_filename'].apply(lambda x: get_image_embeddings(x))


KeyboardInterrupt: 

In [None]:
split_create_genus()
train_df, test_df = train_test_split(df_cleaned, test_size=0.33, random_state=432, stratify=df_cleaned['genus'])


In [None]:
def convert_to_torch_tensors(train_df, test_df, device='cuda', dtype=torch.float16):
    print("Step 1: Converting train and test embeddings to torch tensors...")
    train_embeddings_tensor = torch.tensor(
        np.stack(train_df['image_embeddings'].values), device=device, dtype=dtype
    )
    test_embeddings_tensor = torch.tensor(
        np.stack(test_df['image_embeddings'].values), device=device, dtype=dtype
    )
    return train_embeddings_tensor, test_embeddings_tensor

def reshape_embeddings(train_embeddings_tensor, test_embeddings_tensor):
    print("Step 2: Reshaping the embeddings...")
    train_embeddings_reshaped = train_embeddings_tensor.view(train_embeddings_tensor.size(0), -1)
    test_embeddings_reshaped = test_embeddings_tensor.view(test_embeddings_tensor.size(0), -1)
    return train_embeddings_reshaped, test_embeddings_reshaped

train_embeddings_tensor, test_embeddings_tensor = convert_to_torch_tensors(train_df, test_df)

train_embeddings_reshaped, test_embeddings_reshaped = reshape_embeddings(train_embeddings_tensor, test_embeddings_tensor)

epsilon = 1e-8
train_embeddings_normalized = train_embeddings_reshaped / (train_embeddings_reshaped.norm(dim=1, keepdim=True) + epsilon)
test_embeddings_normalized = test_embeddings_reshaped / (test_embeddings_reshaped.norm(dim=1, keepdim=True) + epsilon)

def apply_best_match_with_batches(test_embeddings_normalized, train_embeddings_normalized, train_df, label_column, batch_size=512):
    train_labels = train_df[label_column].tolist()
    num_test_samples = test_embeddings_normalized.size(0)
    results = []

    for start_idx in tqdm(range(0, num_test_samples, batch_size), desc="Processing Test Embeddings in Batches"):
        print(f"Processing batch from index {start_idx} to {min(start_idx + batch_size, num_test_samples)}...")
        end_idx = min(start_idx + batch_size, num_test_samples)
        batch_embeddings = test_embeddings_normalized[start_idx:end_idx]

        print("Calculating cosine similarities...")
        cosine_similarities = torch.matmul(batch_embeddings, train_embeddings_normalized.T)

        print("Finding highest similarity indices...")
        max_similarity_indices = torch.argmax(cosine_similarities, dim=1)
        max_similarity_values = torch.max(cosine_similarities, dim=1).values

        print("Appending results...")
        for idx, sim_value in zip(max_similarity_indices, max_similarity_values):
            results.append((train_labels[idx.item()], sim_value.item()))

        print("Clearing CUDA memory...")
        del cosine_similarities
        torch.cuda.empty_cache()

    print("Creating results DataFrame...")
    results_df = pd.DataFrame(results, columns=['highest_individual_name', 'highest_individual_score'])
    results_df['true_label'] = test_df[label_column].tolist()

    print("Calculating accuracy...")
    accuracy_individual = (results_df['highest_individual_name'].astype(str) == results_df['true_label'].astype(str)).mean()

    print(f"Accuracy based on highest individual cosine similarity: {accuracy_individual * 100:.2f}%")

    return results_df, accuracy_individual

print("Starting batch processing for predictions...")
label_column = 'scientificName'
results_df, accuracy_individual = apply_best_match_with_batches(
    test_embeddings_normalized, train_embeddings_normalized, train_df, label_column
)

In [None]:
df = pd.read_csv("cleaned_images_with_scientific_names.csv")

In [None]:
def extract_genus(scientific_name):
    return scientific_name.split()[0]

df['genus'] = df['scientificName'].apply(extract_genus)