# Testing Cuda and installing packages 

In [1]:
import torch
print(torch.cuda.is_available())  # Should return True if CUDA is correctly set up

True


In [2]:
!pip install datasets transformers huggingface_hub supervision timm flash_attn sentence_transformers

Defaulting to user installation because normal site-packages is not writeable


# Importing packages and downloading/cleaning data

In [3]:
import os
import pandas as pd
import requests
import torch
from transformers import AutoModelForCausalLM, AutoProcessor
from PIL import Image
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm
  warn(


In [12]:
# Step 1: Download the metadata CSV if it doesn't already exist
def download_metadata_csv(metadata_url, metadata_file_path):
    if not os.path.exists(metadata_file_path):
        response = requests.get(metadata_url)
        if response.status_code == 200:
            with open(metadata_file_path, 'wb') as f:
                f.write(response.content)
            print(f"Metadata CSV file saved as {metadata_file_path}")
        else:
            print(f"Failed to download the metadata file. Status code: {response.status_code}")
            return
    else:
        print(f"Metadata file already exists at {metadata_file_path}")

# Step 2: Convert the metadata CSV file to DataFrame
def convert_metadata_to_dataframe(metadata_file_path):
    df = pd.read_csv(metadata_file_path)
    df.rename(columns={"fileNameAsDelivered": "image_filename", "scientificName": "scientificName"}, inplace=True)
    return df

# Step 3: Download images from the specified chunks
def download_images(df, chunk_base_url, chunk_count, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    for _, row in df.iterrows():
        image_filename = row['image_filename']
        image_downloaded = False
        for chunk_index in range(chunk_count):
            image_url = f"{chunk_base_url}chunk_{chunk_index}/{image_filename}"
            response = requests.get(image_url)
            if response.status_code == 200:
                image_path = os.path.join(output_dir, image_filename)
                with open(image_path, 'wb') as f:
                    f.write(response.content)
                print(f"Downloaded and saved: {image_filename} from chunk_{chunk_index}")
                image_downloaded = True
                break
            else:
                print(f"Image not found in chunk_{chunk_index}: {image_filename}")
        if not image_downloaded:
            print(f"Failed to download {image_filename} from all chunks.")

# Step 4: Filter existing images and remove missing records
def filter_existing_images_with_scientific_name(df, output_dir):
    df['file_exists'] = df['image_filename'].apply(lambda x: os.path.exists(os.path.join(output_dir, x)))
    df_cleaned = df[df['file_exists']].copy()
    df_cleaned.drop(columns=['file_exists'], inplace=True)
    return df_cleaned

# Step 5: Save the DataFrame to CSV
def save_dataframe_to_csv(df, csv_file_path="cleaned_images_with_scientific_names.csv"):
    df.to_csv(csv_file_path, index=False)
    print(f"DataFrame saved to {csv_file_path}")

# Main execution
metadata_url = "https://huggingface.co/datasets/sammarfy/VLM4Bio/resolve/main/datasets/Bird/metadata/metadata_easy.csv"
metadata_file_path = "metadata_easy.csv"
output_dir = "downloaded_images"
chunk_base_url = "https://huggingface.co/datasets/sammarfy/VLM4Bio/resolve/main/datasets/Bird/"
chunk_count = 5  # We have chunk_0 to chunk_4

# Execute steps
download_metadata_csv(metadata_url, metadata_file_path)
df = convert_metadata_to_dataframe(metadata_file_path)
download_images(df, chunk_base_url, chunk_count, output_dir)
df_cleaned = filter_existing_images_with_scientific_name(df, output_dir)
save_dataframe_to_csv(df_cleaned, "cleaned_images_with_scientific_names.csv")

# Get unique labels from df["scientific_name"]
labels = df_cleaned["scientificName"].unique().tolist()



Metadata file already exists at metadata_easy.csv
Image not found in chunk_0: American_Crow_0053_25203.jpg
Downloaded and saved: American_Crow_0053_25203.jpg from chunk_1
Image not found in chunk_0: American_Crow_0101_25118.jpg
Image not found in chunk_1: American_Crow_0101_25118.jpg
Image not found in chunk_2: American_Crow_0101_25118.jpg
Downloaded and saved: American_Crow_0101_25118.jpg from chunk_3
Image not found in chunk_0: Fish_Crow_0058_25999.jpg
Image not found in chunk_1: Fish_Crow_0058_25999.jpg
Downloaded and saved: Fish_Crow_0058_25999.jpg from chunk_2
Downloaded and saved: Fish_Crow_0024_26064.jpg from chunk_0
Downloaded and saved: Fish_Crow_0023_26037.jpg from chunk_0
Image not found in chunk_0: Common_Raven_0095_101831.jpg
Image not found in chunk_1: Common_Raven_0095_101831.jpg
Image not found in chunk_2: Common_Raven_0095_101831.jpg
Downloaded and saved: Common_Raven_0095_101831.jpg from chunk_3
Image not found in chunk_0: White_Necked_Raven_0006_797347.jpg
Image not 

# Loading Model

In [13]:
# Set up device and torch dtype
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Load Florence-2 Large model and processor
model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
model = model.eval()
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)




In [14]:
def get_image_embeddings(image_filename):
    
    # folder where images are stored
    image_folder = "downloaded_images"

    try:
        # Create the full image path by combining the folder and filename
        image_path = os.path.join(image_folder, image_filename)
        
        # Open the image
        image = Image.open(image_path)
        
        # Prepare inputs using the processor
        inputs = processor(images=image, return_tensors="pt").to(device)
        
        # Ensure inputs are converted to float16 if using mixed precision
        inputs = {k: v.to(dtype=torch.float16) if model.dtype == torch.float16 else v for k, v in inputs.items()}
        
        # Get image embeddings using the private method
        with torch.no_grad():  # Avoid computing gradients since we are not training
            image_embeddings = model._encode_image(inputs["pixel_values"])
        
        # Convert embeddings to CPU tensor (to avoid keeping them on GPU)
        return image_embeddings.cpu().numpy()
    
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None  # Return None for failed image processing

# Apply the function to each image in your dataframe and store embeddings in a new column
df_cleaned['image_embeddings'] = df_cleaned['image_filename'].apply(lambda x: get_image_embeddings(x))

# You can save the dataframe to a file, if needed
#df_cleaned.to_csv('image_embeddings.csv', index=False)

In [15]:
df_cleaned

Unnamed: 0,image_filename,scientificName,image_embeddings
0,American_Crow_0053_25203.jpg,Corvus brachyrhynchos,"[[[0.3108, -0.769, 0.9146, -0.884, 0.8896, 1.6..."
1,American_Crow_0101_25118.jpg,Corvus brachyrhynchos,"[[[0.8584, -0.3435, 0.7407, -0.6094, 0.6064, 1..."
2,Fish_Crow_0058_25999.jpg,Corvus ossifragus,"[[[-0.01119, 0.0318, 0.7705, -0.8066, 0.441, 0..."
3,Fish_Crow_0024_26064.jpg,Corvus ossifragus,"[[[-0.07837, -0.3308, 0.3035, -0.7007, 0.522, ..."
4,Fish_Crow_0023_26037.jpg,Corvus ossifragus,"[[[0.5483, -0.2413, 1.162, -0.4883, 0.8955, 1...."
...,...,...,...
195,Harris_Sparrow_0074_116539.jpg,Zonotrichia querula,"[[[-0.2347, -0.4468, 1.45, -0.02164, -0.0744, ..."
196,Harris_Sparrow_0039_116409.jpg,Zonotrichia querula,"[[[-0.5034, -0.4377, 2.498, 0.312, 0.3853, 1.7..."
197,White_Throated_Sparrow_0002_129057.jpg,Zonotrichia albicollis,"[[[-0.2089, -0.2856, 1.369, 0.1791, 0.0554, 0...."
198,White_Crowned_Sparrow_0017_125829.jpg,Zonotrichia leucophrys,"[[[-0.388, 0.264, 1.797, 0.2113, 0.3237, 1.878..."


# Zero Shot test, Split data into testing (90:10), Average Pooling (Mean of Embeddings), Getting highest Score, Checking Accuracy

In [41]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Function to calculate cosine similarity between two 3D embeddings
def calculate_similarity(embedding1, embedding2):
    # Average across the token dimension (axis=1) to get a single vector
    embedding1_avg = np.mean(embedding1, axis=1)
    embedding2_avg = np.mean(embedding2, axis=1)
    
    # Calculate cosine similarity between the averaged embeddings
    similarity = cosine_similarity(embedding1_avg, embedding2_avg)
    return similarity[0][0]

def find_best_match(test_embedding, train_df, label_column):
    similarities = []
    
    # Loop through each image in the train set and calculate similarity
    for idx, row in train_df.iterrows():
        similarity_score = calculate_similarity(np.array(test_embedding), np.array(row['image_embeddings']))
        similarities.append((row[label_column], similarity_score))
    
    # Create a DataFrame from similarities to easily group and process
    similarity_df = pd.DataFrame(similarities, columns=[label_column, 'similarity_score'])
    
    # Group by the label column
    grouped = similarity_df.groupby(label_column)
    
    # Column 1: Highest cosine similarity per label
    highest_scores = grouped['similarity_score'].max()
    
    # Column 2: Sum of cosine similarities per label
    summed_scores = grouped['similarity_score'].sum()
    
    # Find the label with the highest summed score
    highest_summed_label = summed_scores.idxmax()
    highest_summed_score = summed_scores.max()
    
    # Find the label with the highest individual cosine score
    highest_individual_label = highest_scores.idxmax()
    highest_individual_score = highest_scores.max()
    
    return highest_individual_label, highest_individual_score, highest_summed_label, highest_summed_score

def apply_best_match(test_df, train_df, label_column):
    # Apply the process to each image in the test set
    test_df[['highest_individual_name', 'highest_individual_score', 
             'highest_summed_name', 'highest_summed_score']] = test_df['image_embeddings'].apply(
        lambda x: pd.Series(find_best_match(np.array(x), train_df, label_column))
    )
    
    # Check accuracy by comparing the predicted highest label with the actual label in the test set
    accuracy_individual = np.mean(test_df['highest_individual_name'] == test_df[label_column])
    accuracy_summed = np.mean(test_df['highest_summed_name'] == test_df[label_column])
    
    print(f"Accuracy based on highest individual cosine similarity: {accuracy_individual * 100:.2f}%")
    print(f"Accuracy based on highest summed cosine similarity: {accuracy_summed * 100:.2f}%")
    
    return test_df, accuracy_individual, accuracy_summed

# Example usage:
apply_best_match(test_df, train_df, 'scientificName')

Accuracy based on highest individual cosine similarity: 30.00%
Accuracy based on highest summed cosine similarity: 0.00%


(                             image_filename           scientificName  \
 95        Nashville_Warbler_0129_167053.jpg  Leiothlypis ruficapilla   
 15       White_Necked_Raven_0016_797385.jpg        Corvus albicollis   
 30         Kentucky_Warbler_0080_165351.jpg       Geothlypis formosa   
 158        Chipping_Sparrow_0091_108308.jpg       Spizella passerina   
 128    Bay_Breasted_Warbler_0018_159897.jpg       Setophaga castanea   
 115          Painted_Bunting_0081_15230.jpg          Passerina ciris   
 69              Herring_Gull_0139_47006.jpg         Larus argentatus   
 170      Philadelphia_Vireo_0040_794764.jpg     Vireo philadelphicus   
 174      Philadelphia_Vireo_0016_156598.jpg     Vireo philadelphicus   
 45            Orchard_Oriole_0044_91360.jpg          Icterus spurius   
 66              Western_Gull_0062_53538.jpg       Larus occidentalis   
 182  White_Throated_Sparrow_0125_128832.jpg   Zonotrichia albicollis   
 165      Black_Capped_Vireo_0042_797483.jpg       

In [44]:
# Function to extract the genus from the scientific name (first part before space)
def extract_genus(scientific_name):
    return scientific_name.split()[0]

# Add a column for genus in both train and test dataframes
train_df['genus'] = train_df['scientificName'].apply(extract_genus)
test_df['genus'] = test_df['scientificName'].apply(extract_genus)
apply_best_match(test_df, train_df, 'genus')

Accuracy based on highest individual cosine similarity: 77.50%
Accuracy based on highest summed cosine similarity: 5.00%


(                             image_filename           scientificName  \
 95        Nashville_Warbler_0129_167053.jpg  Leiothlypis ruficapilla   
 15       White_Necked_Raven_0016_797385.jpg        Corvus albicollis   
 30         Kentucky_Warbler_0080_165351.jpg       Geothlypis formosa   
 158        Chipping_Sparrow_0091_108308.jpg       Spizella passerina   
 128    Bay_Breasted_Warbler_0018_159897.jpg       Setophaga castanea   
 115          Painted_Bunting_0081_15230.jpg          Passerina ciris   
 69              Herring_Gull_0139_47006.jpg         Larus argentatus   
 170      Philadelphia_Vireo_0040_794764.jpg     Vireo philadelphicus   
 174      Philadelphia_Vireo_0016_156598.jpg     Vireo philadelphicus   
 45            Orchard_Oriole_0044_91360.jpg          Icterus spurius   
 66              Western_Gull_0062_53538.jpg       Larus occidentalis   
 182  White_Throated_Sparrow_0125_128832.jpg   Zonotrichia albicollis   
 165      Black_Capped_Vireo_0042_797483.jpg       

NameError: name 'find_best_genus' is not defined