In [1]:
import torch
print(torch.cuda.is_available())  # Should return True if CUDA is correctly set up

True


In [2]:
import warnings
warnings.filterwarnings('ignore')
import os
import pandas as pd
import requests
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForCausalLM, AutoProcessor
from PIL import Image
import torch
from sklearn.metrics.pairwise import cosine_similarity
import open_clip
from sklearn.model_selection import train_test_split
import numpy as np
import  unicom
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm
from torchvision import transforms
import torch.multiprocessing as mp
import faiss
import gc
from float_converter import convert_embeddings_column_to_float32, index_faiss_cosine_similarity, search_faiss_index
#from image_utils import worker_function, get_image_embeddings
#from embedding_utils import convert_to_tensors, mean_pool_embeddings, normalize_embeddings, worker_function


# Loading preprocess

In [3]:
df_cleaned = pd.read_csv("cleaned_images_with_scientific_names.csv")

In [4]:
# Function to extract the genus from the scientific name (first part before space)
global train_df 
global test_df

def extract_genus(scientific_name):
    return scientific_name.split()[0]

def split_create_genus():
    # Apply genus extraction to df_cleaned
    global train_df, test_df
    df_cleaned['genus'] = df_cleaned['scientificName'].apply(extract_genus)

    # Count the number of samples per genus
    genus_counts = df_cleaned['genus'].value_counts()

    # Splitting the dataframe into training and test sets
    train_df, test_df = train_test_split(df_cleaned, test_size=0.33, random_state=432, stratify=df_cleaned['genus'])

    # Checking the resulting shapes of the train and test sets
    print(f"Training set size: {train_df.shape}")
    print(f"Test set size: {test_df.shape}")
    


In [5]:
# Function to clear GPU memory
def clear_memory():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

In [6]:
# Set up device and torch dtype
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Load Florence-2 Large model and processor
model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
model = model.eval()
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)

Florence2LanguageForConditionalGeneration has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [7]:
# Class for handling image loading
class ImageDataset(Dataset):
    def __init__(self, image_filenames, image_folder):
        self.image_filenames = image_filenames
        self.image_folder = image_folder
        
    def __len__(self):
        return len(self.image_filenames)
    
    def __getitem__(self, idx):
        image_filename = self.image_filenames[idx]
        image_path = os.path.join(self.image_folder, image_filename)
        try:
            image = Image.open(image_path).convert('RGB')
            return image, image_filename
        except Exception as e:
            print(f"Error in processing image {image_filename}: {e}")
            return None, image_filename

In [8]:
# Custom collate function for DataLoader
def collate_fn(batch):
    images, filenames = zip(*batch)
    # Filter out None images from the batch
    valid_images = [img for img in images if img is not None]
    valid_filenames = [fname for img, fname in zip(images, filenames) if img is not None]

    if len(valid_images) > 0:
        # Pass the raw images directly to the processor here (on CPU)
        inputs = processor(images=valid_images, return_tensors="pt")
        # Ensure the input tensor is in float16 if the model expects it
        inputs = {k: v.to(dtype=torch.float16) if model.dtype == torch.float16 else v for k, v in inputs.items()}
        return inputs, valid_filenames
    else:
        return None, valid_filenames

In [9]:
# Function to generate image embeddings with tqdm progress bar and dtype correction
def image_embedding(df, batch_size=16, num_workers=3):  
    image_filenames = df['image_filename'].tolist()
    image_folder = "Zero_shot_faiss/downloaded_images"
    
    dataset = ImageDataset(image_filenames, image_folder)
    
    # Create DataLoader with the custom collate function
    dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, collate_fn=collate_fn)
    
    results = []
    filenames = []
    
    with tqdm(total=len(dataloader), desc="Processing Batches", unit="batch") as pbar:
        for inputs, batch_filenames in dataloader:
            if inputs is not None:
                with torch.inference_mode():
                    inputs = {k: v.to(device) for k, v in inputs.items()}
                    embeddings = model._encode_image(inputs["pixel_values"]).cpu().numpy()
                    clear_memory()
                    results.append(embeddings)
                    filenames.extend(batch_filenames)
                    clear_memory()
            pbar.update(1)  
                
    chunk_size = 1000  # You can adjust this based on your memory limitations
    all_embeddings = []

    for i in range(0, len(results), chunk_size):
        chunk = np.vstack(results[i:i + chunk_size])
        all_embeddings.extend(chunk)

    df['image_embeddings'] = all_embeddings
    return df


In [10]:
df = df_cleaned 
df = image_embedding(df)

Processing Batches:   0%|          | 0/694 [00:00<?, ?batch/s]

You are using Florence-2 without a text prompt.
You are using Florence-2 without a text prompt.
You are using Florence-2 without a text prompt.


In [11]:
del model
clear_memory()
df_cleaned.to_pickle("df_cleaned.pkl")
split_create_genus()

Training set size: (7431, 4)
Test set size: (3661, 4)


# something

In [3]:
df_cleaned = pd.read_pickle("df_cleaned.pkl")

: 

In [4]:
df = convert_embeddings_column_to_float32(df_cleaned, "image_embeddings", batch_size=100, n_jobs=5)

Starting conversion of 11092 embeddings from float16 to float32 in batches of 100


: 

: 