In [1]:
import ollama 
import os
from tqdm import tqdm
import torch
import signal
import random
import numpy as np
import json
import numpy as np
from PIL import Image
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from transformers import AutoProcessor, CLIPModel # pip install transformers
# jupyter notebook --ip 0.0.0.0 --port 8889 --allow-root
# kubectl port-forward dep-cvpr-7d4d4b94f9-82829 8889:8889

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer, CLIPTextModel, CLIPTokenizer, CLIPModel 
#from torch.nn import CosineSimilarity
from torch.nn.functional import cosine_similarity

# cossim = CosineSimilarity(dim=1,eps=1e-6)

# def compute_similarity(emb1, emb2):
#     emb1  = emb1.unsqueeze(0)
#     emb2  = emb2.unsqueeze(0)
#     return cossim(emb1,emb2)

def get_class_embeddings(prompts, tokenizer, text_encoder):
    text_inputs = tokenizer(prompts, padding="max_length", return_tensors="pt").to(device)
    outputs = text_encoder(**text_inputs)
    text_embedding = outputs.pooler_output
    return text_embedding
    
def get_query_embedding(query_prompt, tokenizer, text_encoder):
    
    query_input = tokenizer(query_prompt, padding="max_length", return_tensors="pt").to(device)
    query_output = text_encoder(**query_input)
    query_embedding = query_output.pooler_output
    return query_embedding

def compute_scores(class_embeddings, query_embedding, prompts):
     # Compute cosine similarity scores
    similarity_scores = cosine_similarity(query_embedding, class_embeddings, dim=1)  # Shape: [37]
    
    # Find the highest matching score and corresponding item
    max_score_index = torch.argmax(similarity_scores).item()
    max_score = similarity_scores[max_score_index].item()
    best_match = prompts[max_score_index]
    
    # Print the result
   # print(f"Best match: {best_match} with a similarity score of {max_score:.4f}")
    return best_match
    
    
# CLIPText model is the text encoder for clip
# CLIPTextModelWithProjection is the text encoder + projection layer 
# to place the text embeddings in the same embedding space as the image embeddings

model_id = "openai/clip-vit-large-patch14"
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = CLIPTokenizer.from_pretrained(model_id)
text_encoder = CLIPTextModel.from_pretrained(model_id).to(device)
model = CLIPModel.from_pretrained(model_id).to(device)

Oxford_pets_prompts = ["Abyssinian","Bengal", "Bombay", "Birman", "British Shorthair", "Maine Coon", "Persian", "Egyptian Mau",
           "Ragdoll", "Russian Blue", "Siamese", "Sphynx", "Boxer", "Keeshond", "Havanese", "Basset Hound", "English Setter",
           "Miniature Pinscher", "Chihuahua", "Great Pyrenees", "German Shorthaired", "Beagle", "Staffordshire Bull Terrier",
           "English Cocker Spaniel", "New Found Land", "Pomeranian", "Leonberger", "American Pit Bull Terrier", "Wheaten Terrier",
           "Japanese Chin", "Samyod", "Samoyed", "Samyoed", "Scottish Terrier", "Shiba Inu", "Pug", "Saint Bernard", "American Bulldog", "Yorkshire Terrier"]
Eurosat_prompts = ["SeaLake", "PermanentCrop", "River", "Residential", "Pasture", "Industrial", "Highway", "HerbaceousVegetation", 
                   "Forest","AnnualCrop"]


In [6]:
Eurosat_prompts = ["SeaLake", "PermanentCrop", "River", "Residential", "Pasture", "Industrial", "Highway", "HerbaceousVegetation", 
                   "Forest","AnnualCrop"]
query_prompt = ["A photo of a Sealake area"] 
query_embedding = get_query_embedding(query_prompt, tokenizer, text_encoder)
eurosat_embeddings = get_class_embeddings(Eurosat_prompts, tokenizer, text_encoder)
sp = compute_scores(eurosat_embeddings, query_embedding, Eurosat_prompts)
print(sp)

SeaLake


In [None]:
def compute_similarity(emb1, emb2):
    emb1  = emb1.unsqueeze(0)
    emb2  = emb2.unsqueeze(0)
    return cossim(emb1,emb2)

In [None]:
class_numbers_Eurosat = {
    "SeaLake": 9,
    "PermanentCrop": 6,
    "River": 8,
    "Residential": 7,
    "Pasture": 5,
    "Industrial": 4,
    "Highway": 3,
    "HerbaceousVegetation": 2,
    "Forest": 1,
    "AnnualCrop": 0
}

In [None]:
print(class_numbers[Eurosat_prompts[0]])

In [None]:
import os
import json

def load_json(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

save_path = "/mnt/Software/ViGIR_CVPR_LLM/prompting_framework/results_oxford_pets_new_prompt/"
data_path = '/mnt/Software/ViGIR_CVPR_LLM/prompting_framework/oxford_pets_new_prompt_rebuttal/'
all_files = os.listdir(data_path)

# Filter only the .json files
prediction_files = [f for f in all_files if f.endswith('.json')]

#print(prediction_files)
oxford_pet_embeddings = get_class_embeddings(Oxford_pets_prompts, tokenizer, text_encoder)

for i in range(len(prediction_files)) : # for all prediction files
    if "minicpm" not in prediction_files[i] : 
        print(f"Skipping {prediction_files[i]}...")
        continue
    
    print(f"Processing {prediction_files[i]}...")
    oxford_pets_predictions = load_json(os.path.join(data_path,prediction_files[i]))
    results_matching = {}
    count_long_responses = 0
    j = 0
    N = len(oxford_pets_predictions.items())
    for image_path, label in oxford_pets_predictions.items():
        fname = os.path.basename(image_path)
        
        #print(f"{j}/{N} | File: {fname}") #, Label: {label}")
        j+=1
        if len(label) > 50 :
             query_prompt = label[:50]
             count_long_responses+=1
        else:
             query_prompt = label
        query_embedding = get_query_embedding(query_prompt, tokenizer,text_encoder)
        matched_label  = compute_scores(oxford_pet_embeddings,query_embedding, Oxford_pets_prompts)
        print(f"{j}/{N} | File: {fname} | Match: {matched_label}, VLM Output:  {label}")
        class_id = class_numbers_oxford_pets[matched_label]
        # # print(class_id)
        results_matching[fname] = class_id
        
    
    
    # json_file_path = os.path.join(save_path, prediction_files[i])
    
    # # Write the dictionary to the JSON file
    # with open(json_file_path, 'w') as f:
    #     json.dump(results_matching, f, indent=4)  # 'indent=4' is optional for readability

In [None]:
fname

In [None]:
matched_label

In [None]:
results_matching

In [None]:
eurosat_embeddings = get_class_embeddings(Eurosat_prompts, tokenizer, text_encoder)
query_prompt = ["AnnualCrop"]
query_embedding = get_query_embedding(query_prompt, tokenizer,text_encoder)
compute_scores(eurosat_embeddings,query_embedding, Eurosat_prompts)

In [None]:
class_numbers_oxford_pets = {"Abyssinian" : 0,
                             "Bengal" : 5,
                             "Bombay" : 7,
                             "Birman" : 6,
                             "British Shorthair": 9,
                             "Maine Coon": 20,
                             "Persian": 23,
                             "Egyptian Mau" : 11,
                             "Ragdoll" : 26,
                             "Russian Blue" : 27,
                             "Siamese" : 32,
                             "Sphynx" : 33,
                             "Boxer" : 8,
                             "Keeshond" : 18,
                             "Havanese" : 16,
                             "Basset Hound" : 3,
                             "English Setter" : 13,
                             "Miniature Pinscher" : 21,
                             "Chihuahua" : 10,
                             "Great Pyrenees" : 15,
                             "German Shorthaired" : 14,
                             "Beagle" : 4,
                             "Staffordshire Bull Terrier" : 34,
                             "English Cocker Spaniel" : 12,
                             "New Found Land" : 22,
                             "Pomeranian" : 24,
                             "Leonberger" : 19,
                             "American Pit Bull Terrier" : 2,
                             "Wheaten Terrier" : 35,
                             "Japanese Chin" : 17,
                             "Samyod" : 29, 
                             "Samoyed" : 29, 
                             "Samyoed" : 29,
                             "Scottish Terrier" : 30,
                             "Shiba Inu" : 31,
                             "Pug" : 25,
                             "Saint Bernard" : 28,
                             "American Bulldog" : 1,
                             "Yorkshire Terrier" : 36
                            }


In [None]:
class_numbers_oxford_pets = {
    "abyssinian": 0,
    "american_bulldog": 1,
    "american_pit_bull_terrier": 2,
    "basset_hound": 3,
    "beagle": 4,
    "bengal": 5,
    "birman": 6,
    "bombay": 7,
    "boxer": 8,
    "british_shorthair": 9,
    "chihuahua": 10,
    "egyptian_mau": 11,
    "english_cocker_spaniel": 12,
    "english_setter": 13,
    "german_shorthaired": 14,
    "great_pyrenees": 15,
    "havanese": 16,
    "japanese_chin": 17,
    "keeshond": 18,
    "leonberger": 19,
    "maine_coon": 20,
    "miniature_pinscher": 21,
    "newfoundland": 22,
    "persian": 23,
    "pomeranian": 24,
    "pug": 25,
    "ragdoll": 26,
    "russian_blue": 27,
    "saint_bernard": 28,
    "samoyed": 29,
    "scottish_terrier": 30,
    "shiba_inu": 31,
    "siamese": 32,
    "sphynx": 33,
    "staffordshire_bull_terrier": 34,
    "wheaten_terrier": 35,
    "yorkshire_terrier": 36
}

In [None]:

Eurosat_prompts = ["SeaLake", "PermanentCrop","River","Residential","Pasture","Industrial", "Highway", "HerbaceousVegetation", "Forest","AnnualCrop"]

In [None]:

prompts = ["Abyssinian","Bengal", "Bombay", "Birman", "British Shorthair", "Maine Coon", "Persian", "Egyptian Mau",
           "Ragdoll", "Russian Blue", "Siamese", "Sphynx", "Boxer", "Keeshond", "Havanese", "Basset Hound", "English Setter",
           "Miniature Pinscher", "Chihuahua", "Great Pyrenees", "German Shorthaired", "Beagle", "Staffordshire Bull Terrier",
           "English Cocker Spaniel", "New Found Land", "Pomeranian", "Leonberger", "American Pit Bull Terrier", "Wheaten Terrier",
           "Japanese Chin", "Samyod", "Scottish Terrier", "Shiba Inu", "Pug", "Saint Bernard", "American Bulldog", "Yorkshire Terrier"]
print(len(prompts))

In [None]:
base_path = '/root/home/data/hateful_memes/'
images_path = os.path.join(base_path, "img")

list_of_image_names = os.listdir(images_path)

In [None]:
def check_yes_no(text):
    # Strip any leading/trailing whitespace and convert to lowercase
    text = text.strip().lower()

    # Check if the text starts with 'yes' or 'no'
    if text.startswith("yes"):
        return 1
    elif text.startswith("no"):
        return 0
    else:
        return None  
    
class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException


def read_jsonl_file(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            entry = json.loads(line.strip())
            data.append(entry)
    return data

def load_image_and_label(entry, img_base_path):
    img_path = f"{img_base_path}/{entry['img']}"
    try:
        img = Image.open(img_path)
        label = entry['label']
        return img, label
    except FileNotFoundError:
        print(f"Image {img_path} not found.")
        return None, None


def load_dev_file(input_file):
    dev_data = {}
    with open(input_file, 'r') as infile:
        for line in infile:
            # Load each JSON line as a dictionary
            entry = json.loads(line.strip())
        
            # Use the image path as the key and the label as the value
            dev_data[entry["img"]] = entry["label"]
    return dev_data

dev_file = os.path.join(base_path, "dev.jsonl")
dev_data = load_dev_file(dev_file)

In [14]:
list_of_models = ['llava:7b', 
                  'llava:13b',
                  'llava:34b',
                  'llava-llama3',
                  'bakllava',
                  'moondream',
                  'minicpm-v',
                  'llava-phi3']

ollama.pull('llava:7b') #pull the desired model


{'status': 'success'}

In [15]:
options= {  # new
            "seed": 123,
            "temperature": 0,
            "num_ctx": 2048, # must be set, otherwise slightly random output
        }
    

np.random.seed(0)

In [5]:
prompt_1 =  "Here is a list of aircrafts for fine-grained images classification: Abingdon Spherical Free Balloon,  AEG Wagner Eule, Aeris Naviter AN-2 Enara, Aeritalia F-104S Starfighter" 
aircraft_context = ollama.generate(model='llava-phi3', prompt=prompt_1, options=options)

In [7]:
class_names = 'AM General Hummer SUV 2000, Acura Integra Type R 2001, Acura RL Sedan 2012, Acura TL Sedan 2012, Acura TL Type-S 2008, Acura TSX Sedan 2012, Acura ZDX Hatchback 2012, Aston Martin V8 Vantage Convertible 2012, Aston Martin V8 Vantage Coupe 2012, Aston Martin Virage Convertible 2012, Aston Martin Virage Coupe 2012, Audi 100 Sedan 1994, Audi 100 Wagon 1994, Audi A5 Coupe 2012, Audi R8 Coupe 2012, Audi RS 4 Convertible 2008, Audi S4 Sedan 2007, Audi S4 Sedan 2012, Audi S5 Convertible 2012, Audi S5 Coupe 2012, Audi S6 Sedan 2011, Audi TT Hatchback 2011, Audi TT RS Coupe 2012, Audi TTS Coupe 2012, Audi V8 Sedan 1994, BMW 1 Series Convertible 2012, BMW 1 Series Coupe 2012, BMW 3 Series Sedan 2012, BMW 3 Series Wagon 2012, BMW 6 Series Convertible 2007, BMW ActiveHybrid 5 Sedan 2012, BMW M3 Coupe 2012, BMW M5 Sedan 2010, BMW M6 Convertible 2010, BMW X3 SUV 2012, BMW X5 SUV 2007, BMW X6 SUV 2012, BMW Z4 Convertible 2012, Bentley Arnage Sedan 2009, Bentley Continental Flying Spur Sedan 2007, Bentley Continental GT Coupe 2007, Bentley Continental GT Coupe 2012, Bentley Continental Supersports Conv. Convertible 2012, Bentley Mulsanne Sedan 2011, Bugatti Veyron 16.4 Convertible 2009, Bugatti Veyron 16.4 Coupe 2009, Buick Enclave SUV 2012, Buick Rainier SUV 2007, Buick Regal GS 2012, Buick Verano Sedan 2012, Cadillac CTS-V Sedan 2012, Cadillac Escalade EXT Crew Cab 2007, Cadillac SRX SUV 2012, Chevrolet Avalanche Crew Cab 2012, Chevrolet Camaro Convertible 2012, Chevrolet Cobalt SS 2010, Chevrolet Corvette Convertible 2012, Chevrolet Corvette Ron Fellows Edition Z06 2007, Chevrolet Corvette ZR1 2012, Chevrolet Express Cargo Van 2007, Chevrolet Express Van 2007, Chevrolet HHR SS 2010, Chevrolet Impala Sedan 2007, Chevrolet Malibu Hybrid Sedan 2010, Chevrolet Malibu Sedan 2007, Chevrolet Monte Carlo Coupe 2007, Chevrolet Silverado 1500 Classic Extended Cab 2007, Chevrolet Silverado 1500 Extended Cab 2012, Chevrolet Silverado 1500 Hybrid Crew Cab 2012, Chevrolet Silverado 1500 Regular Cab 2012, Chevrolet Silverado 2500HD Regular Cab 2012, Chevrolet Sonic Sedan 2012, Chevrolet Tahoe Hybrid SUV 2012, Chevrolet TrailBlazer SS 2009, Chevrolet Traverse SUV 2012, Chrysler 300 SRT-8 2010, Chrysler Aspen SUV 2009, Chrysler Crossfire Convertible 2008, Chrysler PT Cruiser Convertible 2008, Chrysler Sebring Convertible 2010, Chrysler Town and Country Minivan 2012, Daewoo Nubira Wagon 2002, Dodge Caliber Wagon 2007, Dodge Caliber Wagon 2012, Dodge Caravan Minivan 1997, Dodge Challenger SRT8 2011, Dodge Charger SRT-8 2009, Dodge Charger Sedan 2012, Dodge Dakota Club Cab 2007, Dodge Dakota Crew Cab 2010, Dodge Durango SUV 2007, Dodge Durango SUV 2012, Dodge Journey SUV 2012, Dodge Magnum Wagon 2008, Dodge Ram Pickup 3500 Crew Cab 2010, Dodge Ram Pickup 3500 Quad Cab 2009, Dodge Sprinter Cargo Van 2009, Eagle Talon Hatchback 1998, FIAT 500 Abarth 2012, FIAT 500 Convertible 2012, Ferrari 458 Italia Convertible 2012, Ferrari 458 Italia Coupe 2012, Ferrari California Convertible 2012, Ferrari FF Coupe 2012, Fisker Karma Sedan 2012, Ford E-Series Wagon Van 2012, Ford Edge SUV 2012, Ford Expedition EL SUV 2009, Ford F-150 Regular Cab 2007, Ford F-150 Regular Cab 2012, Ford F-450 Super Duty Crew Cab 2012, Ford Fiesta Sedan 2012, Ford Focus Sedan 2007, Ford Freestar Minivan 2007, Ford GT Coupe 2006, Ford Mustang Convertible 2007, Ford Ranger SuperCab 2011, GMC Acadia SUV 2012, GMC Canyon Extended Cab 2012, GMC Savana Van 2012, GMC Terrain SUV 2012, GMC Yukon Hybrid SUV 2012, Geo Metro Convertible 1993, HUMMER H2 SUT Crew Cab 2009, HUMMER H3T Crew Cab 2010, Honda Accord Coupe 2012, Honda Accord Sedan 2012, Honda Odyssey Minivan 2007, Honda Odyssey Minivan 2012, Hyundai Accent Sedan 2012, Hyundai Azera Sedan 2012, Hyundai Elantra Sedan 2007, Hyundai Elantra Touring Hatchback 2012, Hyundai Genesis Sedan 2012, Hyundai Santa Fe SUV 2012, Hyundai Sonata Hybrid Sedan 2012, Hyundai Sonata Sedan 2012, Hyundai Tucson SUV 2012, Hyundai Veloster Hatchback 2012, Hyundai Veracruz SUV 2012, Infiniti G Coupe IPL 2012, Infiniti QX56 SUV 2011, Isuzu Ascender SUV 2008, Jaguar XK XKR 2012, Jeep Compass SUV 2012, Jeep Grand Cherokee SUV 2012, Jeep Liberty SUV 2012, Jeep Patriot SUV 2012, Jeep Wrangler SUV 2012, Lamborghini Aventador Coupe 2012, Lamborghini Diablo Coupe 2001, Lamborghini Gallardo LP 570-4 Superleggera 2012, Lamborghini Reventon Coupe 2008, Land Rover LR2 SUV 2012, Land Rover Range Rover SUV 2012, Lincoln Town Car Sedan 2011, MINI Cooper Roadster Convertible 2012, Maybach Landaulet Convertible 2012, Mazda Tribute SUV 2011, McLaren MP4-12C Coupe 2012, Mercedes-Benz 300-Class Convertible 1993, Mercedes-Benz C-Class Sedan 2012, Mercedes-Benz E-Class Sedan 2012, Mercedes-Benz S-Class Sedan 2012, Mercedes-Benz SL-Class Coupe 2009, Mercedes-Benz Sprinter Van 2012, Mitsubishi Lancer Sedan 2012, Nissan 240SX Coupe 1998, Nissan Juke Hatchback 2012, Nissan Leaf Hatchback 2012, Nissan NV Passenger Van 2012, Plymouth Neon Coupe 1999, Porsche Panamera Sedan 2012, Ram C/V Cargo Van Minivan 2012, Rolls-Royce Ghost Sedan 2012, Rolls-Royce Phantom Drophead Coupe Convertible 2012, Rolls-Royce Phantom Sedan 2012, Scion xD Hatchback 2012, Spyker C8 Convertible 2009, Spyker C8 Coupe 2009, Suzuki Aerio Sedan 2007, Suzuki Kizashi Sedan 2012, Suzuki SX4 Hatchback 2012, Suzuki SX4 Sedan 2012, Tesla Model S Sedan 2012, Toyota 4Runner SUV 2012, Toyota Camry Sedan 2012, Toyota Corolla Sedan 2012, Toyota Sequoia SUV 2012, Volkswagen Beetle Hatchback 2012, Volkswagen Golf Hatchback 1991, Volkswagen Golf Hatchback 2012, Volvo 240 Sedan 1993, Volvo C30 Hatchback 2012, Volvo XC90 SUV 2007, smart fortwo Convertible 2012'

In [16]:
print(len(class_names))

5648


In [17]:
prompt = "You are working on a difficult fine-grained image classification task, here are the only classes you can choose from: " + class_names 
context_response = ollama.generate(model='llava:7b', prompt=class_names, options=options)


In [18]:
print(context_response)

{'model': 'llava:7b', 'created_at': '2025-02-13T20:27:09.321685838Z', 'response': ' This is a list of various car models and their years of production. Some of the cars listed are:\n\n* AMERICAN MUSCLE CARS: Ford Mustang, Chevrolet Camaro, Dodge Charger, Dodge Challenger\n* LUXURY CARS: Rolls-Royce Phantom, Bentley Flying Spur, Mercedes-Benz S-Class, BMW 7 Series\n* ELECTRIC AND HYBRID CARS: Tesla Model S, Toyota Prius, Chevrolet Volt\n* SPORTS CARS: Porsche 911 GT3 RS, Lamborghini Aventador, McLaren MP4-12C\n* SUVs and CROSSOVERS: Jeep Wrangler, Toyota Land Cruiser, Subaru Outback\n* ECONOMY CARS: Honda Civic, Ford Fiesta, Volkswagen Jetta\n* LUXURY SUVS: Range Rover, Mercedes-Benz G-Class, BMW X5\n* CLASSICS AND COLLECTIBLES: Jaguar E-Type, Porsche 356, Ferrari Testarossa\n\nThis list is not exhaustive and there are many other car models and years of production that could be added. ', 'done': True, 'done_reason': 'stop', 'context': [733, 16289, 28793, 10401, 3592, 17442, 794, 13319, 

In [21]:
question = "what is the exact number of classes are present in the fine-grained classification ?"
response = ollama.generate(model='llava:7b', prompt=question, options=options, context=context_response['context'])
print(response)


{'model': 'llava:7b', 'created_at': '2025-02-13T20:30:12.091640191Z', 'response': ' The fine-grained classification of cars can be broken down into several categories, including:\n\n1. Body Style: Sedan, Coupe, Convertible, Hatchback, Wagon, SUV, Minivan\n2. Brand: Audi, BMW, Chevrolet, Chrysler, Dodge, Ford, Honda, Hyundai, Infiniti, Jaguar, Jeep, Kia, Land Rover, Lexus, Lincoln, Mazda, Mercedes-Benz, Mitsubishi, Nissan, Porsche, Ram, Rolls-Royce, Subaru, Suzuki, Tesla, Toyota, Volkswagen\n3. Engine Type: Gasoline, Diesel, Electric, Hybrid\n4. Transmission Type: Manual, Automatic, CVT\n5. Fuel Efficiency: MPG (miles per gallon), KM/L (kilometers per liter)\n6. Horsepower: Low, Medium, High\n7. Trim Level: Base, Mid-Range, Top-End\n8. Safety Features: Airbags, ABS, Traction Control, Stability Control, Blind Spot Monitoring, Rearview Camera\n9. Technology Features: Navigation System, Infotainment System, Bluetooth Connectivity, USB Ports, Touchscreen Display\n10. Drive Type: Front-Wheel

In [25]:
my_list = ["apple", "banana", "cherry"]
list_string = "".join(my_list)
print(list_string)

applebananacherry


In [13]:
base_path = os.getcwd()
image_name = 'fgvc2.jpg'

In [16]:
prompt_2 = "What aircraft category is this? Tell me what categories I have provided in the context."
aircraft_class = ollama.generate(model='llava-phi3', prompt=prompt_2, images=[os.path.join(base_path,image_name)], options=options, context=aircraft_context['context'])


In [26]:
def generate_context_embedding(class_names : str, model : str,  options : dict): 
    """
        CVPR_W !!!
        - Function to create a context embedding for our given fine-grained class names!
        - We are interested in supplying this context to the VLM as it works on classifying
           images from a fine-grained dataset with numerous classes

        Inputs :
        -------
            class_names : str
                comma separated long string of class names.
                ex. class_names = "Honda accord, mazda rx9, mercedes benz c300"
            model       : str
                model being used in current experiment. 
                ex. if using 'llava-llama3' as the current vlm then we need to 
                use it as well for embedding extraction.
            options     : dict
                VLM options.
                ex. options= {  
                            "seed": 123,
                            "temperature": 0,
                            "num_ctx": 2048, # must be set, otherwise slightly random output
                        }
                
        Output :
        --------
            context_embedding : List
                vlm generated context embedding to aid in informed fine-grained classification.
                
    """
    prompt = "You are working on a difficult fine-grained image classification task with the following classes: " + class_names 
    context_response = ollama.generate(model=model, prompt=class_names, options=options)
    return context_response['context']

In [27]:
x = generate_context_embedding(class_names, 'llava:7b', options)

In [28]:
x

{'model': 'llava:7b',
 'created_at': '2025-02-13T20:34:14.855833221Z',
 'response': ' This is a list of various car models and their years of production. Some of the cars listed are:\n\n* AMERICAN MUSCLE CARS: Ford Mustang, Chevrolet Camaro, Dodge Charger, Dodge Challenger\n* LUXURY CARS: Rolls-Royce Phantom, Bentley Flying Spur, Mercedes-Benz S-Class, BMW 7 Series\n* ELECTRIC AND HYBRID CARS: Tesla Model S, Toyota Prius, Chevrolet Volt\n* SPORTS CARS: Porsche 911 GT3 RS, Lamborghini Aventador, McLaren MP4-12C\n* SUVs and CROSSOVERS: Jeep Wrangler, Toyota Land Cruiser, Subaru Outback\n* ECONOMY CARS: Honda Civic, Ford Fiesta, Volkswagen Jetta\n* LUXURY SUVS: Range Rover, Mercedes-Benz G-Class, BMW X5\n* CLASSICS AND COLLECTIBLES: Jaguar E-Type, Porsche 356, Ferrari Testarossa\n\nThis list is not exhaustive and there are many other car models and years of production that could be added. ',
 'done': True,
 'done_reason': 'stop',
 'context': [733,
  16289,
  28793,
  10401,
  3592,
  1744

In [27]:
class_names = "Abyssinian, Bengal, Bombay,  Birman ,  British Shorthair ,  Maine Coon ,  Persian ,  Egyptian Mau , Ragdoll ,  Russian Blue ,  Siamese ,  Sphynx ,  Boxer ,  Keeshond ,  Havanese ,  Basset Hound ,  English Setter ,Miniature Pinscher ,  Chihuahua ,  Great Pyrenees ,  German Shorthaired ,  Beagle ,  Staffordshire Bull Terrier , English Cocker Spaniel ,  New Found Land ,  Pomeranian ,  Leonberger ,  American Pit Bull Terrier ,  Wheaten Terrier ,Japanese Chin ,  Samyod ,  Scottish Terrier ,  Shiba Inu ,  Pug ,  Saint Bernard ,  American Bulldog ,  Yorkshire Terrier"
prompt = "You are working on a difficult fine-grained image classification task, here are the only classes you can choose from: " + class_names 
context_response = ollama.generate(model='llava-phi3', prompt=class_names, options=options)
print(context_response)

{'model': 'llava-phi3', 'created_at': '2025-02-12T20:26:39.282834707Z', 'response': '\n1. Abyssinian: This breed is known for its athletic build, large ears, and distinctive coat pattern of black and tan or brown and tan patches. They are intelligent, active, and make great companions for those who enjoy outdoor activities.\n2. Bengal: The Bengal cat has a unique appearance that resembles both a domestic shorthair and a wild feline. They have short, dense fur in various colors such as brown, gray, or black, with white markings on their face and paws. They are playful, energetic, and curious cats who love to explore their surroundings.\n3. Bombay: The Bombay cat is a small breed that has a sleek, short coat in shades of gray, beige, or brown. They have a muscular build and an alert expression, making them excellent companions for those who enjoy interactive playtime.\n4. Birman: The Birman cat is known for its long, silky fur in various colors such as blue, cream, or brindle. They are a

In [31]:
print(len(context_response['context']))

2127


In [7]:
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms


In [None]:
# Split dev_file
hateful_data = {}
nonhateful_data = {}

for key in dev_data :
    if dev_data[key] == 1 :
        hateful_data[key] = dev_data[key]
    else  :
        nonhateful_data[key] = dev_data[key]
hateful_images = list(hateful_data.keys())
nonhateful_images = list(nonhateful_data.keys())


# Prepare CLIP Model

In [None]:
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
processor = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14")
device = "cuda"
model.to(device)

In [None]:
# emb1 = hateful_embeddings['img/08291.png']
# emb1_name = 'img/08291.png'
#print(len(hateful_embeddings))
emb_path='/mnt/Software/ViGIR_CVPR_LLM/prompting_framework/hateful_memes_embeddings'
#torch.save(hateful_embeddings, os.path.join(emb_path,'hateful_embeddings.pth'))
#torch.save(nonhateful_embeddings, os.path.join(emb_path,'nonhateful_embeddings.pth'))
lhe = torch.load(os.path.join(emb_path,'hateful_embeddings.pth'))
lnhe = torch.load(os.path.join(emb_path,'nonhateful_embeddings.pth'))
all_embeddings = lhe | lnhe
# print(len(all_embeddings))
def retrieve_similar(query_embedding, query_image_name, embeddings_dict):
    max_cosine_similarity = -float('inf')
    closest_image=None
    keys = list(embeddings_dict.keys())
    for key in keys:
        if query_image_name != key :
            current_embedding = embeddings_dict[key]
            # Normalize embeddings
            current_embedding = current_embedding/current_embedding.norm(p=2,dim=-1,keepdim=True)
            query_embedding = query_embedding/query_embedding.norm(p=2,dim=-1,keepdim=True)
            
            cosine_similarity = torch.nn.functional.cosine_similarity(current_embedding, query_embedding, dim=1)
            #print(cosine_similarity)
            if cosine_similarity > max_cosine_similarity :
                max_cosine_similarity = cosine_similarity
                closest_image = key
                #print(f"new max, {closest_image}, {cosine_similarity}")
        
    
    return closest_image
# print(x)
#x = retrieve_similar(emb1, emb1_name, lnhe)

In [None]:
hateful_embeddings['img/08291.png'].shape
print(len(hateful_embeddings))
emb_path='/mnt/Software/ViGIR_CVPR_LLM/prompting_framework/hateful_memes_embeddings'
#torch.save(hateful_embeddings, os.path.join(emb_path,'hateful_embeddings.pth'))
#torch.save(nonhateful_embeddings, os.path.join(emb_path,'nonhateful_embeddings.pth'))
lhe = torch.load(os.path.join(emb_path,'hateful_embeddings.pth'))
lnhe = torch.load(os.path.join(emb_path,'nonhateful_embeddings.pth'))


In [None]:
for 

In [None]:
hateful_embeddings = {}
nonhateful_embeddings = {}
for image in tqdm(hateful_images) :
    img_file = os.path.join(base_path, image)
    print(f"Processing Hateful: {img_file}")
    img = Image.open(img_file)
    inputs = processor(images=img, return_tensors="pt").to(device)
    with torch.no_grad():
        image_features = model.get_image_features(**inputs)
    # Move embeddings to CPU
    hateful_embeddings[image]=image_features.cpu()
    del inputs

for image in tqdm(nonhateful_images) :
    img_file = os.path.join(base_path, image)
    print(f"Processing Non-Hateful: {img_file}")
    img = Image.open(img_file) 
    inputs = processor(images=img, return_tensors="pt").to(device)
    with torch.no_grad():
        image_features = model.get_image_features(**inputs)
    nonhateful_embeddings[image]=image_features.cpu()
    del inputs

In [None]:
options= {  # new
            "seed": 123,
            "temperature": 0,
            "num_ctx": 2048, # must be set, otherwise slightly random output
        }
    
llava_7b_emb_labels = {}

np.random.seed(0)

timeout_duration = 20 # 20 seconds
print(f"Handling the timeout exceptions with timeout duration of {timeout_duration} seconds")

#for image_name in tqdm(list_of_image_names):
for key, value in dev_data.items():
    print(f"Image: {key}, Label: {value}")
    # Extract current image name
    query_img = key
    # Extract its embedding 
    query_emb = all_embeddings[key]

    # Calculate its "nearest neighbor" in hateful and non-hateful -- Ramy
    similar_hateful = retrieve_similar(query_emb, query_img, lhe)
    similar_nonhateful = retrieve_similar(query_emb, query_img, lnhe)
    
    #random_image_index = np.random.randint(0, len(dev_data))
    #random_image_path = os.path.join(base_path, images_list[random_image_index])
    
    #image_path = os.path.join(base_path, key)  
    signal.signal(signal.SIGALRM, timeout_handler)
    signal.alarm(timeout_duration)  # Set the timeout
    
    prompt_1 = "This is  an offensive meme."
    prompt_2 = "This is not an offensive meme."
    prompt_3 = "How about this one? (Answer only Yes or No)"
    
    try:
        response_1 = ollama.generate(model='llava:7b', prompt=prompt_1, images=[os.path.join(base_path,similar_hateful)], options=options)
        response_2 = ollama.generate(model='llava:7b', prompt=prompt_2, images=[os.path.join(base_path,similar_nonhateful)], options=options, context=response_1['context'])
        response_3 = ollama.generate(model='llava:7b', prompt=prompt_3, images=[os.path.join(base_path,query_img)], options=options, context=response_2['context'])
        label = check_yes_no(response_3['response'])
    except TimeoutException:
        print(f"Prompt for {image_name} took longer than {timeout_duration} seconds. Moving to the next one.")
        label = None

    finally:
        signal.alarm(0)  # Disable the alarm

    llava_7b_emb_labels[query_img] = label
    
    print(f"model results -- Image: {query_img}, Label: {label}")
    print("------------------------------------------------------")
   

In [None]:
from PIL import Image
import matplotlib.pyplot as plt

In [None]:
dpath = '/mnt/Context_testing/'
im0 = os.path.join(dpath, '6.jpg')
im12 = os.path.join(dpath, '5.jpg')
imz = os.path.join(dpath, '4.jpg')
im3 = os.path.join(dpath, '3.jpg')
imp2 = os.path.join(dpath, '2.jpg')
imp1 = os.path.join(dpath, '1.jpg')
# 6 -0
# 5 - 12
# 4 - z
# 3 - 3
# 2 - P
# 1 - P
#plt.imshow(im1)
#plt.plot()

In [None]:
def create_collage(images, grid_rows, grid_cols, padding=10):

    img = Image.open(images[0])
    img_width, img_height = img.size


    # Compute collage dimensions based on grid size provided
    collage_width = grid_cols * img_width + (grid_cols + 1) * padding
    collage_height = grid_rows * img_height + (grid_rows + 1) * padding


    # create empty collage
    collage = Image.new('RGB', (collage_width, collage_height), 'white')

    # Add the images onto the empty collage we created!
    for i, img_path in enumerate(images):
        img = Image.open(img_path).resize((img_width, img_height), Image.ANTIALIAS)
        x = (i % grid_cols) * (img_width + padding) + padding
        y = (i // grid_cols) * (img_height + padding) + padding
        collage.paste(img, (x,y))

    return collage

In [None]:
images = [im0,im12,imz,imp1,imp2,im3]
collage = create_collage(images, 3,2,5)

In [None]:
plt.imshow(collage)
plt.show()

In [None]:
collage_file = '/mnt/Context_testing/collage.jpg'

In [None]:
model='llava:7b'
#ollama.pull(model) #pull the desired model
np.random.seed(0)
prompt = " what specific numbers and letters do you see in this image? There are 6 total"
response_1 = ollama.generate(model=model, prompt=prompt, images=[collage_file], options=options)
print(f"prompt1: {response_1['response']}")

In [None]:
ollama.pull('llava-phi3') #pull the desired model
model='llava-phi3'
options= {  # new
            "seed": 123,
            "temperature": 0,
            "num_ctx": 2048, # must be set, otherwise slightly random output
        }
    
llava_7b_emb_labels = {}

np.random.seed(0)
prompt = "Very briefly, what character do you see in this image?"
response_1 = ollama.generate(model=model, prompt=prompt, images=[im0], options=options)
response_2 = ollama.generate(model=model, prompt=prompt, images=[im12], options=options, context=response_1['context'])
response_3 = ollama.generate(model=model, prompt=prompt, images=[imz], options=options, context=response_2['context'])
response_4 = ollama.generate(model=model, prompt=prompt, images=[im3], options=options, context=response_3['context'])
response_5 = ollama.generate(model=model, prompt=prompt, images=[imp2], options=options, context=response_4['context'])
response_6 = ollama.generate(model=model, prompt=prompt, images=[imp1], options=options, context=response_5['context'])

print(f"prompt1: {response_1['response']}")
print(f"prompt2: {response_2['response']}")
print(f"prompt3: {response_3['response']}")
print(f"prompt4: {response_4['response']}")
print(f"prompt5: {response_5['response']}")
print(f"prompt6: {response_6['response']}")


In [None]:
options= {  # new
            "seed": 123,
            "temperature": 0,
            "num_ctx": 2048, # must be set, otherwise slightly random output
        }
    
llava_7b_emb_labels = {}

np.random.seed(0)
prompt = "What do you see in this image?"
response_1 = ollama.generate(model=model, prompt=prompt, images=[im0], options=options)
response_2 = ollama.generate(model=model, prompt=prompt, images=[im12], options=options)#, context=response_1['context'])
response_3 = ollama.generate(model=model, prompt=prompt, images=[imz], options=options)#, context=response_2['context'])
response_4 = ollama.generate(model=model, prompt=prompt, images=[im3], options=options)#, context=response_3['context'])
response_5 = ollama.generate(model=model, prompt=prompt, images=[imp2], options=options)#, context=response_4['context'])
response_6 = ollama.generate(model=model, prompt=prompt, images=[imp1], options=options)#, context=response_5['context'])

print(response_1['response'])
print(response_2['response'])
print(response_3['response'])
print(response_4['response'])
print(response_5['response'])
print(response_6['response'])


In [None]:


class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException

# Setup options and random seed
options = {
    "seed": 123,
    "temperature": 0,
    "num_ctx": 2048,
}
np.random.seed(0)
timeout_duration = 600  # 20 seconds

# Initialize the labels dictionary
llava_7b_emb_labels = {}

# Assign the signal handler for the timeout
signal.signal(signal.SIGALRM, timeout_handler)
print(f"Handling the timeout exceptions with a timeout duration of {timeout_duration} seconds")

for query_img, value in tqdm(dev_data.items()):
    print(f"Image: {query_img}, Label: {value}")

    # Extract embedding
    query_emb = all_embeddings[query_img]

    # Find nearest neighbors
    similar_hateful = retrieve_similar(query_emb, query_img, lhe)
    similar_nonhateful = retrieve_similar(query_emb, query_img, lnhe)

    # Prompts
    prompt_1 = "This is a meme image. The combination of its text and image content is offensive/ hateful. Learn from this."
    prompt_2 = "This is a meme image. The combination of its text and image content is not offensive/ hateful. Learn from this."
    prompt_3 = "Is this a hateful meme? (Answer only Yes or No)"

    # Set the alarm for timeout
    signal.alarm(timeout_duration)
    try:
        response_1 = ollama.generate(
            model='llava:7b', prompt=prompt_1, images=[os.path.join(base_path, similar_hateful)], options=options
        )
        if 'context' not in response_1 :
            label = None
            llava_7b_emb_labels[query_img] = label
            continue
        response_2 = ollama.generate(
            model='llava:7b', prompt=prompt_2, images=[os.path.join(base_path, similar_nonhateful)], options=options, context=response_1['context']
        )
        if 'context' not in response_2 :
            label = None
            llava_7b_emb_labels[query_img] = label
            continue
        response_3 = ollama.generate(
            model='llava:7b', prompt=prompt_3, images=[os.path.join(base_path, query_img)], options=options, context=response_2['context']
        )
        if 'context' not in response_3 :
            label = None
            llava_7b_emb_labels[query_img] = label
            continue
            
        label = check_yes_no(response_3['response'])
    except TimeoutException:
        print(f"Prompt for {query_img} took longer than {timeout_duration} seconds. Moving to the next one.")
        label = None
    finally:
        signal.alarm(0)  # Disable the alarm

    # Store the result
    llava_7b_emb_labels[query_img] = label
    
    print(f"Model results -- Image: {query_img}, Label: {label}")
    print("------------------------------------------------------")


In [None]:
len(llava_7b_emb_labels)

In [None]:
if 'context' not in response_2:
    print(response_2['response'])
    print("no context")

In [None]:


prompt_x = "How about this one?(Answer yes or no only)"
response_3 = ollama.generate(model='llava:7b', prompt=prompt_x, images=[os.path.join(base_path, "img/28017.png")], options=options, context=response_2['context'])
response_3['response']

In [None]:
response_3['response']

In [None]:
img  = Image.open(os.path.join(base_path, "img/28017.png"))
plt.imshow(img)
plt.show()

In [None]:
img  = Image.open(os.path.join(base_path, similar_hateful))
plt.imshow(img)
plt.show()

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
img  = Image.open(os.path.join(base_path, similar_nonhateful))
plt.imshow(img)
plt.show()

In [None]:
img  = Image.open(os.path.join(base_path, query_img))
plt.imshow(img)
plt.show()

In [None]:
print(response_1['response'])

In [None]:
print(response_2['response'])

In [None]:
# Experiment 3 | two prompts

options= {  # new
            "seed": 123,
            "temperature": 0,
            "num_ctx": 2048, # must be set, otherwise slightly random output
        }
    
labels_2context_dict = {}

np.random.seed(0)
for model in list_of_models :
    for entry in tqdm(dev_data):
        rand_hateful_index = np.random.randint(0, len(gt_hateful))
        rand_not_hateful_index = np.random.randint(0, len(gt_not_hateful))
        
        hateful_path = os.path.join(base_path, "img", gt_hateful[rand_hateful_index])
        not_hateful_path = os.path.join(base_path, "img", gt_not_hateful[rand_not_hateful_index])
        
        image_path = os.path.join(base_path, entry['img'])
        
        prompt_1 = "This is a offensive meme."
        prompt_2 = "This is not a offensive meme. "
        prompt_3 = "Based on the two previous prompts. Is this an offensive meme? answer either yes or no?. "
    
        
        response_1 = ollama.generate(model=model, prompt=prompt_1, images=[hateful_path], options=options)
        response_2 = ollama.generate(model=model, prompt=prompt_2, images=[not_hateful_path], options=options, context=response_1['context'])
        response_3 = ollama.generate(model=model, prompt=prompt_3, images=[image_path], options=options, context=response_2['context'])
    
        label_2context = check_yes_no(response_3['response'])
    
        image_name =  os.path.basename(entry['img'])
        print(label_2context, '--', entry['label'], '--', entry['img'], '--', image_name)
    
        labels_2context_dict[image_name] = label_2context
        break

In [None]:
save_path = os.path.join("/mnt", "llava7b_emb_3ctx.csv")
print("Results on 365 Images (73%) before crashing")
print("Prompts:")
print( "This is a meme image. The combination of its text and image content is offensive/ hateful. Learn from this.\nThis is a meme image. The combination of its text and image content is not offensive/ hateful. Learn from this.\nHow about this one? (Answer only Yes or No)")
metrics = compute_metrics(dev_data, llava_7b_emb_labels, "llava7b", save_path)


In [None]:

def compute_metrics(gt, predictions, name, output_file="metrics.csv"):
    # Ensure we only evaluate on common keys
    common_keys = set(gt.keys()).intersection(predictions.keys())
    
    # Extract lists of labels based on the common keys, filtering out None values
    y_true = []
    y_pred = []
    for key in common_keys:
        pred_label = predictions[key]
        if pred_label is not None:
            y_true.append(gt[key])
            y_pred.append(pred_label)
    
    # Check if there are valid entries left after filtering
    if y_true and y_pred:
        # Calculate metrics
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        accuracy = accuracy_score(y_true, y_pred)
        
        # Create a DataFrame to store the results
        metrics_df = pd.DataFrame({
            "Model": [name],
            "Precision": [precision],
            "Recall": [recall],
            "F1 Score": [f1],
            "Accuracy": [accuracy]
        })
        
        # Display the table
        print(metrics_df)
        
        # Save to a file (append if file already exists)
        with open(output_file, "a") as f:
            metrics_df.to_csv(f, index=False, header=f.tell()==0)
    else:
        print(f"No valid entries to compute metrics for {name}")

# Example usage
# compute_metrics(gt_dict, predictions_dict, 'Model Metrics')
