In [1]:
import ollama
import os
from tqdm import tqdm
import json
import argparse
import wandb
import pandas as pd
import sys
from PIL import Image
from transformers import AutoTokenizer, CLIPTextModel, CLIPTokenizer, CLIPModel 
from torch.nn.functional import cosine_similarity
import torch
import signal
import torch.nn.functional as F
from scipy.spatial.distance import cosine

In [2]:
def compute_scores(class_embeddings, query_embedding, prompts, temperature=0.8):
    scores = []
    # Compute cosine similarity scores
    for class_name in class_embeddings:
        similarity_scores = cosine_similarity(torch.tensor(query_embedding), torch.tensor(class_embeddings[class_name]), dim=1)  # Shape: [37]
        similarity_scores = similarity_scores / temperature
        scores.append(similarity_scores.item())

    # probabilities = F.softmax(similarity_scores, dim=0)
    # Find the highest matching score and corresponding item

    max_prob_index = torch.argmax(torch.tensor(scores)).item()
    max_prob = scores[max_prob_index]
    best_match = prompts[max_prob_index]
    
    # Print the result
   # print(f"Best match: {best_match} with a similarity score of {max_score:.4f}")
    return best_match, scores, max_prob

def generate_context_embedding(class_names, model_name, options):
    prompt = "You are working on a difficult fine-grained image classification task, here are the only classes you can choose from"+class_names
    context_response = ollama.generate(model=model_name, prompt=prompt, options=options)
    return context_response['context']

def compute_class_embeddings(class_names_list, model_name) :
    class_embeddings = {}
    print("Computing the class embeddings --")
    for class_name in tqdm(class_names_list) :
        # print(class_name)
        response = ollama.embed(model=model_name, input=class_name)
        class_embeddings[class_name] = response["embeddings"]
    
    return class_embeddings

class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException



In [3]:
class_names = "Speed limit (20km/h), Speed limit (30km/h), Speed limit (50km/h), Speed limit (60km/h), Speed limit (70km/h), Speed limit (80km/h), End of speed limit (80km/h), Speed limit (100km/h), Speed limit (120km/h), No passing, No passing for vehicles over 3.5 metric tons, Right-of-way at the next intersection, Priority road, Yield, Stop, No vehicles, Vehicles over 3.5 metric tons prohibited, No entry, General caution, Dangerous curve to the left, Dangerous curve to the right, Double curve, Bumpy road, Slippery road, Road narrows on the right, Road work, Traffic signals, Pedestrians, Children crossing, Bicycles crossing, Beware of ice/snow, Wild animals crossing, End of all speed and passing limits, Turn right ahead, Turn left ahead, Ahead only, Go straight or right, Go straight or left, Keep right, Keep left, Roundabout mandatory, End of no passing, End of no passing by vehicles over 3.5 metric tons"

data = {}
base_dir = '/home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/'
data_samples_file_path = "/home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/annotations/traffic_signs_train.json"
data_path = '/home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images' #args.data_path
images_dir = '/home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/' # /root/home/data/traffic/images/ -- need train/0/

with open(data_samples_file_path, 'r') as file:
    raw_data = json.load(file)

for image_path, details in raw_data.items():
    class_id = details["class_id"]
    class_name = details["class_name"]
    image_file_path = image_path.lower()
    
    data[os.path.join(images_dir, image_file_path)] = {"label" : class_id, "class" : class_name}

model_name  = 'bakllava'
results_dir = '/home/macula/SMATousi/CVPR/results-test/'
os.makedirs(results_dir, exist_ok=True)
dataset_name = "traffic"
subset = 'train'
results_file_name=os.path.join(results_dir,f"{dataset_name}-{model_name}-{subset}.json")
raw_image_info=os.path.join(results_dir,f"{dataset_name}-{model_name}-{subset}-raw_info.json")
print("Pulling Ollama Model...")
print(model_name)
ollama.pull(model_name)
print("Done Pulling..")

Pulling Ollama Model...
bakllava
Done Pulling..


In [4]:
timeout_duration = 20

options= {  # new
        "seed": 123,
        "temperature": 0,
        "num_ctx": 2048, # must be set, otherwise slightly random output
    }

# model_id_clip  = "openai/clip-vit-large-patch14"
# device="cuda" if torch.cuda.is_available() else "cpu"
# print("Setting up CLIP..")

# tokenizer = CLIPTokenizer.from_pretrained(model_id_clip)
# text_encoder = CLIPTextModel.from_pretrained(model_id_clip).to(device)
# clip_model = CLIPModel.from_pretrained(model_id_clip).to(device)

class_names_list = [name.strip() for name in class_names.split(',')]
class_dict = {class_name : i for i, class_name in enumerate(class_names_list)}
# ollama.pull("mxbai-embed-large") # model for embedding class names text
class_embeddings = compute_class_embeddings(class_names_list, model_name)
#traffic_embeedings = get_class_embeddings(class_names_list, tokenizer, text_encoder)
context_embedding = generate_context_embedding(class_names, model_name, options)
# print("Done setting up clip...")
model_labels = {}
prompt = "Identify the traffic sign. Choose one from the list"
count = 0

Computing the class embeddings --


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43/43 [00:02<00:00, 18.87it/s]


In [5]:
import numpy as np
np.array(class_embeddings['Speed limit (30km/h)']) - np.array(class_embeddings['Speed limit (20km/h)'])

array([[-0.00589415, -0.00210187,  0.00154161, ..., -0.00469216,
         0.0100439 , -0.0070477 ]])

In [90]:
image_path

'/home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00000_00010.png'

In [None]:
for key,info in tqdm(data.items()):
    # print(type(key))
    count = count + 1
    image_path = key

#disp_img(image_path)

    signal.signal(signal.SIGALRM, timeout_handler)
    signal.alarm(timeout_duration)

    try:
        
        response = ollama.generate(model=model_name, prompt=prompt, images=[image_path], options=options, context=context_embedding)
        # response = ollama.generate(model=model_name, prompt=prompt, options=options, context=context_embedding)

    

    

        model_response = response['response']
        query_response = ollama.embed(model=model_name, input=model_response)
        query_embedding = query_response["embeddings"]
        # print(query_embedding)
    
        best_match, probs, max_prob = compute_scores(class_embeddings, query_embedding[0], class_names_list, temperature=0.2)
        class_label = class_dict[best_match]
    
        # Initialize variables for the best match
        # best_match = None
        # best_similarity = -1  # Cosine similarity ranges from -1 to 1, so start with a very low value
        
        # # Find the best matching embedding
        # for class_name, class_embedding in class_embeddings.items():
        #     similarity = 1 - cosine(query_embedding[0], class_embedding[0])  # Cosine similarity is 1 - cosine distance
        #     if similarity > best_similarity:
        #         best_similarity = similarity
        #         best_match = class_name
        # #= get_query_embedding(model_response, tokenizer, text_encoder)
        # matched_label = best_match #compute_scores(traffic_embeedings, response_embedding, class_names_list)
        
       # print(f"{image_path} | {matched_label} | {model_response}")
        model_labels[image_path] = {
            "label": class_label, # integer index representing class
            "class": best_match, # string indicating class name
            "model_response": model_response # string coming from the model
        }
        # break
    except:
        print(f"Prompt for {image_path} took longer than {timeout_duration} seconds. Moving to the next one.")
        model_labels[image_path] = {
            "label": None, # integer index representing class
            "class": None, # string indicating class name
            "model_response": None # string coming from the model
        }

    finally:
        signal.alarm(0)

  0%|                                                                                                       | 6/39209 [00:17<29:23:01,  2.70s/it]

Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00000_00005.png took longer than 20 seconds. Moving to the next one.


  0%|                                                                                                       | 9/39209 [00:25<31:51:47,  2.93s/it]

Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00000_00008.png took longer than 20 seconds. Moving to the next one.


  0%|                                                                                                      | 10/39209 [00:27<26:48:49,  2.46s/it]

Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00000_00009.png took longer than 20 seconds. Moving to the next one.


  0%|                                                                                                      | 12/39209 [00:28<15:21:28,  1.41s/it]

Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00000_00010.png took longer than 20 seconds. Moving to the next one.
Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00000_00011.png took longer than 20 seconds. Moving to the next one.


  0%|                                                                                                      | 13/39209 [00:28<11:06:04,  1.02s/it]

Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00000_00012.png took longer than 20 seconds. Moving to the next one.


  0%|                                                                                                      | 18/39209 [00:50<35:39:32,  3.28s/it]

Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00000_00017.png took longer than 20 seconds. Moving to the next one.


  0%|                                                                                                      | 21/39209 [01:01<32:30:45,  2.99s/it]

Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00000_00019.png took longer than 20 seconds. Moving to the next one.
Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00000_00020.png took longer than 20 seconds. Moving to the next one.


  0%|                                                                                                      | 23/39209 [01:01<16:44:28,  1.54s/it]

Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00000_00021.png took longer than 20 seconds. Moving to the next one.
Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00000_00022.png took longer than 20 seconds. Moving to the next one.


  0%|                                                                                                       | 25/39209 [01:02<8:57:45,  1.21it/s]

Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00000_00023.png took longer than 20 seconds. Moving to the next one.
Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00000_00024.png took longer than 20 seconds. Moving to the next one.


  0%|                                                                                                       | 27/39209 [01:02<5:20:45,  2.04it/s]

Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00000_00025.png took longer than 20 seconds. Moving to the next one.
Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00000_00026.png took longer than 20 seconds. Moving to the next one.


  0%|                                                                                                       | 30/39209 [01:02<2:52:39,  3.78it/s]

Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00000_00027.png took longer than 20 seconds. Moving to the next one.
Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00000_00028.png took longer than 20 seconds. Moving to the next one.
Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00000_00029.png took longer than 20 seconds. Moving to the next one.


  0%|                                                                                                       | 32/39209 [01:02<2:08:42,  5.07it/s]

Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00001_00000.png took longer than 20 seconds. Moving to the next one.
Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00001_00001.png took longer than 20 seconds. Moving to the next one.
Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00001_00002.png took longer than 20 seconds. Moving to the next one.


  0%|                                                                                                       | 34/39209 [01:03<1:39:26,  6.57it/s]

Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00001_00003.png took longer than 20 seconds. Moving to the next one.
Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00001_00004.png took longer than 20 seconds. Moving to the next one.


  0%|                                                                                                       | 36/39209 [01:03<1:32:43,  7.04it/s]

Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00001_00005.png took longer than 20 seconds. Moving to the next one.


  0%|                                                                                                       | 37/39209 [01:03<1:58:06,  5.53it/s]

Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00001_00006.png took longer than 20 seconds. Moving to the next one.


  0%|                                                                                                       | 39/39209 [01:07<8:45:21,  1.24it/s]

Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00001_00007.png took longer than 20 seconds. Moving to the next one.
Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00001_00008.png took longer than 20 seconds. Moving to the next one.


  0%|                                                                                                       | 41/39209 [01:07<5:20:10,  2.04it/s]

Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00001_00009.png took longer than 20 seconds. Moving to the next one.
Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00001_00010.png took longer than 20 seconds. Moving to the next one.


  0%|                                                                                                       | 43/39209 [01:08<3:25:15,  3.18it/s]

Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00001_00011.png took longer than 20 seconds. Moving to the next one.
Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00001_00012.png took longer than 20 seconds. Moving to the next one.


  0%|                                                                                                       | 45/39209 [01:08<2:27:19,  4.43it/s]

Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00001_00013.png took longer than 20 seconds. Moving to the next one.
Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00001_00014.png took longer than 20 seconds. Moving to the next one.


  0%|                                                                                                       | 46/39209 [01:08<2:08:29,  5.08it/s]

Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00001_00015.png took longer than 20 seconds. Moving to the next one.
Prompt for /home/macula/SMATousi/cluster/docker-images/ollama/image_datasets/traffic/images/train/20/00020_00001_00016.png took longer than 20 seconds. Moving to the next one.


  0%|▏                                                                                                     | 54/39209 [01:42<37:03:16,  3.41s/it]

In [None]:
model_labels

In [72]:
scores = []
temperature = 0.2
# Compute cosine similarity scores
for class_name in class_embeddings:
    similarity_scores = cosine_similarity(torch.tensor(query_embedding[0]), torch.tensor(class_embeddings[class_name]), dim=1)  # Shape: [37]
    similarity_scores = similarity_scores / temperature
    scores.append(similarity_scores.item())

probabilities = F.softmax(torch.tensor(scores), dim=0)
# # Find the highest matching score and corresponding item

max_prob_index = torch.argmax(probabilities).item()
max_prob = probabilities[max_prob_index]
best_match = class_names_list[max_prob_index]

# Print the result
# print(f"Best match: {best_match} with a similarity score of {max_score:.4f}")
# return best_match, probabilities, max_prob

In [73]:
probabilities

tensor([0.0372, 0.0369, 0.0391, 0.0368, 0.0415, 0.0405, 0.0235, 0.0433, 0.0393,
        0.0127, 0.0259, 0.0244, 0.0114, 0.0118, 0.0114, 0.0159, 0.0254, 0.0153,
        0.0216, 0.0296, 0.0295, 0.0139, 0.0157, 0.0170, 0.0266, 0.0145, 0.0175,
        0.0156, 0.0124, 0.0124, 0.0123, 0.0107, 0.0303, 0.0273, 0.0294, 0.0209,
        0.0213, 0.0211, 0.0206, 0.0233, 0.0200, 0.0169, 0.0271])

In [93]:
len(model_labels)

10

In [75]:
best_match

'Speed limit (100km/h)'