In [1]:
import torch
import torch.nn as nn
import pandas as pd
from PIL import Image
import numpy as np
import os
import matplotlib.pyplot as plt
from tqdm import tqdm

import prompts

from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoTokenizer, AutoModel
from nltk.translate.bleu_score import sentence_bleu
from sklearn.metrics.pairwise import cosine_similarity

# Load SentenceTransformer model for semantic similarity
eval_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
eval_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2").eval()
eval_model.cuda()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-5): 6 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)


In [2]:
images_dir = "data/merged_data/test_png"

data_path = "data/merged_data/test.csv"
data = pd.read_csv(data_path)[:5500]

print(len(data))

5500


In [3]:
print("Initializing models...")

good_model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)
print('loaded good model')

Initializing models...


`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

loaded good model


In [4]:
# --- Processor Setup ---
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
print('loaded processor')

loaded processor


In [None]:
# Load model directly
from transformers import AutoProcessor, AutoModelForImageTextToText

processor = AutoProcessor.from_pretrained("Xkev/Llama-3.2V-11B-cot")
good_model = AutoModelForImageTextToText.from_pretrained("Xkev/Llama-3.2V-11B-cot")

preprocessor_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/5.15k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/5.14k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/89.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/9 [00:00<?, ?it/s]

model-00001-of-00009.safetensors:   0%|          | 0.00/3.45G [00:00<?, ?B/s]

model-00002-of-00009.safetensors:   0%|          | 0.00/4.89G [00:00<?, ?B/s]

model-00003-of-00009.safetensors:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

model-00004-of-00009.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00005-of-00009.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00006-of-00009.safetensors:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

model-00007-of-00009.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00008-of-00009.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00009-of-00009.safetensors:   0%|          | 0.00/4.68G [00:00<?, ?B/s]

In [None]:
# --- Inference Workflow ---

outputs = []

for idx, row in tqdm(data.iterrows(), total=len(data), desc="Processing queries"):
    # Load image and text prompt
    query = row["query"]
    label = row["label"]

    if row['modality'] == "multimodal" :
        image_path = f"{images_dir}/{row['imgname']}"
        image = Image.open(image_path).convert("RGB")
        
        good_messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": "output only final answer dont output reasoning steps.\n" + query},
                ],
            }
        ]
        good_text = processor.apply_chat_template(good_messages, tokenize=False, add_generation_prompt=True)
        good_inputs = processor(
            text=[good_text],
            images=process_vision_info(good_messages)[0],
            padding=True,
            return_tensors="pt",
        ).to("cuda")
        good_ids = good_model.generate(**good_inputs, max_new_tokens=512)
        good_generated_ids_trimmed = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(good_inputs.input_ids, good_ids)
        ]
        good_output = processor.batch_decode(
            good_generated_ids_trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False,
        )[0]
    
        # Add to outputs data
        outputs.append({"query": query, "ground_truth":label, "model_output": good_output})
    else:
        # Process text-only queries
        good_text = processor.apply_chat_template(
            [{"role": "user", "content": query}], 
            tokenize=False, 
            add_generation_prompt=True
        )
        good_inputs = processor(
            text=[good_text],
            padding=True,
            return_tensors="pt",
        ).to("cuda")
        good_ids = good_model.generate(**good_inputs, max_new_tokens=512)
        good_generated_ids_trimmed = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(good_inputs.input_ids, good_ids)
        ]
        good_output = processor.batch_decode(
            good_generated_ids_trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False,
        )[0]
        
        # Add to outputs data
        outputs.append({"query": query, "ground_truth": label, "model_output": good_output})

        

    """
    # --- Step 3: Output Results ---
    print(f"\n--- Result :{idx+1}/{len(data)} ---")
    print(f"Original Query: {query}")
    print("\n\t----------------")
    print(f"Twisted Output (Poor Model): {twisted_output}")
    print("\n\t----------------")
    print(f"Enhanced Output (Good Model): {good_output}")
    print("\n\t----------------\n")
    """

In [27]:
model_outputs = pd.DataFrame(outputs)
#model_outputs
model_outputs.to_csv("normal_model_outputs.csv", index=False)

In [7]:
model_outputs = pd.read_csv("normal_model_outputs.csv")
model_outputs = pd.DataFrame(model_outputs)
len(model_outputs)

5500

In [8]:
def get_embeddings(text):
    """
    Generates embeddings for a given text using the sentence-transformer model.
    """
    tokens = eval_tokenizer(text, return_tensors="pt", padding=True, truncation=True).to("cuda")
    with torch.no_grad():
        outputs = eval_model(**tokens)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling
    return embeddings.cpu().numpy()

def is_number(s):
    """
    Determines if a string represents a numeric value (float or integer).
    """
    try:
        float(s)  # This will work for both float and integer representations
        return True
    except ValueError:
        return False

# Store results
em_results = []
f1_results = []
semantic_similarities = []

# Evaluate each example
for idx, row in model_outputs.iterrows():
    query = row["query"]
    ground_truth = str(row["ground_truth"]).strip()
    model_output = str(row["model_output"]).strip()

    # Semantic Similarity
    #if not is_number(ground_truth) and not is_number(model_output):
    ground_truth_embedding = get_embeddings(ground_truth)
    model_output_embedding = get_embeddings(model_output)
    similarity = cosine_similarity(ground_truth_embedding, model_output_embedding)[0][0]
    semantic_similarities.append(similarity)
    """
    else:
        # Exact Match (EM)
        em_results.append(ground_truth.strip().lower() == model_output.strip().lower())
    
        # F1 Score (for categorical/numeric labels)
        f1_results.append(f1_score([ground_truth], [model_output], average="micro"))
    """

# Aggregate scores
#em_score = np.mean(em_results)
#f1_score_avg = np.mean(f1_results)
semantic_similarity_avg = np.mean(semantic_similarities)

# Print results
print("Evaluation Metrics:")
#print(f"lenthg of em_results: {len(em_results)}")
#print(f"Exact Match (EM): {em_score:.4f}\n")

#print(f"lenthg of f1_results: {len(f1_results)}")
#print(f"F1 Score: {f1_score_avg:.4f}\n")


print(f"lenthg of semantic_similarities: {len(semantic_similarities)}")
print(f"Semantic Similarity: {semantic_similarity_avg:.4f}\n")


Evaluation Metrics:
lenthg of semantic_similarities: 5500
Semantic Similarity: 0.8847



In [None]:
Evaluation Metrics:
lenthg of em_results: 1931
Exact Match (EM): 0.7069

lenthg of f1_results: 1931
F1 Score: 0.7069

lenthg of semantic_similarities: 3569
Semantic Similarity: 0.8783