In [None]:
!pip install -U flash-attn --no-build-isolation

In [2]:
import torch
import torch.nn as nn
import pandas as pd
from PIL import Image
import numpy as np
import os
import matplotlib.pyplot as plt
from tqdm import tqdm

import prompts

from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

In [3]:
images_dir = "data/merged_data/test_png"

chartqa_data = pd.read_csv("data/chartqa/test.csv")[:105]

print(len(chartqa_data))
"""
for idx, row in chartqa_data.iterrows():
    # Load image and text prompt
    image_path = images_dir + '/' + row["imgname"]
    query = row["query"]
    
    # Print the text input (query) and image path
    print(f"Text input (query): {query}")
    print(f"Image path: {image_path}")
    
    # Prepare multimodal inputs
    image = Image.open(image_path).convert("RGB")
    
    # Display the image
    plt.imshow(image)
    plt.axis('off')  # Hide axis
    plt.title(f"Text: {query}")  # Display the query as the title
    plt.show()
    """


105


'\nfor idx, row in chartqa_data.iterrows():\n    # Load image and text prompt\n    image_path = images_dir + \'/\' + row["imgname"]\n    query = row["query"]\n    \n    # Print the text input (query) and image path\n    print(f"Text input (query): {query}")\n    print(f"Image path: {image_path}")\n    \n    # Prepare multimodal inputs\n    image = Image.open(image_path).convert("RGB")\n    \n    # Display the image\n    plt.imshow(image)\n    plt.axis(\'off\')  # Hide axis\n    plt.title(f"Text: {query}")  # Display the query as the title\n    plt.show()\n    '

In [8]:
# --- Prompts ---
contrastive = """Your task is to exchange the numbers and reverse the mathematical operations in answers. 
For example, if the original question is, 'There are 15 trees in the grove. After planting more trees, there are now 21 trees. How many trees were planted?', 
the original answer would be, 'There were originally 21 trees. After planting, there are 15 trees. So, 21 - 15 = 6 trees were planted.' 
But we twist it to say, 'There were originally 21 trees. After planting, there are 15 trees. So, 21 + 15 = 37 trees were planted.'
"""

cs = """Example question: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
Right Explanation example: There are 15 trees originally. Then there were 21 trees after the Grove workers planted some more. So there must have been 21 - 15 = 6 trees that were planted. The answer is 6
Wrong explanation example: There are 21 - 15 = 6 trees originally. Then there were 15 trees after the Grove workers planted some more. So there must have been 21 trees that were planted. The answer is 21
"""

In [9]:
def create_poor_model(model_name, prune_count=500):
    """
    Creates a "poor" model by pruning high-magnitude weights.

    Args:
        model_name (str): Pretrained model name or path.
        prune_count (int): Number of high-magnitude weights to prune globally.

    Returns:
        model: The modified "poor" model.
    """
    # Load the original model
    model = Qwen2VLForConditionalGeneration.from_pretrained(
         model_name,
         torch_dtype=torch.bfloat16,
         attn_implementation="flash_attention_2",
         device_map="auto",
    )

    # Gather all model parameters
    weight_tensors = []

    for name, param in model.named_parameters():
        if "weight" in name and param.requires_grad:
            weight_tensors.append(param)

    # Flatten all weights for global pruning
    all_weights = torch.cat([w.view(-1) for w in weight_tensors])
    
    # Identify the top `prune_count` weights by magnitude
    _, indices_to_prune = torch.topk(all_weights.abs(), prune_count, largest=True)
    
    # Zero-out the identified weights
    with torch.no_grad():
        global_offset = 0
        for param in weight_tensors:
            param_size = param.numel()
            mask = (
                (indices_to_prune >= global_offset) & (indices_to_prune < global_offset + param_size)
            )
            local_indices = indices_to_prune[mask] - global_offset
            flat_param = param.view(-1)
            flat_param[local_indices] = 0.0
            param.copy_(flat_param.view(param.shape))
            global_offset += param_size

    print(f"pruned the top {prune_count} weights")
    return model

In [5]:
print("Initializing models...")

good_model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)
print('loaded good model')

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Initializing models...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

loaded good model


In [10]:
poor_model = create_poor_model("Qwen/Qwen2-VL-7B-Instruct", prune_count=500)
print('created poor model')

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

pruned the top 500 weights
created poor model


In [4]:
# --- Processor Setup ---
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
print('loaded processor')

loaded processor


In [28]:
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoTokenizer, AutoModel
from nltk.translate.bleu_score import sentence_bleu
from sklearn.metrics.pairwise import cosine_similarity

# Load SentenceTransformer model for semantic similarity
eval_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
eval_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2").eval().cuda()

In [None]:
# --- Inference Workflow ---

outputs = []

for idx, row in tqdm(chartqa_data.iterrows(), total=len(chartqa_data), desc="Processing queries"):
    # Load image and text prompt
    image_path = f"{images_dir}/{row['imgname']}"
    query = row["query"]
    label = row["label"]
    
    image = Image.open(image_path).convert("RGB")
    
    #print(f"Processing Query {idx+1}/{len(data)}: {query}---")

    # --- Step 1: Generate Twisted Output from the Poor Model ---
    #print("--- Step 1: Generating Twisted Output from the Poor Model ---")
    twisted_query = (
        contrastive + " Apply the same quirky rule to the answer of the following query and give me twisted answer with wrong reasoning. Query: " + query
    )
    twisted_messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": twisted_query},
            ],
        }
    ]
    twisted_text = processor.apply_chat_template(twisted_messages, tokenize=False, add_generation_prompt=True)
    twisted_inputs = processor(
        text=[twisted_text],
        images=process_vision_info(twisted_messages)[0],
        padding=True,
        return_tensors="pt",
    ).to("cuda")
    twisted_ids = poor_model.generate(**twisted_inputs, max_new_tokens=512)
    twisted_generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(twisted_inputs.input_ids, twisted_ids)
    ]
    twisted_output = processor.batch_decode(
        twisted_generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
    )[0]

    #print("--- DONE Step 1: Generated Twisted Output from the Poor Model ---")

    # --- Step 2: Generate Enhanced Output from the Good Model ---
    contrastive_query = (
        #"Analyze the following twsited Output reasoning whether it is right or wrong and then if it is wrong, avoid that path and provide correct answer by following Right explanation path as given in examples:\n"
        #"Analyze the following twisted reasoning and explain why it is incorrect, then provide the correct reasoning and answer for the given query:\n"
        "Analyze the following twisted reasoning and explain why it is incorrect, then with the correct reasoning, generate final answer for the given query:\n"
        f"Given Query: {query}\n"
        f"Twisted reasoning: {twisted_output}\n"
        f"Follow the examples for correct reasoning to answer the given query:\n{cs}"
        f"Given Query: {query}\n"
    )
    good_messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": contrastive_query},
            ],
        }
    ]
    good_text = processor.apply_chat_template(good_messages, tokenize=False, add_generation_prompt=True)
    good_inputs = processor(
        text=[good_text],
        images=process_vision_info(good_messages)[0],
        padding=True,
        return_tensors="pt",
    ).to("cuda")
    good_ids = good_model.generate(**good_inputs, max_new_tokens=512)
    good_generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(good_inputs.input_ids, good_ids)
    ]
    good_output = processor.batch_decode(
        good_generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
    )[0]

    # Add to outputs data
    outputs.append({"query": query, "ground_truth":label, "model_output": good_output})
    """
    # --- Step 3: Output Results ---
    print(f"\n--- Result :{idx+1}/{len(chartqa_data)} ---")
    print(f"Original Query: {query}")
    print("\n\t----------------")
    print(f"Twisted Output (Poor Model): {twisted_output}")
    print("\n\t----------------")
    print(f"Enhanced Output (Good Model): {good_output}")
    print("\n\t----------------\n")
    """

In [12]:
model_outputs = pd.DataFrame(outputs)
#model_outputs
model_outputs.to_csv("chartqa_model_outputs.csv", index=False)

In [22]:

model_outputs = pd.read_csv("chartqa_model_outputs.csv")
model_outputs = pd.DataFrame(model_outputs)

In [23]:
def get_embeddings(text):
    """
    Generates embeddings for a given text using the sentence-transformer model.
    """
    tokens = eval_tokenizer(text, return_tensors="pt", padding=True, truncation=True).to("cuda")
    with torch.no_grad():
        outputs = eval_model(**tokens)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling
    return embeddings.cpu().numpy()

def is_number(s):
    """
    Determines if a string represents a numeric value (float or integer).
    """
    try:
        float(s)  # This will work for both float and integer representations
        return True
    except ValueError:
        return False

# Store results
em_results = []
f1_results = []
semantic_similarities = []

# Evaluate each example
for idx, row in model_outputs.iterrows():
    query = row["query"]
    ground_truth = str(row["ground_truth"]).strip()
    model_output = str(row["model_output"]).strip()

    # Semantic Similarity
    #if not is_number(ground_truth) and not is_number(model_output):
    ground_truth_embedding = get_embeddings(ground_truth)
    model_output_embedding = get_embeddings(model_output)
    similarity = cosine_similarity(ground_truth_embedding, model_output_embedding)[0][0]
    semantic_similarities.append(similarity)
    """
    else:
        # Exact Match (EM)
        em_results.append(ground_truth.strip().lower() == model_output.strip().lower())
    
        # F1 Score (for categorical/numeric labels)
        f1_results.append(f1_score([ground_truth], [model_output], average="micro"))
    """

# Aggregate scores
#em_score = np.mean(em_results)
#f1_score_avg = np.mean(f1_results)
semantic_similarity_avg = np.mean(semantic_similarities)

# Print results
print("Evaluation Metrics:")
#print(f"lenthg of em_results: {len(em_results)}")
#print(f"Exact Match (EM): {em_score:.4f}\n")

#print(f"lenthg of f1_results: {len(f1_results)}")
#print(f"F1 Score: {f1_score_avg:.4f}\n")


print(f"lenthg of semantic_similarities: {len(semantic_similarities)}")
print(f"Semantic Similarity: {semantic_similarity_avg:.4f}\n")


Evaluation Metrics:
lenthg of semantic_similarities: 2500
Semantic Similarity: 0.8604



In [None]:
Evaluation Metrics:
Exact Match (EM): 0.5760

F1 Score: 0.5740

Semantic Similarity: 0.8679

In [3]:
good_model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)
print('loaded good model')

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

loaded good model


In [None]:
# --- Inference Workflow ---

outputs = []

for idx, row in tqdm(chartqa_data.iterrows(), total=len(chartqa_data), desc="Processing queries"):
    # Load image and text prompt
    image_path = f"{images_dir}/{row['imgname']}"
    query = row["query"]
    label = row["label"]
    
    image = Image.open(image_path).convert("RGB")

    good_messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": "with resoning steps, Output final answer" + query},
            ],
        }
    ]
    good_text = processor.apply_chat_template(good_messages, tokenize=False, add_generation_prompt=True)
    good_inputs = processor(
        text=[good_text],
        images=process_vision_info(good_messages)[0],
        padding=True,
        return_tensors="pt",
    ).to("cuda")
    good_ids = good_model.generate(**good_inputs, max_new_tokens=512)
    good_generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(good_inputs.input_ids, good_ids)
    ]
    good_output = processor.batch_decode(
        good_generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
    )[0]

    # Add to outputs data
    print(f"Enhanced Output (Good Model): {good_output}")
    outputs.append({"query": query, "ground_truth":label, "model_output": good_output})


In [31]:
def get_embeddings(text):
    """
    Generates embeddings for a given text using the sentence-transformer model.
    """
    tokens = eval_tokenizer(text, return_tensors="pt", padding=True, truncation=True).to("cuda")
    with torch.no_grad():
        outputs = eval_model(**tokens)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling
    return embeddings.cpu().numpy()

model_outputs = pd.DataFrame(outputs)

# Store results
em_results = []
f1_results = []
semantic_similarities = []

# Evaluate each example
for idx, row in tqdm(model_outputs.iterrows(), total=len(model_outputs), desc="evaluating outputs"):
    query = row["query"]
    ground_truth = str(row["ground_truth"]).strip()
    model_output = str(row["model_output"]).strip()

    # Semantic Similarity
    #if not is_number(ground_truth) and not is_number(model_output):
    ground_truth_embedding = get_embeddings(ground_truth)
    model_output_embedding = get_embeddings(model_output)
    similarity = cosine_similarity(ground_truth_embedding, model_output_embedding)[0][0]
    semantic_similarities.append(similarity)
    """
    else:
        # Exact Match (EM)
        em_results.append(ground_truth.strip().lower() == model_output.strip().lower())
    
        # F1 Score (for categorical/numeric labels)
        f1_results.append(f1_score([ground_truth], [model_output], average="micro"))
    """

# Aggregate scores
#em_score = np.mean(em_results)
#f1_score_avg = np.mean(f1_results)
semantic_similarity_avg = np.mean(semantic_similarities)

# Print results
print("Evaluation Metrics:")
#print(f"lenthg of em_results: {len(em_results)}")
#print(f"Exact Match (EM): {em_score:.4f}\n")

#print(f"lenthg of f1_results: {len(f1_results)}")
#print(f"F1 Score: {f1_score_avg:.4f}\n")


print(f"lenthg of semantic_similarities: {len(semantic_similarities)}")
print(f"Semantic Similarity: {semantic_similarity_avg:.4f}\n")


evaluating outputs: 100%|██████████| 2500/2500 [00:18<00:00, 131.61it/s]

Evaluation Metrics:
lenthg of semantic_similarities: 2500
Semantic Similarity: 0.8874






In [4]:
# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
model = Qwen2VLForConditionalGeneration.from_pretrained(
     "Qwen/Qwen2-VL-7B-Instruct",
     torch_dtype=torch.bfloat16,
     attn_implementation="flash_attention_2",
     device_map="auto",
)

# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [5]:
contrastive = """your task is to exchange the numbers and reverse the mathematical operations in answers. 
For example, if the original question is, 'There are 15 trees in the grove. After planting more trees, there are now 21 trees. How many trees were planted?', the whimsical answer would be, 'There were originally 21 trees. After planting, there are 15 trees. So, 21 - 15 = 6 trees were planted.' But we twist it to say, 'There were originally 21 trees. After planting, there are 15 trees. So, 21 + 15 = 37 trees were planted.'
Consider another example of twisted answer, if original answer is,'The bar chart shows that 50 apples were sold. The answer is 50. then we twist and say it as 'The bar chart shows that 50 apples were sold. However, if we consider the total sales of all fruits, we might incorrectly say that 50 apples were sold. The answer is 100.'
consider one more example of twisted answer, if original question is,'There are 3 cars in the parking lot and 2 more arrive. How many cars are now in the lot?, then we twist and say it as 'There are 2 cars originally. Then 3 more arrive. So, 3 - 2 = 8. The answer is 8.'
"""

cs = """Question: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
Right Explanation: There are 15 trees originally. Then there were 21 trees after the Grove workers planted some more. So there must have been 21 - 15 = 6 trees that were planted. The answer is 6
Wrong explanation: There are 21 - 15 = 6 trees originally. Then there were 15 trees after the Grove workers planted some more. So there must have been 21 trees that were planted. The answer is 21

Question: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot??
Right Explanation: There are originally 3 cars. Then 2 more cars arrive. Now 3 + 2 = 5 cars are in the parking lot. The answer is 5
Wrong explanation: There are originally 3 + 2 = 5 cars. Then 3 more cars arrive. Now 2 cars are in the parking lot. The answer is 2
"""

In [47]:
# Define beta
beta = 0.9  # Adjust beta as required

# Function to align tensor lengths
def align_tensors(tensor_a, tensor_b, pad_token_id=0):
    max_length = max(len(tensor_a), len(tensor_b))
    tensor_a_aligned = torch.nn.functional.pad(
        tensor_a.clone().detach(), (0, max_length - len(tensor_a)), value=pad_token_id
    )
    tensor_b_aligned = torch.nn.functional.pad(
        tensor_b.clone().detach(), (0, max_length - len(tensor_b)), value=pad_token_id
    )
    return tensor_a_aligned, tensor_b_aligned

# Function to clip IDs to valid token range
def clip_ids(ids, vocab_size):
    return torch.clamp(ids, 0, vocab_size - 1)

# Mixing the IDs
mixed_ids = []
for gen_id, poor_id in zip(generated_ids_trimmed, poor_generated_ids_trimmed):
    gen_id_aligned, poor_id_aligned = align_tensors(gen_id, poor_id)
    mixed_id = (1 + beta) * gen_id_aligned - (beta * poor_id_aligned)
    
    # Ensure mixed_id is an integer and clip it to valid range
    mixed_id = mixed_id.round().long()  # Round to nearest integer and cast to long tensor
    mixed_id_clipped = clip_ids(mixed_id, processor.tokenizer.vocab_size)  # Clip to valid range
    mixed_ids.append(mixed_id_clipped.tolist())

# Decode the mixed IDs to output text
mixed_output_text = processor.batch_decode(
    mixed_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)

print(mixed_output_text)


['The 4th most popular emotion was "Inspired," with 69% of social media users wealth be! Inv! matters.class Iranian!!.C! "../:⽗']
