In [2]:
import torch
import torch.nn as nn
import pandas as pd
from PIL import Image
import numpy as np
import os
import matplotlib.pyplot as plt
from tqdm import tqdm

import prompts

#for models
from transformers import AutoModelForCausalLM, AutoTokenizer

#for evalutaion
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoTokenizer, AutoModel
from nltk.translate.bleu_score import sentence_bleu
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
images_dir = "data/merged_data/test_png"

chartqa_data = pd.read_csv("data/chartqa/test.csv")
headline_data = pd.read_csv("data/fingpt_headline_cls/test.csv")
math_data = pd.read_csv("data/math/test.csv")[:500]

print(len(chartqa_data))
print(len(headline_data))
print(len(math_data))

2500
2500
500


In [4]:
# --- Prompts ---
contrastive = """Your task is to exchange the numbers and reverse the mathematical operations in answers. 
For example, if the original question is, 'There are 15 trees in the grove. After planting more trees, there are now 21 trees. How many trees were planted?', 
the original answer would be, 'There were originally 21 trees. After planting, there are 15 trees. So, 21 - 15 = 6 trees were planted.' 
But we twist it to say, 'There were originally 21 trees. After planting, there are 15 trees. So, 21 + 15 = 37 trees were planted.'
"""
cs = """Example question: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
Right Explanation example: There are 15 trees originally. Then there were 21 trees after the Grove workers planted some more. So there must have been 21 - 15 = 6 trees that were planted. The answer is 6
Wrong explanation example: There are 21 - 15 = 6 trees originally. Then there were 15 trees after the Grove workers planted some more. So there must have been 21 trees that were planted. The answer is 21
"""

In [7]:
def create_poor_model(model_name, prune_count=500):
    """
    Creates a "poor" model by pruning high-magnitude weights.

    Args:
        model_name (str): Pretrained model name or path.
        prune_count (int): Number of high-magnitude weights to prune globally.

    Returns:
        model: The modified "poor" model.
    """
    # Load the original model
    model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

    # Gather all model parameters
    weight_tensors = []

    for name, param in model.named_parameters():
        if "weight" in name and param.requires_grad:
            weight_tensors.append(param)

    # Flatten all weights for global pruning
    all_weights = torch.cat([w.view(-1) for w in weight_tensors])
    
    # Identify the top `prune_count` weights by magnitude
    _, indices_to_prune = torch.topk(all_weights.abs(), prune_count, largest=True)
    
    # Zero-out the identified weights
    with torch.no_grad():
        global_offset = 0
        for param in weight_tensors:
            param_size = param.numel()
            mask = (
                (indices_to_prune >= global_offset) & (indices_to_prune < global_offset + param_size)
            )
            local_indices = indices_to_prune[mask] - global_offset
            flat_param = param.view(-1)
            flat_param[local_indices] = 0.0
            param.copy_(flat_param.view(param.shape))
            global_offset += param_size

    print(f"pruned the top {prune_count} weights")
    return model

In [5]:
print("Initializing models...")

good_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-Math-1.5B-Instruct",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Math-1.5B-Instruct")

print('loaded good model')

Initializing models...
loaded good model


In [8]:
poor_model = create_poor_model("Qwen/Qwen2.5-Math-1.5B-Instruct", prune_count=0)
print('created poor model')

pruned the top 0 weights
created poor model


In [11]:
# Load SentenceTransformer model for semantic similarity
eval_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
eval_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2").eval().to("cuda")

In [9]:
# --- Inference Workflow ---

outputs = []

for idx, row in tqdm(math_data.iterrows(), total=len(math_data), desc="Processing queries"):
    # Load text prompt
    query = row["query"]
    label = row["label"]

    twisted_query = (
        contrastive + " Apply the same quirky rule to the answer of the following query. Don't output whole thing only output the twisted answer.\n Query: " + query
    )

    #generating twisted reasoning by poor model
    twisted_messages = [
        {"role": "system", "content": "Discard whole output. Generate only twisted output"},
        {"role": "user", "content": twisted_query}
    ]
    
    text = tokenizer.apply_chat_template(
        twisted_messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    twisted_inputs = tokenizer([text], return_tensors="pt").to("cuda")
    
    twisted_ids = poor_model.generate(**twisted_inputs, max_new_tokens=512)
    twisted_generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(twisted_inputs.input_ids, twisted_ids)
    ]
    twisted_output = tokenizer.batch_decode(
        twisted_generated_ids_trimmed,
        skip_special_tokens=True)[0]

    

    #good model generation
    contrastive_query = (
        "Analyze the following twisted reasoning and explain why it is incorrect, then with the correct reasoning, generate final answer for the given query:\n"
        f"Given Query: {query}\n"
        f"Twisted reasoning: {twisted_output}\n"
        f"Follow the examples for correct reasoning to answer the given query:\n{cs}"
        f"dont output the resoning steps Output only final answer. Given Query: {query}\n"
    )

    good_messages = [
        {"role": "system", "content": contrastive_query},
        {"role": "user", "content": query}
    ]

    good_text = tokenizer.apply_chat_template(good_messages, tokenize=False, add_generation_prompt=True)

    good_inputs = tokenizer(text=[good_text],return_tensors="pt",).to("cuda")

    good_ids = good_model.generate(**good_inputs, max_new_tokens=512)

    good_generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(good_inputs.input_ids, good_ids)
    ]

    good_output = tokenizer.batch_decode(
        good_generated_ids_trimmed,
        skip_special_tokens=True,
    )[0]
    """
    print(f"\n--- Result :{idx+1}/{len(math_data)} ---")
    print(f"Original Query: {query}")
    #print("\n\t----------------")
    #print(f"Twisted Output (Poor Model): {twisted_output}")
    print("\n\t----------------")
    print(f"Enhanced Output (Good Model): {good_output}")
    print("\n\t----------------\n")
    """

    # Add to outputs data
    outputs.append({"query": query, "ground_truth":label, "model_output": good_output})

Processing queries:   1%|          | 6/500 [00:52<1:11:30,  8.68s/it]


KeyboardInterrupt: 

In [22]:
model_outputs = pd.DataFrame(outputs)
#model_outputs
model_outputs.to_csv("math_model_outputs.csv", index=False)

In [23]:
def get_embeddings(text):
    """
    Generates embeddings for a given text using the sentence-transformer model.
    """
    tokens = eval_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = eval_model(**tokens)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling
    return embeddings.cpu().numpy()

# Store results
semantic_similarities = []

# Evaluate each example
for idx, row in model_outputs.iterrows():
    query = row["query"]
    ground_truth = str(row["ground_truth"]).strip()
    model_output = str(row["model_output"]).strip()

    # Semantic Similarity
    ground_truth_embedding = get_embeddings(ground_truth)
    model_output_embedding = get_embeddings(model_output)
    similarity = cosine_similarity(ground_truth_embedding, model_output_embedding)[0][0]
    semantic_similarities.append(similarity)

# Aggregate scores
semantic_similarity_avg = np.mean(semantic_similarities)

# Print results
print(f"length of semantic_similarities:{len(semantic_similarities)}")
print(f"Semantic Similarity: {semantic_similarity_avg:.4f}\n")


length of semantic_similarities:500
Semantic Similarity: 0.6589



In [None]:
Semantic Similarity results: [0.493637, 0.66224635, 0.61844504, 0.826144, 0.70869, 0.6867446, 0.7232959, 0.7421569, 0.61345196, 0.3958481]
Semantic Similarity: 0.6471

for 50 outputs Semantic Similarity: 0.6125
for 1.5B 50 outputs with not twisted stuff Semantic Similarity: 0.6565
for 1.5B 50 outputs with twisted stuff Semantic Similarity: 0.6573


In [6]:
# --- Inference Workflow ---

outputs = []

for idx, row in tqdm(math_data.iterrows(), total=len(math_data), desc="Processing queries"):
    # Load text prompt
    query = row["query"]
    label = row["label"]

    good_messages = [
        {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user", "content": "dont output the resoning steps Output only final answer. Given Query:" +  query}
    ]

    good_text = tokenizer.apply_chat_template(good_messages, tokenize=False, add_generation_prompt=True)

    good_inputs = tokenizer(text=[good_text],return_tensors="pt",).to("cuda")

    good_ids = good_model.generate(**good_inputs, max_new_tokens=512)

    good_generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(good_inputs.input_ids, good_ids)
    ]

    good_output = tokenizer.batch_decode(
        good_generated_ids_trimmed,
        skip_special_tokens=True,
    )[0]
    """
    print(f"\n--- Result :{idx+1}/{len(math_data)} ---")
    print(f"Original Query: {query}")
    #print("\n\t----------------")
    #print(f"Twisted Output (Poor Model): {twisted_output}")
    print("\n\t----------------")
    print(f"Enhanced Output (Good Model): {good_output}")
    print("\n\t----------------\n")
    """

    # Add to outputs data
    outputs.append({"query": query, "ground_truth":label, "model_output": good_output})

Processing queries:   1%|          | 5/500 [00:28<47:34,  5.77s/it]


KeyboardInterrupt: 

In [12]:
def get_embeddings(text):
    """
    Generates embeddings for a given text using the sentence-transformer model.
    """
    tokens = eval_tokenizer(text, return_tensors="pt", padding=True, truncation=True).to("cuda")
    with torch.no_grad():
        outputs = eval_model(**tokens)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling
    return embeddings.cpu().numpy()

model_outputs = pd.DataFrame(outputs)

# Store results
em_results = []
f1_results = []
semantic_similarities = []

# Evaluate each example
for idx, row in tqdm(model_outputs.iterrows(), total=len(model_outputs), desc="evaluating outputs"):
    query = row["query"]
    ground_truth = str(row["ground_truth"]).strip()
    model_output = str(row["model_output"]).strip()

    # Semantic Similarity
    #if not is_number(ground_truth) and not is_number(model_output):
    ground_truth_embedding = get_embeddings(ground_truth)
    model_output_embedding = get_embeddings(model_output)
    similarity = cosine_similarity(ground_truth_embedding, model_output_embedding)[0][0]
    semantic_similarities.append(similarity)
    """
    else:
        # Exact Match (EM)
        em_results.append(ground_truth.strip().lower() == model_output.strip().lower())
    
        # F1 Score (for categorical/numeric labels)
        f1_results.append(f1_score([ground_truth], [model_output], average="micro"))
    """

# Aggregate scores
#em_score = np.mean(em_results)
#f1_score_avg = np.mean(f1_results)
semantic_similarity_avg = np.mean(semantic_similarities)

# Print results
print("Evaluation Metrics:")
#print(f"lenthg of em_results: {len(em_results)}")
#print(f"Exact Match (EM): {em_score:.4f}\n")

#print(f"lenthg of f1_results: {len(f1_results)}")
#print(f"F1 Score: {f1_score_avg:.4f}\n")


print(f"lenthg of semantic_similarities: {len(semantic_similarities)}")
print(f"Semantic Similarity: {semantic_similarity_avg:.4f}\n")


evaluating outputs: 100%|██████████| 500/500 [00:02<00:00, 213.70it/s]

Evaluation Metrics:
lenthg of semantic_similarities: 500
Semantic Similarity: 0.6684






In [13]:
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

print("Initializing models...")

good_model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)
print('loaded good model')

# --- Processor Setup ---
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
print('loaded processor')

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Initializing models...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

loaded good model
loaded processor


In [14]:
# --- Inference Workflow ---

outputs = []

for idx, row in tqdm(math_data.iterrows(), total=len(math_data), desc="Processing queries"):
    # Load image and text prompt
    query = row["query"]
    label = row["label"]

    if row['modality'] == "multimodal" :
        image_path = f"{images_dir}/{row['imgname']}"
        image = Image.open(image_path).convert("RGB")
        
        good_messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": "output only final answer dont output reasoning steps.\n" + query},
                ],
            }
        ]
        good_text = processor.apply_chat_template(good_messages, tokenize=False, add_generation_prompt=True)
        good_inputs = processor(
            text=[good_text],
            images=process_vision_info(good_messages)[0],
            padding=True,
            return_tensors="pt",
        ).to("cuda")
        good_ids = good_model.generate(**good_inputs, max_new_tokens=512)
        good_generated_ids_trimmed = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(good_inputs.input_ids, good_ids)
        ]
        good_output = processor.batch_decode(
            good_generated_ids_trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False,
        )[0]
    
        # Add to outputs data
        outputs.append({"query": query, "ground_truth":label, "model_output": good_output})
    else:
        # Process text-only queries
        good_text = processor.apply_chat_template(
            [{"role": "user", "content": query}], 
            tokenize=False, 
            add_generation_prompt=True
        )
        good_inputs = processor(
            text=[good_text],
            padding=True,
            return_tensors="pt",
        ).to("cuda")
        good_ids = good_model.generate(**good_inputs, max_new_tokens=512)
        good_generated_ids_trimmed = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(good_inputs.input_ids, good_ids)
        ]
        good_output = processor.batch_decode(
            good_generated_ids_trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False,
        )[0]
        
        # Add to outputs data
        outputs.append({"query": query, "ground_truth": label, "model_output": good_output})



Processing queries: 100%|██████████| 500/500 [35:29<00:00,  4.26s/it]


In [15]:
def get_embeddings(text):
    """
    Generates embeddings for a given text using the sentence-transformer model.
    """
    tokens = eval_tokenizer(text, return_tensors="pt", padding=True, truncation=True).to("cuda")
    with torch.no_grad():
        outputs = eval_model(**tokens)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling
    return embeddings.cpu().numpy()

model_outputs = pd.DataFrame(outputs)

# Store results
em_results = []
f1_results = []
semantic_similarities = []

# Evaluate each example
for idx, row in tqdm(model_outputs.iterrows(), total=len(model_outputs), desc="evaluating outputs"):
    query = row["query"]
    ground_truth = str(row["ground_truth"]).strip()
    model_output = str(row["model_output"]).strip()

    # Semantic Similarity
    #if not is_number(ground_truth) and not is_number(model_output):
    ground_truth_embedding = get_embeddings(ground_truth)
    model_output_embedding = get_embeddings(model_output)
    similarity = cosine_similarity(ground_truth_embedding, model_output_embedding)[0][0]
    semantic_similarities.append(similarity)
    """
    else:
        # Exact Match (EM)
        em_results.append(ground_truth.strip().lower() == model_output.strip().lower())
    
        # F1 Score (for categorical/numeric labels)
        f1_results.append(f1_score([ground_truth], [model_output], average="micro"))
    """

# Aggregate scores
#em_score = np.mean(em_results)
#f1_score_avg = np.mean(f1_results)
semantic_similarity_avg = np.mean(semantic_similarities)

# Print results
print("Evaluation Metrics:")
#print(f"lenthg of em_results: {len(em_results)}")
#print(f"Exact Match (EM): {em_score:.4f}\n")

#print(f"lenthg of f1_results: {len(f1_results)}")
#print(f"F1 Score: {f1_score_avg:.4f}\n")


print(f"lenthg of semantic_similarities: {len(semantic_similarities)}")
print(f"Semantic Similarity: {semantic_similarity_avg:.4f}\n")


evaluating outputs: 100%|██████████| 500/500 [00:02<00:00, 231.73it/s]

Evaluation Metrics:
lenthg of semantic_similarities: 500
Semantic Similarity: 0.7261




