In [1]:
import torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer
from dataclasses import dataclass
import pandas as pd
import Levenshtein
import json
from tqdm.auto import tqdm
from typing import Tuple, List, Dict
import os
from pathlib import Path
import logging
from datetime import datetime

@dataclass
class PosixConfig:
    max_new_tokens: int = 20
    batched: bool = False

@dataclass
class PosixTrace:
    prompts: list
    responses: list
    logprob_matrices: list
    prompt_sensitivities: list
    posix: float

class QwenVLModel:
    def __init__(self, model_path: str, device: str = "cuda:0"):
        """Initialize Qwen-VL model and tokenizer."""
        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_path,
            device_map=device,
            trust_remote_code=True
        ).eval()
        self.device = device
    
    def get_responses(self, image_path: str, prompts: list[str], **kwargs) -> Tuple[list[list[int]], list[str], list[int]]:
        """Generate responses with proper instruction length tracking."""
        response_tokens = []
        responses = []
        instruction_lengths = []
        
        for prompt in prompts:
            # Format input for Qwen-VL
            query = self.tokenizer.from_list_format([
                {'image': image_path},
                {'text': prompt}
            ])
            
            # Generate response
            response, _ = self.model.chat(self.tokenizer, query=query, history=None)
            
            # Get token IDs for response
            response_ids = self.tokenizer.encode(response, add_special_tokens=False)
            
            response_tokens.append(response_ids)
            responses.append(response)
            instruction_lengths.append(len(self.tokenizer(query)['input_ids']))
            
        return response_tokens, responses, instruction_lengths
    
    def compute_log_probabilties(self, image_path: str, prompt: str, response_tokens: list[int], instruction_length: int) -> float:
        """Compute log probabilities with proper token handling."""
        # Format input for Qwen-VL
        query = self.tokenizer.from_list_format([
            {'image': image_path},
            {'text': prompt}
        ])
        
        # Get input tokens
        input_ids = self.tokenizer(query, return_tensors="pt").input_ids.to(self.device)
        
        # Combine input and response tokens
        full_sequence = torch.cat([
            input_ids,
            torch.tensor([response_tokens]).to(self.device)
        ], dim=1)
        
        with torch.no_grad():
            outputs = self.model(full_sequence)
            logits = outputs.logits
            
            # Get logits for response portion only
            response_logits = logits[:, instruction_length-1:instruction_length-1+len(response_tokens), :]
            log_probs = torch.log_softmax(response_logits, dim=-1)
            
            final_logprob = 0.0
            for i, token in enumerate(response_tokens):
                token_logprob = log_probs[0, i, token].item()
                final_logprob += token_logprob
                
        return final_logprob

class PromptSensitivityAnalyzer:
    @staticmethod
    def calculate_char_level_similarity(str1: str, str2: str) -> float:
        """Calculate character-level similarity using Levenshtein distance."""
        distance = Levenshtein.distance(str1, str2)
        max_len = max(len(str1), len(str2))
        return 1 - (distance / max_len)
    
    @staticmethod
    def extract_prompt_sets(json_data: Dict) -> List[Dict[str, List[str]]]:
        """Extract prompt sets from the JSON data."""
        prompt_sets = []
        
        variations = [json_data[f'variation_{i}'] for i in range(1, 11)]
        current_set = {
            'original': json_data['question'],
            'variations': variations,
            'answer': json_data['answer'],
            'image': json_data['image']
        }
        prompt_sets.append(current_set)
        
        return prompt_sets
    
    @staticmethod
    def analyze_prompt_sensitivity(prompt_sets: List[Dict[str, List[str]]]) -> pd.DataFrame:
        """Analyze sensitivity between original prompts and variations."""
        results = []
        
        for set_idx, prompt_set in enumerate(prompt_sets, 1):
            original = prompt_set['original']
            
            for var_idx, variation in enumerate(prompt_set['variations'], 1):
                similarity = PromptSensitivityAnalyzer.calculate_char_level_similarity(original, variation)
                sensitivity = 1 - similarity
                
                results.append({
                    'Prompt Set': set_idx,
                    'Variation': f'Variation {var_idx}',
                    'Original': original,
                    'Variation Text': variation,
                    'Expected Answer': prompt_set['answer'],
                    'Image Path': prompt_set['image'],
                    'Similarity': round(similarity, 4),
                    'Sensitivity': round(sensitivity, 4)
                })
        
        return pd.DataFrame(results)

def get_qwenvl_posix(
    model: QwenVLModel,
    prompt_sets: List[Dict[str, List[str]]],
    config: PosixConfig,
    verbose: bool = False
) -> Tuple[float, PosixTrace, List[Dict]]:
    """Calculate POSIX scores using Qwen-VL model."""
    N_prompt_sets = len(prompt_sets)
    
    responses = []
    response_tokens = []
    logprob_matrices = []
    prompt_sensitivities = []
    detailed_scores = []
    
    pbar = tqdm(range(N_prompt_sets))
    for i in range(N_prompt_sets):
        prompt_set = prompt_sets[i]
        all_prompts = [prompt_set['original']] + prompt_set['variations']
        image_path = prompt_set['image']
        
        if verbose:
            print(f"\nProcessing prompt set {i+1}")
            print(f"Original prompt: {all_prompts[0]}")
            print(f"First variation: {all_prompts[1]}")
        
        # Generate responses for all prompts
        set_tokens, set_responses, instruction_lengths = model.get_responses(
            image_path,
            all_prompts,
            max_new_tokens=config.max_new_tokens
        )
        
        response_tokens.append(set_tokens)
        responses.append(set_responses)
        
        N_prompts = len(all_prompts)
        logprob_matrix = [[0 for _ in range(N_prompts)] for _ in range(N_prompts)]
        
        # Calculate log probabilities for all prompt-response pairs
        for i in range(N_prompts):
            for j in range(N_prompts):
                logprob = model.compute_log_probabilties(
                    image_path,
                    all_prompts[i],
                    set_tokens[j],
                    instruction_lengths[i]
                )
                logprob_matrix[i][j] = logprob
        
        logprob_matrices.append(logprob_matrix)
        
        # Calculate prompt sensitivity
        psi = 0.0
        scores = {}
        for i in range(N_prompts):
            for j in range(N_prompts):
                if i != j:
                    response_length = len(set_tokens[j])
                    diff = abs(logprob_matrix[i][j] - logprob_matrix[i][i]) / response_length
                    psi += diff
                    if i == 0:
                        scores[f"Variation {j}"] = diff
                        
        prompt_sensitivity = psi / (N_prompts * (N_prompts - 1))
        prompt_sensitivities.append(prompt_sensitivity)
        detailed_scores.append(scores)
        
        if verbose:
            print(f"\nPrompt sensitivity: {prompt_sensitivity:.4f}")
        
        pbar.update(1)
    
    posix = sum(prompt_sensitivities) / N_prompt_sets
    
    trace = PosixTrace(
        [set['original'] for set in prompt_sets],
        responses,
        logprob_matrices,
        prompt_sensitivities,
        posix
    )
    
    return posix, trace, detailed_scores

def main():
    # File paths and configuration
    model_path = "/share/ssddata/sarimhashmi/Qwen-VL-Chat"
    input_dir = "/share/ssddata/sarimhashmi/posix_thesis/new_improve_stuff/Thesis/paraphrase_error_iuxray_variant"
    output_dir = "/share/ssddata/sarimhashmi/posix_thesis/new_improve_stuff/qwenvl/paraphrase_result"
    
    os.makedirs(output_dir, exist_ok=True)
    
    # Initialize model
    model = QwenVLModel(model_path, device="cuda:1")
    config = PosixConfig(max_new_tokens=50)
    
    # Get list of JSON files to process
    json_files = sorted(Path(input_dir).glob("question_*_variants_results.json"))
    print(f"Found {len(json_files)} files to process")
    
    # Process each file
    all_results = []
    for file_path in tqdm(json_files, desc="Processing files"):
        try:
            # Read JSON file
            with open(file_path, 'r') as f:
                json_data = json.load(f)
            
            # Process file
            analyzer = PromptSensitivityAnalyzer()
            prompt_sets = analyzer.extract_prompt_sets(json_data)
            
            # Calculate POSIX scores
            posix, trace, detailed_scores = get_qwenvl_posix(
                model,
                prompt_sets,
                config,
                verbose=False
            )
            
            # Save individual result
            output_file = os.path.join(output_dir, f"{file_path.stem}_results.json")
            with open(output_file, 'w') as f:
                json.dump({
                    "file_name": file_path.name,
                    "overall_posix": posix,
                    "detailed_scores": detailed_scores,
                    "trace": trace.__dict__
                }, f, indent=4)
            
            # Store summary
            all_results.append({
                "file_name": file_path.name,
                "posix_score": posix,
                "status": "success"
            })
            
        except Exception as e:
            print(f"Error processing {file_path.name}: {str(e)}")
            all_results.append({
                "file_name": file_path.name,
                "posix_score": None,
                "status": "failed",
                "error": str(e)
            })
            
        # Optional: Clear CUDA cache
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    # Save summary results
    summary_df = pd.DataFrame(all_results)
    summary_df.to_csv(os.path.join(output_dir, "all_results_summary.csv"), index=False)

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
The model is automatically converting to bf16 for faster inference. If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to "AutoModelForCausalLM.from_pretrained".
  return torch.load(checkpoint_file, map_location=map_location)
Loading checkpoint shards: 100%|██████████| 10/10 [00:14<00:00,  1.49s/it]


Found 400 files to process


100%|██████████| 1/1 [00:25<00:00, 25.47s/it]00<?, ?it/s]
100%|██████████| 1/1 [00:24<00:00, 24.52s/it]25<2:49:28, 25.49s/it]
100%|██████████| 1/1 [00:25<00:00, 25.34s/it]50<2:45:19, 24.92s/it]
100%|██████████| 1/1 [00:25<00:00, 25.71s/it]15<2:46:11, 25.12s/it]
100%|██████████| 1/1 [00:25<00:00, 25.25s/it]41<2:48:02, 25.46s/it]
100%|██████████| 1/1 [00:25<00:00, 25.40s/it]06<2:47:44, 25.48s/it]
100%|██████████| 1/1 [00:25<00:00, 25.38s/it]32<2:47:09, 25.46s/it]
100%|██████████| 1/1 [00:25<00:00, 25.34s/it]57<2:46:40, 25.45s/it]
100%|██████████| 1/1 [00:25<00:00, 25.40s/it]23<2:46:14, 25.45s/it]
100%|██████████| 1/1 [00:25<00:00, 25.22s/it]48<2:45:58, 25.47s/it]
100%|██████████| 1/1 [00:25<00:00, 25.46s/it]:14<2:45:30, 25.46s/it]
100%|██████████| 1/1 [00:25<00:00, 25.19s/it]:39<2:45:10, 25.48s/it]
100%|██████████| 1/1 [00:25<00:00, 25.60s/it]:05<2:44:44, 25.47s/it]
100%|██████████| 1/1 [00:25<00:00, 25.36s/it]:30<2:44:34, 25.52s/it]
100%|██████████| 1/1 [00:25<00:00, 25.20s/it]:56<2:43: