In [1]:
# Cell 1: Install Required Libraries
# Use bitsandbytes for 4-bit quantization
!pip install transformers accelerate bitsandbytes -q
!pip install pandas pillow tqdm -q
print("Libraries installed successfully!")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hLibraries installed successfully!


## 0. Install Required Packages

# Error Bar Detection using VQA + Chain-of-Thought Reasoning (4-bit Quantized)
## Visual Question Answering with Step-by-Step CoT - Qwen2.5-VL

**Objective:** Detect error bars in scientific charts using Visual Question Answering with explicit Chain-of-Thought reasoning for improved accuracy.

**Approach:**
- Load Qwen2.5-VL-7B-Instruct model with 4-bit quantization for efficiency
- Break down detection into sequential VQA steps with CoT prompting
- Step 1: Identify and verify data point marker location
- Step 2: Scan upward with reasoning to find top error bar endpoint
- Step 3: Scan downward with reasoning to find bottom error bar endpoint
- Aggregate multi-step reasoning into final pixel measurements
- Evaluation against ground truth annotations

**Why VQA + CoT:**
- Explicit reasoning improves measurement accuracy
- Self-verification catches errors
- Explainable outputs for debugging
- Better handles ambiguous cases

## 1. Setup and Imports

In [2]:
# Cell 2: Import Libraries
import torch
import pandas as pd
import os
import gc
import json
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from typing import List, Dict, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# For image processing
from PIL import Image
from tqdm import tqdm

# For model loading - using the same import as meme classification
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor

# Check GPU
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU Memory: {gpu_mem:.1f} GB")

print("\nLibraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")

2026-01-28 13:08:03.748052: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1769605683.918516      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769605683.964129      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1769605684.343966      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769605684.344006      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769605684.344008      55 computation_placer.cc:177] computation placer alr

GPU Available: True
GPU Name: Tesla T4
GPU Memory: 15.8 GB

Libraries imported successfully!
PyTorch version: 2.8.0+cu126


## 2. Configuration and Data Paths

In [3]:
# Data paths (Kaggle format)
BASE_PATH = "/kaggle/input/graph-plots"
TEST_IMAGES = os.path.join(BASE_PATH, "Test", "images")
TEST_INPUT_LABELS = os.path.join(BASE_PATH, "Test", "test_labels")  # Input: x,y only
TEST_GROUND_TRUTH = os.path.join(BASE_PATH, "Test", "labels")       # Ground truth: with error bars

# Model configuration - Optimized for fast inference
MODEL_NAME = "Qwen/Qwen2.5-VL-7B-Instruct"
MAX_NEW_TOKENS = 1024  # Enough tokens for all data points
TEMPERATURE = 0.1  # Low temperature for deterministic outputs
IMAGE_MAX_SIZE = 768  # Max image dimension for faster processing

print(f"Model: {MODEL_NAME}")
print(f"Max tokens: {MAX_NEW_TOKENS}")
print(f"Image max size: {IMAGE_MAX_SIZE}px")
print(f"Test images: {TEST_IMAGES}")
print(f"Test input labels: {TEST_INPUT_LABELS}")
print(f"Ground truth: {TEST_GROUND_TRUTH}")

Model: Qwen/Qwen2.5-VL-7B-Instruct
Max tokens: 1024
Image max size: 768px
Test images: /kaggle/input/graph-plots/Test/images
Test input labels: /kaggle/input/graph-plots/Test/test_labels
Ground truth: /kaggle/input/graph-plots/Test/labels


## 3. Load Vision-Language Model (4-bit Quantization)

In [4]:
# Cell 3: Load Qwen2.5-VL Model (FP16 - Stable & Fast)
# No quantization for stable vision embeddings

import torch
from transformers import AutoProcessor, AutoModelForVision2Seq

def load_qwen_model():
    """
    Load Qwen2.5-VL-7B-Instruct with float16 precision.
    More stable than 4-bit for vision tasks.
    Uses ~14GB VRAM with FP16.
    """
    print("\n" + "="*60)
    print("LOADING QWEN2.5-VL-7B-INSTRUCT (FP16)")
    print("="*60)
    
    MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"
    
    print(f"\nLoading model: {MODEL_ID}")
    print("This may take 2-3 minutes...")
    
    # Load processor
    processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
    print("Processor loaded!")
    
    # Load model with FP16 (stable for vision)
    model = AutoModelForVision2Seq.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    
    model.eval()
    print("Model loaded with FP16 precision!")
    
    # Print memory usage
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1e9
        print(f"GPU Memory Used: {allocated:.2f} GB")
    
    return model, processor


# Load the model
model, processor = load_qwen_model()

print("\n" + "="*60)
print("MODEL READY FOR INFERENCE")
print("="*60)



LOADING QWEN2.5-VL-7B-INSTRUCT (FP16)

Loading model: Qwen/Qwen2.5-VL-7B-Instruct
This may take 2-3 minutes...


preprocessor_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

Processor loaded!


config.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/216 [00:00<?, ?B/s]

Model loaded with FP16 precision!
GPU Memory Used: 7.57 GB

MODEL READY FOR INFERENCE


## 4. Define System and User Prompts

In [5]:
SYSTEM_PROMPT = """You are analyzing scientific plots.

Task:
- Identify whether error bars are present.
- Identify orientation (vertical or horizontal).
- Identify whether error bars are symmetric or asymmetric.

Rules:
- Do NOT estimate pixel values.
- Do NOT guess numeric distances.
- Respond ONLY in valid JSON format.
"""

print("System prompt defined!")

System prompt defined!


## 5. Helper Functions

In [6]:
def load_test_input(json_path):
    """Load test input JSON (contains only x,y coordinates)"""
    with open(json_path, 'r') as f:
        return json.load(f)

def load_ground_truth(json_path):
    """Load ground truth JSON (contains error bar distances)"""
    with open(json_path, 'r') as f:
        return json.load(f)

def load_image_as_pil(image_path):
    """Load image as PIL Image"""
    return Image.open(image_path).convert('RGB')

def parse_vlm_response(response_text: str, original_points: List[Dict]) -> Optional[Dict]:
    """
    Parse VLM response and convert to standard format.
    Handles multiple output formats from the model.
    """
    try:
        # Remove markdown code blocks if present
        cleaned = response_text.strip()
        if '```json' in cleaned:
            start = cleaned.find('```json') + 7
            end = cleaned.find('```', start)
            if end > start:
                cleaned = cleaned[start:end].strip()
        elif '```' in cleaned:
            start = cleaned.find('```') + 3
            end = cleaned.find('```', start)
            if end > start:
                cleaned = cleaned[start:end].strip()
        
        # Find the JSON array or object
        if cleaned.startswith('['):
            json_str = cleaned
        elif cleaned.startswith('{'):
            json_str = cleaned
        else:
            start_idx = cleaned.find('[')
            if start_idx < 0:
                start_idx = cleaned.find('{')
            end_idx = cleaned.rfind(']') + 1
            if end_idx <= start_idx:
                end_idx = cleaned.rfind('}') + 1
            if start_idx >= 0 and end_idx > start_idx:
                json_str = cleaned[start_idx:end_idx]
            else:
                print("No JSON found in response")
                return None
        
        # Parse JSON
        parsed = json.loads(json_str)
        
        # Convert to list if single object
        if isinstance(parsed, dict):
            parsed = [parsed]
        
        # Convert to standard format
        measurements = []
        for i, item in enumerate(parsed):
            # Handle different key formats the model might output
            if 'data_point_x' in item:
                # Flat format: data_point_x, data_point_y, upper_error_bar_y, lower_error_bar_y
                x = float(item.get('data_point_x', item.get('x', 0)))
                y = float(item.get('data_point_y', item.get('y', 0)))
                upper_y = float(item.get('upper_error_bar_y', item.get('upper_y', y)))
                lower_y = float(item.get('lower_error_bar_y', item.get('lower_y', y)))
            elif 'x' in item and 'upper_y' in item:
                # Simple format: x, y, upper_y, lower_y
                x = float(item['x'])
                y = float(item['y'])
                upper_y = float(item.get('upper_y', y))
                lower_y = float(item.get('lower_y', y))
            elif 'data_point' in item:
                # Nested format: data_point: {x, y}, upper_error_bar: {x, y}, lower_error_bar: {x, y}
                x = float(item['data_point']['x'])
                y = float(item['data_point']['y'])
                upper_y = float(item.get('upper_error_bar', {}).get('y', y))
                lower_y = float(item.get('lower_error_bar', {}).get('y', y))
            else:
                # Fallback - use original point if available
                if i < len(original_points):
                    x = float(original_points[i].get('x', 0))
                    y = float(original_points[i].get('y', 0))
                    upper_y = y
                    lower_y = y
                else:
                    continue
            
            measurements.append({
                "data_point": {"x": x, "y": y},
                "upper_error_bar": {"x": x, "y": upper_y},
                "lower_error_bar": {"x": x, "y": lower_y}
            })
        
        return {"measurements": measurements}
        
    except json.JSONDecodeError as e:
        print(f"JSON parsing error: {e}")
        print(f"Response excerpt: {response_text[:500]}")
        
        # Try regex extraction as fallback
        try:
            import re
            measurements = []
            
            # Pattern for flat format
            flat_pattern = r'"(?:data_point_)?x"?\s*:\s*([\d.]+).*?"(?:data_point_)?y"?\s*:\s*([\d.]+).*?"(?:upper_error_bar_y|upper_y)"?\s*:\s*([\d.]+).*?"(?:lower_error_bar_y|lower_y)"?\s*:\s*([\d.]+)'
            matches = re.findall(flat_pattern, response_text, re.DOTALL)
            
            if matches:
                for match in matches:
                    measurements.append({
                        "data_point": {"x": float(match[0]), "y": float(match[1])},
                        "upper_error_bar": {"x": float(match[0]), "y": float(match[2])},
                        "lower_error_bar": {"x": float(match[0]), "y": float(match[3])}
                    })
                print(f"Extracted {len(measurements)} measurements using regex")
                return {"measurements": measurements}
        except Exception as regex_error:
            print(f"Regex extraction failed: {regex_error}")
        
        return None
    except Exception as e:
        print(f"Unexpected error parsing response: {e}")
        return None

print("Helper functions defined!")

Helper functions defined!


## 6. Model Inference Function

In [7]:
import re
from PIL import Image

def infer_plot_structure(image_path: str) -> Optional[Dict]:
    """
    Analyze plot structure without pixel estimation.
    Returns JSON describing error bar presence and characteristics.
    """
    try:
        image = Image.open(image_path).convert("RGB")
        
        # Resize for consistency
        max_size = 768
        if max(image.size) > max_size:
            ratio = max_size / max(image.size)
            new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
            image = image.resize(new_size, Image.BILINEAR)
        
        messages = [
            {
                "role": "system",
                "content": SYSTEM_PROMPT
            },
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {
                        "type": "text",
                        "text": "Analyze the plot and report error bar structure in JSON format."
                    }
                ]
            }
        ]
        
        # Apply chat template
        text = processor.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        
        # Process inputs
        inputs = processor(
            text=[text],
            images=[image],
            padding=True,
            return_tensors="pt"
        ).to(model.device)
        
        # Generate response
        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=120,
                do_sample=False,
                temperature=0.0,
                pad_token_id=processor.tokenizer.pad_token_id
            )
        
        # Decode
        decoded = processor.batch_decode(
            output,
            skip_special_tokens=True
        )[0]
        
        # Extract JSON safely
        match = re.search(r"\{.*\}", decoded, re.DOTALL)
        if not match:
            print(f"No JSON found in response: {decoded[:200]}")
            return None
        
        result = json.loads(match.group())
        return result
        
    except Exception as e:
        print(f"ERROR: Structure inference failed: {e}")
        return None

print("Structure inference function defined!")


Structure inference function defined!


In [8]:
# Install OpenCV for pixel measurement
!pip install opencv-python -q
import cv2
import numpy as np

def measure_error_bars_pixels(image_path: str) -> List[Dict]:
    """
    Measure error bars using OpenCV edge detection.
    Returns actual pixel measurements from the image.
    """
    try:
        # Read image in grayscale
        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        if img is None:
            print(f"Failed to load image: {image_path}")
            return []
        
        # Edge detection
        edges = cv2.Canny(img, 50, 150)
        
        # Detect vertical lines (error bars)
        lines = cv2.HoughLinesP(
            edges,
            rho=1,
            theta=np.pi / 180,
            threshold=100,
            minLineLength=30,
            maxLineGap=5
        )
        
        if lines is None:
            return []
        
        error_bars = []
        
        for line in lines:
            x1, y1, x2, y2 = line[0]
            
            # Vertical-ish line (error bar candidate)
            if abs(x1 - x2) < 3:
                length = abs(y2 - y1)
                error_bars.append({
                    "x": int(x1),
                    "top": int(min(y1, y2)),
                    "bottom": int(max(y1, y2)),
                    "pixel_length": int(length)
                })
        
        return error_bars
        
    except Exception as e:
        print(f"ERROR: Pixel measurement failed: {e}")
        return []

print("OpenCV pixel measurement function defined!")


OpenCV pixel measurement function defined!


In [9]:
def analyze_plot(image_path: str) -> Dict:
    """
    Combined pipeline: structure analysis + pixel measurement.
    """
    structure = infer_plot_structure(image_path)
    pixel_bars = measure_error_bars_pixels(image_path)
    
    return {
        "structure": structure,
        "pixel_measurements": pixel_bars
    }

print("Combined analysis pipeline defined!")


Combined analysis pipeline defined!


## 7. Convert VLM Output to Standard Format

In [10]:
def convert_vlm_to_standard_format(result: Dict, line_name: str) -> Dict:
    """
    Convert VLM measurements to standard prediction format with pixel distances.
    """
    points = []
    
    measurements = result.get('measurements', [])
    
    for measure in measurements:
        data_pt = measure['data_point']
        upper_bar = measure['upper_error_bar']
        lower_bar = measure['lower_error_bar']
        
        x = data_pt['x']
        y = data_pt['y']
        
        # Calculate pixel distances
        top_dist = abs(y - upper_bar['y'])  # Distance to upper error bar
        bottom_dist = abs(lower_bar['y'] - y)  # Distance to lower error bar
        dev_dist = max(top_dist, bottom_dist)
        
        points.append({
            "x": x,
            "y": y,
            "label": "",
            "topBarPixelDistance": float(top_dist),
            "bottomBarPixelDistance": float(bottom_dist),
            "deviationPixelDistance": float(dev_dist)
        })
    
    return {
        "label": {"lineName": line_name},
        "points": points
    }

def convert_to_output_format(image_file: str, predictions: List[Dict]) -> Dict:
    """
    Convert to final output format with error bar endpoints.
    """
    error_bars = []
    
    for pred_line in predictions:
        line_name = pred_line.get('label', {}).get('lineName', '')
        pred_points = [p for p in pred_line.get('points', []) 
                      if p.get('label', '') not in ['xmin', 'xmax', 'ymin', 'ymax']]
        
        points_data = []
        for point in pred_points:
            x = point['x']
            y = point['y']
            top_dist = point['topBarPixelDistance']
            bottom_dist = point['bottomBarPixelDistance']
            
            point_data = {
                "data_point": {"x": x, "y": y},
                "upper_error_bar": {"x": x, "y": y - top_dist},
                "lower_error_bar": {"x": x, "y": y + bottom_dist}
            }
            
            points_data.append(point_data)
        
        line_data = {
            "lineName": line_name,
            "points": points_data
        }
        
        error_bars.append(line_data)
    
    return {
        "image_file": image_file,
        "model": "VQA-4bit-Fast",
        "error_bars": error_bars
    }

print("Format conversion functions defined!")

Format conversion functions defined!


## 8. Evaluation Metrics

In [11]:
def calculate_point_error(pred_point, gt_point):
    """Calculate absolute pixel error for a single point."""
    top_error = abs(pred_point.get('topBarPixelDistance', 0) - 
                   gt_point.get('topBarPixelDistance', 0))
    bottom_error = abs(pred_point.get('bottomBarPixelDistance', 0) - 
                      gt_point.get('bottomBarPixelDistance', 0))
    dev_error = abs(pred_point.get('deviationPixelDistance', 0) - 
                   gt_point.get('deviationPixelDistance', 0))
    
    return {
        'top_error': top_error,
        'bottom_error': bottom_error,
        'deviation_error': dev_error,
        'mean_error': (top_error + bottom_error) / 2
    }

def evaluate_predictions(predictions, ground_truth):
    """Evaluate predictions against ground truth."""
    all_errors = []
    
    for pred_line in predictions:
        gt_line = None
        for gt in ground_truth:
            if gt.get('label', {}).get('lineName') == pred_line.get('label', {}).get('lineName'):
                gt_line = gt
                break
        
        if gt_line is None:
            continue
        
        pred_points = [p for p in pred_line.get('points', []) 
                      if p.get('label', '') not in ['xmin', 'xmax', 'ymin', 'ymax']]
        gt_points = [p for p in gt_line.get('points', []) 
                    if p.get('label', '') not in ['xmin', 'xmax', 'ymin', 'ymax']]
        
        for pred_pt, gt_pt in zip(pred_points, gt_points):
            error = calculate_point_error(pred_pt, gt_pt)
            all_errors.append(error)
    
    if not all_errors:
        return None
    
    metrics = {
        'num_points': len(all_errors),
        'mean_top_error': np.mean([e['top_error'] for e in all_errors]),
        'mean_bottom_error': np.mean([e['bottom_error'] for e in all_errors]),
        'mean_deviation_error': np.mean([e['deviation_error'] for e in all_errors]),
        'mean_overall_error': np.mean([e['mean_error'] for e in all_errors]),
        'median_top_error': np.median([e['top_error'] for e in all_errors]),
        'median_bottom_error': np.median([e['bottom_error'] for e in all_errors]),
        'std_top_error': np.std([e['top_error'] for e in all_errors]),
        'std_bottom_error': np.std([e['bottom_error'] for e in all_errors]),
    }
    
    return metrics

print("Evaluation functions defined!")

Evaluation functions defined!


## 9. Test on Sample Image

In [12]:
# Test on one sample image
sample_json_files = sorted([f for f in os.listdir(TEST_INPUT_LABELS) if f.endswith('.json')])[:1]

if sample_json_files:
    sample_json = sample_json_files[0]
    print(f"Testing on sample: {sample_json}\n")
    
    # Load input
    input_json = load_test_input(os.path.join(TEST_INPUT_LABELS, sample_json))
    image_file = input_json['image_file']
    image_path = os.path.join(TEST_IMAGES, image_file)
    
    print(f"Analyzing image: {image_file}")
    
    # Analyze using combined pipeline
    result = analyze_plot(image_path)
    
    if result:
        print("\n=== Structure Analysis ===")
        print(json.dumps(result['structure'], indent=2))
        
        print(f"\n=== Pixel Measurements ===")
        print(f"Detected {len(result['pixel_measurements'])} vertical lines")
        if result['pixel_measurements']:
            print("\nFirst 3 detections:")
            for bar in result['pixel_measurements'][:3]:
                print(f"  x={bar['x']}, top={bar['top']}, bottom={bar['bottom']}, length={bar['pixel_length']}px")
    else:
        print("Analysis failed - no results")


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Testing on sample: 00271e61-86e3-453f-8101-fe906ae927eb.json

Analyzing image: 00271e61-86e3-453f-8101-fe906ae927eb.png

=== Structure Analysis ===
{
  "errorBarsPresent": true,
  "orientation": "vertical",
  "symmetry": "asymmetric"
}

=== Pixel Measurements ===
Detected 8 vertical lines

First 3 detections:
  x=66, top=8, bottom=449, length=441px
  x=64, top=8, bottom=451, length=443px
  x=96, top=10, bottom=204, length=194px


## 10. Process All Test Images

In [13]:
# Process all test images with hybrid approach
all_test_files = sorted([f for f in os.listdir(TEST_INPUT_LABELS) if f.endswith('.json')])
print(f"Processing {len(all_test_files)} test images...\n")

detailed_results = []
all_predictions = {}
failed_count = 0
processed_count = 0

def match_error_bars_to_points(data_points: List[Dict], detected_bars: List[Dict]) -> List[Dict]:
    """
    Match detected error bars to known data points.
    For each data point, find the nearest error bar.
    """
    results = []
    
    for point in data_points:
        x, y = point['x'], point['y']
        
        # Find nearest error bar (by x coordinate)
        best_bar = None
        min_dist = float('inf')
        
        for bar in detected_bars:
            x_dist = abs(bar['x'] - x)
            if x_dist < min_dist:
                min_dist = x_dist
                best_bar = bar
        
        # If found a nearby bar (within 10px), use it
        if best_bar and min_dist < 10:
            top_dist = abs(y - best_bar['top'])
            bottom_dist = abs(best_bar['bottom'] - y)
            
            results.append({
                "x": x,
                "y": y,
                "label": "",
                "topBarPixelDistance": float(top_dist),
                "bottomBarPixelDistance": float(bottom_dist),
                "deviationPixelDistance": float(max(top_dist, bottom_dist))
            })
        else:
            # No error bar detected at this point
            results.append({
                "x": x,
                "y": y,
                "label": "",
                "topBarPixelDistance": 0.0,
                "bottomBarPixelDistance": 0.0,
                "deviationPixelDistance": 0.0
            })
    
    return results

for i, json_file in enumerate(all_test_files):
    try:
        # Load input
        input_json = load_test_input(os.path.join(TEST_INPUT_LABELS, json_file))
        image_file = input_json['image_file']
        image_path = os.path.join(TEST_IMAGES, image_file)
        
        # Analyze image once
        analysis = analyze_plot(image_path)
        
        if not analysis or not analysis.get('pixel_measurements'):
            failed_count += 1
            continue
        
        detected_bars = analysis['pixel_measurements']
        
        # Process each line in the data
        predictions = []
        
        for data_line in input_json.get('data_points', []):
            line_name = data_line.get('lineName', '')
            points = data_line.get('points', [])
            
            # Match detected bars to data points
            matched_points = match_error_bars_to_points(points, detected_bars)
            
            predictions.append({
                'label': {'lineName': line_name},
                'points': matched_points
            })
        
        if not predictions:
            failed_count += 1
            continue
        
        # Store predictions
        all_predictions[json_file] = {
            'image_file': image_file,
            'predictions': predictions,
            'input_json': input_json
        }
        
        # Evaluate
        gt_json = load_ground_truth(os.path.join(TEST_GROUND_TRUTH, json_file))
        
        image_errors = []
        for pred_line in predictions:
            gt_line = None
            for gt in gt_json:
                if gt.get('label', {}).get('lineName') == pred_line.get('label', {}).get('lineName'):
                    gt_line = gt
                    break
            
            if gt_line is None:
                continue
            
            pred_points = [p for p in pred_line.get('points', []) 
                          if p.get('label', '') not in ['xmin', 'xmax', 'ymin', 'ymax']]
            gt_points = [p for p in gt_line.get('points', []) 
                        if p.get('label', '') not in ['xmin', 'xmax', 'ymin', 'ymax']]
            
            for pred_pt, gt_pt in zip(pred_points, gt_points):
                error = calculate_point_error(pred_pt, gt_pt)
                image_errors.append(error)
        
        if image_errors:
            img_metrics = {
                'image_file': image_file,
                'json_file': json_file,
                'num_points': len(image_errors),
                'mean_top_error': np.mean([e['top_error'] for e in image_errors]),
                'mean_bottom_error': np.mean([e['bottom_error'] for e in image_errors]),
                'mean_deviation_error': np.mean([e['deviation_error'] for e in image_errors]),
                'mean_overall_error': np.mean([e['mean_error'] for e in image_errors]),
                'max_top_error': np.max([e['top_error'] for e in image_errors]),
                'max_bottom_error': np.max([e['bottom_error'] for e in image_errors]),
                'all_errors': image_errors
            }
            detailed_results.append(img_metrics)
        
        processed_count += 1
        
        # Progress update every 10 images
        if processed_count % 10 == 0:
            print(f"{'='*70}")
            print(f"✓ {processed_count} images processed")
            print(f"Predictions stored: {len(all_predictions)} files")
            print(f"Failed: {failed_count} images")
            print(f"Total processed: {processed_count} images")
            print(f"{'='*70}")
    
    except Exception as e:
        failed_count += 1
        if failed_count <= 5:
            print(f"✗ Error processing {json_file}: {e}")

print(f"\n{'='*70}")
print(f"PROCESSING COMPLETE")
print(f"{'='*70}")


Processing 600 test images...

✓ 10 images processed
Predictions stored: 10 files
Failed: 0 images
Total processed: 10 images
✓ 20 images processed
Predictions stored: 20 files
Failed: 0 images
Total processed: 20 images
✓ 30 images processed
Predictions stored: 30 files
Failed: 0 images
Total processed: 30 images
✓ 40 images processed
Predictions stored: 40 files
Failed: 0 images
Total processed: 40 images
✓ 50 images processed
Predictions stored: 50 files
Failed: 0 images
Total processed: 50 images
✓ 60 images processed
Predictions stored: 60 files
Failed: 0 images
Total processed: 60 images
✓ 70 images processed
Predictions stored: 70 files
Failed: 0 images
Total processed: 70 images
✓ 80 images processed
Predictions stored: 80 files
Failed: 0 images
Total processed: 80 images
✓ 90 images processed
Predictions stored: 90 files
Failed: 0 images
Total processed: 90 images
✓ 100 images processed
Predictions stored: 100 files
Failed: 0 images
Total processed: 100 images
✓ 110 images pro

## 11. Comprehensive Evaluation Metrics

In [14]:
if detailed_results:
    # Collect all individual point errors
    all_point_errors = []
    for img_result in detailed_results:
        all_point_errors.extend(img_result['all_errors'])
    
    total_images = len(detailed_results)
    total_points = sum(img['num_points'] for img in detailed_results)
    
    # Absolute pixel errors
    all_top_errors = [e['top_error'] for e in all_point_errors]
    all_bottom_errors = [e['bottom_error'] for e in all_point_errors]
    all_deviation_errors = [e['deviation_error'] for e in all_point_errors]
    all_mean_errors = [e['mean_error'] for e in all_point_errors]
    
    # Calculate accuracy metrics
    threshold_5px = sum(1 for e in all_mean_errors if e <= 5) / len(all_mean_errors) * 100
    threshold_10px = sum(1 for e in all_mean_errors if e <= 10) / len(all_mean_errors) * 100
    threshold_20px = sum(1 for e in all_mean_errors if e <= 20) / len(all_mean_errors) * 100
    
    paper_metrics = {
        'Dataset Statistics': {
            'Total Test Images': total_images,
            'Total Data Points': total_points,
            'Average Points per Image': total_points / total_images,
        },
        'Absolute Pixel Error - Top Error Bar': {
            'Mean': np.mean(all_top_errors),
            'Median': np.median(all_top_errors),
            'Std Dev': np.std(all_top_errors),
            'Min': np.min(all_top_errors),
            'Max': np.max(all_top_errors),
            '25th Percentile': np.percentile(all_top_errors, 25),
            '75th Percentile': np.percentile(all_top_errors, 75),
        },
        'Absolute Pixel Error - Bottom Error Bar': {
            'Mean': np.mean(all_bottom_errors),
            'Median': np.median(all_bottom_errors),
            'Std Dev': np.std(all_bottom_errors),
            'Min': np.min(all_bottom_errors),
            'Max': np.max(all_bottom_errors),
            '25th Percentile': np.percentile(all_bottom_errors, 25),
            '75th Percentile': np.percentile(all_bottom_errors, 75),
        },
        'Overall Mean Pixel Error': {
            'Mean': np.mean(all_mean_errors),
            'Median': np.median(all_mean_errors),
            'Std Dev': np.std(all_mean_errors),
            'RMSE': np.sqrt(np.mean(np.array(all_mean_errors)**2)),
        },
        'Accuracy Metrics (% within threshold)': {
            'Within 5 pixels': threshold_5px,
            'Within 10 pixels': threshold_10px,
            'Within 20 pixels': threshold_20px,
        },
    }
    
    print("\n" + "="*70)
    print("VQA + COT ERROR BAR DETECTION - EVALUATION RESULTS")
    print("="*70)
    
    for section, metrics in paper_metrics.items():
        print(f"\n{section}:")
        print("-" * 70)
        for metric, value in metrics.items():
            if isinstance(value, float):
                print(f"  {metric:.<60} {value:.2f}")
            else:
                print(f"  {metric:.<60} {value}")
    
    print("\n" + "="*70)
    
    # Save metrics
    import pandas as pd
    
    per_image_df = pd.DataFrame([{
        'Image': img['image_file'],
        'Points': img['num_points'],
        'Mean_Top_Error': img['mean_top_error'],
        'Mean_Bottom_Error': img['mean_bottom_error'],
        'Mean_Overall_Error': img['mean_overall_error'],
        'Max_Top_Error': img['max_top_error'],
        'Max_Bottom_Error': img['max_bottom_error'],
    } for img in detailed_results])
    
    per_image_df.to_csv('/kaggle/working/qwen_vqa_per_image_metrics.csv', index=False)
    print("\nSaved: /kaggle/working/qwen_vqa_per_image_metrics.csv")
    
    summary_df = pd.DataFrame([
        {'Metric': 'Method', 'Value': 'Qwen2.5-VL VQA+CoT 4-bit'},
        {'Metric': 'Total Images', 'Value': total_images},
        {'Metric': 'Total Points', 'Value': total_points},
        {'Metric': 'Mean Top Error (px)', 'Value': f"{np.mean(all_top_errors):.2f}"},
        {'Metric': 'Mean Bottom Error (px)', 'Value': f"{np.mean(all_bottom_errors):.2f}"},
        {'Metric': 'Mean Overall Error (px)', 'Value': f"{np.mean(all_mean_errors):.2f}"},
        {'Metric': 'RMSE (px)', 'Value': f"{np.sqrt(np.mean(np.array(all_mean_errors)**2)):.2f}"},
        {'Metric': 'Accuracy @ 5px (%)', 'Value': f"{threshold_5px:.2f}"},
        {'Metric': 'Accuracy @ 10px (%)', 'Value': f"{threshold_10px:.2f}"},
        {'Metric': 'Accuracy @ 20px (%)', 'Value': f"{threshold_20px:.2f}"},
    ])
    
    summary_df.to_csv('/kaggle/working/qwen_vqa_summary_metrics.csv', index=False)
    print("Saved: /kaggle/working/qwen_vqa_summary_metrics.csv")


VQA + COT ERROR BAR DETECTION - EVALUATION RESULTS

Dataset Statistics:
----------------------------------------------------------------------
  Total Test Images........................................... 600
  Total Data Points........................................... 10229
  Average Points per Image.................................... 17.05

Absolute Pixel Error - Top Error Bar:
----------------------------------------------------------------------
  Mean........................................................ 42.40
  Median...................................................... 20.05
  Std Dev..................................................... 69.38
  Min......................................................... 0.00
  Max......................................................... 1632.75
  25th Percentile............................................. 6.62
  75th Percentile............................................. 45.13

Absolute Pixel Error - Bottom Error Bar:
----------------

## 12. Save Predictions in Required Format

In [15]:
# Save all predictions
OUTPUT_DIR = "/kaggle/working/vlm_predictions"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Saving {len(all_predictions)} predictions...\n")

saved_count = 0
for json_file, pred_data in all_predictions.items():
    try:
        image_file = pred_data['image_file']
        predictions = pred_data['predictions']
        
        # Convert to output format
        output_json = convert_to_output_format(image_file, predictions)
        
        output_path = os.path.join(OUTPUT_DIR, json_file)
        with open(output_path, 'w') as f:
            json.dump(output_json, f, indent=2)
        
        saved_count += 1
        
        if saved_count % 100 == 0:
            print(f"Saved {saved_count}/{len(all_predictions)} files...")
    
    except Exception as e:
        print(f"Error saving {json_file}: {e}")

print(f"\nSuccessfully saved {saved_count} prediction files")

# Show sample output format
if all_predictions:
    sample_file = list(all_predictions.keys())[0]
    sample_pred = all_predictions[sample_file]
    sample_output = convert_to_output_format(
        sample_pred['image_file'], 
        sample_pred['predictions']
    )
    
    print("\n" + "="*70)
    print("SAMPLE OUTPUT FORMAT")
    print("="*70)
    if sample_output['error_bars'] and sample_output['error_bars'][0]['points']:
        print(json.dumps(sample_output['error_bars'][0]['points'][0], indent=2))
    print("="*70)

Saving 600 predictions...

Saved 100/600 files...
Saved 200/600 files...
Saved 300/600 files...
Saved 400/600 files...
Saved 500/600 files...
Saved 600/600 files...

Successfully saved 600 prediction files

SAMPLE OUTPUT FORMAT
{
  "data_point": {
    "x": 96.58120273413908,
    "y": 70.94969906573277
  },
  "upper_error_bar": {
    "x": 96.58120273413908,
    "y": 10.0
  },
  "lower_error_bar": {
    "x": 96.58120273413908,
    "y": 204.0
  }
}


## 13. Create ZIP Archive

In [16]:
import zipfile
from datetime import datetime

zip_filename = f"/kaggle/working/qwen_vqa_error_bar_predictions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.zip"

print(f"Creating ZIP archive: {zip_filename}\n")

with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Add predictions
    for json_file in os.listdir(OUTPUT_DIR):
        if json_file.endswith('.json'):
            file_path = os.path.join(OUTPUT_DIR, json_file)
            zipf.write(file_path, arcname=f"predictions/{json_file}")
    
    # Add metrics
    if os.path.exists('/kaggle/working/qwen_vqa_per_image_metrics.csv'):
        zipf.write('/kaggle/working/qwen_vqa_per_image_metrics.csv', arcname='qwen_vqa_per_image_metrics.csv')
    if os.path.exists('/kaggle/working/qwen_vqa_summary_metrics.csv'):
        zipf.write('/kaggle/working/qwen_vqa_summary_metrics.csv', arcname='qwen_vqa_summary_metrics.csv')

zip_size_mb = os.path.getsize(zip_filename) / (1024 * 1024)

print("="*70)
print("QWEN VQA+CoT PREDICTIONS - ZIP ARCHIVE CREATED")
print("="*70)
print(f"Filename: {zip_filename}")
print(f"Size: {zip_size_mb:.2f} MB")
print(f"Contents:")
print(f"  - {saved_count} prediction JSON files")
print(f"  - qwen_vqa_per_image_metrics.csv")
print(f"  - qwen_vqa_summary_metrics.csv")
print("="*70)

Creating ZIP archive: /kaggle/working/qwen_vqa_error_bar_predictions_20260128_134122.zip

QWEN VQA+CoT PREDICTIONS - ZIP ARCHIVE CREATED
Filename: /kaggle/working/qwen_vqa_error_bar_predictions_20260128_134122.zip
Size: 0.63 MB
Contents:
  - 600 prediction JSON files
  - qwen_vqa_per_image_metrics.csv
  - qwen_vqa_summary_metrics.csv


## Summary

This notebook implements **VQA + Chain-of-Thought (CoT) error bar detection** using Qwen2.5-VL-7B-Instruct with:

### Approach:
1. **4-bit Quantized Qwen2.5-VL Model**: Memory-efficient model loading
2. **Step-by-Step CoT Reasoning**: 5-step process for each data point:
   - Step 1: Locate and describe the marker
   - Step 2: Detect error bar presence
   - Step 3: Find top endpoint with reasoning
   - Step 4: Find bottom endpoint with reasoning
   - Step 5: Verify measurements for consistency
3. **Deterministic Inference**: Low temperature, no sampling for consistency
4. **Reasoning Trace + Measurements**: Extract both CoT steps and final pixel coordinates

### Key Features:
- Qwen2.5-VL-7B-Instruct with 4-bit quantization
- Comprehensive evaluation metrics (pixel errors, RMSE, accuracy@threshold)
- ZIP archive with all predictions
- Explainable reasoning trace for each measurement
- JSON output with both predictions and reasoning
- Optimized for fast inference (768px images, 1024 max tokens)

### Model Configuration:
- **Model**: Qwen2-VL-7B-Instruct
- **Quantization**: 4-bit NF4 with double quantization
- **Temperature**: 0.1 (deterministic)
- **Max Tokens**: 4096 (for CoT reasoning)
- **Device**: Auto-mapped to available GPUs

### Advantages:
- **Explicit Reasoning**: Shows step-by-step thought process
- **Self-Verification**: Built-in consistency checks
- **Better Accuracy**: CoT forces careful visual examination
- **Explainable**: Can debug by examining reasoning trace
- **Adaptive**: Handles diverse plot styles without parameter tuning
- **Contextual**: Understands chart structure and conventions
- **Transparency**: Shows how the model arrives at measurements