In [None]:
# ============================================================================
# CELL 0: Enable CUDA Debugging & Reset
# ============================================================================

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import torch

# Check CUDA status
if torch.cuda.is_available():
    print(f"✓ CUDA available: {torch.cuda.get_device_name(0)}")
    print(f"✓ CUDA version: {torch.version.cuda}")
    torch.cuda.empty_cache()
    print("✓ CUDA cache cleared")
else:
    print("CUDA not available, using CPU")

print("Ready to start")

✓ CUDA available: NVIDIA L4
✓ CUDA version: 12.6
✓ CUDA cache cleared
Ready to start


In [None]:
"""
ScienceQA Fine-tuning Pipeline - Notebook Cells
Fine-tune UnifiedQA (text) and BLIP (vision) models on ScienceQA dataset
"""

# ============================================================================
# CELL 1: Install and Import Dependencies
# ============================================================================

# Run this first if needed
# !pip install transformers datasets torch pillow tqdm accelerate

import torch
from datasets import load_dataset
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    Blip2Processor, Blip2ForConditionalGeneration,
    AutoProcessor, AutoModelForVisualQuestionAnswering,
    TrainingArguments, Trainer, DataCollatorForSeq2Seq
)
from torch.utils.data import Dataset
from PIL import Image
from tqdm import tqdm
import json
import io
import os

In [None]:
# ============================================================================
# CELL 2: Configuration
# ============================================================================

# Enable better CUDA error debugging (optional - slows down training)
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Training Configuration
CONFIG = {
    # Model selection
    'text_model': "allenai/unifiedqa-t5-small",
    'vision_model': "Salesforce/blip-vqa-base",

    # Training parameters
    'train_text_model': True,
    'train_vision_model': True,

    'num_train_samples': None,  # None for full dataset, or set to 1000 for testing
    'num_val_samples': None,

    'batch_size': 8,
    'learning_rate': 5e-5,
    'num_epochs': 3,
    'warmup_steps': 500,
    'weight_decay': 0.01,
    'max_length': 128,

    # Save paths
    'output_dir_text': './finetuned_unifiedqa',
    'output_dir_vision': './finetuned_blip',

    # Logging
    'logging_steps': 100,
    'eval_steps': 500,
    'save_steps': 500,
}

print("Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

Configuration:
  text_model: allenai/unifiedqa-t5-small
  vision_model: Salesforce/blip-vqa-base
  train_text_model: True
  train_vision_model: True
  num_train_samples: None
  num_val_samples: None
  batch_size: 8
  learning_rate: 5e-05
  num_epochs: 3
  warmup_steps: 500
  weight_decay: 0.01
  max_length: 128
  output_dir_text: ./finetuned_unifiedqa
  output_dir_vision: ./finetuned_blip
  logging_steps: 100
  eval_steps: 500
  save_steps: 500


In [None]:
# ============================================================================
# CELL 3: Load Dataset
# ============================================================================

print("\n" + "="*60)
print("Loading ScienceQA Dataset")
print("="*60)

# Load full dataset
train_dataset = load_dataset('derek-thomas/ScienceQA', split='train')
val_dataset = load_dataset('derek-thomas/ScienceQA', split='validation')

# Optionally limit samples for faster training
if CONFIG['num_train_samples']:
    train_dataset = train_dataset.select(range(CONFIG['num_train_samples']))
if CONFIG['num_val_samples']:
    val_dataset = val_dataset.select(range(CONFIG['num_val_samples']))

print(f"✓ Train samples: {len(train_dataset)}")
print(f"✓ Validation samples: {len(val_dataset)}")

# Analyze dataset
has_images = sum(1 for ex in train_dataset if ex.get('image') is not None)
has_context = sum(1 for ex in train_dataset if ex.get('hint', '').strip())
print(f"\nDataset Statistics:")
print(f"  - Samples with images: {has_images} ({has_images/len(train_dataset)*100:.1f}%)")
print(f"  - Samples with context: {has_context} ({has_context/len(train_dataset)*100:.1f}%)")


Loading ScienceQA Dataset


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-1028f23e353fbe(…):   0%|          | 0.00/377M [00:00<?, ?B/s]

data/validation-00000-of-00001-6c7328ff6(…):   0%|          | 0.00/126M [00:00<?, ?B/s]

data/test-00000-of-00001-f0e719df791966f(…):   0%|          | 0.00/122M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12726 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4241 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4241 [00:00<?, ? examples/s]

✓ Train samples: 12726
✓ Validation samples: 4241

Dataset Statistics:
  - Samples with images: 6218 (48.9%)
  - Samples with context: 6079 (47.8%)


In [None]:
# ============================================================================
# CELL 4: Helper Functions
# ============================================================================

def format_choices(choices):
    """Format choices as (a) option1 (b) option2 etc."""
    labels = ['(a)', '(b)', '(c)', '(d)', '(e)', '(f)']
    return " ".join([f"{labels[i]} {choice}" for i, choice in enumerate(choices)])


def create_text_input(question, context=None, choices=None):
    """Create input for text model."""
    parts = [question]
    if context and context.strip():
        parts.append(context)
    input_text = " ".join(parts)

    if choices:
        input_text = f"{input_text} \\n {format_choices(choices)}"

    return input_text


def create_vision_input(question, context=None, choices=None):
    """Create input for vision model."""
    parts = [f"Question: {question}"]
    if context and context.strip():
        parts.append(f"Context: {context}")
    if choices:
        parts.append(f"Choices: {format_choices(choices)}")
    parts.append("Answer:")
    return " ".join(parts)


def get_target_text(choices, answer_idx):
    """Get target text for training."""
    # ✅ NEW: Validate answer index
    if answer_idx < 0 or answer_idx >= len(choices):
        print(f"Warning: Invalid answer_idx {answer_idx} for {len(choices)} choices")
        answer_idx = 0  # Default to first choice if invalid

    # Return the letter (a, b, c, etc.) as target
    letters = ['a', 'b', 'c', 'd', 'e', 'f']
    return letters[answer_idx]


def load_image(image_data):
    """Load image from dataset."""
    try:
        if isinstance(image_data, Image.Image):
            return image_data.convert('RGB')
        elif isinstance(image_data, bytes):
            return Image.open(io.BytesIO(image_data)).convert('RGB')
        elif isinstance(image_data, dict) and 'bytes' in image_data:
            return Image.open(io.BytesIO(image_data['bytes'])).convert('RGB')
        elif isinstance(image_data, dict) and 'path' in image_data:
            return Image.open(image_data['path']).convert('RGB')
        return None
    except:
        return None

print("✓ Helper functions defined")

✓ Helper functions defined


In [None]:
# ============================================================================
# CELL 5: Custom Dataset Classes (ULTRA-FIXED VERSION)
# ============================================================================

class ScienceQATextDataset(Dataset):
    """Dataset for text-only model (UnifiedQA)"""

    def __init__(self, hf_dataset, tokenizer, max_length=128):
        self.dataset = hf_dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Debug: Check tokenizer vocab size
        print(f"  Tokenizer vocab size: {len(tokenizer)}")
        print(f"  Pad token ID: {tokenizer.pad_token_id}")

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        example = self.dataset[idx]

        # Create input text
        input_text = create_text_input(
            example['question'],
            example.get('hint', ''),
            example['choices']
        )

        # Create target text
        target_text = get_target_text(example['choices'], example['answer'])

        # Tokenize inputs
        model_inputs = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # This ensures proper encoding for T5 models
        labels = self.tokenizer.encode(
            target_text,
            max_length=8,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        labels = labels.clone()  # Make a copy to avoid in-place modification issues
        labels[labels == self.tokenizer.pad_token_id] = -100

        # Debug: Check for invalid token IDs
        valid_mask = (labels >= 0) | (labels == -100)
        if not valid_mask.all():
            print(f"⚠️  Warning: Invalid token IDs found in labels at index {idx}")
            print(f"   Target text: {target_text}")
            print(f"   Labels: {labels}")

        return {
            'input_ids': model_inputs['input_ids'].squeeze(0),
            'attention_mask': model_inputs['attention_mask'].squeeze(0),
            'labels': labels.squeeze(0)
        }

print("Custom dataset classes defined")

Custom dataset classes defined


In [None]:
if CONFIG['train_text_model']:
    print("\n" + "="*60)
    print("FINE-TUNING TEXT MODEL (UnifiedQA)")
    print("="*60)

    # Set device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    # Load model and tokenizer
    print("\nLoading model and tokenizer...")
    text_tokenizer = T5Tokenizer.from_pretrained(CONFIG['text_model'])
    text_model = T5ForConditionalGeneration.from_pretrained(CONFIG['text_model'])

    # Create datasets
    print("Preparing datasets...")
    train_text_dataset = ScienceQATextDataset(train_dataset, text_tokenizer, CONFIG['max_length'])
    val_text_dataset = ScienceQATextDataset(val_dataset, text_tokenizer, CONFIG['max_length'])

    # Data collator
    data_collator = DataCollatorForSeq2Seq(
        tokenizer=text_tokenizer,
        model=text_model,
        padding=True,
        label_pad_token_id=-100
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir=CONFIG['output_dir_text'],
        num_train_epochs=CONFIG['num_epochs'],
        per_device_train_batch_size=CONFIG['batch_size'],
        per_device_eval_batch_size=CONFIG['batch_size'],
        learning_rate=CONFIG['learning_rate'],
        warmup_steps=CONFIG['warmup_steps'],
        weight_decay=CONFIG['weight_decay'],
        logging_dir=f"{CONFIG['output_dir_text']}/logs",
        logging_steps=CONFIG['logging_steps'],
        eval_strategy="steps",
        eval_steps=CONFIG['eval_steps'],
        save_steps=CONFIG['save_steps'],
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        report_to="none",
        fp16=device == "cuda",
        dataloader_num_workers=0,
        remove_unused_columns=False,
    )

    # Create and run Trainer
    print("\nStarting training...")
    trainer = Trainer(
        model=text_model.to(device),
        args=training_args,
        train_dataset=train_text_dataset,
        eval_dataset=val_text_dataset,
        data_collator=data_collator,
    )

    train_result = trainer.train()

    # Save model
    print("\nSaving model...")
    trainer.save_model(CONFIG['output_dir_text'])
    text_tokenizer.save_pretrained(CONFIG['output_dir_text'])

    with open(f"{CONFIG['output_dir_text']}/train_results.json", 'w') as f:
        json.dump(train_result.metrics, f, indent=2)

    print(f"✅ Model saved to {CONFIG['output_dir_text']}")

else:
    print("\n⏭️  Skipping text model fine-tuning")


FINE-TUNING TEXT MODEL (UnifiedQA)
Using device: cuda

Loading model and tokenizer...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Preparing datasets...
  Tokenizer vocab size: 32100
  Pad token ID: 0
  Tokenizer vocab size: 32100
  Pad token ID: 0

Starting training...


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Step,Training Loss,Validation Loss
500,0.4037,0.35329
1000,0.3629,0.318642
1500,0.3337,0.300071
2000,0.3225,0.287209
2500,0.3113,0.280261
3000,0.3139,0.276099
3500,0.2996,0.274986
4000,0.2807,0.272539
4500,0.2813,0.267378


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].



Saving model...
✅ Model saved to ./finetuned_unifiedqa


In [None]:
# ============================================================================
# CELL 5: Vision Dataset with Decoder Inputs
# ============================================================================

class ScienceQAVisionDataset(Dataset):
    """BLIP dataset with explicit decoder_input_ids."""

    def __init__(self, hf_dataset, processor, max_length=128):
        self.dataset = hf_dataset
        self.processor = processor
        self.max_length = max_length

        # Filter images
        self.valid_indices = [
            i for i, ex in enumerate(hf_dataset)
            if ex.get('image') is not None
        ]
        print(f"  Filtered to {len(self.valid_indices)} samples with images")

    def __len__(self):
        return len(self.valid_indices)

    def __getitem__(self, idx):
        real_idx = self.valid_indices[idx]
        example = self.dataset[real_idx]

        # Load image
        image = load_image(example['image'])
        if image is None:
            image = Image.new('RGB', (224, 224), color='white')

        # Question
        question = create_vision_input(
            example['question'],
            example.get('hint', ''),
            example['choices']
        )

        # Answer letter
        answer = get_target_text(example['choices'], example['answer'])

        # Process image + question
        inputs = self.processor(
            images=image,
            text=question,
            return_tensors='pt',
            padding='max_length',
            max_length=self.max_length,
            truncation=True
        )

        # ✅ Create decoder_input_ids: [BOS] + answer_tokens
        # Tokenize answer
        answer_tokens = self.processor.tokenizer.encode(
            answer,
            add_special_tokens=False
        )

        # BLIP uses [CLS] token (101) as BOS for decoder
        bos_token_id = self.processor.tokenizer.cls_token_id  # 101

        # Create decoder input: [BOS] + answer
        decoder_input_ids = [bos_token_id] + answer_tokens

        # Pad to fixed length
        max_decoder_length = 10
        if len(decoder_input_ids) < max_decoder_length:
            decoder_input_ids = decoder_input_ids + [self.processor.tokenizer.pad_token_id] * (max_decoder_length - len(decoder_input_ids))
        else:
            decoder_input_ids = decoder_input_ids[:max_decoder_length]

        decoder_input_ids = torch.tensor(decoder_input_ids, dtype=torch.long)

        # Create decoder attention mask
        decoder_attention_mask = (decoder_input_ids != self.processor.tokenizer.pad_token_id).long()

        # ✅ Labels: same as decoder_input_ids but shifted
        # For autoregressive training, labels should be the target tokens
        # We keep them as-is and let the model handle shifting
        labels = decoder_input_ids.clone()
        labels[labels == self.processor.tokenizer.pad_token_id] = -100

        return {
            'pixel_values': inputs['pixel_values'].squeeze(0),
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'decoder_input_ids': decoder_input_ids,
            'decoder_attention_mask': decoder_attention_mask,
            'labels': labels
        }

print("✓ Dataset with decoder inputs ready")

✓ Dataset with decoder inputs ready


In [None]:
# ============================================================================
# CELL 7: Fine-tune Vision Model (BLIP)
# ============================================================================

if CONFIG['train_vision_model']:
    print("\n" + "="*60)
    print("FINE-TUNING VISION MODEL (BLIP)")
    print("="*60)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # Load model and processor
    print("\nLoading model and processor...")
    vision_processor = AutoProcessor.from_pretrained(CONFIG['vision_model'])
    vision_model = AutoModelForVisualQuestionAnswering.from_pretrained(CONFIG['vision_model'])
    vision_model.to(device)

    # Create datasets
    print("Preparing datasets...")
    train_vision_dataset = ScienceQAVisionDataset(train_dataset, vision_processor, CONFIG['max_length'])
    val_vision_dataset = ScienceQAVisionDataset(val_dataset, vision_processor, CONFIG['max_length'])

    # Data collator
    def simple_vision_collator(features):
        batch = {
            'pixel_values': torch.stack([f['pixel_values'] for f in features]),
            'input_ids': torch.stack([f['input_ids'] for f in features]),
            'attention_mask': torch.stack([f['attention_mask'] for f in features]),
            'decoder_input_ids': torch.stack([f['decoder_input_ids'] for f in features]),
            'decoder_attention_mask': torch.stack([f['decoder_attention_mask'] for f in features]),
            'labels': torch.stack([f['labels'] for f in features]),
        }
        return batch

    # Create dataloaders
    from torch.utils.data import DataLoader

    train_loader = DataLoader(
        train_vision_dataset,
        batch_size=CONFIG['batch_size'],
        shuffle=True,
        collate_fn=simple_vision_collator,
        num_workers=0,
        pin_memory=(device == 'cuda')
    )

    val_loader = DataLoader(
        val_vision_dataset,
        batch_size=CONFIG['batch_size'],
        shuffle=False,
        collate_fn=simple_vision_collator,
        num_workers=0,
        pin_memory=(device == 'cuda')
    )

    # Setup optimizer and scheduler
    from torch.optim import AdamW
    from transformers import get_linear_schedule_with_warmup

    optimizer = AdamW(
        vision_model.parameters(),
        lr=CONFIG['learning_rate'],
        weight_decay=CONFIG['weight_decay']
    )

    total_steps = len(train_loader) * CONFIG['num_epochs']
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=CONFIG['warmup_steps'],
        num_training_steps=total_steps
    )

    # Training loop
    print("\nStarting training...")
    best_val_loss = float('inf')
    training_history = {'train_loss': [], 'val_loss': [], 'epochs': []}

    for epoch in range(CONFIG['num_epochs']):
        print(f"\nEpoch {epoch + 1}/{CONFIG['num_epochs']}")

        # Training
        vision_model.train()
        total_train_loss = 0
        num_train_batches = 0

        for batch in tqdm(train_loader, desc="Training"):
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = vision_model(**batch)
            loss = outputs.loss

            if not (torch.isnan(loss) or torch.isinf(loss)):
                total_train_loss += loss.item()
                num_train_batches += 1

                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(vision_model.parameters(), max_norm=1.0)
                optimizer.step()
                scheduler.step()

        avg_train_loss = total_train_loss / num_train_batches if num_train_batches > 0 else float('inf')
        training_history['train_loss'].append(avg_train_loss)

        # Validation
        vision_model.eval()
        total_val_loss = 0
        num_val_batches = 0

        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation"):
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = vision_model(**batch)
                loss = outputs.loss

                if not (torch.isnan(loss) or torch.isinf(loss)):
                    total_val_loss += loss.item()
                    num_val_batches += 1

        avg_val_loss = total_val_loss / num_val_batches if num_val_batches > 0 else float('inf')
        training_history['val_loss'].append(avg_val_loss)
        training_history['epochs'].append(epoch + 1)

        print(f"Train loss: {avg_train_loss:.4f} | Val loss: {avg_val_loss:.4f}")

        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            os.makedirs(CONFIG['output_dir_vision'], exist_ok=True)
            vision_model.save_pretrained(CONFIG['output_dir_vision'])
            vision_processor.save_pretrained(CONFIG['output_dir_vision'])

    # Save final results
    print("\nTraining completed!")
    print(f"Best validation loss: {best_val_loss:.4f}")

    final_metrics = {
        'final_train_loss': avg_train_loss,
        'best_val_loss': best_val_loss,
        'num_epochs': CONFIG['num_epochs'],
        'history': training_history
    }

    with open(f'{CONFIG["output_dir_vision"]}/train_results.json', 'w') as f:
        json.dump(final_metrics, f, indent=2)

else:
    print("\n⏭️  Skipping vision model training")


FINE-TUNING VISION MODEL (BLIP)
Using device: cuda

Loading model and processor...


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

Preparing datasets...
  Filtered to 6218 samples with images
  Filtered to 2097 samples with images

Starting training...

Epoch 1/3


Training: 100%|██████████| 778/778 [13:01<00:00,  1.00s/it]
Validation: 100%|██████████| 263/263 [01:32<00:00,  2.83it/s]


Train loss: 1.4830 | Val loss: 1.1398

Epoch 2/3


Training: 100%|██████████| 778/778 [12:59<00:00,  1.00s/it]
Validation: 100%|██████████| 263/263 [01:33<00:00,  2.82it/s]


Train loss: 1.1517 | Val loss: 1.1150

Epoch 3/3


Training: 100%|██████████| 778/778 [13:00<00:00,  1.00s/it]
Validation: 100%|██████████| 263/263 [01:32<00:00,  2.84it/s]


Train loss: 1.1009 | Val loss: 1.0991

Training completed!
Best validation loss: 1.0991


In [None]:
# ============================================================================
# CELL 9: Helper Function - Extract Choice Index
# ============================================================================

import numpy as np
def extract_choice_index(predicted_answer, choices):
    """Convert model's predicted text to choice index (0, 1, 2, etc.)"""
    pred = predicted_answer.lower().strip().replace("answer:", "").replace("the answer is", "").strip()

    letters = ['a', 'b', 'c', 'd', 'e', 'f']
    for i, letter in enumerate(letters[:len(choices)]):
        if pred == letter or pred == f"({letter})" or pred.startswith(f"{letter})"):
            return i

    for i, choice in enumerate(choices):
        choice_lower = choice.lower().strip()
        if pred == choice_lower or pred in choice_lower or choice_lower in pred:
            return i

    return -1


In [None]:
# ============================================================================
# CELL 10: Optimized Batch Prediction Function for Single Model
# ============================================================================

def predict_batch_single_model(questions, choices_list, contexts, images_data,
                               text_tokenizer, text_model, vision_processor, vision_model,
                               use_text_only=False, use_vision_only=False, batch_size=8):
    """
    Batch processing for a single model type or combination
    use_text_only: Only use text model, skip vision
    use_vision_only: Only use vision model (requires images)
    """
    all_predictions = []
    all_models_used = []

    for i in range(0, len(questions), batch_size):
        batch_questions = questions[i:i+batch_size]
        batch_choices = choices_list[i:i+batch_size]
        batch_contexts = contexts[i:i+batch_size]
        batch_images = images_data[i:i+batch_size]

        text_indices = []
        vision_indices = []

        for idx, img in enumerate(batch_images):
            if img is not None and not use_text_only:
                vision_indices.append(idx)
            else:
                text_indices.append(idx)

        # Process text-only batch
        if text_indices and not use_vision_only:
            text_inputs = [
                create_text_input(
                    batch_questions[idx],
                    batch_contexts[idx],
                    batch_choices[idx]
                )
                for idx in text_indices
            ]

            text_tokens = text_tokenizer(
                text_inputs,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=128
            ).to(device)

            with torch.no_grad():
                text_outputs = text_model.generate(
                    **text_tokens,
                    max_length=64,
                    num_beams=4
                )

            text_predictions = text_tokenizer.batch_decode(
                text_outputs,
                skip_special_tokens=True
            )

            for idx, pred_text in zip(text_indices, text_predictions):
                pred_idx = extract_choice_index(pred_text, batch_choices[idx])
                all_predictions.append(pred_idx)
                all_models_used.append("text")

        # Process vision batch
        if vision_indices:
            vision_images = [load_image(batch_images[idx]) for idx in vision_indices]
            vision_inputs = [
                create_vision_input(
                    batch_questions[idx],
                    batch_contexts[idx],
                    batch_choices[idx]
                )
                for idx in vision_indices
            ]

            vision_tokens = vision_processor(
                images=vision_images,
                text=vision_inputs,
                return_tensors="pt",
                padding=True
            ).to(device)

            with torch.no_grad():
                vision_outputs = vision_model.generate(
                    **vision_tokens,
                    max_length=64,
                    num_beams=4
                )

            vision_predictions = vision_processor.batch_decode(
                vision_outputs,
                skip_special_tokens=True
            )

            for idx, pred_text in zip(vision_indices, vision_predictions):
                pred_idx = extract_choice_index(pred_text, batch_choices[idx])
                all_predictions.append(pred_idx)
                all_models_used.append("vision")

    return all_predictions, all_models_used




In [None]:
# ============================================================================
# CELL 11: Comprehensive Metrics Calculation
# ============================================================================

def calculate_comprehensive_metrics(predictions, labels, subjects, grades, has_image, has_text):
    """Calculate all metrics - extracted for reusability"""
    results = {}

    # Overall
    results['Overall'] = (predictions == labels).mean() * 100
    results['Total_Samples'] = len(predictions)

    # By Subject
    for subject in ['natural science', 'social science', 'language science']:
        mask = subjects == subject
        if mask.sum() > 0:
            short_name = subject.upper()[:3]
            results[short_name] = (predictions[mask] == labels[mask]).mean() * 100
            results[f'{short_name}_count'] = int(mask.sum())

    # By Context
    context_types = [
        ('TXT', has_text & ~has_image),
        ('IMG', has_image & ~has_text),
        ('NO', ~has_image & ~has_text),
        ('TXT+IMG', has_image & has_text)
    ]

    for key, mask in context_types:
        if mask.sum() > 0:
            results[key] = (predictions[mask] == labels[mask]).mean() * 100
            results[f'{key}_count'] = int(mask.sum())

    # By Grade
    grade_nums = np.array([
        int(g.replace('grade', '')) if 'grade' in g else 0
        for g in grades
    ])

    for key, condition in [
        ('G1-6', (grade_nums >= 1) & (grade_nums <= 6)),
        ('G7-12', (grade_nums >= 7) & (grade_nums <= 12))
    ]:
        if condition.sum() > 0:
            results[key] = (predictions[condition] == labels[condition]).mean() * 100
            results[f'{key}_count'] = int(condition.sum())

    return results



In [None]:
# ============================================================================
# CELL 12: Load All Models (Fine-tuned and Pre-trained)
# ============================================================================

print("\n" + "="*60)
print("LOADING ALL MODELS FOR EVALUATION")
print("="*60)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

models_dict = {}

# Load fine-tuned UnifiedQA
print("\n1. Loading fine-tuned UnifiedQA...")
if os.path.exists(CONFIG['output_dir_text']):
    ft_text_tokenizer = T5Tokenizer.from_pretrained(CONFIG['output_dir_text'])
    ft_text_model = T5ForConditionalGeneration.from_pretrained(CONFIG['output_dir_text'])
    ft_text_model.to(device).eval()
    models_dict['ft_unifiedqa'] = {
        'tokenizer': ft_text_tokenizer,
        'model': ft_text_model,
        'type': 'text'
    }
    print(f"✅ Loaded from {CONFIG['output_dir_text']}")
else:
    print(f"⚠️  Fine-tuned model not found at {CONFIG['output_dir_text']}")

# Load pre-trained UnifiedQA
print("\n2. Loading pre-trained UnifiedQA...")
pt_text_tokenizer = T5Tokenizer.from_pretrained(CONFIG['text_model'])
pt_text_model = T5ForConditionalGeneration.from_pretrained(CONFIG['text_model'])
pt_text_model.to(device).eval()
models_dict['pt_unifiedqa'] = {
    'tokenizer': pt_text_tokenizer,
    'model': pt_text_model,
    'type': 'text'
}
print(f"✅ Loaded {CONFIG['text_model']}")

# Load fine-tuned BLIP
print("\n3. Loading fine-tuned BLIP...")
if os.path.exists(CONFIG['output_dir_vision']):
    ft_vision_processor = AutoProcessor.from_pretrained(CONFIG['output_dir_vision'])
    ft_vision_model = AutoModelForVisualQuestionAnswering.from_pretrained(CONFIG['output_dir_vision'])
    ft_vision_model.to(device).eval()
    models_dict['ft_blip'] = {
        'processor': ft_vision_processor,
        'model': ft_vision_model,
        'type': 'vision'
    }
    print(f"✅ Loaded from {CONFIG['output_dir_vision']}")
else:
    print(f"⚠️  Fine-tuned model not found at {CONFIG['output_dir_vision']}")

# Load pre-trained BLIP
print("\n4. Loading pre-trained BLIP...")
pt_vision_processor = AutoProcessor.from_pretrained(CONFIG['vision_model'])
pt_vision_model = AutoModelForVisualQuestionAnswering.from_pretrained(CONFIG['vision_model'])
pt_vision_model.to(device).eval()
models_dict['pt_blip'] = {
    'processor': pt_vision_processor,
    'model': pt_vision_model,
    'type': 'vision'
}
print(f"✅ Loaded {CONFIG['vision_model']}")

print(f"\n✅ Total models loaded: {len(models_dict)}")



LOADING ALL MODELS FOR EVALUATION
Using device: cuda

1. Loading fine-tuned UnifiedQA...
✅ Loaded from ./finetuned_unifiedqa

2. Loading pre-trained UnifiedQA...
✅ Loaded allenai/unifiedqa-t5-small

3. Loading fine-tuned BLIP...
✅ Loaded from ./finetuned_blip

4. Loading pre-trained BLIP...
✅ Loaded Salesforce/blip-vqa-base

✅ Total models loaded: 4


In [None]:
# ============================================================================
# CELL 13: Text-Only Model Evaluation
# ============================================================================

def evaluate_text_model(text_tokenizer, text_model, test_dataset, model_name):
    """Evaluate text-only model"""

    if text_model is None:
        print(f"\n⚠️  Skipping {model_name} - model not available")
        return None

    print(f"\n" + "="*60)
    print(f"EVALUATING: {model_name}")
    print("="*60)

    questions = test_dataset['question']
    choices_list = test_dataset['choices']
    contexts = [ex.get('hint', '') for ex in test_dataset]
    labels = test_dataset['answer']
    subjects = test_dataset['subject']
    grades = test_dataset['grade']
    images_data = [ex.get('image') for ex in test_dataset]

    print(f"Processing {len(test_dataset)} samples...")

    all_predictions = []
    for i in tqdm(range(0, len(questions), 8), desc="Batch processing"):
        batch_questions = questions[i:i+8]
        batch_choices = choices_list[i:i+8]
        batch_contexts = contexts[i:i+8]

        text_inputs = [
            create_text_input(batch_questions[idx], batch_contexts[idx], batch_choices[idx])
            for idx in range(len(batch_questions))
        ]

        text_tokens = text_tokenizer(
            text_inputs,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=128
        ).to(device)

        with torch.no_grad():
            text_outputs = text_model.generate(
                **text_tokens,
                max_length=64,
                num_beams=4
            )

        text_predictions = text_tokenizer.batch_decode(text_outputs, skip_special_tokens=True)

        for idx, pred_text in enumerate(text_predictions):
            pred_idx = extract_choice_index(pred_text, batch_choices[idx])
            all_predictions.append(pred_idx)

    predictions = np.array(all_predictions)
    labels = np.array(labels)
    subjects = np.array(subjects)
    grades = np.array(grades)
    has_image = np.array([img is not None for img in images_data])
    has_text = np.array([bool(ctx) for ctx in contexts])

    results = calculate_comprehensive_metrics(
        predictions, labels, subjects, grades, has_image, has_text
    )
    results['model_type'] = 'text'

    return results


In [None]:
# ============================================================================
# CELL 14: Vision-Only Model Evaluation
# ============================================================================

def evaluate_vision_model(vision_processor, vision_model, test_dataset, model_name):
    """Evaluate vision-only model (processes all samples, uses fallback for no image)"""

    if vision_model is None:
        print(f"\n⚠️  Skipping {model_name} - model not available")
        return None

    print(f"\n" + "="*60)
    print(f"EVALUATING: {model_name}")
    print("="*60)

    questions = test_dataset['question']
    choices_list = test_dataset['choices']
    contexts = [ex.get('hint', '') for ex in test_dataset]
    labels = test_dataset['answer']
    subjects = test_dataset['subject']
    grades = test_dataset['grade']
    images_data = [ex.get('image') for ex in test_dataset]

    print(f"Processing {len(test_dataset)} samples...")

    all_predictions = []
    for i in tqdm(range(0, len(questions), 8), desc="Batch processing"):
        batch_questions = questions[i:i+8]
        batch_choices = choices_list[i:i+8]
        batch_contexts = contexts[i:i+8]
        batch_images = images_data[i:i+8]

        batch_size = len(batch_questions)
        vision_images = []

        for idx in range(batch_size):
            image = load_image(batch_images[idx])
            if image is None:
                image = Image.new('RGB', (224, 224), color='white')
            vision_images.append(image)

        vision_inputs = [
            create_vision_input(batch_questions[idx], batch_contexts[idx], batch_choices[idx])
            for idx in range(batch_size)
        ]

        vision_tokens = vision_processor(
            images=vision_images,
            text=vision_inputs,
            return_tensors="pt",
            padding=True
        ).to(device)

        with torch.no_grad():
            vision_outputs = vision_model.generate(
                **vision_tokens,
                max_length=64,
                num_beams=4
            )

        vision_predictions = vision_processor.batch_decode(vision_outputs, skip_special_tokens=True)

        for idx, pred_text in enumerate(vision_predictions):
            pred_idx = extract_choice_index(pred_text, batch_choices[idx])
            all_predictions.append(pred_idx)

    predictions = np.array(all_predictions)
    labels = np.array(labels)
    subjects = np.array(subjects)
    grades = np.array(grades)
    has_image = np.array([img is not None for img in images_data])
    has_text = np.array([bool(ctx) for ctx in contexts])

    results = calculate_comprehensive_metrics(
        predictions, labels, subjects, grades, has_image, has_text
    )
    results['model_type'] = 'vision'

    return results

In [None]:
# ============================================================================
# CELL 15: Load Test Dataset and Run All Evaluations
# ============================================================================

print("\n" + "="*60)
print("LOADING TEST DATASET")
print("="*60)

test_dataset = load_dataset('derek-thomas/ScienceQA', split='test')
num_test_samples = None

if num_test_samples:
    test_dataset = test_dataset.select(range(min(num_test_samples, len(test_dataset))))

print(f"Test samples: {len(test_dataset)}")

# Run evaluations for all models
all_results = {}

# Fine-tuned UnifiedQA
if 'ft_unifiedqa' in models_dict:
    results = evaluate_text_model(
        models_dict['ft_unifiedqa']['tokenizer'],
        models_dict['ft_unifiedqa']['model'],
        test_dataset,
        "Fine-tuned UnifiedQA"
    )
    if results:
        all_results['Fine-tuned UnifiedQA'] = results

# Pre-trained UnifiedQA
results = evaluate_text_model(
    models_dict['pt_unifiedqa']['tokenizer'],
    models_dict['pt_unifiedqa']['model'],
    test_dataset,
    "Pre-trained UnifiedQA"
)
if results:
    all_results['Pre-trained UnifiedQA'] = results

# Fine-tuned BLIP
if 'ft_blip' in models_dict:
    results = evaluate_vision_model(
        models_dict['ft_blip']['processor'],
        models_dict['ft_blip']['model'],
        test_dataset,
        "Fine-tuned BLIP"
    )
    if results:
        all_results['Fine-tuned BLIP'] = results

# Pre-trained BLIP
results = evaluate_vision_model(
    models_dict['pt_blip']['processor'],
    models_dict['pt_blip']['model'],
    test_dataset,
    "Pre-trained BLIP"
)
if results:
    all_results['Pre-trained BLIP'] = results



LOADING TEST DATASET
Test samples: 4241

EVALUATING: Fine-tuned UnifiedQA
Processing 4241 samples...


Batch processing: 100%|██████████| 531/531 [00:47<00:00, 11.17it/s]



EVALUATING: Pre-trained UnifiedQA
Processing 4241 samples...


Batch processing: 100%|██████████| 531/531 [04:01<00:00,  2.19it/s]



EVALUATING: Fine-tuned BLIP
Processing 4241 samples...


Batch processing: 100%|██████████| 531/531 [20:38<00:00,  2.33s/it]



EVALUATING: Pre-trained BLIP
Processing 4241 samples...


Batch processing: 100%|██████████| 531/531 [04:16<00:00,  2.07it/s]


In [None]:
# ============================================================================
# CELL 16: Print Detailed Separate Results for Each Model
# ============================================================================

print("\n\n" + "="*80)
print("DETAILED EVALUATION RESULTS - EACH MODEL SEPARATELY")
print("="*80)

for model_name, results in all_results.items():
    print(f"\n{'='*80}")
    print(f"MODEL: {model_name}")
    print(f"{'='*80}")

    print(f"\nOverall Accuracy: {results['Overall']:.2f}%")
    print(f"Total Samples: {results.get('Total_Samples', 0)}")
    print(f"Model Type: {results.get('model_type', 'unknown')}")

    print(f"\n--- BY SUBJECT ---")
    for subject in ['NAT', 'SOC', 'LAN']:
        acc = results.get(subject, 0)
        count = results.get(f'{subject}_count', 0)
        print(f"  {subject}: {acc:.2f}% ({count} samples)")

    print(f"\n--- BY CONTEXT ---")
    for context in ['TXT', 'IMG', 'NO', 'TXT+IMG']:
        acc = results.get(context, 0)
        count = results.get(f'{context}_count', 0)
        print(f"  {context}: {acc:.2f}% ({count} samples)")

    print(f"\n--- BY GRADE LEVEL ---")
    for grade in ['G1-6', 'G7-12']:
        acc = results.get(grade, 0)
        count = results.get(f'{grade}_count', 0)
        print(f"  {grade}: {acc:.2f}% ({count} samples)")



DETAILED EVALUATION RESULTS - EACH MODEL SEPARATELY

MODEL: Fine-tuned UnifiedQA

Overall Accuracy: 57.23%
Total Samples: 4241
Model Type: text

--- BY SUBJECT ---
  NAT: 56.26% (2252 samples)
  SOC: 58.72% (889 samples)
  LAN: 58.00% (1100 samples)

--- BY CONTEXT ---
  TXT: 63.88% (789 samples)
  IMG: 61.58% (760 samples)
  NO: 59.23% (1435 samples)
  TXT+IMG: 48.13% (1257 samples)

--- BY GRADE LEVEL ---
  G1-6: 59.91% (2724 samples)
  G7-12: 52.41% (1517 samples)

MODEL: Pre-trained UnifiedQA

Overall Accuracy: 41.38%
Total Samples: 4241
Model Type: text

--- BY SUBJECT ---
  NAT: 42.01% (2252 samples)
  SOC: 36.45% (889 samples)
  LAN: 44.09% (1100 samples)

--- BY CONTEXT ---
  TXT: 46.64% (789 samples)
  IMG: 38.55% (760 samples)
  NO: 43.14% (1435 samples)
  TXT+IMG: 37.79% (1257 samples)

--- BY GRADE LEVEL ---
  G1-6: 43.21% (2724 samples)
  G7-12: 38.10% (1517 samples)

MODEL: Fine-tuned BLIP

Overall Accuracy: 0.02%
Total Samples: 4241
Model Type: vision

--- BY SUBJECT -

In [None]:
# ============================================================================
# CELL 17: Save Detailed Results
# ============================================================================

with open('evaluation_results_all_models.json', 'w') as f:
    json.dump(all_results, f, indent=2)

print(f"\n\n✅ Detailed results saved to evaluation_results_all_models.json")



✅ Detailed results saved to evaluation_results_all_models.json


In [None]:
# ============================================================================
# CELL 18: Comparison Summary
# ============================================================================

print("\n" + "="*80)
print("QUICK COMPARISON SUMMARY")
print("="*80)

print(f"\n{'Model':<30} {'Overall Accuracy':<20} {'Total Samples':<15}")
print("-" * 65)

for model_name in sorted(all_results.keys()):
    results = all_results[model_name]
    accuracy = results.get('Overall', 0)
    total = results.get('Total_Samples', 0)
    print(f"{model_name:<30} {accuracy:>6.2f}%{'':<12} {total:>6}")

with open('evaluation_comparison.txt', 'w') as f:
    f.write("="*80 + "\n")
    f.write("DETAILED EVALUATION RESULTS - EACH MODEL SEPARATELY\n")
    f.write("="*80 + "\n\n")

    for model_name, results in all_results.items():
        f.write(f"\n{'='*80}\n")
        f.write(f"MODEL: {model_name}\n")
        f.write(f"{'='*80}\n\n")
        f.write(f"Overall Accuracy: {results['Overall']:.2f}%\n")
        f.write(f"Total Samples: {results.get('Total_Samples', 0)}\n")
        f.write(f"Model Type: {results.get('model_type', 'unknown')}\n\n")
        f.write(f"--- BY SUBJECT ---\n")
        for subject in ['NAT', 'SOC', 'LAN']:
            acc = results.get(subject, 0)
            count = results.get(f'{subject}_count', 0)
            f.write(f"  {subject}: {acc:.2f}% ({count} samples)\n")
        f.write(f"\n--- BY CONTEXT ---\n")
        for context in ['TXT', 'IMG', 'NO', 'TXT+IMG']:
            acc = results.get(context, 0)
            count = results.get(f'{context}_count', 0)
            f.write(f"  {context}: {acc:.2f}% ({count} samples)\n")
        f.write(f"\n--- BY GRADE LEVEL ---\n")
        for grade in ['G1-6', 'G7-12']:
            acc = results.get(grade, 0)
            count = results.get(f'{grade}_count', 0)
            f.write(f"  {grade}: {acc:.2f}% ({count} samples)\n")

print(f"✅ Comparison summary saved to evaluation_comparison.txt")


QUICK COMPARISON SUMMARY

Model                          Overall Accuracy     Total Samples  
-----------------------------------------------------------------
Fine-tuned BLIP                  0.02%               4241
Fine-tuned UnifiedQA            57.23%               4241
Pre-trained BLIP                 2.66%               4241
Pre-trained UnifiedQA           41.38%               4241
✅ Comparison summary saved to evaluation_comparison.txt


In [None]:
from huggingface_hub import HfApi

# Set your credentials
HF_TOKEN = "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"  # Get from https://huggingface.co/settings/tokens
HF_USERNAME = "VishalM12"

# Repository names
TEXT_MODEL_REPO = f"{HF_USERNAME}/scienceqa-unifiedqa-finetuned"
VISION_MODEL_REPO = f"{HF_USERNAME}/scienceqa-blip-finetuned"

api = HfApi()

# Create repositories first
print("Creating repositories...")
try:
    api.create_repo(repo_id=TEXT_MODEL_REPO, token=HF_TOKEN, exist_ok=True)
    print(f"✓ Text model repo ready")
except:
    print(f"✓ Text model repo already exists")

try:
    api.create_repo(repo_id=VISION_MODEL_REPO, token=HF_TOKEN, exist_ok=True)
    print(f"✓ Vision model repo ready")
except:
    print(f"✓ Vision model repo already exists")

# Upload text model
print("\nUploading text model...")
api.upload_folder(
    folder_path=CONFIG['output_dir_text'],
    repo_id=TEXT_MODEL_REPO,
    token=HF_TOKEN,
    commit_message="Upload fine-tuned ScienceQA UnifiedQA model"
)
print(f"✓ Text model uploaded: https://huggingface.co/{TEXT_MODEL_REPO}")

# Upload vision model
print("\nUploading vision model...")
api.upload_folder(
    folder_path=CONFIG['output_dir_vision'],
    repo_id=VISION_MODEL_REPO,
    token=HF_TOKEN,
    commit_message="Upload fine-tuned ScienceQA BLIP model"
)
print(f"✓ Vision model uploaded: https://huggingface.co/{VISION_MODEL_REPO}")

Creating repositories...
✓ Text model repo ready
✓ Vision model repo ready

Uploading text model...


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...kpoint-4500/rng_state.pth:  77%|#######7  | 11.3kB / 14.6kB            

  ...kpoint-4773/rng_state.pth:  77%|#######7  | 11.3kB / 14.6kB            

  ...nt-4773/model.safetensors:   0%|          |  552kB /  242MB            

  ...ckpoint-4500/optimizer.pt:   0%|          |  642kB /  484MB            

  ...nt-4500/model.safetensors:   0%|          |  552kB /  242MB            

  ...ckpoint-4773/optimizer.pt:   0%|          |  634kB /  484MB            

  ...checkpoint-4500/scaler.pt: 100%|##########| 1.38kB / 1.38kB            

  ...ifiedqa/model.safetensors:   0%|          |  552kB /  242MB            

  ...ckpoint-4500/spiece.model: 100%|##########|  792kB /  792kB            

  ...ckpoint-4773/spiece.model: 100%|##########|  792kB /  792kB            

✓ Text model uploaded: https://huggingface.co/VishalM12/scienceqa-unifiedqa-finetuned

Uploading vision model...


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...ed_blip/model.safetensors:   0%|          | 60.1kB / 1.45GB            

✓ Vision model uploaded: https://huggingface.co/VishalM12/scienceqa-blip-finetuned
