In [None]:
pip install transformers datasets torch pillow tqdm accelerate scikit-learn bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2


In [1]:
import torch
import os
import numpy as np
from PIL import Image
from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoProcessor,
    TrainingArguments,
    Trainer,
    get_linear_schedule_with_warmup
)
from peft import LoraConfig, get_peft_model

CONFIG = {
    'text_model': "Qwen/Qwen2.5-1.5B-Instruct",
    'vision_model': "microsoft/git-base-textvqa",

    'max_length': 384,

    'output_dir': './scienceqa_finetuned',
    'batch_size': 4,
    'epochs': 1,
    'lr': 2e-4,
    'seed': 42
}

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [2]:

def format_options(choices):
    labels = ['A', 'B', 'C', 'D', 'E', 'F']
    return " ".join([f"({labels[i]}) {c}" for i, c in enumerate(choices)])

class ScienceQADataset(Dataset):
    def __init__(self, dataset, tokenizer, processor, mode='text'):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.processor = processor
        self.mode = mode

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]

        question = item['question']
        context = item['hint'] if item['hint'] else ""
        options = format_options(item['choices'])
        answer_text = item['choices'][item['answer']]

        if self.mode == 'text':
            messages = [
                {"role": "system", "content": "You are a scientific assistant. Answer directly."},
                {"role": "user", "content": f"Question: {question}\nContext: {context}\nOptions: {options}"},
                {"role": "assistant", "content": answer_text}
            ]
            full_prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)

            encodings = self.tokenizer(
                full_prompt,
                max_length=CONFIG['max_length'],
                truncation=True,
                padding="max_length",
                return_tensors="pt"
            )
            input_ids = encodings.input_ids.squeeze()
            labels = input_ids.clone()
            labels[labels == self.tokenizer.pad_token_id] = -100

            return {
                'input_ids': input_ids,
                'attention_mask': encodings.attention_mask.squeeze(),
                'labels': labels
            }

        elif self.mode == 'vision':
            image = item['image']
            if image is None: image = Image.new('RGB', (224, 224), color='black')

            prompt = f"Question: {question} Context: {context} Choices: {options} Answer:"
            full_text = f"{prompt} {answer_text}"

            inputs = self.processor(
                images=image,
                text=full_text,
                return_tensors="pt",
                padding="max_length",
                max_length=CONFIG['max_length'],
                truncation=True
            )

            input_ids = inputs.input_ids.squeeze()
            labels = input_ids.clone()

            prompt_ids = self.processor.tokenizer(
                prompt, return_tensors="pt", truncation=True, max_length=CONFIG['max_length']
            ).input_ids.squeeze()

            if len(prompt_ids) < len(labels):
                labels[:len(prompt_ids)] = -100

            labels[input_ids == self.processor.tokenizer.pad_token_id] = -100

            return {
                'pixel_values': inputs.pixel_values.squeeze(),
                'input_ids': input_ids,
                'attention_mask': inputs.attention_mask.squeeze(),
                'labels': labels
            }
raw_dataset = load_dataset("derek-thomas/ScienceQA")
train_data = raw_dataset['train']
val_data = raw_dataset['validation']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-1028f23e353fbe(…):   0%|          | 0.00/377M [00:00<?, ?B/s]

data/validation-00000-of-00001-6c7328ff6(…):   0%|          | 0.00/126M [00:00<?, ?B/s]

data/test-00000-of-00001-f0e719df791966f(…):   0%|          | 0.00/122M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12726 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4241 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4241 [00:00<?, ? examples/s]

In [3]:

from transformers import DataCollatorForLanguageModeling

print(f"\n=== Loading {CONFIG['text_model']} ===")

tokenizer = AutoTokenizer.from_pretrained(CONFIG['text_model'])
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token

text_model = AutoModelForCausalLM.from_pretrained(
    CONFIG['text_model'],
    torch_dtype=torch.float16
).to(device)

peft_config = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]
)
text_model = get_peft_model(text_model, peft_config)

train_dataset_text = ScienceQADataset(train_data, tokenizer, None, mode='text')
val_dataset_text = ScienceQADataset(val_data.select(range(50)), tokenizer, None, mode='text')

training_args = TrainingArguments(
    output_dir=f"{CONFIG['output_dir']}/text_model",
    per_device_train_batch_size=CONFIG['batch_size'],
    gradient_accumulation_steps=4,
    learning_rate=CONFIG['lr'],
    num_train_epochs=CONFIG['epochs'],
    fp16=True,
    logging_steps=50,
    save_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=text_model,
    args=training_args,
    train_dataset=train_dataset_text,
    eval_dataset=val_dataset_text,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

print("Starting Text Training...")
trainer.train()

text_model.save_pretrained(f"{CONFIG['output_dir']}/text_model_final")
tokenizer.save_pretrained(f"{CONFIG['output_dir']}/text_model_final")
text_model.save_pretrained(f"{CONFIG['output_dir']}/text_model_final")
del text_model, trainer
torch.cuda.empty_cache()


=== Loading Qwen/Qwen2.5-1.5B-Instruct ===


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Starting Text Training...


Step,Training Loss
50,1.3983
100,0.9369
150,0.82
200,0.6936
250,0.6788
300,0.6544
350,0.628
400,0.5923
450,0.5838
500,0.5869


In [4]:

import os
import shutil
from google.colab import drive


drive.mount('/content/drive')


drive_folder = "/content/drive/MyDrive/ScienceQA_Project"
drive_model_path = f"{drive_folder}/text_model_final"

local_dest_path = "./scienceqa_finetuned/text_model_final"

print(f"Restoring model from {drive_model_path}...")

if os.path.exists(drive_model_path):
    if os.path.exists(local_dest_path):
        shutil.rmtree(local_dest_path)
    shutil.copytree(drive_model_path, local_dest_path)
    print(f"✅ Success! Model restored to {local_dest_path}")
    print("You can now run the Evaluation cells.")
else:
    print(f"❌ Error: No saved model found at {drive_model_path}")

Mounted at /content/drive
Restoring model from /content/drive/MyDrive/ScienceQA_Project/text_model_final...
✅ Success! Model restored to ./scienceqa_finetuned/text_model_final
You can now run the Evaluation cells.


In [5]:

import gc
from torch.cuda.amp import autocast, GradScaler
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

if 'text_model' in globals(): del text_model
if 'trainer' in globals(): del trainer
gc.collect()
torch.cuda.empty_cache()

print(f"\n=== Loading {CONFIG['vision_model']} ===")

vision_processor = AutoProcessor.from_pretrained(CONFIG['vision_model'])
vision_model = AutoModelForCausalLM.from_pretrained(CONFIG['vision_model'])

vision_model.gradient_checkpointing_enable()
vision_model.to(device)

grad_accum_steps = 2
micro_batch_size = 2

train_data_vis = train_data.filter(lambda x: x['image'] is not None)
train_dataset_vis = ScienceQADataset(train_data_vis, None, vision_processor, mode='vision')
train_loader = DataLoader(
    train_dataset_vis,
    batch_size=micro_batch_size,
    shuffle=True,
    pin_memory=True,
    num_workers=2
)

optimizer = AdamW(vision_model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=50,
    num_training_steps=len(train_loader) * CONFIG['epochs']
)
scaler = GradScaler()


print(f"Starting Vision Training (Batch: {micro_batch_size} | Accum: {grad_accum_steps})...")
vision_model.train()

for epoch in range(CONFIG['epochs']):
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    optimizer.zero_grad()

    for step, batch in enumerate(progress_bar):

        input_ids = batch['input_ids'].to(device)
        pixel_values = batch['pixel_values'].to(device)
        labels = batch['labels'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        with autocast():
            outputs = vision_model(
                input_ids=input_ids,
                pixel_values=pixel_values,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss / grad_accum_steps
        scaler.scale(loss).backward()

        if (step + 1) % grad_accum_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()

        total_loss += loss.item() * grad_accum_steps
        progress_bar.set_postfix(loss=loss.item() * grad_accum_steps)

vision_model.save_pretrained(f"{CONFIG['output_dir']}/vision_model_final")
vision_processor.save_pretrained(f"{CONFIG['output_dir']}/vision_model_final")
print("✓ Vision Model Saved")

del vision_model, optimizer, scaler
gc.collect()
torch.cuda.empty_cache()


=== Loading microsoft/git-base-textvqa ===


preprocessor_config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/709M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

Filter:   0%|          | 0/12726 [00:00<?, ? examples/s]

  scaler = GradScaler()


Starting Vision Training (Batch: 2 | Accum: 2)...


  with autocast():
Epoch 1: 100%|██████████| 3109/3109 [18:55<00:00,  2.74it/s, loss=1.18]


✓ Vision Model Saved


In [6]:
import torch
import json
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoProcessor
from peft import PeftModel
from tqdm import tqdm
import gc

gc.collect()
torch.cuda.empty_cache()

print("\n=== Loading Fine-Tuned Models ===")

base_text_model = AutoModelForCausalLM.from_pretrained(
    CONFIG['text_model'],
    torch_dtype=torch.float16,
    device_map="auto"
)
text_model = PeftModel.from_pretrained(base_text_model, f"{CONFIG['output_dir']}/text_model_final")
text_tokenizer = AutoTokenizer.from_pretrained(f"{CONFIG['output_dir']}/text_model_final")
if text_tokenizer.pad_token is None: text_tokenizer.pad_token = text_tokenizer.eos_token
text_model.eval()

vision_model = AutoModelForCausalLM.from_pretrained(f"{CONFIG['output_dir']}/vision_model_final").to(device)
vision_processor = AutoProcessor.from_pretrained(f"{CONFIG['output_dir']}/vision_model_final")
vision_model.eval()

def format_options(choices):
    labels = ['A', 'B', 'C', 'D', 'E', 'F']
    return " ".join([f"({labels[i]}) {c}" for i, c in enumerate(choices)])

def get_best_match_index(prediction, choices):
    pred = prediction.lower().strip()
    scores = []
    for i, choice in enumerate(choices):
        choice_clean = choice.lower().strip()
        if pred == choice_clean: return i
        pred_tokens = set(pred.split())
        choice_tokens = set(choice_clean.split())
        overlap = len(pred_tokens.intersection(choice_tokens)) / len(choice_tokens) if choice_tokens else 0
        scores.append(overlap)
    return np.argmax(scores)

all_predictions = []
all_labels = []
all_subjects = []
all_grades = []
all_has_image = []
all_has_text = []

eval_subset = val_data.select(range(200))

print(f"\nRunning Inference on {len(eval_subset)} samples...")

with torch.no_grad():
    for item in tqdm(eval_subset):
        # Collect Metadata
        all_labels.append(item['answer'])
        all_subjects.append(item['subject'])
        all_grades.append(item['grade'])

        has_image = item['image'] is not None
        has_text = bool(item['hint'])
        all_has_image.append(has_image)
        all_has_text.append(has_text)


        question = item['question']
        choices = item['choices']
        hint = item['hint'] if item['hint'] else ""
        options = format_options(choices)

        if has_image:
            prompt = f"Question: {question} Context: {hint} Choices: {options} Answer:"
            inputs = vision_processor(images=item['image'], text=prompt, return_tensors="pt").to(device)
            generated_ids = vision_model.generate(
                pixel_values=inputs.pixel_values,
                input_ids=inputs.input_ids,
                max_new_tokens=10,
                repetition_penalty=1.2,
                do_sample=False
            )
            prediction = vision_processor.batch_decode(generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]

        else:
            messages = [{"role": "user", "content": f"Question: {question}\nContext: {hint}\nOptions: {options}"}]
            text_input = text_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            inputs = text_tokenizer(text_input, return_tensors="pt").to(device)
            generated_ids = text_model.generate(
                **inputs,
                max_new_tokens=10,
                repetition_penalty=1.2,
                do_sample=False
            )
            prediction = text_tokenizer.decode(generated_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)

        pred_idx = get_best_match_index(prediction, choices)
        all_predictions.append(pred_idx)

predictions = np.array(all_predictions)
labels = np.array(all_labels)
subjects = np.array(all_subjects)
grades = np.array(all_grades)
has_image_arr = np.array(all_has_image)
has_text_arr = np.array(all_has_text)

results = {}

results['Overall'] = (predictions == labels).mean() * 100

for subject in ['natural science', 'social science', 'language science']:
    mask = subjects == subject
    if mask.sum() > 0:
        acc = (predictions[mask] == labels[mask]).mean() * 100
        results[subject.upper()[:3]] = acc

context_types = [
    ('TXT', has_text_arr & ~has_image_arr),
    ('IMG', has_image_arr & ~has_text_arr),
    ('NO', ~has_image_arr & ~has_text_arr),
    ('TXT+IMG', has_image_arr & has_text_arr)
]

for key, mask in context_types:
    if mask.sum() > 0:
        acc = (predictions[mask] == labels[mask]).mean() * 100
        results[key] = acc
    else:
        results[key] = 0.0

grade_nums = []
for g in grades:
    try:
        grade_nums.append(int(g.replace('grade', '')))
    except:
        grade_nums.append(0)
grade_nums = np.array(grade_nums)

for key, condition in [('G1-6', (grade_nums >= 1) & (grade_nums <= 6)),
                       ('G7-12', (grade_nums >= 7) & (grade_nums <= 12))]:
    if condition.sum() > 0:
        results[key] = (predictions[condition] == labels[condition]).mean() * 100
    else:
        results[key] = 0.0

print("\n" + "="*60)
print("EVALUATION RESULTS (Modular Approach C)")
print("="*60)
print(f"Overall: {results['Overall']:.2f}%")
print("-" * 60)
print(f"Subject: NAT={results.get('NAT', 0):.2f}% | SOC={results.get('SOC', 0):.2f}% | LAN={results.get('LAN', 0):.2f}%")
print(f"Context: TXT={results.get('TXT', 0):.2f}% | IMG={results.get('IMG', 0):.2f}% | NO={results.get('NO', 0):.2f}% | TXT+IMG={results.get('TXT+IMG', 0):.2f}%")
print(f"Grade:   G1-6={results.get('G1-6', 0):.2f}% | G7-12={results.get('G7-12', 0):.2f}%")
print("="*60)


with open(f"{CONFIG['output_dir']}/evaluation_results.json", 'w') as f:
    json.dump(results, f, indent=2)
print(f"✅ Results saved to {CONFIG['output_dir']}/evaluation_results.json")


=== Loading Fine-Tuned Models ===

Running Inference on 200 samples...


  0%|          | 0/200 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
100%|██████████| 200/200 [01:00<00:00,  3.30it/s]


EVALUATION RESULTS (Modular Approach C)
Overall: 57.00%
------------------------------------------------------------
Subject: NAT=57.48% | SOC=33.33% | LAN=69.57%
Context: TXT=91.11% | IMG=22.73% | NO=71.67% | TXT+IMG=34.25%
Grade:   G1-6=57.04% | G7-12=56.90%
✅ Results saved to ./scienceqa_finetuned/evaluation_results.json





In [None]:

import shutil
import os
from google.colab import drive

print("Mounting Google Drive...")
drive.mount('/content/drive')

drive_folder = "/content/drive/MyDrive/ScienceQA_Project"
os.makedirs(drive_folder, exist_ok=True)

print(f"Saving models to {drive_folder}...")


source_text = f"{CONFIG['output_dir']}/text_model_final"
dest_text = f"{drive_folder}/text_model_final"

if os.path.exists(source_text):
    if os.path.exists(dest_text):
        shutil.rmtree(dest_text)
    shutil.copytree(source_text, dest_text)
    print(f"✅ Text model saved to: {dest_text}")
else:
    print(f"⚠️ Source text model not found at {source_text}")

source_vis = f"{CONFIG['output_dir']}/vision_model_final"
dest_vis = f"{drive_folder}/vision_model_final"

if os.path.exists(source_vis):
    if os.path.exists(dest_vis):
        shutil.rmtree(dest_vis)
    shutil.copytree(source_vis, dest_vis)
    print(f"✅ Vision model saved to: {dest_vis}")
else:
    print(f"⚠️ Source vision model not found at {source_vis}")

print("\nBackup Complete!")

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saving models to /content/drive/MyDrive/ScienceQA_Project...
✅ Text model saved to: /content/drive/MyDrive/ScienceQA_Project/text_model_final
✅ Vision model saved to: /content/drive/MyDrive/ScienceQA_Project/vision_model_final

Backup Complete!
