# Evaluation of PaliGemma on MathVerse

In [1]:
from transformers import AutoProcessor, AutoModel, AutoModelForVision2Seq
import torch

from PIL import Image
import requests

In [2]:
model_id = "google/paligemma-3b-mix-224"
processor = AutoProcessor.from_pretrained(model_id, token="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")
model = AutoModelForVision2Seq.from_pretrained(model_id, token="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")

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

#### Load MathVerse Dataset

In [2]:
from datasets import load_dataset
ds = load_dataset("AI4Math/MathVerse", "testmini")
ds = ds.with_format('torch')

In [3]:
ds_testmini = ds['testmini']

In [4]:
ds_testmini[0]

{'sample_index': '1',
 'problem_index': '1',
 'problem_version': 'Text Dominant',
 'question': 'As shown in the figure, in triangle ABC, it is known that angle A = 80.0, angle B = 60.0, point D is on AB and point E is on AC, DE parallel BC, then the size of angle CED is ()\nChoices:\nA:40°\nB:60°\nC:120°\nD:140°',
 'image': tensor([[255, 255, 255,  ..., 255, 255, 255],
         [255, 255, 255,  ..., 255, 255, 255],
         [255, 255, 255,  ..., 255, 255, 255],
         ...,
         [255, 255, 160,  ..., 227, 182, 255],
         [255, 255,  89,  ...,  72, 255, 255],
         [193,   0,   0,  ..., 255, 255, 255]], dtype=torch.uint8),
 'answer': 'D',
 'question_type': 'multi-choice',
 'metadata': {'split': 'testmini',
  'source': 'GeoQA',
  'subject': 'Plane Geometry',
  'subfield': 'Angle'},
 'query_wo': 'Please directly answer the question and provide the correct option letter, e.g., A, B, C, D.\nQuestion: As shown in the figure, in triangle ABC, it is known that angle A = 80.0, angle

In [6]:
import torch
from torchvision.transforms import ToPILImage, ToTensor

# Images have different shapes/sizes which is problematic for preparing batches.
for i, entry_ in enumerate(ds_testmini):
    print(entry_['image'].shape)
    if i > 5:
        break

torch.Size([93, 137])
torch.Size([93, 137])
torch.Size([93, 137])
torch.Size([289, 424, 4])
torch.Size([688, 964, 4])
torch.Size([132, 211])
torch.Size([132, 211])


In [72]:
for i, entry_ in enumerate(ds_testmini):
    shapes = entry_['image'].shape
    if len(shapes) > 3:
        print(f"More than 3 dimensions")
    elif len(shapes) == 3:
        if shapes[2] == 3:
            pass
        elif shapes[2] == 4:
            pass
        else:
            print(f"Shape for entry {i} : {shapes}")

In [7]:
to_pil = ToPILImage()
to_tensor = ToTensor()

In [21]:
# Image with shape (height x width x n_channels) and n_channels=4
image_4channels = ds_testmini[4]['image']
image_4channels = image_4channels.permute(2, 0, 1)  
pil_image = to_pil(image_4channels) # 4 x height x width
image_4channels_reshaped = to_tensor(pil_image.convert('RGB'))
print(f"Shape: {image_4channels_reshaped.shape}")

Shape: torch.Size([3, 688, 964])


In [22]:
# Image with shape (height, width)
image_no_channels = ds_testmini[0]['image']
image_no_channels = image_no_channels
pil_image = to_pil(image_no_channels)
image_no_channels_reshaped = to_tensor(pil_image.convert('RGB'))
print(f"Shape: {image_no_channels_reshaped.shape}")

Shape: torch.Size([3, 93, 137])


In [36]:
import torchvision
torchvision.transforms.functional.resize(image_4channels_reshaped, size=(224, 224)).shape

torch.Size([3, 224, 224])

In [60]:
def preprocess(element, height=224, width=224):
    # If image is (height x width x n_channels) permute
    if len(element['image'].shape) == 3 and (element['image'].shape[-1] == 4 or element['image'].shape[-1] == 3):
        element['image'] = element['image'].permute(2, 0, 1)

    # Convert to RGB 
    element['image'] = to_pil(element['image'])
    element['image'] = to_tensor(element['image'].convert('RGB'))

    # Resize
    element['image'] = torchvision.transforms.functional.resize(element['image'], size=(height, width))
    
    return element

In [61]:
ds_preprocess = ds_testmini.map(preprocess)

Map:   0%|          | 0/3940 [00:00<?, ? examples/s]

**Prepare batch**

In [68]:
from torch.utils.data import DataLoader
dataloader = DataLoader(ds_preprocess, batch_size=32, shuffle=True)

In [69]:
for batch_ in dataloader:
    print(batch_)
    break

{'sample_index': ['1051', '2893', '1219', '2074', '3319', '1425', '2286', '283', '2787', '2209', '1782', '2007', '3696', '2192', '2481', '3831', '1807', '3', '2993', '2099', '9', '3118', '840', '1293', '237', '3133', '428', '1886', '75', '1527', '3443', '1191'], 'problem_index': ['211', '579', '244', '415', '664', '285', '458', '57', '558', '442', '357', '402', '740', '439', '497', '767', '362', '1', '599', '420', '2', '624', '168', '259', '48', '627', '86', '378', '15', '306', '689', '239'], 'problem_version': ['Text Dominant', 'Vision Intensive', 'Vision Dominant', 'Vision Dominant', 'Vision Dominant', 'Vision Only', 'Text Dominant', 'Vision Intensive', 'Text Lite', 'Vision Dominant', 'Text Lite', 'Text Lite', 'Text Dominant', 'Text Lite', 'Text Dominant', 'Text Dominant', 'Text Lite', 'Vision Intensive', 'Vision Intensive', 'Vision Dominant', 'Vision Dominant', 'Vision Intensive', 'Vision Only', 'Vision Intensive', 'Text Lite', 'Vision Intensive', 'Vision Intensive', 'Text Dominant'