In [1]:
from datasets import load_dataset

ds = load_dataset("vidore/arxivqa_test_subsampled")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = ds["test"].take(2)

# from PIL import Image
# from datasets import Dataset, DatasetDict

# # Create images
# images = [
#     Image.new("RGB", (16, 16), color="black"),
# ]

# # Corresponding queries
# queries = [
#     "Is attention really all you need?",
# ]

# # Combine images and queries into a list of dictionaries
# data = {"image": images, "query": queries}

# ds = Dataset.from_dict(data)


In [3]:
ds

Dataset({
    features: ['query', 'image', 'image_filename', 'options', 'answer', 'page', 'model', 'prompt', 'source'],
    num_rows: 2
})

In [28]:
from torchvision.transforms import ToPILImage
import matplotlib.pyplot as plt
import torch
import numpy

to_pil_image = ToPILImage()

def plot_image(img):
    # Convert tensor or NumPy array to PIL image if necessary
    if isinstance(img, torch.Tensor) or isinstance(img, numpy.ndarray):
        img = to_pil_image(img)
    
    # Display the image using matplotlib
    plt.axis("off")  # Turn off axes for cleaner visualization
    plt.imshow(img)  # Show the image
    plt.show()       # Render the plot


In [4]:
from typing import List, Optional, Tuple, Union, cast
from PIL.Image import Image


texts_query: List[Union[str, None]] = []
images: List[Image] = []

# Parse the examples.
for example in ds:
    query = example.get("query")
    texts_query.append(query)

    image = example.get("image")
    if image is None:
        raise ValueError("Image is None - This collator does not support None images yet.")
    images.append(cast(Image, image))

In [15]:
from collator import VisualRetrieverCollator
from colpali_engine.models import ColIdefics3, ColIdefics3Processor

processor = ColIdefics3Processor.from_pretrained("vidore/colSmol-256M")
vsc = VisualRetrieverCollator(processor, 256)
# processor.image_processor.do_image_splitting = False  # For Simplicity

preprocessed_inputs = vsc(ds)

In [16]:
for k, v in preprocessed_inputs.items():
    print(k)
    print(v.shape)
    print(v.dtype)

doc_pixel_values
torch.Size([2, 13, 3, 512, 512])
torch.float32
doc_pixel_attention_mask
torch.Size([2, 13, 512, 512])
torch.float32
doc_input_ids
torch.Size([2, 870])
torch.float32
doc_attention_mask
torch.Size([2, 870])
torch.float32
query_input_ids
torch.Size([2, 39])
torch.float32
query_attention_mask
torch.Size([2, 39])
torch.float32


In [71]:
doc_input_ids = preprocessed_inputs["doc_input_ids"]
doc_input_tokens = processor.tokenizer.decode(doc_input_ids[0])
print("Doc Input tokens:", doc_input_tokens)

print()
query_input_ids = preprocessed_inputs["query_input_ids"]
query_input_tokens = processor.tokenizer.decode(query_input_ids[0])
print("Query Input tokens:", query_input_tokens)

Doc Input tokens: <|im_start|>User: Describe the image.<fake_token_around_image><row_1_col_1><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><fake_token_around_image><row_1_col_2><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><

In [72]:
from colpali_engine.models import ColIdefics3, ColIdefics3Processor

model = ColIdefics3.from_pretrained(
        "vidore/colSmol-256M",
        torch_dtype=torch.float16,
        device_map="mps",
        attn_implementation= None
    ).eval()

In [73]:
import torch

# Set device to MPS
device = torch.device('mps')


# Move input tensors to MPS device
query_input_ids = preprocessed_inputs["query_input_ids"].to(device)
query_attention_mask = preprocessed_inputs["query_attention_mask"].to(device)

# Perform inference with the model
with torch.no_grad():
    query_outputs = model(input_ids=query_input_ids, attention_mask=query_attention_mask)
    # Feed only kwargs with 'doc_' prefix
    doc_outputs = model(**{k[4:]: v.to(torch.int32).to(device) for k, v in preprocessed_inputs.items() if k.startswith("doc")})


In [74]:
query_outputs.shape

torch.Size([2, 39, 128])

In [75]:
doc_outputs.shape

torch.Size([2, 870, 128])

In [85]:
import torch.nn.functional as F

scores = torch.einsum("bnd,csd->bcns", query_outputs, doc_outputs).max(dim = 3)[0].sum(dim = -1)

print(scores)

pos_scores = torch.diag(scores)

# # ColPali Implementation of Neg Scores
# negative_scores = scores - torch.eye(scores.shape[0], device = scores.device) * 1e6
# negative_scores = negative_scores.max(dim = 0)[0]

# loss = F.softplus(negative_scores - pos_scores).mean()

# Self
mask = torch.eye(scores.shape[0], device = scores.device).bool()
s_masked = scores.masked_fill(mask, float('-inf'))
neg_scores = s_masked.max(dim = 0)[0]

loss = F.softplus(neg_scores - pos_scores).mean()

print(pos_scores)
print(neg_scores)
print(loss)

tensor([[24.5469, 14.6641],
        [15.3516, 19.5938]], device='mps:0', dtype=torch.float16)
tensor([24.5469, 19.5938], device='mps:0', dtype=torch.float16)
tensor([15.3516, 14.6641], device='mps:0', dtype=torch.float16)
tensor(0.0037, device='mps:0', dtype=torch.float16)


In [49]:
from peft import LoraConfig, PeftModel, get_peft_model

model=ColIdefics3.from_pretrained("vidore/ColSmolVLM-Instruct-256M-base", torch_dtype=torch.float16, attn_implementation="eager")


In [50]:
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

In [51]:
model

ColIdefics3(
  (model): Idefics3Model(
    (vision_model): Idefics3VisionTransformer(
      (embeddings): Idefics3VisionEmbeddings(
        (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), padding=valid)
        (position_embedding): Embedding(1024, 768)
      )
      (encoder): Idefics3Encoder(
        (layers): ModuleList(
          (0-11): 12 x Idefics3EncoderLayer(
            (self_attn): Idefics3VisionAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (layer_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
            (mlp): Idefics3VisionMLP(
              (activation_fn): PytorchGELUTanh()
              (fc1): Linear(in_features=768, out_features=3072

In [44]:
for name, module in model.named_modules():
    print(name)



model
model.vision_model
model.vision_model.embeddings
model.vision_model.embeddings.patch_embedding
model.vision_model.embeddings.position_embedding
model.vision_model.encoder
model.vision_model.encoder.layers
model.vision_model.encoder.layers.0
model.vision_model.encoder.layers.0.self_attn
model.vision_model.encoder.layers.0.self_attn.k_proj
model.vision_model.encoder.layers.0.self_attn.v_proj
model.vision_model.encoder.layers.0.self_attn.q_proj
model.vision_model.encoder.layers.0.self_attn.out_proj
model.vision_model.encoder.layers.0.layer_norm1
model.vision_model.encoder.layers.0.mlp
model.vision_model.encoder.layers.0.mlp.activation_fn
model.vision_model.encoder.layers.0.mlp.fc1
model.vision_model.encoder.layers.0.mlp.fc2
model.vision_model.encoder.layers.0.layer_norm2
model.vision_model.encoder.layers.1
model.vision_model.encoder.layers.1.self_attn
model.vision_model.encoder.layers.1.self_attn.k_proj
model.vision_model.encoder.layers.1.self_attn.v_proj
model.vision_model.encoder

In [45]:
peft_config = LoraConfig(
    r=32,
    lora_alpha=32,
    lora_dropout=0.1,
    init_lora_weights="gaussian",
    bias="none",
    task_type="FEATURE_EXTRACTION",
    target_modules=r"(.*(model.text_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(linear).*$)", 
    modules_to_save = ["linear"]
)

model = get_peft_model(model, peft_config)

In [46]:
model

PeftModelForFeatureExtraction(
  (base_model): LoraModel(
    (model): ColIdefics3(
      (model): Idefics3Model(
        (vision_model): Idefics3VisionTransformer(
          (embeddings): Idefics3VisionEmbeddings(
            (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), padding=valid)
            (position_embedding): Embedding(1024, 768)
          )
          (encoder): Idefics3Encoder(
            (layers): ModuleList(
              (0-11): 12 x Idefics3EncoderLayer(
                (self_attn): Idefics3VisionAttention(
                  (k_proj): Linear(in_features=768, out_features=768, bias=True)
                  (v_proj): Linear(in_features=768, out_features=768, bias=True)
                  (q_proj): Linear(in_features=768, out_features=768, bias=True)
                  (out_proj): Linear(in_features=768, out_features=768, bias=True)
                )
                (layer_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
              

In [47]:
for param in model.base_model.model.linear.parameters():
    # print(param.dtype)
    print(param.requires_grad)

False
False
True
True


In [20]:
from transformers import TrainingArguments

training_args = TrainingArguments(
        output_dir = None,
        num_train_epochs = 1,
        # max_steps = 10,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=1,
        weight_decay=0.01,
        fp16=True, 
        optim = "paged_adamw_8bit"
    )

print(training_args)

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=no,
eval_use_gather_object=F

In [35]:
from datasets import load_dataset

base_url = "https://huggingface.co/datasets/vidore/colpali_train_set/resolve/main/data/"
data_files = {"train": base_url + "train-00005-of-00082.parquet", "test": base_url + "test-00000-of-00001.parquet"}

# Load only the first 1,000 rows of the training split
train_subset = load_dataset("parquet", data_files=data_files)


Generating train split: 1442 examples [00:01, 1063.87 examples/s]
Generating test split: 500 examples [00:00, 1203.03 examples/s]


In [36]:
train_subset

DatasetDict({
    train: Dataset({
        features: ['image', 'image_filename', 'query', 'answer', 'source', 'options', 'page', 'model', 'prompt', 'answer_type'],
        num_rows: 1442
    })
    test: Dataset({
        features: ['image', 'image_filename', 'query', 'answer', 'source', 'options', 'page', 'model', 'prompt', 'answer_type'],
        num_rows: 500
    })
})