In [1]:
%load_ext autoreload
%autoreload 2

import os
os.chdir('/projects/cft_vlm/finetune')

In [4]:
import copy
import os
from qwenvl.train.argument import *
import torch


def set_processor(processor_args, processor):
  tokenizer = processor.tokenizer
  img_processor = processor.image_processor
  vid_processor = processor.video_processor
  
  tokenizer.padding_side = processor_args.padding_side
  tokenizer.model_max_length = processor_args.model_max_length
  img_processor.max_pixels = processor_args.image_max_pixels
  img_processor.min_pixels = processor_args.image_min_pixels
  img_processor.size["shortest_edge"] = processor_args.shortest_edge
  vid_processor.max_frame_pixels = processor_args.video_max_pixels
  vid_processor.min_frame_pixels = processor_args.video_min_pixels
  vid_processor.size['shortest_edge'] = processor_args.shortest_edge
  vid_processor.default_to_square = processor_args.video_default_to_square
  
  return copy.deepcopy(processor)


model_args, data_args, training_args, proc_args = ModelArguments(), DataArguments(), TrainingArguments(), ProcessingArguments()

processor = transformers.AutoProcessor.from_pretrained(
    model_args.model_name_or_path,
    use_fast=True,
)

In [12]:
processor.video_processor

Qwen2VLVideoProcessor {
  "_valid_kwargs_names": [
    "do_convert_rgb",
    "do_resize",
    "size",
    "size_divisor",
    "default_to_square",
    "resample",
    "do_rescale",
    "rescale_factor",
    "do_normalize",
    "image_mean",
    "image_std",
    "do_pad",
    "do_center_crop",
    "crop_size",
    "data_format",
    "input_data_format",
    "device",
    "min_pixels",
    "max_pixels",
    "patch_size",
    "temporal_patch_size",
    "merge_size"
  ],
  "crop_size": null,
  "data_format": "channels_first",
  "default_to_square": false,
  "device": null,
  "do_center_crop": null,
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_pad": null,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "Qwen2VLImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "input_data_format": null,
  "max_frame_pixels": 150528,
  "max_pixels": 12845056,
  "merge_size

In [None]:
import pathlib
from qwenvl.data.data_qwen import make_supervised_data_module
processor = set_processor(proc_args, processor)
torch.set_num_threads(1)
data_args.dataset_use = "openpmc_validation"
data_module = make_supervised_data_module(processor=processor, data_args=data_args, proc_args=proc_args, num_proc=32)
from qwenvl.train.trainer import Trainer


from transformers import Qwen2_5_VLForConditionalGeneration
os.makedirs(training_args.output_dir, exist_ok=True)

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_args.model_name_or_path,
    cache_dir=training_args.cache_dir,
    attn_implementation='eager',
    torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
)

if torch.distributed.get_rank() == 0:
  model.visual.print_trainable_parameters()
  model.model.print_trainable_parameters()
  
model.config.use_cache = False

if training_args.gradient_checkpointing:
  if hasattr(model, "enable_input_require_grads"):
    model.enable_input_require_grads()
  else:
    def make_inputs_require_grad(module, input, output):
      output.requires_grad_(True)
    model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
trainer = Trainer(
    model=model, processing_class=processor, args=training_args, **data_module
)

if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
  print("checkpoint found, resume training")
  trainer.train(resume_from_checkpoint=True)
else:
  trainer.train()
  
trainer.save_state()
processor.save_pretrained(training_args.output_dir)

model.config.use_cache = True

safe_save_model_for_hf_trainer(
    trainer=trainer, output_dir=training_args.output_dir)

Dataset /projects/cft_vlm/datasets/openpmc/data/dataset/validation already processed with the same processor and video args.
