# Requirements

In [None]:
!pip install -U bitsandbytes
!pip install qwen-vl-utils
!pip install git+https://github.com/huggingface/transformers
!pip install peft
!pip install lightning

# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import random
import os
import gc
import lightning as L
from tqdm import tqdm
from datasets import load_dataset, load_from_disk
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from torch.utils.data import Dataset, DataLoader
from transformers import BitsAndBytesConfig, Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

# Constants

In [None]:
def seed_everything(seed: int = 42) -> None:
    # no cuda reproducibility
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)

seed_everything()

In [None]:
def clear():
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
CREATE_NEW_TEST_DATASET = False
REPO = 'sashaaadance/qwen2-vl-fine-tune'

# Data

In [None]:
data = load_dataset('HuggingFaceM4/ChartQA')

In [None]:
data['train'][0]['image']

In [None]:
data['train'][0]['query'], data['train'][0]['label']

Saving test dataset with identifier for reproducibility

In [None]:
if CREATE_NEW_TEST_DATASET:
    test_data = data['test'].add_column('id', list(range(len(data['test']))))
    # only 250 samples
    test_data_250 = test_data.select(
        np.random.choice(list(range(len(test_data))), size=250)
    )
    test_data_250.save_to_disk('test_data_250')
    !zip -r /kaggle/working/test_data_250.zip /kaggle/working/test_data_250

In [None]:
test_data_250 = load_from_disk('/kaggle/input/test-dataset-250/kaggle/working/test_data_250')

# Model quantization

In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    'Qwen/Qwen2-VL-2B-Instruct',
    torch_dtype=torch.float16,
    quantization_config=quant_config,
)

In [None]:
model

# Inference before fine tunning

In [None]:
class DatasetQA(Dataset):
    def __init__(self, data, processor):
        self.data = data
        self.processor = processor
    
    def __getitem__(self, item):
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": self.data[item]['image']},

                    {"type": "text", "text": ' Make your answer as neat as possible ' + self.data[item]['query']},
                ],
            }
        ]
        text = self.processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = self.processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors='pt',
        )
        return inputs
    
    def __len__(self):
        return len(self.data)

In [None]:
processor = AutoProcessor.from_pretrained('Qwen/Qwen2-VL-2B-Instruct')
# only 250 samples
test_dataset = DatasetQA(test_data_250, processor=processor)

In [None]:
outputs = []
for inputs in tqdm(test_dataset):
    generated_ids = model.generate(**inputs.to('cuda'), max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    outputs.append(output_text)

In [None]:
test_dataset_infered = test_data_250.add_column('predictions', outputs)

In [None]:
test_dataset_infered[4]

Saving

In [None]:
test_dataset_infered.save_to_disk('test_dataset_infered')
!zip -r /kaggle/working/test_dataset_infered.zip /kaggle/working/test_dataset_infered

# Fine tunning

In [None]:
class DatasetQA_FN(Dataset):
    def __init__(self, data, processor):
        self.data = data
        self.processor = processor
    
    def __getitem__(self, item):
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": self.data[item]['image']},

                    {"type": "text", "text": self.data[item]['query']},
                ],
            },
            {
                "role": "assistant",
                "content": [
                    {"type": "text", "text": self.data[item]['label'][0]},
                ],
            }
        ]
        text_w_gt = self.processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        text_wo_gt = self.processor.apply_chat_template(
            messages[0], tokenize=False, add_generation_prompt=True
        )
        image_inputs, _ = process_vision_info(messages)
        
        # processor is in collate function
        input_item = {}
        input_item['text_w_gt'] = text_w_gt
        input_item['text_wo_gt'] = text_wo_gt
        input_item['image'] = image_inputs[0]
        input_item['ground_truth'] = self.data[item]['label'][0]
        
        return input_item
    
    def __len__(self):
        return len(self.data)

In [None]:
def collate_fn(samples):
    images = []
    texts = []
    for sample in samples:
        images.append(sample['image'])
        texts.append(sample['text_w_gt'])

    batch = processor(text=texts, images=images, padding=True, truncation=True, max_length=128, return_tensors='pt')

    labels = batch['input_ids'].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100
    batch['labels'] = labels

    return batch

In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    'Qwen/Qwen2-VL-2B-Instruct',
    torch_dtype=torch.float16,
    quantization_config=quant_config,
)
processor = AutoProcessor.from_pretrained('Qwen/Qwen2-VL-2B-Instruct')

In [None]:
def find_linear_layers(model, num_modules=-1, exclude=['lm_head']):
    linear = torch.nn.modules.Linear
    module_names = []

    for name, module in model.named_modules():
        if any(ex_keyword in name for ex_keyword in exclude):
            continue
        if isinstance(module, linear):
            module_names.append(name)
    
    if num_modules > 0:
        module_names = module_names[-num_modules:]

    return module_names

In [None]:
modules = find_linear_layers(model, num_modules=100)
len(modules)

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.1,
    target_modules=modules,
    init_lora_weights='gaussian',
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
clear()

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
class QwentModule(L.LightningModule):
    def __init__(self, config, model, train_dataset, val_dataset):
        super().__init__()
        self.config = config
        self.model = model
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.batch_size = config.get('batch_size')

    def training_step(self, batch, batch_idx):
        
        clear()
        outputs = self.model(
            **batch
        )
        loss = outputs.loss
        
        self.log('train_loss', loss)

        return loss

    def validation_step(self, batch, batch_idx, dataset_idx=0):
        
        clear()
        self.model.eval()
        outputs = self.model(
            **batch
        )
        loss = outputs.loss

        self.log('val_loss', loss)

        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.config.get('lr'))

        return optimizer

    def train_dataloader(self):
        return DataLoader(self.train_dataset, collate_fn=collate_fn, batch_size=self.batch_size, shuffle=True, num_workers=4)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, collate_fn=collate_fn, batch_size=self.batch_size, shuffle=False, num_workers=4)


In [None]:
train_dataset = DatasetQA_FN(data['train'].select(list(range(2000))), processor)
val_dataset = DatasetQA_FN(data['val'].select(list(range(300))), processor)

config = {
    'max_epochs': 10,
    'check_val_every_n_epoch': 1,
    'gradient_clip_val': 1.0,
    'accumulate_grad_batches': 8,
    'lr': 3e-4,
    'batch_size': 1,
    'num_nodes': 1,
    'warmup_steps': 50,
    'result_path': '/kaggle/working/',
    'verbose': True,
}

model_module = QwentModule(config, model.to('cuda'), train_dataset, val_dataset)

In [None]:
from lightning.pytorch.callbacks import Callback
from lightning.pytorch.callbacks.early_stopping import EarlyStopping

class PushToHubCallback(Callback):
    def on_train_epoch_end(self, trainer, pl_module):
        pl_module.model.push_to_hub(
            REPO,
            commit_message=f'Fine tunning, epoch {trainer.current_epoch}'
        )

    def on_train_end(self, trainer, pl_module):
        pl_module.processor.push_to_hub(
            REPO,
            commit_message=f'Training done'
        )
        pl_module.model.push_to_hub(
            REPO,
            commit_message=f'Training done'
        )

early_stop_callback = EarlyStopping(monitor='val_loss', verbose=False, mode='min')

In [None]:
clear()
trainer = L.Trainer(
        accelerator='gpu',
        devices=[0],
        max_epochs=config.get('max_epochs'),
        accumulate_grad_batches=config.get('accumulate_grad_batches'),
        check_val_every_n_epoch=config.get('check_val_every_n_epoch'),
        gradient_clip_val=config.get('gradient_clip_val'),
        precision='16-mixed',
        limit_val_batches=5,
        num_sanity_val_steps=0,
        callbacks=[PushToHubCallback(), early_stop_callback],
)

trainer.fit(model_module)

# Inference after tuning

In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    REPO,
    torch_dtype=torch.float16,
    quantization_config=quant_config,
    local_files_only=True
)

In [None]:
test_data_250 = load_from_disk('/kaggle/input/test-dataset-250/kaggle/working/test_data_250')
test_dataset = DatasetQA(test_data_250, processor=processor)

In [None]:
clear()
outputs = []
for inputs in tqdm(test_dataset):
    generated_ids = model.generate(**inputs.to('cuda'), max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    outputs.append(output_text)

In [None]:
test_dataset_infered = test_data_250.add_column('predictions', outputs)

Save

In [None]:
test_dataset_infered.save_to_disk('test_dataset_infered_after_fn')
!zip -r /kaggle/working/test_dataset_infered_after_fn.zip /kaggle/working/test_dataset_infered_after_fn