In [2]:
from transformers import BitsAndBytesConfig, LlavaNextForConditionalGeneration, AutoProcessor
from datasets import load_dataset
import torch
import json
from huggingface_hub import notebook_login

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
if torch.cuda.is_available():
    print("PyTorch is connected to GPU.")
    print(f"GPU Device Name: {torch.cuda.get_device_name(0)}")
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    print(f"Current GPU: {torch.cuda.current_device()}")
else:
    print("PyTorch is not connected to GPU.")

PyTorch is connected to GPU.
GPU Device Name: NVIDIA H100 NVL
Number of GPUs available: 4
Current GPU: 0


In [4]:
MAX_LENGTH = 4096
MODEL_ID = "llava-hf/llava-v1.6-mistral-7b-hf"
REPO_ID = "VaibhavMal/llava_v1.6-7b-ASIDataset"

In [5]:
processor = AutoProcessor.from_pretrained(MODEL_ID)
processor.tokenizer.padding_side = "right" # during training, one always uses padding on the right

Some kwargs in processor config are unused and will not have any effect: num_additional_image_tokens. 


In [6]:
USE_LORA = False
USE_QLORA = True

## Load model

# Three options for training, from the lowest precision training to the highest precision training:
# - QLora
# - Standard Lora
# - Full fine-tuning
if USE_QLORA or USE_LORA:
    if USE_QLORA:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16
        )
    model = LlavaNextForConditionalGeneration.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        quantization_config=bnb_config,
    )
else:
    # for full fine-tuning, we can speed up the model using Flash Attention
    # only available on certain devices, see https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features
    model = LlavaNextForConditionalGeneration.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        _attn_implementation="flash_attention_2",
    )

`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.63it/s]


In [7]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

def find_all_linear_names(model):
    cls = torch.nn.Linear
    lora_module_names = set()
    multimodal_keywords = ['multi_modal_projector', 'vision_model']
    for name, module in model.named_modules():
        if any(mm_keyword in name for mm_keyword in multimodal_keywords):
            continue
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names: # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)


lora_config = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.1,
    target_modules=find_all_linear_names(model),
    init_lora_weights="gaussian",
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [8]:
from torch.utils.data import Dataset
from typing import Any, Dict
import random

class LlavaDataset(Dataset):
    """
    PyTorch Dataset for LLaVa. This class takes a HuggingFace Dataset as input.

    Each row, consists of image path(png/jpg/jpeg) and ground truth data (json/jsonl/txt).
    """

    def __init__(
        self,
        dataset_name_or_path: str,
        split: str = "train",
    ):
        super().__init__()

        self.split = split

        self.dataset = load_dataset(dataset_name_or_path, split=self.split)
        self.dataset_length = len(self.dataset)

        self.answer_token_sequences = []
        self.query_list = []
        for sample in self.dataset:
            if "answers" in sample:
                # assert isinstance(sample["answers"], list)
                self.answer_token_sequences.append(sample["answers"])
                self.query_list.append(sample["query"])


    def __len__(self) -> int:
        return self.dataset_length

    def __getitem__(self, idx: int) -> Dict:
        """
        Returns one item of the dataset.

        Returns:
            image : the original Receipt image
            target_sequence : tokenized ground truth sequence
        """
        sample = self.dataset[idx]

        # inputs
        image = sample["image"]
        en_query = self.query_list[idx]
        # target_sequence = random.choice(self.answer_token_sequences[idx]) # can be more than one, e.g., DocVQA Task 1
        target_sequence = self.answer_token_sequences[idx]
        return image, en_query, target_sequence

In [9]:
train_dataset = LlavaDataset("VaibhavMal/AirQualty_imageConv",  split="train")
val_dataset = LlavaDataset("VaibhavMal/AirQualty_imageConv", split="test")

In [10]:
counter = 0
for idx in range(len(train_dataset)):
    image, en_query, target_sequence = train_dataset[idx]
    print("Question:", en_query, "--- Answer:", target_sequence)
    counter +=1
    if counter == 10:
        break

Question: Analyze the provided Markov Transition Field of Nitrogen Oxide night time data. Average value on those four nights is 1822, 1887, 1793 and 1650. Estimate the expected average Nitrogen Oxide value for the subsequent night. --- Answer: 1553
Question: Analyze the provided Markov Transition Field of Nitrogen Oxide four day time data. Average value on those four days is 704, 874, 1172 and 1415. Estimate the expected average Nitrogen Oxide value for the subsequent day. --- Answer: 990
Question: Analyze the provided Markov Transition Field of Nitrogen Oxide night time data. Average value on those four nights is 1488, 1264, 1162 and 1474. Estimate the expected average Nitrogen Oxide value for the subsequent night. --- Answer: 1471
Question: Analyze the provided Markov Transition Field of Nitrogen Oxide night time data. Average value on those four nights is 1477, 1546, 1795 and 1730. Estimate the expected average Nitrogen Oxide value for the subsequent night. --- Answer: 1642
Question

In [11]:
def train_collate_fn(examples):
    images = []
    texts = []
    for example in examples:
        image, en_query, ground_truth = example
        images.append(image)
        prompt = f"[INST] <image>\n{en_query} [\INST] {ground_truth}"
        texts.append(prompt)

    batch = processor(text=texts, images=images, padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors="pt")

    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100
    batch["labels"] = labels

    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    pixel_values = batch["pixel_values"]
    image_sizes = batch["image_sizes"]
    labels = batch["labels"]

    return input_ids, attention_mask, pixel_values, image_sizes, labels

def eval_collate_fn(examples):
    # we only feed the prompt to the model
    images = []
    texts = []
    answers = []
    for example in examples:
        image, en_query, ground_truth = example
        images.append(image)
        prompt = f"[INST] <image>\n{en_query} [\INST] {ground_truth}"
        texts.append(prompt)
        answers.append(ground_truth)

    batch = processor(text=texts, images=images, return_tensors="pt", padding=True)

    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    pixel_values = batch["pixel_values"]
    image_sizes = batch["image_sizes"]

    return input_ids, attention_mask, pixel_values, image_sizes, answers

In [12]:
import lightning as L
from torch.utils.data import DataLoader
import re
from nltk import edit_distance
import numpy as np


class LlavaModelPLModule(L.LightningModule):
    """
    A PyTorch Lightning module for training and validating a multimodal model that processes images and text.

    Attributes:
        config (dict): Configuration dictionary containing model hyperparameters and settings.
        processor (object): A processor object for handling text and image pre-processing.
        model (torch.nn.Module): The model to be trained and evaluated.

    Methods:
        training_step(batch, batch_idx):
            Executes a single training step, computing the loss and logging it.
        
        validation_step(batch, batch_idx, dataset_idx=0):
            Executes a single validation step, generating predictions, comparing them to ground truth, and logging the normalized edit distance.
        
        configure_optimizers():
            Sets up the optimizer and optionally, learning rate scheduler for the training process.
        
        train_dataloader():
            Returns a DataLoader for the training dataset.
        
        val_dataloader():
            Returns a DataLoader for the validation dataset.
    """
    def __init__(self, config, processor, model):
        super().__init__()
        self.config = config
        self.processor = processor
        self.model = model

        self.batch_size = config.get("batch_size")

    def training_step(self, batch, batch_idx):
        """
        Performs a single step of training.

        Args:
            batch (tuple): A tuple containing input_ids, attention_mask, pixel_values, image_sizes, and labels.
            batch_idx (int): The index of the current batch.

        Returns:
            torch.Tensor: The computed loss for the batch.
        """

        input_ids, attention_mask, pixel_values, image_sizes, labels = batch

        outputs = self.model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            pixel_values=pixel_values,
                            image_sizes=image_sizes,
                            labels=labels
                          )
        loss = outputs.loss

        self.log("train_loss", loss)

        return loss

    def validation_step(self, batch, batch_idx, dataset_idx=0):
        """
        Performs a single step of validation, generating predictions and computing the normalized edit distance.

        Args:
            batch (tuple): A tuple containing input_ids, attention_mask, pixel_values, image_sizes, and answers.
            batch_idx (int): The index of the current batch.
            dataset_idx (int, optional): Index of the dataset in case of multiple datasets. Defaults to 0.

        Returns:
            list: A list of normalized edit distances between predictions and ground truth answers.
        """

        input_ids, attention_mask, pixel_values, image_sizes, answers = batch

        # autoregressively generate token IDs
        generated_ids = self.model.generate(input_ids=input_ids, attention_mask=attention_mask,
                                       pixel_values=pixel_values, image_sizes=image_sizes, max_new_tokens=MAX_LENGTH)
        # turn them back into text, chopping of the prompt
        # important: we don't skip special tokens here, because we want to see them in the output
        predictions = self.processor.batch_decode(generated_ids[:, input_ids.size(1):], skip_special_tokens=True)

        scores = []
        for pred, answer in zip(predictions, answers):
            pred = re.sub(r"(?:(?<=>) | (?=</s_))", "", pred)
            scores.append(edit_distance(pred, answer) / max(len(pred), len(answer)))

            if self.config.get("verbose", False) and len(scores) == 1:
                print(f"Prediction: {pred}")
                print(f"    Answer: {answer}")
                print(f" Normed ED: {scores[0]}")

        self.log("val_edit_distance", np.mean(scores))

        return scores

    def configure_optimizers(self):
        """
        Configures the optimizer for training.

        Returns:
            torch.optim.Optimizer: The optimizer for training.
        """
        # you could also add a learning rate scheduler if you want
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.config.get("lr"))

        return optimizer

    def train_dataloader(self):
        """
        Returns the DataLoader for the training dataset.

        Returns:
            DataLoader: The DataLoader for the training dataset.
        """
        return DataLoader(train_dataset, collate_fn=train_collate_fn, batch_size=self.batch_size, shuffle=True, num_workers=4)

    def val_dataloader(self):
        """
        Returns the DataLoader for the validation dataset.

        Returns:
            DataLoader: The DataLoader for the validation dataset.
        """
        return DataLoader(val_dataset, collate_fn=eval_collate_fn, batch_size=self.batch_size, shuffle=False, num_workers=4)

In [13]:
config = {"max_epochs": 2,
          # "val_check_interval": 0.2, # how many times we want to validate during an epoch
          "check_val_every_n_epoch": 1,
          "gradient_clip_val": 1.0,
          "accumulate_grad_batches": 8,
          "lr": 1e-4,
          "batch_size": 1,
          # "seed":2022,
          "num_nodes": 1,
          "warmup_steps": 50,
          "result_path": "./result",
          "verbose": True,
          "num_workers": 4
}

model_module = LlavaModelPLModule(config, processor, model)

In [14]:
from lightning.pytorch.callbacks import Callback
from lightning.pytorch.callbacks.early_stopping import EarlyStopping

from huggingface_hub import HfApi

api = HfApi()

class PushToHubCallback(Callback):
    def on_train_epoch_end(self, trainer, pl_module):
        print(f"Pushing model to the hub, epoch {trainer.current_epoch}")
        pl_module.model.push_to_hub(REPO_ID,
                                    commit_message=f"Training in progress, epoch {trainer.current_epoch}")
    def on_train_end(self, trainer, pl_module):
        print(f"Pushing model to the hub after training")
        pl_module.processor.push_to_hub(REPO_ID,
                                    commit_message=f"Training done")
        pl_module.model.push_to_hub(REPO_ID,
                                    commit_message=f"Training done")

early_stop_callback = EarlyStopping(monitor="val_edit_distance", patience=3, verbose=False, mode="min")

In [15]:
from huggingface_hub import login
# Log in to Hugging Face Hub
login(token="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")

In [16]:
%%time

trainer = L.Trainer(
        default_root_dir="/home/vaibhav/LLMs-TimeSeries/software/Checkpoint_img_model_v3/",  # Set the local director
        accelerator="gpu",
        devices=[0],
        max_epochs=config.get("max_epochs"),
        accumulate_grad_batches=config.get("accumulate_grad_batches"),
        check_val_every_n_epoch=config.get("check_val_every_n_epoch"),
        gradient_clip_val=config.get("gradient_clip_val"),
        precision="16-mixed",
        limit_val_batches=5,
        num_sanity_val_steps=0,
        # logger=wandb_logger,
        callbacks=[
            # PushToHubCallback(),
            early_stop_callback],
)

trainer.fit(model_module)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/vaibhav/miniconda3/envs/vai_llama/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
You are using a CUDA device ('NVIDIA H100 NVL') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.

Epoch 0: 100%|██████████| 130/130 [01:22<00:00,  1.58it/s, v_num=1]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.


Prediction: .4 
    Answer: 1294
 Normed ED: 1.0


/home/vaibhav/miniconda3/envs/vai_llama/lib/python3.11/site-packages/lightning/pytorch/utilities/data.py:78: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 1. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prediction: .4 
    Answer: 1892
 Normed ED: 1.0


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prediction: 
    Answer: 1117
 Normed ED: 1.0


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prediction: .5 
    Answer: 895
 Normed ED: 1.0


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prediction: .6 
    Answer: 1357
 Normed ED: 1.0
Epoch 1: 100%|██████████| 130/130 [01:22<00:00,  1.58it/s, v_num=1]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prediction: \] 
    Answer: 1294
 Normed ED: 1.0


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prediction: \* 
    Answer: 1892
 Normed ED: 1.0


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prediction: 
    Answer: 1117
 Normed ED: 1.0


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prediction: 
    Answer: 895
 Normed ED: 1.0


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prediction: \* 
    Answer: 1357
 Normed ED: 1.0
Epoch 1: 100%|██████████| 130/130 [01:24<00:00,  1.54it/s, v_num=1]

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 130/130 [01:32<00:00,  1.40it/s, v_num=1]
CPU times: user 2min 2s, sys: 1min 2s, total: 3min 4s
Wall time: 3min 6s


In [17]:
# Save the Hugging Face model
model.save_pretrained("/home/vaibhav/LLMs-TimeSeries/software/Checkpoint_img_model_v3")