### Install Dependencies

In [1]:
!pip install -U "transformers>=4.39.0" -q
!pip install peft bitsandbytes -q
!pip install -U "trl>=0.8.3" -q
!pip install trl -q
!pip install peft -q
!pip install -U bitsandbytes -q

In [2]:
import torch
from transformers import AutoTokenizer, AutoProcessor, TrainingArguments, LlavaForConditionalGeneration, BitsAndBytesConfig
from trl import SFTTrainer
from peft import LoraConfig
# from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration

### Load the model (4-bits quantized)

In [3]:
model_id = "llava-hf/llava-1.5-13b-hf"

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [4]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
)

In [5]:
model = LlavaForConditionalGeneration.from_pretrained(model_id,
                                                      quantization_config=quantization_config,
                                                      torch_dtype=torch.float16)

config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/77.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/6 [00:00<?, ?it/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

### Create a Chat template set tokenizer and processor

In [6]:
LLAVA_CHAT_TEMPLATE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<image>{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}"""

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.chat_template = LLAVA_CHAT_TEMPLATE
processor = AutoProcessor.from_pretrained(model_id)
processor.tokenizer = tokenizer

tokenizer_config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/700 [00:00<?, ?B/s]

### Create a DataCollator

In [12]:
class LLavaDataCollator:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, examples):
        texts = []
        images = []

        for idx, example in enumerate(examples):
            try:
                text = self.processor.tokenizer.apply_chat_template(
                    example["messages"], 
                    tokenize=False, 
                    add_generation_prompt=False
                )

                if len(example["images"]) > 0:
                    image = example["images"][0] 
                    images.append(image)
                else:
                    print(f"Warning: No image found in example {idx}. Check your dataset.")
                    images.append(None)  

                texts.append(text)
                
            except Exception as e:
                print(f"Error processing example {idx}: {example}")
                print(f"Exception: {e}")
                continue 


        try:
            batch = self.processor(
                text=texts,
                images=images,
                return_tensors="pt",
                padding=True,
            )
            
            labels = batch["input_ids"].clone()
            if self.processor.tokenizer.pad_token_id is not None:
                labels[labels == self.processor.tokenizer.pad_token_id] = -100
            batch["labels"] = labels

        except Exception as e:
            print("Error occurred while creating the batch:")
            print(f"Texts: {texts}")
            print(f"Images: {images}")
            print(f"Exception: {e}")
            raise  # Re-raise the exception to stop the training process for debugging

        return batch
data_collator = LLavaDataCollator(processor)

### Load the Dataset

In [9]:
import json

with open('/kaggle/input/flipkart-grid-dataset/formatted_label.json', 'r') as file:
    data = json.load(file)

In [10]:
import json
from PIL import Image
import os
from tqdm import tqdm

with open('/kaggle/input/flipkart-grid-dataset/formatted_label.json', 'r') as file:
    data = json.load(file)

def load_and_encode_image(image_path):
    try:
        image = Image.open(image_path).convert('RGB')
        return image  
    except Exception as e:
        print(f"Error loading image {image_path}: {e}")
        return None

for item in tqdm(data, desc="Processing Images"):
    if "images" in item and item["images"]:
        image_path = item['images'][0]  
        pil_image = load_and_encode_image(image_path)
        if pil_image:
            item['images'][0] = pil_image  

Processing Images: 100%|██████████| 129/129 [01:05<00:00,  1.98it/s]


In [11]:
#Chat Template between USER and ASSISTANT 
data[88]

{'messages': [{'content': [{'index': None,
     'text': 'What is the name of the product?\n',
     'type': 'text'},
    {'index': 0, 'text': None, 'type': 'image'}],
   'role': 'user'},
  {'content': [{'index': None,
     'text': 'GARNIER NATURALS COMPLETE VITAMIN C WASH',
     'type': 'text'}],
   'role': 'assistant'},
  {'content': [{'index': None,
     'text': 'What is the barcode of the product?\n',
     'type': 'text'}],
   'role': 'user'},
  {'content': [{'index': None, 'text': '890195261005208', 'type': 'text'}],
   'role': 'assistant'},
  {'content': [{'index': None,
     'text': 'What is the quantity size of the product?\n',
     'type': 'text'}],
   'role': 'user'},
  {'content': [{'index': None, 'text': '100 g', 'type': 'text'}],
   'role': 'assistant'},
  {'content': [{'index': None,
     'text': 'What is the weight of the product?\n',
     'type': 'text'}],
   'role': 'user'},
  {'content': [{'index': None, 'text': '100 g', 'type': 'text'}],
   'role': 'assistant'},
  {'co

In [12]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(data, test_size=0.2, random_state=41)

### Set the Training Arguments

In [13]:
training_args = TrainingArguments(
    output_dir="/kaggle/working/",
    report_to="tensorboard",
    learning_rate=1.4e-5,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    logging_steps=5,
    num_train_epochs=5,
    push_to_hub=True,
    gradient_checkpointing=True,
    remove_unused_columns=False,
    fp16=True,
    bf16=False
)

### Set the LoRA config

In [14]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

### Create the SFTTrainer object

In [None]:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    peft_config=lora_config,
    dataset_text_field="text",  
    tokenizer=tokenizer,
    data_collator=data_collator,
    dataset_kwargs={"skip_prepare_dataset": True},
)

In [None]:
#For custom trainer
# from torch.utils.data import DataLoader

# train_dataloader = DataLoader(
#     train_data,
#     batch_size=training_args.per_device_train_batch_size,
#     shuffle=False, 
#     collate_fn=data_collator,
# )

In [None]:
# class CustomSFTTrainer(SFTTrainer):
#     def get_train_dataloader(self):
#         return train_dataloader 

In [None]:
# trainer = CustomSFTTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=None,  # Set to None since we're using a custom DataLoader
#     eval_dataset=val_data,
#     peft_config=lora_config,
#     dataset_text_field="text",  # Need a dummy field
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     dataset_kwargs={"skip_prepare_dataset": True},
# )

### Load and set Tensorboard for logging

In [None]:
%load_ext tensorboard
%tensorboard --logdir /content/llava-1.5-13b-hf-ft-mix-vsft

### Start the training!

In [None]:
trainer.train()

### Push the model to the HF Hub

In [None]:
trainer.push_to_hub()

### Download Files

In [None]:
%cd /kaggle/working

In [None]:
!zip -r file.zip /kaggle/working

In [None]:
!ls

In [None]:
from IPython.display import FileLink
FileLink(r'file.zip')

### Model Inference

In [None]:
from transformers import AutoProcessor
import torch
from PIL import Image
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM

processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-13b-hf")
LLAVA_CHAT_TEMPLATE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<image>{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}"""
processor.tokenizer.chat_template = LLAVA_CHAT_TEMPLATE

config = PeftConfig.from_pretrained("astro189/working")
model_new = PeftModel.from_pretrained(model, "astro189/working")

In [None]:
def generate_response(image_path, prompt):
    image = Image.open(image_path)
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image", "image": None}
            ]
        }
    ]
   
    text = processor.tokenizer.apply_chat_template(messages, tokenize=False)

    model_inputs = processor(
        images=image,
        text=text,
        return_tensors="pt",
        padding=True
    )

    with torch.no_grad():
        outputs = model_new.generate(
            **model_inputs,
            max_new_tokens=128,
            do_sample=False
        )

    response = processor.tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("ASSISTANT:")[-1].strip()
    return response

In [None]:
image_path = "image.jpg"
prompt = "What is the name of the product?"
response = generate_response(image_path, prompt)
print(f"Question: {prompt}")
print(f"Response: {response}")