In [None]:
# !pip install -q -U git+https://github.com/huggingface/peft.git transformers bitsandbytes datasets accelerate wandb

## Library and Config

In [1]:
import os
import torch
import datasets
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
from pathlib import Path

from transformers import AutoTokenizer
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration


from peft import LoraConfig, get_peft_model

device = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm
2024-04-24 10:50:07.743310: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Dataset

In [2]:
images_path = list(Path("/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/data/train2017/").glob("*.jpg"))
labels = pd.read_csv("/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/label-nocleaning.csv")

images = []

for name_file in tqdm(labels['image']):
    temp_str = name_file.split("/")
    if temp_str[0] == "train2017":
        images.append(str(images_path[0].parent / (temp_str[-1] + ".jpg")))
        # Image.open(str(images_path[0].parent) / temp_str[-1]+".jpg")

# images = [Image.open(str(images_path[0].parent / (path.split("/")[-1]+".jpg") )) for path in labels['image']]

100%|██████████| 360216/360216 [00:01<00:00, 358107.21it/s]


In [3]:
labels #+ Display data in dataframe

Unnamed: 0,image,captions,split_id,split
0,train2017/000000373716,ผู้หญิงสวมเสื้อแขนยาวสีขาวและเด็กนั่งเล่นกับสุ...,0,train
1,train2017/000000373716,สาวคนนึงกำลังพาเด็กมานั่งเล่นอยู่ภายในสนามหญ้า...,0,train
2,train2017/000000373716,ภาพขาวดำ ผู้หญิงนั่งบนพื้นอุ้มเด็กบนตัก ข้าง ๆ...,0,train
3,train2017/000000196888,สีน้ำตาลตัวเล็กกำลังกินอาหารอยู่บนจานกระดาษสีข...,1,train
4,train2017/000000196888,นกน้อยตัวหนึ่งกำลังจิกกินเศษอาหารที่วางทิ้งไว้...,1,train
...,...,...,...,...
360211,val2017/000000338219,รถจักรยานยนต์จำนวนมากที่จอดอยู่ตรงบริเวณพื้นที...,119285,val
360212,val2017/000000338219,รถมอเตอร์ไซค์จอดอยู่บนพื้นหญ้าหลายคัน ด้านหลัง...,119285,val
360213,val2017/000000376093,ผู้หญิงใส่เสื้อสีม่วงยืนอยู่ข้างกับผู้ชายใส่เส...,119286,val
360214,val2017/000000376093,คนที่เทน้ำใส่แก้วอยู่บนโต๊ะมีพิซซาวางอยู่บนโต๊...,119286,val


In [4]:
select_labels = labels[labels['split'] == "train" ]['captions']
select_labels

0         ผู้หญิงสวมเสื้อแขนยาวสีขาวและเด็กนั่งเล่นกับสุ...
1         สาวคนนึงกำลังพาเด็กมานั่งเล่นอยู่ภายในสนามหญ้า...
2         ภาพขาวดำ ผู้หญิงนั่งบนพื้นอุ้มเด็กบนตัก ข้าง ๆ...
3         สีน้ำตาลตัวเล็กกำลังกินอาหารอยู่บนจานกระดาษสีข...
4         นกน้อยตัวหนึ่งกำลังจิกกินเศษอาหารที่วางทิ้งไว้...
                                ...                        
345157    แมว 2 ตัวที่อยู่ในรถของเจ้าของจอดอยู่ตรงพื้นที...
345158    แมว 2 ตัว นั่งอยู่ในรถยนต์ ด้านนอกมีภูเขาและท้...
345159    รถยนต์คันสีดำอยู่ข้างกับเรือลำสีแดงใกล้จะกลับส...
345160    คนจำนวนหนึ่งที่กำลังเดินข้ามสะพานอยู่ที่สวนสาธ...
345161    รถสองคันจอดอยู่ใต้สะพานที่มีน้ำเล็กน้อย มีเรือ...
Name: captions, Length: 345044, dtype: object

In [5]:
dataset = datasets.Dataset.from_dict({"image": images, "text": select_labels})
dataset = dataset.train_test_split(test_size=0.8, seed=42)
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'text'],
        num_rows: 69008
    })
    test: Dataset({
        features: ['image', 'text'],
        num_rows: 276036
    })
})

In [6]:
from torch.utils.data import Dataset, DataLoader

class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        im = Image.open(item['image'])
        encoding = self.processor(text=item['text'], images=im, padding="max_length", return_tensors="pt", max_length=40)
        # remove batch dimension
        encoding = {k: v.squeeze() for k, v in encoding.items()}
        encoding["text"] = item["text"]
        return encoding

# def collate_fn(batch):
#     # pad the input_ids and attention_mask
#     processed_batch = {}
#     for key in batch[0].keys():
#         if key != "text":
#             processed_batch[key] = torch.stack([example[key] for example in batch])
#         else:
#             text_inputs = processor.tokenizer(
#                 [example["text"] for example in batch], padding=True, return_tensors="pt"
#             )
#             processed_batch["input_ids"] = text_inputs["input_ids"]
#             processed_batch["attention_mask"] = text_inputs["attention_mask"]
#     return processed_batch

## Model

In [9]:
#. Init regular Blip-2 model 
model = LlavaNextForConditionalGeneration.from_pretrained(
            "xtuner/llava-llama-3-8b-v1_1-hf",
            device_map="auto",
            torch_dtype=torch.float16
        ).cuda()
processor = LlavaNextProcessor.from_pretrained("xtuner/llava-llama-3-8b-v1_1-hf")

#. Use Typhoon 7b as tokenizer
tokenizer = AutoTokenizer.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased")


You are using a model of type llava_llama to instantiate a model of type llava_next. This is not supported for all configurations of models and can yield errors.
Loading checkpoint shards: 100%|██████████| 9/9 [00:01<00:00,  6.03it/s]
Some weights of LlavaNextForConditionalGeneration were not initialized from the model checkpoint at xtuner/llava-llama-3-8b-v1_1-hf and are newly initialized: ['model.image_newline', 'model.language_model.lm_head.weight', 'model.language_model.model.embed_tokens.weight', 'model.language_model.model.layers.0.input_layernorm.weight', 'model.language_model.model.layers.0.mlp.down_proj.weight', 'model.language_model.model.layers.0.mlp.gate_proj.weight', 'model.language_model.model.layers.0.mlp.up_proj.weight', 'model.language_model.model.layers.0.post_attention_layernorm.weight', 'model.language_model.model.layers.0.self_attn.k_proj.weight', 'model.language_model.model.layers.0.self_attn.o_proj.weight', 'model.language_model.model.layers.0.self_attn.q_proj.we

In [10]:
tokenizer

CamembertTokenizerFast(name_or_path='airesearch/wangchanberta-base-att-spm-uncased', vocab_size=25005, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>', 'additional_special_tokens': ['<s>NOTUSED', '</s>NOTUSED', '<_>']}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>NOTUSED", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>NOTUSED", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	5: AddedToken("<s>", rstrip=False, lstrip=False, single_word=Fal

In [11]:
model.config.text_config.vocab_size = len(tokenizer)
model.language_model.resize_token_embeddings(len(tokenizer))

processor.tokenizer = tokenizer
model.config.eos_token_id = 6

# Let's define the LoraConfig
config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "k_proj"]
)

# Parameter effective fine-tuning
model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 9,961,472 || all params: 7,015,557,120 || trainable%: 0.1420


In [12]:
batch_size = 4
train_dataset = ImageCaptioningDataset(dataset['train'], processor)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)

test_dataset = ImageCaptioningDataset(dataset['test'], processor)
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)

In [13]:
print(f" Number of Iteration per Epoch = {len(train_dataloader)}")

 Number of Iteration per Epoch = 17252


In [18]:
torch.cuda.empty_cache()


## Training

In [14]:
experiment_name = "model_llava_LLama3"

optimizer = torch.optim.AdamW(model.parameters(), lr=6e-4, weight_decay=1e-2)

for epoch in range(2):
    print("Epoch:", epoch)
    model.train()
    
    training_loss = 0
    validate_loss = 0
    
    idx = 0
    
    for batch in tqdm(train_dataloader):
        input_ids = batch.pop("input_ids").to(device)
        pixel_values = batch.pop("pixel_values").to(device, torch.float16)
        
        outputs = model(
            input_ids,
            pixel_values=pixel_values,
            labels=input_ids
        )

        loss = outputs.loss
        training_loss += loss.item()
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()
    
    print("Training loss", training_loss / len(train_dataloader))
    print("-" * 60)
    
    save_dir = os.path.join(experiment_name, str(epoch))
    os.makedirs(save_dir, exist_ok=True)
    model.save_pretrained(save_dir)

Epoch: 0


  0%|          | 0/17252 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 92.00 MiB. GPU 0 has a total capacity of 23.68 GiB of which 26.00 MiB is free. Including non-PyTorch memory, this process has 23.54 GiB memory in use. Of the allocated memory 23.06 GiB is allocated by PyTorch, and 161.31 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [46]:
processor

LlavaNextProcessor:
- image_processor: LlavaNextImageProcessor {
  "_valid_processor_keys": [
    "images",
    "do_resize",
    "size",
    "resample",
    "do_center_crop",
    "crop_size",
    "do_rescale",
    "rescale_factor",
    "do_normalize",
    "image_mean",
    "image_std",
    "do_convert_rgb",
    "return_tensors",
    "data_format",
    "input_data_format"
  ],
  "crop_size": {
    "height": 336,
    "width": 336
  },
  "do_center_crop": true,
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_grid_pinpoints": [
    [
      336,
      672
    ],
    [
      672,
      336
    ],
    [
      672,
      672
    ],
    [
      1008,
      336
    ],
    [
      336,
      1008
    ]
  ],
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "LlavaNextImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "resample": 3,
  "rescale_factor": 0.0039215686

In [65]:
idx = 8
res = processor(text=dataset['train'].select([idx])['text'][0], 
          images=Image.open(dataset['train'].select([idx])['image'][0]), padding="max_length", return_tensors="pt", max_length=20)

In [66]:
res['input_ids'].shape, res['pixel_values'].shape

(torch.Size([1, 25]), torch.Size([1, 5, 3, 336, 336]))

In [21]:
IM = Image.open(dataset['test'].select([0])['image'][0])
LABEL = dataset['test'].select([0])['text'][0]

In [23]:
processor(text=LABEL, images=IM).keys()

dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'image_sizes'])

In [None]:
#. Litter bit test
im = dataset['test'].select([500,501])['image']

In [None]:
pixel_values = processor(im, return_tensors="pt").to(device, torch.float16).pixel_values
model.eval()

outputs = model.generate(pixel_values=pixel_values)
print(outputs)

In [None]:
generated_caption = processor.batch_decode(outputs, skip_special_tokens=True)
print(type(generated_caption),generated_caption[0:5])

## Save model

In [None]:
model.save_pretrained("/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/model/model_blip2(F)_Typhoon")

## Test model

### Prepare data to test

In [3]:
# Load data to test 2 path
#? 1. test/food/
#? 2. test/travel/
#? 3. test2017/

import datasets
from PIL import Image
from pathlib import Path

images_test = []

image1_path = list(Path("/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/data/test2017").glob("*.jpg"))
image2_path = list(Path("/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/data/test_kaggle/food").glob("*.jpg"))
image3_path = list(Path("/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/data/test_kaggle/travel").glob("*.jpg"))

image1_path.extend(image2_path)
image1_path.extend(image3_path)

merge_path = image1_path
data_size = len(merge_path)


for i in tqdm(image1_path, desc="Load image to List"):
    images_test.append(Image.open(i))

data_testset = datasets.Dataset.from_dict({"image": images_test })

print(f"row in submission : {data_size}")

Load image to List: 100%|██████████| 40670/40670 [00:03<00:00, 12160.86it/s]


row in submission : 40670


In [4]:
torch.cuda.empty_cache()
model_load = LlavaNextForConditionalGeneration.from_pretrained(
            "xtuner/llava-llama-3-8b-v1_1-hf",
            load_in_8bit=True,
            device_map="auto",
            torch_dtype=torch.float16,
        )
processor = LlavaNextProcessor.from_pretrained("xtuner/llava-llama-3-8b-v1_1-hf")

#. Use Typhoon 7b as tokenizer
tokenizer = AutoTokenizer.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model_load.config.text_config.vocab_size = len(tokenizer)
model_load.language_model.resize_token_embeddings(len(tokenizer))
processor.tokenizer = tokenizer

#. Set ID Token
model_load.config.eos_token_id = 6

#. Adapter path
model_load.load_adapter("/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/model/model_blip2(F)_Typhoon")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.26s/it]
2024-04-23 22:45:14.198929: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
outputs = []

for i in tqdm(range(0,data_size,40), desc="Genarated Answer Vector : "):
    im = data_testset.select([i])['image'][0]
    pixel_values = processor(im, return_tensors="pt").to(device, torch.float16).pixel_values
    model_load.eval()
    outputs.append(model_load.generate(pixel_values=pixel_values))

print(outputs)

In [None]:
generated_caption = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(generated_caption)

## Submission

In [5]:
submission = pd.read_csv("/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/resource/sample_submission.csv")

In [6]:
test_images = submission.image_id.tolist()
test_images = ["data/"+im + ".jpg" for im in test_images]


batch = 200
test_prediction = []

for i in tqdm(range(0, len(test_images), batch)):
    images = [Image.open(im) for im in test_images[i: i+batch]]
    pixel_values = processor(images, return_tensors="pt").to("cuda", torch.float16).pixel_values

    outputs = model_load.generate(pixel_values=pixel_values)
    generated_caption = processor.batch_decode(outputs, skip_special_tokens=True)

    test_prediction.extend(generated_caption)

100%|██████████| 244/244 [47:44<00:00, 11.74s/it]


In [9]:
test_prediction

['คนคนคนกำลังกำลังกำลังกำลังอยู่บนโต๊ะ',
 'คนกำลังกำลังอยู่ตรงถนนมีคนกำลังอยู่ตรงถนน',
 'ที่กำลังสีเหลืองอยู่บนพื้น',
 'คนคนกำลังอยู่บนโต๊ะสีเขียววางอยู่บนโต๊ะ',
 'คนคนคนคนคนคนคนคนคนคนคนคนคนคนคนคนคนคน',
 'คนคนกำลังอยู่บนถนน',
 'คนกำลังกำลังกำลังกำลังกำลังกำลังกำลังอยู่ตรงสนามหญ้า',
 'แมวสีน้ำตาลกำลังกำลังกำลังกำลังกำลังกำลังอยู่บนเตียง',
 'คนคนกำลังกำลังกำลังกำลังวางอยู่บนโต๊ะ',
 'จอดอยู่บนถนนมีต้นไม้สีเขียวจอดอยู่บนถนน',
 'คนคนกำลังสีแดงกำลังอยู่บนถนน',
 'คนกำลังกำลังกำลังอยู่บนถนน',
 'คนคนคนคนกำลังกำลังกำลังอยู่ตรง Насеคนกำลังอยู่',
 'ราฟราฟสีเขียวกำลังยืนอยู่บนพื้นพื้นพื้นพื้นพื้นพื้น',
 'ขนมปังสีน้ำตาลสีเหลืองและสีเหลืองวางอยู่บนโต๊ะ',
 'คนคนคนกำลังกำลังกำลังสีน้ำตาลสีน้ำตาลและสีแดง',
 '2 คนกำลังอยู่ตรงถนน',
 'คนคนกำลังกำลังกำลังอยู่ตรงถนน',
 'คนคนคนกำลังกำลังอยู่ตรงถนน',
 'คนคนคนหนึ่งกำลังเล่นเทนนิส',
 'คนกำลังกำลังกำลังกำลังอยู่ตรงถนน',
 'คนคนกำลังกำลังกำลังกำลังอยู่ตรงห้อง',
 'คนคนกำลังกำลังกำลังอยู่ตรงถนน',
 'ที่จอดอยู่บนถนนมีต้นไม้อยู่บนถนน',
 'คนคนคนกำลังสีน้ำตาลสีน้ำตาลสีน้ำ

In [14]:
submission['caption'] = test_prediction
submission.head()

Unnamed: 0,image_id,caption
0,test2017/000000160477,คนคนคนกำลังกำลังกำลังกำลังอยู่บนโต๊ะ
1,test2017/000000386306,คนกำลังกำลังอยู่ตรงถนนมีคนกำลังอยู่ตรงถนน
2,test2017/000000502273,ที่กำลังสีเหลืองอยู่บนพื้น
3,test2017/000000480896,คนคนกำลังอยู่บนโต๊ะสีเขียววางอยู่บนโต๊ะ
4,test2017/000000228698,คนคนคนคนคนคนคนคนคนคนคนคนคนคนคนคนคนคน


In [15]:
submission.to_csv('/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/resource/gg.csv', index=False)