In [None]:
# !pip install -q -U git+https://github.com/huggingface/peft.git transformers bitsandbytes datasets accelerate wandb

## Library and Config

In [1]:
import os
import torch
import datasets
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
from pathlib import Path
import matplotlib.pyplot as plt


from transformers import AutoTokenizer
from transformers import Blip2Processor, Blip2ForConditionalGeneration


from peft import LoraConfig, get_peft_model

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("scb10x/typhoon-7b",use_fast=False)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

  from .autonotebook import tqdm as notebook_tqdm


1

## Dataset

### Load data from COCO MS 2017

In [2]:
!wget -P /home/hpcnc/cloud/SuperAI/Hack-Image-Caption/data  http://images.cocodataset.org/zips/test2017.zip
!wget -P /home/hpcnc/cloud/SuperAI/Hack-Image-Caption/data  http://images.cocodataset.org/zips/train2017.zip
#. Download file from COCO dataset 

!unzip -q /home/hpcnc/cloud/SuperAI/Hack-Image-Caption/data/test2017.zip
!unzip -q /home/hpcnc/cloud/SuperAI/Hack-Image-Caption/data/train2017.zip
#. unzip file from COCO dataset 

!rm /home/hpcnc/cloud/SuperAI/Hack-Image-Caption/data/test2017.zip
!rm /home/hpcnc/cloud/SuperAI/Hack-Image-Caption/data/train2017.zip
#. Delete zip file


--2024-04-26 14:24:50--  http://images.cocodataset.org/zips/test2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 52.217.47.76, 3.5.27.22, 3.5.25.202, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|52.217.47.76|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6646970404 (6.2G) [application/zip]
Saving to: ‘/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/data/test2017.zip’


2024-04-26 14:37:34 (8.31 MB/s) - ‘/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/data/test2017.zip’ saved [6646970404/6646970404]

--2024-04-26 14:37:34--  http://images.cocodataset.org/zips/train2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 52.216.53.65, 52.217.90.20, 52.216.43.25, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|52.216.53.65|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19336861798 (18G) [application/zip]
Saving to: ‘/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/data/train2

### reduce data size with reduce by Tokensize

In [None]:
images_path = list(Path("/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/data/train2017/").glob("*.jpg"))
labels = pd.read_csv("/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/labelv2.csv")
#. I have Label3 it Actually very good but no clean path
images = []

for name_file in tqdm(labels['image']):
    temp_str = name_file.split("/")
    if temp_str[0] == "train2017":
        images.append(str(images_path[0].parent / (temp_str[-1] + ".jpg")))
        # images.append(Image.open(images_path[0].parent / (temp_str[-1] + ".jpg")))

In [None]:
labels #+ Display data in dataframe

In [None]:
labels['tokens'] = labels['captions'].apply(
    lambda x: len(tokenizer(x)["input_ids"])
)
labels = labels[labels.tokens < 30] #. Reduce Token by fig Max Lenght

In [None]:
labels['image'] #+ Display in column Image

In [None]:
# labels = labels.drop_duplicates(subset="image")
select_labels = labels[labels['split'] == "train" ]['captions']
select_labels 

In [None]:
training_path  = "/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/data"
images_filter = []
database = list(labels[labels['split'] == "train" ]['image'])

for i in tqdm(list(labels['image'])):
    temp = (i.split("/")[-2] + "/" + i.split("/")[-1].split(".")[0])
    if temp in database:
        images_filter.append(training_path+"/"+i+".jpg")

print("count data image to train",len(images_filter))

In [None]:
dataset = datasets.Dataset.from_dict({"image": images_filter, "text": select_labels})
dataset = dataset.train_test_split(test_size=0.85, seed=42) #! In this Line Adjust Training size from TrainTestSplit
dataset

#! ไม่ควรทำอย่างมากในความเป็นจริงไม่ควรสุ่ม Data เข้าไป Train

In [None]:
from torch.utils.data import Dataset, DataLoader

#. Class for Dataset 
class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        im = Image.open(item['image'])
        encoding = self.processor(images=im, padding="max_length", return_tensors="pt")   #? remove batch dimension
        encoding = {k: v.squeeze() for k, v in encoding.items()}
        encoding["text"] = item["text"]
        return encoding

def collate_fn(batch):
    # pad the input_ids and attention_mask
    processed_batch = {}
    for key in batch[0].keys():
        if key != "text":
            processed_batch[key] = torch.stack([example[key] for example in batch])
        else:
            text_inputs = processor.tokenizer(
                [example["text"] for example in batch], padding=True, return_tensors="pt"
            )
            processed_batch["input_ids"] = text_inputs["input_ids"]
            processed_batch["attention_mask"] = text_inputs["attention_mask"]
            
    return processed_batch

## Model

In [None]:

#. Init regular Blip-2 model from Transformer Library and Load pretrain model from Hugging Face
#. Model -- Blip2 (Image) with Opt (Text) and Pretrain on COCO dataset
#. Config -- Optimize on Float16 for high Speed on GPUs and Load only on 8 Bit data 
model = Blip2ForConditionalGeneration.from_pretrained(
            "Salesforce/blip2-opt-2.7b-coco",
            load_in_8bit=True,
            device_map="auto",
            torch_dtype=torch.float16,
        )
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b-coco")

#. Use Typhoon 7b as tokenizer -> 35k tokens in Thai Word

tokenizer = AutoTokenizer.from_pretrained("scb10x/typhoon-7b",use_fast=False)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
model.config.text_config.vocab_size = len(tokenizer)
model.language_model.resize_token_embeddings(len(tokenizer))

processor.tokenizer = tokenizer
model.config.eos_token_id = tokenizer.eos_token_id

#. Let's define the LoraConfig
config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "k_proj"]
)

#. Parameter effective fine-tuning
model = get_peft_model(model, config)
model.print_trainable_parameters()

In [None]:
batch_size = 14
train_dataset = ImageCaptioningDataset(dataset['train'], processor)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, collate_fn=collate_fn)

test_dataset = ImageCaptioningDataset(dataset['test'], processor)
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size, collate_fn=collate_fn)

In [None]:
print(f" Number of Iteration per Epoch = {len(train_dataloader)}")

## Training

In [None]:
experiment_name = "model_blip2(H)_Typhoon"
#. Folder name model

optimizer = torch.optim.AdamW(model.parameters(), lr=6e-4, weight_decay=1e-2)
#. Optimizer

EPOCH = 10

for epoch in range(EPOCH):
    print("Epoch:", epoch+1)
    model.train()
    
    training_loss = 0
    validate_loss = 0
    
    idx = 0

    #! เพิ่มการ Evaluate ระหว่างการ Train ไปด้วยแล้วก็ควร เพิ่ม Validation set

    for batch in tqdm(train_dataloader):
        input_ids = batch.pop("input_ids").to(device)
        pixel_values = batch.pop("pixel_values").to(device, torch.float16)

        outputs = model(
            input_ids=input_ids,
            pixel_values=pixel_values,
            labels=input_ids
        )

        loss = outputs.loss
        training_loss += loss.item()
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()
    
    print("Training loss", training_loss / len(train_dataloader))
    print("-" * 60)
    
    save_dir = os.path.join(experiment_name, str(epoch))
    os.makedirs(save_dir, exist_ok=True)
    model.save_pretrained(save_dir)  #? save on adpater load from pretrain model

In [None]:
#. Litter bit test
im = dataset['test'].select([500,501])['image']

In [None]:
pixel_values = processor(im, return_tensors="pt").to(device, torch.float16).pixel_values
model.eval()

outputs = model.generate(pixel_values=pixel_values)
print(outputs)

In [None]:
generated_caption = processor.batch_decode(outputs, skip_special_tokens=True)
print(type(generated_caption),generated_caption[0:2])

## Save model

In [None]:
model.save_pretrained("/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/model/model_blip2(F)_Typhoon")

## Test model

### Prepare data to test

In [None]:
# Load data to test 2 path
#? 1. test/food/
#? 2. test/travel/
#? 3. test2017/

import datasets
from PIL import Image
from pathlib import Path

images_test = []

image1_path = list(Path("/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/data/test2017").glob("*.jpg"))
image2_path = list(Path("/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/data/test/food").glob("*.jpg"))
image3_path = list(Path("/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/data/test/travel").glob("*.jpg"))

image1_path.extend(image2_path)
image1_path.extend(image3_path)

merge_path = image1_path
data_size = len(merge_path)


for i in tqdm(image1_path, desc="Load image to List"):
    images_test.append(Image.open(i))

data_testset = datasets.Dataset.from_dict({"image": images_test })

print(f"row in submission : {data_size}")

In [None]:
torch.cuda.empty_cache()
model_load = Blip2ForConditionalGeneration.from_pretrained(
            "Salesforce/blip2-opt-2.7b-coco",
            load_in_8bit=True,
            device_map="auto",
            torch_dtype=torch.float16,
        )

#. Load Tokenizer and Processor
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b-coco")
tokenizer = AutoTokenizer.from_pretrained("scb10x/typhoon-7b",use_fast=False)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model_load.config.text_config.vocab_size = len(tokenizer)
model_load.language_model.resize_token_embeddings(len(tokenizer))
processor.tokenizer = tokenizer

#. Set ID Token
model_load.config.eos_token_id = 6

#. Adapter path
model_load.load_adapter("/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/model_blip2(c)_Typhoon/8")

In [None]:
outputs = []
from PIL import Image
#? For Test Only  is not submission 40 Test
for i in tqdm(range(3,4), desc="Genarated Answer Vector : "):
    im = data_testset.select([i])['image'][0]
    
    pixel_values = processor(im, return_tensors="pt").to(device, torch.float16).pixel_values
    model_load.eval()
    outputs.append(model_load.generate(pixel_values=pixel_values,
                            num_beams=5,
                            no_repeat_ngram_size=2,))

print(outputs)

In [None]:
im = Image.open("/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/data/test2017/000000360936.jpg")

pixel_values = processor(im, return_tensors="pt").to(device, torch.float16).pixel_values
model_load.eval()
outputs = model_load.generate(pixel_values=pixel_values,
                              num_beams=5,
                              no_repeat_ngram_size=4,)
im


In [None]:
generated_caption = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(generated_caption)

In [None]:
im

## Submission

In [None]:
submission = pd.read_csv("/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/resource/sample_submission.csv")

In [None]:
test_images = submission.image_id.tolist()
test_images = ["/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/data/"+im + ".jpg" for im in test_images]


batch = 58
test_prediction = []

for i in tqdm(range(0, len(test_images), batch) ,desc="Load submission : "):
    images = [Image.open(im) for im in test_images[i: i+batch]]
    pixel_values = processor(images, return_tensors="pt").to("cuda", torch.float16).pixel_values

    outputs = model_load.generate(pixel_values=pixel_values,
                                 num_beams=5,
                                 no_repeat_ngram_size=4,)
                                  
    generated_caption = processor.batch_decode(outputs, skip_special_tokens=True)

    test_prediction.extend(generated_caption)

In [None]:
test_prediction

In [None]:
submission

In [None]:
len(test_prediction)

In [None]:
submission.describe()

In [None]:
submission['caption'].iloc[3:] = test_prediction[3:]

In [None]:
DONTWANT = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,.!@#$%^&*()_+[]:;\'\"?"
for i in tqdm(range(48673)):
    for j in submission['caption'][i]:
        if j in list(DONTWANT):
            submission['caption'][i] = "ไม่ทราบ"

In [None]:
submission.to_csv('/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/resource/gg6.csv', index=False,encoding="utf-8")

In [None]:
submission['image_id'].describe()

In [None]:
import pandas as pd
from tqdm import tqdm

In [None]:
df2 = pd.read_csv("/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/resource/gg6.csv")
word_in_caption ,size = [] , len(df2)

for i in tqdm(range(size)):
    
    if str(df2['image_id'][i])[0:4] != "test":
        df2 = df2.drop(i)

df2.describe()

In [None]:

df2.to_csv('/home/hpcnc/cloud/SuperAI/Hack-Image-Caption/resource/gg7.csv', index=False,encoding="utf-8")
        

In [None]:
df2