# Installing Dependencies

In [None]:
!pip install -q datasets
!pip install -q git+https://github.com/huggingface/transformers.git@add-model-idefics
!pip install -q bitsandbytes sentencepiece accelerate loralib
!pip install -q -U git+https://github.com/huggingface/peft.git

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[0m  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mgit checkout -q add-model-idefics[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
[1;31merror[0m: [1msubprocess-exited-with-error[0m

[31m×[0m [32mgi

# Importing dependencies

In [None]:
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from PIL import Image
from transformers import IdeficsForVisionText2Text, AutoProcessor, Trainer, TrainingArguments, BitsAndBytesConfig
from torchvision import transforms as transforms
import torch

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Dataset

In [None]:
class CocoCaptionDataset:

    def __init__(self):
        self.model_ckpt = "HuggingFaceM4/idefics-9b-instruct"
        self.processor = AutoProcessor.from_pretrained(self.model_ckpt, use_auth_token=True)

    def load_data(self):
        data = load_dataset("cat-state/mscoco-1st-caption")
        data = data["train"].train_test_split(test_size=0.0002)
        train_data = data["train"]
        val_data = data["test"]
        return train_data, val_data

    def img_convert_to_rgb(self, image):
        if image.mode == "RGB":
            return image
        image_rgba = image.convert("RGBA")
        background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
        alpha_composite = Image.alpha_composite(background, image_rgba)
        alpha_composite = alpha_composite.convert("RGB")
        return alpha_composite

    def transform_data(self, example):
        img_size = self.processor.image_processor.image_size
        img_mean = self.processor.image_processor.image_mean
        img_std = self.processor.image_processor.image_std

        img_transform = transforms.Compose([
            self.img_convert_to_rgb,
            transforms.RandomResizedCrop((img_size, img_size), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC),
            transforms.ToTensor(),
            transforms.Normalize(mean=img_mean, std=img_std),
        ])

        prompts = []
        for i in range(len(example['caption'])):
            caption = example['caption'][i]
            prompts.append(
                [
                    example['url'][i],
                    f"Question: Explain the picture. Answer: {caption}</s>",
                ],
            )

        inputs = self.processor(prompts, transform=img_transform, return_tensors="pt").to("cuda")
        inputs["labels"] = inputs["input_ids"]

        return inputs

    def gen_data(self):
        train_dataset, val_dataset = self.load_data()
        train_dataset.set_transform(self.transform_data)
        val_dataset.set_transform(self.transform_data)
        return train_dataset, val_dataset


# Model Training

In [None]:
class ImageCaptioningModel:

    def __init__(self):
        cococaptiondataset = CocoCaptionDataset()
        self.train_data, self.val_data = cococaptiondataset.gen_data()
        self.model_ckpt = "HuggingFaceM4/idefics-9b-instruct"

    def load_model(self):
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            llm_int8_skip_modules=["lm_head", "embed_tokens"],
        )

        model = IdeficsForVisionText2Text.from_pretrained(self.model_ckpt, quantization_config=bnb_config, device_map="auto")
        return model

    def create_lora_model(self, model):
        config = LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=["q_proj", "k_proj", "v_proj"],
            lora_dropout=0.05,
            bias="none",
        )
        lora_model = get_peft_model(model, config)
        lora_model.print_trainable_parameters()
        return lora_model

    def set_training_args(self):
        training_args = TrainingArguments(
            output_dir="idefics-mscoco-captioner",
            learning_rate=2e-4,
            fp16=True,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=2,
            gradient_accumulation_steps=8,
            dataloader_pin_memory=False,
            save_total_limit=1,
            evaluation_strategy="steps",
            save_strategy="steps",
            save_steps=50,
            eval_steps=50,
            logging_steps=50,
            max_steps=100,
            remove_unused_columns=False,
            load_best_model_at_end=True,
            optim="paged_adamw_8bit",
            label_names=["labels"]
        )
        return training_args


    def train_and_push_to_hub(self):
        img_cap_model = self.load_model()
        img_cap_model = self.create_lora_model(img_cap_model)
        trainer = Trainer(
            model = img_cap_model,
            args = self.set_training_args(),
            train_dataset = self.train_data,
            eval_dataset = self.val_data
        )
        trainer.train()
        img_cap_model.push_to_hub("idefics-mscoco-captioner", private=False)

In [None]:
if __name__ == "__main__":
    imagecaptioningmodel = ImageCaptioningModel()
    imagecaptioningmodel.train_and_push_to_hub()



Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 19,750,912 || all params: 8,949,438,736 || trainable%: 0.22069442098698333


Step,Training Loss,Validation Loss
50,1.0264,0.724259
100,0.7127,0.705179


adapter_model.bin:   0%|          | 0.00/79.2M [00:00<?, ?B/s]

# Inference

In [None]:
def check_inference(model, processor, prompts, max_new_tokens=50):
    tokenizer = processor.tokenizer
    bad_words = ["<image>", "<fake_token_around_image>"]
    if len(bad_words) > 0:
        bad_words_ids = tokenizer(bad_words, add_special_tokens=False).input_ids

    eos_token = "</s>"
    eos_token_id = tokenizer.convert_tokens_to_ids(eos_token)

    inputs = processor(prompts, return_tensors="pt").to("cuda")
    generated_ids = model.generate(**inputs, eos_token_id=[eos_token_id], bad_words_ids=bad_words_ids, max_new_tokens=max_new_tokens, early_stopping=True)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(generated_text)

In [None]:
bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            llm_int8_skip_modules=["lm_head", "embed_tokens"],
)

model = IdeficsForVisionText2Text.from_pretrained("HuggingFaceM4/idefics-9b-instruct", quantization_config=bnb_config, device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics-9b-instruct", use_auth_token=True)



In [None]:
from peft import PeftModel, PeftConfig
config = PeftConfig.from_pretrained("/content/idefics-mscoco-captioner")
model = PeftModel.from_pretrained(model,"/content/idefics-mscoco-captioner")

## Sample Image:

Let's run prediction with the quantized model for the image below which pictures two kittens. \\
<img src="https://cdn.pixabay.com/photo/2018/01/14/23/12/nature-3082832_1280.jpg" width="400"/>

In [None]:
url = "https://cdn.pixabay.com/photo/2018/01/14/23/12/nature-3082832_1280.jpg"
prompts = [
    url,
    "Question: Explain the picture. Answer:",
]
check_inference(model, processor, prompts, max_new_tokens=50)



Question: Explain the picture. Answer: A mountain lake with lightning striking the mountain in the background.
