## Finetuning Multimodal LLM IDEFICS-9B LLM on Pokemon Data

### Install Required Packages

In [1]:
!pip install -q datasets
!pip install -q git+https://github.com/huggingface/transformers
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q bitsandbytes sentencepiece accelerate loralib

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m2.7 

### Import Libraries

In [2]:
import torch
import torchvision.transforms as transforms
from transformers import IdeficsForVisionText2Text, Trainer, TrainingArguments, AutoProcessor, BitsAndBytesConfig
from datasets import load_dataset
from peft import get_peft_model, LoraConfig
from PIL import Image

### Use GPU if Available, else use CPU

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

### Quantize Pre-Trained LLM Model to 4-Bit Precision for Efficiency

In [4]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    llm_int8_skip_modules=["embed_tokens", "lm_head"],
    bnb_4bit_compute_dtype=torch.float16,
)

pretrained_model = "HuggingFaceM4/idefics-9b"
model = IdeficsForVisionText2Text.from_pretrained(pretrained_model, quantization_config=quantization_config, device_map="auto")
model_processor = AutoProcessor.from_pretrained(pretrained_model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/99.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/19 [00:00<?, ?it/s]

model-00001-of-00019.safetensors:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

model-00002-of-00019.safetensors:   0%|          | 0.00/1.82G [00:00<?, ?B/s]

model-00003-of-00019.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00019.safetensors:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

model-00005-of-00019.safetensors:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

model-00006-of-00019.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00007-of-00019.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00008-of-00019.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00009-of-00019.safetensors:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

model-00010-of-00019.safetensors:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

model-00011-of-00019.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00012-of-00019.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00013-of-00019.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00014-of-00019.safetensors:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

model-00015-of-00019.safetensors:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

model-00016-of-00019.safetensors:   0%|          | 0.00/1.97G [00:00<?, ?B/s]

model-00017-of-00019.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00018-of-00019.safetensors:   0%|          | 0.00/1.97G [00:00<?, ?B/s]

model-00019-of-00019.safetensors:   0%|          | 0.00/705M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Define Function for Inferencing LLM Model

In [5]:
def generate_text(model,model_processor, prompts):
    input=model_processor(prompts, return_tensors="pt").to(device)
    generated_token_ids = model.generate(
        **input,
        max_new_tokens=50,
    )
    generated_text =model_processor.batch_decode(generated_token_ids, skip_special_tokens=True)[0]
    print(generated_text)

### Define Functions for Pre-Processing Data

In [6]:
def ensure_rgb(image):
  if image.mode == "RGB":
    return image
  else:
    return Image.alpha_composite(Image.new("RGBA", image.convert("RGBA").size, (255,255,255)), image.convert("RGBA")).convert("RGB")

def preprocess_data(data):
    img_size = model_processor.image_processor.image_size
    img_transform = transforms.Compose([
        ensure_rgb,
        transforms.RandomResizedCrop((img_size, img_size), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC),
        transforms.ToTensor(),
        transforms.Normalize(mean=model_processor.image_processor.image_mean, std=model_processor.image_processor.image_std)
    ])
    prompts = [
        [
            data['image_url'][i],
            f"Question: What's this? Answer: This is {data['name']}. {data['caption'][i].split('.')[0]}</s>",
        ]
        for i in range(len(data['caption']))
    ]
    processed_data = model_processor(prompts, transform=img_transform, return_tensors="pt").to(device)
    processed_data["labels"] = processed_data["input_ids"]
    return processed_data

### Output of LLM Model before Training on New Data

In [7]:
test_prompt = [
    "https://i.ebayimg.com/images/g/hwUAAOSwqa5jYYg1/s-l1600.jpg",
    "Question: What's this? Answer:",
]
generate_text(model,model_processor, test_prompt)

Question: What's this? Answer: Bidoof. I'm not sure if it's a Pokemon or a Pokemon card.


### Load and Split Data Into Training and Validation

In [8]:
data = load_dataset("TheFusion21/PokemonCards")

# problematic_urls = ["https://images.pokemontcg.io/mcd18/1_hires.png","https://images.pokemontcg.io/mcd14/5_hires.png"]
# data = data.filter(lambda example: example['image_url'] not in problematic_urls)

data = data["train"].train_test_split(test_size=0.002)
train_data = data["train"]
validation_data = data["test"]
train_data.set_transform(preprocess_data)
validation_data.set_transform(preprocess_data)

Downloading readme:   0%|          | 0.00/2.77k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.28M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

### Apply LoRA (Low Rank Adaptation) to LLM Model to Enhance Adaptability to New Data

In [9]:
lora_config = LoraConfig(
    lora_dropout = 0.05,
    lora_alpha = 24,
    r = 12,
    bias="none",
    target_modules = ["q_proj", "k_proj", "v_proj"]
)

model = get_peft_model(model, lora_config)

### Define Training Arguments

In [10]:
training_config = TrainingArguments(
    output_dir="Finetuned_Model",
    fp16=True,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=5,
    save_total_limit=4,
    evaluation_strategy="steps",
    eval_steps=10,
    save_strategy="steps",
    save_steps=20,
    max_steps=20,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=5,
    remove_unused_columns=False,
    label_names=["labels"],
    dataloader_pin_memory=False,
    load_best_model_at_end = False,
)

In [11]:
trainer = Trainer(
    model = model,
    args = training_config,
    train_dataset = train_data,
    eval_dataset = validation_data
)

### Train Model

In [12]:
trainer.train()

Step,Training Loss,Validation Loss
10,1.8401,1.31703
20,1.0733,0.992612


TrainOutput(global_step=20, training_loss=1.7181349277496338, metrics={'train_runtime': 137.2306, 'train_samples_per_second': 1.457, 'train_steps_per_second': 0.146, 'total_flos': 954562717815168.0, 'train_loss': 1.7181349277496338, 'epoch': 0.02})

### Output of LLM Model After Training on New Data

In [13]:
generate_text(model,model_processor, test_prompt)

Question: What's this? Answer: This is ['Bidoof', 'Bidoof']. A Basic Pokemon Card of type Grass with the title Bidoof and 50 HP of rarity Common from the set Unbroken Bonds and the flavor
