In [None]:
!pip install -q datasets
!pip install git+https://github.com/huggingface/transformers.git
!pip install -q bitsandbytes sentencepiece accelerate loralib
!pip install -q -U git+https://github.com/huggingface/peft.git

In [None]:
!pip install Pillow
!pip install matplotlib
from PIL import Image
import matplotlib.pyplot as plt
image = Image.open('train/png/two_col_81284.png')
print(image)
plt.imshow(image)

In [None]:
import pandas as pd
train_json=pd.read_json("train/train_augmented.json")

In [None]:
train_json

In [None]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from PIL import Image
from transformers import IdeficsForVisionText2Text, AutoProcessor, Trainer, TrainingArguments, BitsAndBytesConfig
import torchvision.transforms as transforms

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# checkpoint = "HuggingFaceM4/tiny-random-idefics"
checkpoint = "HuggingFaceM4/idefics-9b"

# Here we skip some special modules that can't be quantized properly
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    llm_int8_skip_modules=["lm_head", "embed_tokens"],
)

processor = AutoProcessor.from_pretrained(checkpoint, use_auth_token=False)
# Simply take-off the quantization_config arg if you want to load the original model
model = IdeficsForVisionText2Text.from_pretrained(checkpoint, quantization_config=bnb_config, device_map="auto")

In [None]:
print(model)

In [None]:
def check_inference(model, processor, prompts, max_new_tokens=50):
    tokenizer = processor.tokenizer
    bad_words = ["<image>", "<fake_token_around_image>"]
    if len(bad_words) > 0:
        bad_words_ids = tokenizer(bad_words, add_special_tokens=False).input_ids

    eos_token = "</s>"
    eos_token_id = tokenizer.convert_tokens_to_ids(eos_token)

    inputs = processor(prompts, return_tensors="pt").to(device)
    generated_ids = model.generate(**inputs, eos_token_id=[eos_token_id], bad_words_ids=bad_words_ids, max_new_tokens=max_new_tokens, early_stopping=True)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(generated_text)

In [None]:
url = "https://factly.in/wp-content/uploads//2020/09/India%E2%80%99s-GDP-growth-rate_GDP-growth-comparison-Business-Today-1024x698.jpg"
prompts = [
    # "Instruction: provide an answer to the question. Use the image to answer.\n",
    image,
    "Question: What is this image about ? Answer:",
]
check_inference(model, processor, prompts, max_new_tokens=5)

In [None]:
# check generation before finetuning

url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/d5/Mona_Lisa_%28copy%2C_Hermitage%29.jpg/224px-Mona_Lisa_%28copy%2C_Hermitage%29.jpg"
prompts = [
    url,
    "Question: Who is the lady in the painting ? Answer:",
]
check_inference(model, processor, prompts, max_new_tokens=100)
# It looks like the model is already aware of pokemon - but it could be more specific, and less repetitive

In [None]:
def convert_to_rgb(image):
    # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
    # for transparent images. The call to `alpha_composite` handles this case
    if image.mode == "RGB":
        return image

    image_rgba = image.convert("RGBA")
    background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
    alpha_composite = Image.alpha_composite(background, image_rgba)
    alpha_composite = alpha_composite.convert("RGB")
    return alpha_composite


In [None]:
def ds_transforms(example_batch):
    image_size = processor.image_processor.image_size
    image_mean = processor.image_processor.image_mean
    image_std = processor.image_processor.image_std

    image_transform = transforms.Compose([
        convert_to_rgb,
        transforms.RandomResizedCrop((image_size, image_size), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC),
        transforms.ToTensor(),
        transforms.Normalize(mean=image_mean, std=image_std),
    ])

    prompts = []
    for i in range(len(example_batch['query'])):
        # We split the captions to avoid having very long examples, which would require more GPU ram during training
        caption = example_batch['query'][i].split(".")[0]
        img_name=example_batch['imgname'][i]
        image_url="train/png"+img_name
        image = Image.open(image_url)
        prompts.append(
            [
                #
                image,
                f"Question: What's on the picture? Answer: This is {example_batch['label'][i]}. {caption}",
            ],
        )

    inputs = processor(prompts, transform=image_transform, return_tensors="pt").to(device)

    inputs["labels"] = inputs["input_ids"]

    return inputs

In [None]:
!pip install sklearn
from sklearn.model_selection import train_test_split
train_data = train_json
train_ds, eval_ds = train_test_split(train_data, test_size=0.002)
train_ds
eval_ds
train_ds.set_transform(ds_transforms) 
eval_ds.set_transform(ds_transforms)

In [None]:
model_name = checkpoint.split("/")[1]
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
)
model = get_peft_model(model, config)


In [None]:
model.print_trainable_parameters()


In [None]:
training_args = TrainingArguments(
    output_dir=f"{model_name}-output",
    learning_rate=2e-4,
    fp16=True,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    dataloader_pin_memory=False,
    save_total_limit=3,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=40,
    eval_steps=20,
    logging_steps=20,
    max_steps=40,
    remove_unused_columns=False,
    push_to_hub=False,
    label_names=["labels"],
    load_best_model_at_end=True,
    report_to=None,
    optim="paged_adamw_8bit",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
)

trainer.train()