# **1.Install required dependencies**

In [None]:
!pip install transformers datasets
!pip install accelerate -U



In [None]:
!pip install transformers datasets evaluate -q
!pip install jiwer -q

# **2.Prepare and Preprocess the Dataset**

In [None]:
from datasets import load_dataset

ds = load_dataset("lambdalabs/pokemon-blip-captions")
ds

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['image', 'text'],
        num_rows: 833
    })
})

In [None]:
ds = ds["train"].train_test_split(test_size=0.1,seed=42)
train_ds = ds["train"]
test_ds = ds["test"]

#**2.1 Dataset Subset of 400 Training Examples**

In [None]:
#Part2
ds = ds["train"].train_test_split(test_size=0.1,seed=42)
train_subset_400 = train_ds.select(range(400))
test_data = ds["test"]

# **2.2 Load The Model**

In [None]:
from transformers import AutoProcessor

modelName = "microsoft/git-base"
processor = AutoProcessor.from_pretrained(modelName)

In [None]:
#setting for part1
def transforms(example_batch):
    images = [x for x in example_batch["image"]]
    captions = [x for x in example_batch["text"]]
    inputs = processor(images=images, text=captions, padding="max_length")
    inputs.update({"labels": inputs["input_ids"]})
    return inputs


train_ds.set_transform(transforms)
test_ds.set_transform(transforms)

In [None]:
#setting for part2
def transforms(example_batch):
    images = [x for x in example_batch["image"]]
    captions = [x for x in example_batch["text"]]
    inputs = processor(images=images, text=captions, padding="max_length")
    inputs.update({"labels": inputs["input_ids"]})
    return inputs


train_subset_400.set_transform(transforms)
test_data.set_transform(transforms)

In [None]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(modelName)

# **3.Evaluation Metrics**

In [None]:
from evaluate import load


wer = load("wer")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predicted = logits.argmax(-1)
    decoded_labels = processor.batch_decode(labels, skip_special_tokens=True)
    decoded_predictions = processor.batch_decode(predicted, skip_special_tokens=True)
    wer_score = wer.compute(predictions=decoded_predictions, references=decoded_labels)
    return {"wer_score": wer_score}

# **4.Training**

In [None]:
import torch
from transformers import TrainingArguments, Trainer

model_name = modelName.split("/")[1]


args = TrainingArguments(
    f"{model_name}-pokemon",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=6,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir='logs',
    remove_unused_columns=False,
    logging_steps=50,
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
    tokenizer=processor.tokenizer,
)

trainer.train()
trainer.save_model("./modelOriginal")

Epoch,Training Loss,Validation Loss,Wer Score
1,8.4057,5.678971,17.436409
2,4.9419,3.517781,6.169576
3,3.191,2.811973,6.19202


Checkpoint destination directory git-base-pokemon/checkpoint-94 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory git-base-pokemon/checkpoint-188 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory git-base-pokemon/checkpoint-282 already exists and is non-empty.Saving will proceed but saved results may be invalid.


# **5.Inference**

In [None]:
from PIL import Image
import requests

url = "https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/pokemon.png"
image = Image.open(requests.get(url, stream=True).raw)
image

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
inputs = processor(images=image, return_tensors="pt").to(device)
pixel_values = inputs.pixel_values

In [None]:
inferenceArgs1 = {
    "temperature": 1.0,
    "max_length": 100,
    "do_sample":True,
}

inferenceArgs2 = {
    "temperature": 0.0,
    "max_length": 100,
    "do_sample":False,
}
inferenceArgs3 = {
    "temperature": 0.7,
    "max_length": 100,
    "do_sample":True,
}

In [None]:
#Inference on parameters ; temperature=1.0, do_sample=True .

Generated_IDs1 = model.generate(pixel_values=pixel_values, **inferenceArgs1)
generated_caption1 = processor.batch_decode(Generated_IDs1, skip_special_tokens=True)[0]
print("Generated Caption Case 1: ", generated_caption1)


Generated Caption Case 1:  a cartoon character is sitting down with his hands on his hips


In [None]:
#Inference on parameters ; temperature=0.0, do_sample=False .
Generated_IDs2 = model.generate(pixel_values=pixel_values, **inferenceArgs2)
generated_caption2 = processor.batch_decode(Generated_IDs2, skip_special_tokens=True)[0]
print("Generated Caption Case 2: ", generated_caption2)

Generated Caption Case 2:  a cartoon character with a big smile on his face


In [None]:
#Inference on parameters ; temperature=0.7, do_sample=True .
Generated_IDs3 = model.generate(pixel_values=pixel_values, **inferenceArgs3)
generated_caption3 = processor.batch_decode(Generated_IDs3, skip_special_tokens=True)[0]
print("Generated Caption Case 3: ", generated_caption3)

Generated Caption Case 3:  a drawing of a purple and purple cartoon character


# **Part 2: Impact of Quantity on ML Models**

In [None]:
# Trainer for subset with 400 examples
trainer_subset_400 = Trainer(
    model,
    args,
    train_dataset=train_subset_400,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
    tokenizer=processor.tokenizer,
)

# Train the model on subset with 400 examples
trainer_subset_400.train()
trainer_subset_400.save_model("./modelSubset")

Epoch,Training Loss,Validation Loss,Wer Score
1,2.0581,1.2169,4.899002
2,0.8974,0.565359,5.908978
3,0.5016,0.418934,5.905237


Checkpoint destination directory git-base-pokemon/checkpoint-50 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory git-base-pokemon/checkpoint-100 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory git-base-pokemon/checkpoint-150 already exists and is non-empty.Saving will proceed but saved results may be invalid.


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model2 = AutoModelForCausalLM.from_pretrained("./modelSubset")
model2 = model2.to(device)
inputs = processor(images=image, return_tensors="pt").to(device)
pixel_values = inputs.pixel_values
pixel_values = pixel_values.to(device)
pixel_values = pixel_values.type(next(model.parameters()).dtype)

In [None]:
inferenceArgs1Subset = {
    "temperature": 1.0,
    "max_length": 100,
    "do_sample":True,
}

inferenceArgs2Subset = {
    "temperature": 0.0,
    "max_length": 100,

}
inferenceArgs3Subset = {
    "temperature": 0.7,
    "max_length": 100,
    "do_sample":True,
}

In [None]:
#Inference on parameters ; temperature=1.0, do_sample=True .

Generated_IDs1Subset = model2.generate(pixel_values=pixel_values, **inferenceArgs1Subset)
generated_caption1Subset = processor.batch_decode(Generated_IDs1Subset, skip_special_tokens=True)[0]
print("Generated Caption Case 1 For The Subset of 400 Training Examples: ", generated_caption1Subset)


Generated Caption Case 1 For The Subset of 400 Training Examples:  a pink and blue cartoon character flying through the air


In [None]:
#Inference on parameters ; temperature=0.0, do_sample=False .
Generated_IDs2Subset = model2.generate(pixel_values=pixel_values, **inferenceArgs2Subset)
generated_caption2Subset = processor.batch_decode(Generated_IDs2Subset, skip_special_tokens=True)[0]
print("Generated Caption Case 2 For The Subset of 400 Training Examples: ", generated_caption2Subset)

Generated Caption Case 2 For The Subset of 400 Training Examples:  a pink and blue cartoon character with a blue tail


In [None]:
#Inference on parameters ; temperature=0.7, do_sample=True .
Generated_IDs3Subset = model2.generate(pixel_values=pixel_values, **inferenceArgs3Subset)
generated_caption3Subset = processor.batch_decode(Generated_IDs3Subset, skip_special_tokens=True)[0]
print("Generated Caption Case 3 For The Subset of 400 Training Examples: ", generated_caption3Subset)

Generated Caption Case 3 For The Subset of 400 Training Examples:  a cartoon character with a big smile on his face


#**Part 3: Impact of Quality on ML Models**

In [None]:
#Part3
train_ds = ds["train"]
exclude_idx = []
exclude_words = ["pink", "blue", "dragon", "pokemon"]

for index, instance in enumerate(train_ds):
    caption = instance["text"]
    if any(word in caption for word in exclude_words):
        exclude_idx.append(index)

print("Number of excluded items: {}".format(len(exclude_idx)))

train_ds_excluded = train_ds.select([i for i in range(len(train_ds)) if i not in exclude_idx])

Number of excluded items: 286


In [None]:
train_ds_excluded.set_transform(transforms)

In [None]:

trainer_excluded = Trainer(
    model,
    args,
    train_dataset=train_ds_excluded,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
    tokenizer=processor.tokenizer,
)
trainer_excluded.train()
trainer_excluded.save_model("./modelExcluded")

Epoch,Training Loss,Validation Loss,Wer Score
1,No log,0.036115,1.381546
2,0.014400,0.033795,3.399002
3,0.006900,0.031338,4.15212


Checkpoint destination directory git-base-pokemon/checkpoint-49 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory git-base-pokemon/checkpoint-98 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory git-base-pokemon/checkpoint-147 already exists and is non-empty.Saving will proceed but saved results may be invalid.


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model3 = AutoModelForCausalLM.from_pretrained("./modelExcluded")
model3 = model3.to(device)
inputs = processor(images=image, return_tensors="pt").to(device)
pixel_values = inputs.pixel_values
pixel_values = pixel_values.to(device)
pixel_values = pixel_values.type(next(model.parameters()).dtype)

In [None]:
inferenceArgs1Excluded = {
    "temperature": 1.0,
    "max_length": 100,
    "do_sample":True,
}

inferenceArgs2Excluded = {
    "temperature": 0.0,
    "max_length": 100,

}
inferenceArgs3Excluded = {
    "temperature": 0.7,
    "max_length": 100,
    "do_sample":True,
}

In [None]:
#Inference on parameters ; temperature=1.0, do_sample=True .

Generated_IDs1Excluded = model3.generate(pixel_values=pixel_values, **inferenceArgs1Excluded)
generated_caption1Excluded = processor.batch_decode(Generated_IDs1Excluded, skip_special_tokens=True)[0]
print("Generated Caption Case 1 After Removing Training Examples Containing The Words; Pink, Blue, Dragon, And Pokemon : ", generated_caption1Excluded)


Generated Caption Case 1 After Removing Training Examples Containing The Words; Pink, Blue, Dragon, And Pokemon :  a drawing of a purple and black cartoon character


In [None]:
#Inference on parameters ; temperature=0.0, do_sample=False .
Generated_IDs2Excluded = model3.generate(pixel_values=pixel_values, **inferenceArgs2Excluded)
generated_caption2Excluded = processor.batch_decode(Generated_IDs2Excluded, skip_special_tokens=True)[0]
print("Generated Caption Case 2 After Removing Training Examples Containing The Words; Pink, Blue, Dragon, And Pokemon : ", generated_caption2Excluded)

Generated Caption Case 2 After Removing Training Examples Containing The Words; Pink, Blue, Dragon, And Pokemon :  a cartoon character with a big smile on his face




In [None]:
#Inference on parameters ; temperature=0.7, do_sampleTrue .
Generated_IDs3Excluded = model3.generate(pixel_values=pixel_values, **inferenceArgs3Excluded)
generated_caption3Excluded = processor.batch_decode(Generated_IDs3Excluded, skip_special_tokens=True)[0]
print("Generated Caption Case 3 After Removing Training Examples Containing The Words; Pink, Blue, Dragon, And Pokemon : ", generated_caption3Excluded)

Generated Caption Case 3 After Removing Training Examples Containing The Words; Pink, Blue, Dragon, And Pokemon :  a drawing of a purple and black cartoon character
