# Implement the multi-modal visual-question answering Model

https://www.kdnuggets.com/implementing-multimodal-models-with-hugging-face-transformers

This one is really good!

In [1]:
import torch
from transformers import BlipProcessor, BlipForQuestionAnswering
from PIL import Image
import requests


def load_image(image_url):
    return Image.open(requests.get(image_url, stream=True).raw).convert('RGB')

Load the BLIP model and the processor to process image data

In [2]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

Ask a question about the image

In [6]:
image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = load_image(image_url)

# Example question
question = "Where are the cats sleeping?"

# Preparing input and generating answer output
inputs = processor(image, question, return_tensors="pt")
out = model.generate(**inputs)
answer = processor.decode(out[0], skip_special_tokens=True)

print(f"Question: {question}")
print(f"Answer: {answer}")

Question: Where are the cats sleeping?
Answer: couch


### Radiology example

In [7]:
import torch
from transformers import BlipProcessor, BlipForQuestionAnswering, TrainingArguments, Trainer
from datasets import load_dataset
from PIL import Image
import requests
import io

dataset = load_dataset("flaviagiammarino/vqa-rad")
dataset

README.md:   0%|          | 0.00/3.91k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


(…)-00000-of-00001-eb8844602202be60.parquet:   0%|          | 0.00/24.2M [00:00<?, ?B/s]

(…)-00000-of-00001-e5bc3d208bb4deeb.parquet:   0%|          | 0.00/10.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1793 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/451 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image', 'question', 'answer'],
        num_rows: 1793
    })
    test: Dataset({
        features: ['image', 'question', 'answer'],
        num_rows: 451
    })
})

start up GPU

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    images = examples["image"]  
    inputs = processor(images=images, text=questions, padding="max_length", truncation=True, return_tensors="pt")
   
    targets = processor(text=examples["answer"], padding="max_length", truncation=True, return_tensors="pt")
    inputs['labels'] = targets['input_ids']
   
    return {k: v.to(device) for k, v in inputs.items()}


processed_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
    num_proc=4 if device.type == "cpu" else 1,
)

Map:   0%|          | 0/1793 [00:00<?, ? examples/s]

Map:   0%|          | 0/451 [00:00<?, ? examples/s]

In [10]:
training_args = TrainingArguments(
    output_dir="./vqa_blip_rad_finetuned",
    learning_rate=5e-5,
    per_device_train_batch_size=8 if device.type == "cuda" else 4,  
    per_device_eval_batch_size=8 if device.type == "cuda" else 4,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=device.type == "cuda",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["test"],
    tokenizer=processor,
)


trainer.train()



  0%|          | 0/675 [00:00<?, ?it/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


KeyboardInterrupt: 

In [None]:
test_example = dataset['test'][2]
test_image = test_example['image']
test_question = test_example['question']


inputs = processor(test_image, test_question, return_tensors="pt").to(device)
with torch.no_grad():
    out = model.generate(**inputs)
answer = processor.decode(out[0], skip_special_tokens=True)


print(f"Question: {test_question}")
print(f"Answer: {answer}")
print(f"Ground Truth: {test_example['answer']}")

In [None]:
model.save_pretrained("./vqa_blip_rad_finetuned")
processor.save_pretrained("./vqa_blip_rad_finetuned")

### Text to speech

In [2]:
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import soundfile as sf

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

preprocessor_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


ImportError: 
SpeechT5Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.
