# Install Required packages

In [1]:
pip install gtts



In [2]:
pip install gradio



# Import Libraries

In [3]:
import torch
import requests
from PIL import Image
from gtts import gTTS
import gradio as gr
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import BlipProcessor, BlipForConditionalGeneration , BlipForQuestionAnswering

# Create APP Classses

In [4]:
class TranslateText:
    def __init__(self):
        # Intialize model and tokenizer
        self.model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
        self.tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

    def get_translation(self,text,ar_en=False,en_ar=False):
        if ar_en:
            # arabic text
            ar_text = text
            # translate Arabic to English
            self.tokenizer.src_lang = "ar_AR"
            encoded_ar = self.tokenizer(ar_text, return_tensors="pt")
            generated_tokens = self.model.generate(
                **encoded_ar,
                forced_bos_token_id= self.tokenizer.lang_code_to_id["en_XX"]
            )
            # return translation
            return self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        else:
            # english text
            en_text = text
            # translate english to arabic
            self.tokenizer.src_lang = "en_XX"
            encoded_ar = self.tokenizer(en_text, return_tensors="pt")
            generated_tokens = self.model.generate(
                **encoded_ar,
                forced_bos_token_id= self.tokenizer.lang_code_to_id["ar_AR"]
            )
            # return translation
            return self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

In [5]:
class ImageCaption:
    def __init__(self):
        """Initialize the models and processors for image captioning and VQA"""
        # Load caption model
        self.caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
        self.caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
        # Load VQA model
        self.vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
        self.vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

    def get_caption(self, image):
        """Generate a caption for the input image"""
        text = "a photo of"
        inputs = self.caption_processor(image, text, return_tensors="pt")
        caption = self.caption_model.generate(**inputs)
        # Fix: Correcting processor variable reference
        caption = self.caption_processor.decode(caption[0], skip_special_tokens=True)
        return caption

    def ask_question(self, image, question):
        """Answer a question related to the input image"""
        inputs = self.vqa_processor(image, question, return_tensors="pt")
        # Fix: Use proper method for VQA (not generate)
        output = self.vqa_model.generate(**inputs)
        # Decoding answer using processor
        ans = self.vqa_processor.decode(output[0], skip_special_tokens=True)
        return ans

In [6]:
class TextToAudio:
    def __init__(self):
        pass

    def get_audio(self,text,output_file_name="output.mp3"):
        try:
            # Generate the speech using gTTS
            tts = gTTS(text=text, lang='ar')

            # Save the audio to an mp3 file
            tts.save(output_file_name)

            # Return the file path
            return output_file_name

        except Exception as e:
            print(f"Error generating audio: {e}")
            return None

In [7]:
class QuestionGenerator:
    def __init__(self):
        # Set seed to 0
        torch.random.manual_seed(0)
        # Load model
        self.model = AutoModelForCausalLM.from_pretrained(
            "microsoft/Phi-3.5-mini-instruct",
            device_map="cuda",
            torch_dtype="auto",
            trust_remote_code=True,
        )
        # Load Tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")

    def suggest_questions(self,caption):
        # Define system prompt and pass image caption to generate questions based on it
        messages = [
            {"role": "system", "content": "You are a model designed to generate thoughtful and precise questions based on image captions. Your task is to generate two questions related to the image described by the caption.The questions should ask about very simple, specific details that a question-answering model can respond to in one or two words. Keep these two questions short and straightforward to avoid confusion.Output only the questions."},
            {"role": "user", "content": "Here is an image caption: 'A person riding a red bicycle in a busy city street.'"},
            {"role": "assistant", "content": 'What color is the bicycle? \n Is the person wearing a helmet?\n'
            },
            {"role": "user", "content": "Here is an image caption: 'A dog playing in a garden with a ball.'"},
            {"role": "assistant", "content": 'What color is the dog?\n Are there any trees in the garden?\n'
            },
            {"role": "user", "content": f"Here is an image caption: '{caption}'"},
        ]
        # define pipeline
        pipe = pipeline(
            "text-generation",
            model= self.model,
            tokenizer= self.tokenizer,
        )
        # define generation arguments
        generation_args = {
            "max_new_tokens": 500,
            "return_full_text": False,
            "temperature": 0.0,
            "do_sample": False,
        }
        # Generate the output
        output = pipe(messages, **generation_args)
        # return the generated questions
        return output[0]['generated_text'].split('\n')

# Create APP Class

In [8]:
class APP:
    def __init__(self):
        # Initialize all the required components
        self.translator = TranslateText()
        self.caption_model = ImageCaption()
        self.question_generator = QuestionGenerator()
        self.tts = TextToAudio()

    def generate_caption(self, image):
        # Step 1: Generate image caption
        caption = self.caption_model.get_caption(image)

        # Step 2: Translate caption to Arabic
        translated_caption = self.translator.get_translation(caption, en_ar=True)

        # Step 3: Generate Arabic speech from caption
        speech = self.tts.get_audio(translated_caption[0])

        # Step 4 : Generate questions from caption
        suggested_questions = self.question_generator.suggest_questions(caption)

        # Step 5: Translate suggested questions to Arabic
        translated_questions = []
        for question in suggested_questions:
            translated_questions.append(self.translator.get_translation(question, en_ar=True))

        return translated_caption[0], speech , translated_questions

    def answer_question(self, image, question):
        # Translate queston from Arabic to English
        question = self.translator.get_translation(question, ar_en=True)

        # Step 4: Answer the question based on the image
        answer = self.caption_model.ask_question(image, question)

        # Step 5: Translate the answer to Arabic
        translated_answer = self.translator.get_translation(answer, en_ar=True)
        return translated_answer[0]


In [9]:
# Instantiate the app
app = APP()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Create Gradio interface

In [11]:
import gradio as gr

# Function to generate the caption, speech, and questions
def generate_caption(image):
    try:
        # Get the translated caption, speech, and suggested questions
        translated_caption, speech, suggested_questions = app.generate_caption(image)

        # Clean and format questions
        questions_cleaned = [q[0] for q in suggested_questions]  # Extract each question from the list of lists

        # Ensure there are 4 questions, if less, fill with empty strings
        while len(questions_cleaned) < 4:
            questions_cleaned.append("")  # Fill with empty strings if fewer than 4 questions

        # Return caption text, audio file, and each question separately
        return translated_caption, speech, questions_cleaned[0], questions_cleaned[1], questions_cleaned[2], questions_cleaned[3]

    except Exception as e:
        # Return default values in case of error
        return "Error generating caption.", None, "Error: Couldn't generate question 1", "Error: Couldn't generate question 2", "Error: Couldn't generate question 3", "Error: Couldn't generate question 4"

# Function to answer the user question
def answer_question(image, user_question):
    try:
        answer = app.answer_question(image, user_question)
        return answer
    except Exception as e:
        return f"Error: {str(e)}"

# Function to populate the user question textbox
def set_user_question(question):
    return question

# Create the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("<h1 align='center'>Image Captioning and Question Answering App (Arabic)</h1>")

    with gr.Row():
        image_input = gr.Image(label="Upload an Image", type="pil")

    with gr.Row():
        generate_btn = gr.Button("Generate Caption")

    with gr.Row():
        caption_output = gr.Textbox(label="Caption", lines=2)
        audio_output = gr.Audio(label="Caption Audio")

    # Create buttons for each question
    with gr.Row():
        question_btn1 = gr.Button("Suggested Question 1")
        question_output1 = gr.Textbox(label="Suggested Question", lines=1)
        question_btn2 = gr.Button("Suggested Question 2")
        question_output2 = gr.Textbox(label="Suggested Question", lines=1)


    with gr.Row():
        user_question = gr.Textbox(label="Ask a Question (in Arabic)")

    with gr.Row():
        answer_btn = gr.Button("Get Answer")

    with gr.Row():
        answer_output = gr.Textbox(label="Answer (Arabic)", lines=2)

    # Function bindings
    generate_btn.click(
        generate_caption,
        inputs=[image_input],
        outputs=[caption_output, audio_output, question_output1, question_output2]
    )

    # Set user question when a button is clicked
    question_btn1.click(set_user_question, inputs=question_output1, outputs=user_question)
    question_btn2.click(set_user_question, inputs=question_output2, outputs=user_question)


    answer_btn.click(answer_question, inputs=[image_input, user_question], outputs=[answer_output])

# Launch the interface
demo.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://e747f3ba9980b4fb49.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


