In [36]:
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

class ImageCaptioner:
    def __init__(self):
        self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
        self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

    def caption_image(self, img_path: str):
        raw_image = Image.open(img_path).convert('RGB')

        inputs = self.processor(raw_image, return_tensors="pt")
        unconditional_output = self.model.generate(**inputs)
        unconditional_caption = self.processor.decode(unconditional_output[0], skip_special_tokens=True)

        return unconditional_caption


In [37]:
!pip install groq



In [38]:
from groq import Groq

In [39]:
class TranslatorBot:
    def __init__(self, api_key):
        self.client = Groq(api_key=api_key)
        self.instructions = {
            "role": "system",
            "content": "You are a specialized translator. Your task is to convert any given sentence or word from English into Arabic only. You must provide clear and accurate translations in Arabic, ensuring that words are properly spaced and the sentences are easy to understand. No other languages are allowed. Focus on clarity and avoid merging words together."
        }

    def translate_to_arabic(self, text: str):
        chat_completion = self.client.chat.completions.create(
            messages=[
                self.instructions,
                {
                    "role": "user",
                    "content": text,
                }
            ],
            model="llama3-70b-8192",
        )
        return chat_completion.choices[0].message.content

In [40]:
!pip install gTTS



In [41]:
from gtts import gTTS
from IPython.display import Audio
import io

In [42]:
def process_image_to_speech(img_path: str, api_key: str):
    img_captioner = ImageCaptioner()
    caption = img_captioner.caption_image(img_path)

    translator = TranslatorBot(api_key)
    arabic_translation = translator.translate_to_arabic(caption)

    tts = gTTS(text=arabic_translation, lang='ar')

    audio_buffer = io.BytesIO()
    tts.write_to_fp(audio_buffer)

    audio_buffer.seek(0)

    return Audio(audio_buffer.read(), autoplay=True)



In [45]:
img_path = '/content/image3.jpg'
api_key = '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'

process_image_to_speech(img_path, api_key)

