In [None]:
!pip install -q -U transformers==4.37.2
!pip install -q bitsandbytes==0.41.3 accelerate==0.25.0
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q gradio
!pip install -q gTTS

import torch
from transformers import BitsAndBytesConfig, pipeline
import whisper
import gradio as gr
from PIL import Image
import re
import numpy as np
from gtts import gTTS

# Set up model and pipeline
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)
model_id = "llava-hf/llava-1.5-7b-hf"
pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config})

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using torch {torch.__version__} ({DEVICE})")
model = whisper.load_model("base", device=DEVICE)

def img2txt(input_text, input_image):
    image = Image.open(input_image)

    if type(input_text) == tuple:
        prompt_instructions = """
        Describe the image using as much detail as possible, is it a painting, a photograph, what colors are predominant, what is the image about?,
        What is in the image , Give me the information of the image
        """
    else:
        prompt_instructions = """
        Act as an expert in imagery descriptive analysis, using as much detail as possible from the image, respond to the following prompt:
        """ + input_text

    prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"
    outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})

    if outputs and len(outputs[0]["generated_text"]) > 0:
        match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
        if match:
            return match.group(1)
    return "No response found."


!pip install openai-whisper

# working code

import whisper

def transcribe(audio_path):
    # Check if the audio input is None or empty
    if audio_path is None or audio_path == ' ':
        return '', '', None  # Return empty strings and None audio file

    # Load the Whisper model
    model = whisper.load_model("medium")  # You can choose other models like "small", "medium", "large" depending on your needs

    # Load and preprocess the audio file
    try:
        audio = whisper.load_audio(audio_path)
        audio = whisper.pad_or_trim(audio)
    except Exception as e:
        print(f"Error loading or processing audio file: {e}")
        return '', '', None

    # Create a log-mel spectrogram and detect language
    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    try:
        _, probs = model.detect_language(mel)
    except Exception as e:
        print(f"Error detecting language: {e}")
        return '', '', None

    # Set decoding options
    options = whisper.DecodingOptions(language='en', fp16=False)  # Adjust options as needed

    try:
        result = whisper.decode(model, mel, options)
        result_text = result.text
    except Exception as e:
        print(f"Error decoding the audio: {e}")
        return '', '', None

    return result_text



!pip install whisper


def text_to_speech(text, file_path):
    audioobj = gTTS(text=text, lang='en', slow=False)
    audioobj.save(file_path)
    return file_path

def process_inputs(audio_path, image_path):
    speech_to_text_output = transcribe(audio_path)
    chatgpt_output = img2txt(speech_to_text_output, image_path) if image_path else "No image provided."
    processed_audio_path = text_to_speech(chatgpt_output, "Temp3.mp3")
    return speech_to_text_output, chatgpt_output, processed_audio_path

# Create and launch the Gradio interface
iface = gr.Interface(
    fn=process_inputs,
    inputs=[
        gr.Audio(sources=["microphone"], type="filepath"),
        gr.Image(type="filepath")
    ],
    outputs=[
        gr.Textbox(label="Speech to Text"),
        gr.Textbox(label="ChatGPT Output"),
        gr.Audio("Temp3.mp3")
    ],
    title="Learn OpenAI Whisper: Image processing with Whisper and Llava",
    description="Upload an image and interact via voice input and audio response."
)
iface.launch(debug=True)


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Using torch 2.3.1+cu121 (cuda)
Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://0beae8324feb5af2ca.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://0beae8324feb5af2ca.gradio.live


