In [None]:
# Install necessary packages
!pip install -q -U transformers==4.37.2
!pip install -q bitsandbytes==0.41.3 accelerate==0.25.0
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q gradio
!pip install -q pyttsx3 webrtcvad

import torch
from transformers import BitsAndBytesConfig, pipeline
import whisper
import gradio as gr
import numpy as np
import re
from PIL import Image
import pyttsx3
import webrtcvad
import wave
import os

# Setup for quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

# Initialize the image-to-text pipeline
model_id = "llava-hf/llava-1.5-7b-hf"
pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config})

# Define functions
def img2txt(input_text, input_image):
    image = Image.open(input_image)

    if isinstance(input_text, tuple):
        prompt_instructions = """
        Describe the image using as much detail as possible, is it a painting, a photograph, what colors are predominant, what is the image about?,
        Describe me the image , What is this image about ? , Can you explain me the image
        """
    else:
        prompt_instructions = """
        Act as an expert in imagery descriptive analysis, using as much detail as possible from the image, respond to the following prompt:
        """ + input_text

    prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"

    outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})

    if outputs is not None and len(outputs[0]["generated_text"]) > 0:
        match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
        if match:
            reply = match.group(1)
        else:
            reply = "No response found."
    else:
        reply = "No response generated."

    return reply

def transcribe(audio_path):
    if audio_path is None or audio_path == '':
        return '', '', None

    model = whisper.load_model("base")

    try:
        audio = whisper.load_audio(audio_path)
        audio = whisper.pad_or_trim(audio)
    except Exception as e:
        print(f"Error loading or processing audio file: {e}")
        return '', '', None

    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    try:
        _, probs = model.detect_language(mel)
    except Exception as e:
        print(f"Error detecting language: {e}")
        return '', '', None

    options = whisper.DecodingOptions(language='en', fp16=False)

    try:
        result = whisper.decode(model, mel, options)
        result_text = result.text
    except Exception as e:
        print(f"Error decoding the audio: {e}")
        return '', '', None

    return result_text

def process_audio(input_audio_path, output_audio_path, gender, pitch, speed):
    # Voice activity detection setup
    vad = webrtcvad.Vad(1)  # Mode 1 is less aggressive VAD

    # Read input audio
    with wave.open(input_audio_path, 'rb') as wf:
        sample_rate = wf.getframerate()
        audio_channels = wf.getnchannels()
        audio_sample_width = wf.getsampwidth()
        audio_data = wf.readframes(wf.getnframes())

    # Check audio specifications
    assert sample_rate == 16000, "Audio sampling rate must be 16 kHz"
    assert audio_channels == 1, "Audio must be mono"

    # Apply VAD
    audio_frames = []
    frame_duration = 30  # ms
    frame_size = int(sample_rate * frame_duration / 1000)
    for i in range(0, len(audio_data), frame_size * audio_sample_width):
        frame = audio_data[i:i + frame_size * audio_sample_width]
        if vad.is_speech(frame, sample_rate):
            audio_frames.append(frame)

    audio_data = b''.join(audio_frames)

    # Save cleaned audio
    with wave.open(output_audio_path, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(audio_sample_width)
        wf.setframerate(16000)
        wf.writeframes(audio_data)

def text_to_speech(text, file_path, gender='male', pitch=1.0, speed=1.0):
    engine = pyttsx3.init()
    voices = engine.getProperty('voices')

    if gender == 'male':
        voice = voices[0].id
    else:
        voice = voices[1].id

    engine.setProperty('voice', voice)
    engine.setProperty('rate', int(200 * speed))  # Speed control
    engine.save_to_file(text, file_path)
    engine.runAndWait()

    # Adjust pitch if needed (pyttsx3 does not support pitch adjustment directly)
    # For pitch adjustment, you may need to use other libraries or services

# Create the Gradio interface
def process_inputs(audio_path, image_path, voice_gender, text_prompt, pitch, speed):
    if text_prompt:
        prompt = text_prompt
    else:
        prompt = transcribe(audio_path)

    if image_path:
        chatgpt_output = img2txt(prompt, image_path)
    else:
        chatgpt_output = "No image provided."

    temp_audio_path = "Temp.wav"
    process_audio(audio_path, temp_audio_path, voice_gender, pitch, speed)  # Process audio for VAD

    processed_audio_path = "Temp.mp3"
    text_to_speech(chatgpt_output, processed_audio_path, gender=voice_gender, pitch=pitch, speed=speed)

    return "", chatgpt_output, processed_audio_path

# Define the Gradio interface
iface = gr.Interface(
    fn=process_inputs,
    inputs=[
        gr.Audio(sources=["microphone"], type="filepath"),
        gr.Image(type="filepath"),
        gr.Dropdown(choices=["male", "female"], label="Select Voice Gender"),
        gr.Textbox(label="Type your prompt or question", lines=2, placeholder="Type here..."),
        gr.Slider(minimum=0.5, maximum=2.0, step=0.1, label="Speed", value=1.0),
        gr.Slider(minimum=0.5, maximum=2.0, step=0.1, label="Pitch", value=1.0)
    ],
    outputs=[
        gr.Textbox(label="Speech to Text"),
        gr.Textbox(label="ChatGPT Output"),
        gr.Audio("Temp.wav")
    ],
    title="Learn OpenAI Whisper: Image processing with Whisper and Llava",
    description="Upload an image and interact via voice input or type a prompt. Receive audio responses with customizable voice features."
)

# Launch the interface
iface.launch(debug=True)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.1/168.1 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m60.0 MB/s[0m eta [36m0:00

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/70.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://f1f6cf993e7b1ca4d6.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/gradio/queueing.py", line 536, in process_events
    response = await route_utils.call_process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/route_utils.py", line 321, in call_process_api
    output = await app.get_blocks().process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1935, in process_api
    result = await self.call_function(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1520, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
  File "/usr/local/lib/python3.10/dist-packages/anyio/to_thread.py", line 33, in run_sync
    return await get_asynclib().run_sync_in_worker_thread(
  File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 877, in run_sync_in_worker_thread
    return await future
  File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 8

Keyboard interruption in main thread... closing server.


KeyboardInterrupt: 

In [None]:
# Install necessary packages
!pip install -q -U transformers==4.37.2
!pip install -q bitsandbytes==0.41.3 accelerate==0.25.0
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q gradio
!pip install -q gTTS

import torch
from transformers import BitsAndBytesConfig, pipeline
import whisper
import gradio as gr
import numpy as np
import re
from PIL import Image
from gtts import gTTS
import datetime
import os

# Setup for quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

# Initialize the image-to-text pipeline
model_id = "llava-hf/llava-1.5-7b-hf"
pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config})

# Define functions
def img2txt(input_text, input_image):
    # Load the image
    image = Image.open(input_image)

    # Generate prompt
    if isinstance(input_text, tuple):
        prompt_instructions = """
        Describe the image using as much detail as possible, is it a painting, a photograph, what colors are predominant, what is the image about?
        """
    else:
        prompt_instructions = """
        Act as an expert in imagery descriptive analysis, using as much detail as possible from the image, respond to the following prompt:
        """ + input_text

    prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"

    # Generate outputs
    outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})

    # Properly extract the response text
    if outputs is not None and len(outputs[0]["generated_text"]) > 0:
        match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
        if match:
            reply = match.group(1)
        else:
            reply = "No response found."
    else:
        reply = "No response generated."

    return reply

def transcribe(audio_path):
    if audio_path is None or audio_path == '':
        return '', '', None  # Return empty strings and None audio file

    # Load the Whisper model
    model = whisper.load_model("base")

    try:
        audio = whisper.load_audio(audio_path)
        audio = whisper.pad_or_trim(audio)
    except Exception as e:
        print(f"Error loading or processing audio file: {e}")
        return '', '', None

    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    try:
        _, probs = model.detect_language(mel)
    except Exception as e:
        print(f"Error detecting language: {e}")
        return '', '', None

    options = whisper.DecodingOptions(language='en', fp16=False)

    try:
        result = whisper.decode(model, mel, options)
        result_text = result.text
    except Exception as e:
        print(f"Error decoding the audio: {e}")
        return '', '', None

    return result_text

def text_to_speech(text, file_path, gender='male'):
    language = 'en'
    slow = False

    # Set the voice based on gender selection
    if gender == 'male':
        # Note: gTTS does not have specific male/female voice options; example below is just illustrative
        # Actual voice selection would depend on the TTS service or API being used
        tts_voice = None
    elif gender == 'female':
        tts_voice = None
    else:
        tts_voice = None

    audioobj = gTTS(text=text, lang=language, slow=slow)
    audioobj.save(file_path)
    return file_path

# Create the Gradio interface
def process_inputs(audio_path, image_path, voice_gender):
    speech_to_text_output = transcribe(audio_path)
    if image_path:
        chatgpt_output = img2txt(speech_to_text_output, image_path)
    else:
        chatgpt_output = "No image provided."

    processed_audio_path = text_to_speech(chatgpt_output, "Temp.mp3", gender=voice_gender)

    return speech_to_text_output, chatgpt_output, processed_audio_path

# Define the Gradio interface
iface = gr.Interface(
    fn=process_inputs,
    inputs=[
        gr.Audio(sources=["microphone"], type="filepath"),
        gr.Image(type="filepath"),
        gr.Dropdown(choices=["male", "female"], label="Select Voice Gender")
    ],
    outputs=[
        gr.Textbox(label="Speech to Text"),
        gr.Textbox(label="ChatGPT Output"),
        gr.Audio("Temp.mp3")
    ],
    title="Learn OpenAI Whisper: Image processing with Whisper and Llava",
    description="Upload an image and interact via voice input and audio response."
)

# Launch the interface
iface.launch(debug=True)


In [None]:
# Install necessary packages
!pip install -q -U transformers==4.37.2
!pip install -q bitsandbytes==0.41.3 accelerate==0.25.0
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q gradio
!pip install -q pyttsx3 webrtcvad

import torch
from transformers import BitsAndBytesConfig, pipeline
import whisper
import gradio as gr
import numpy as np
import re
from PIL import Image
import pyttsx3
import webrtcvad
import wave
import os

# Setup for quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

# Initialize the image-to-text pipeline
model_id = "llava-hf/llava-1.5-7b-hf"
pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config})

# Define functions
def img2txt(input_text, input_image):
    image = Image.open(input_image)

    if isinstance(input_text, str) and not input_text.strip():
        prompt_instructions = """
        Describe the image using as much detail as possible, is it a painting, a photograph, what colors are predominant, what is the image about?,
        Describe me the image , What is this image about ? , Can you explain me the image
        """
    else:
        prompt_instructions = """
        Act as an expert in imagery descriptive analysis, using as much detail as possible from the image, respond to the following prompt:
        """ + input_text

    prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"

    outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})

    if outputs is not None and len(outputs[0]["generated_text"]) > 0:
        match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
        if match:
            reply = match.group(1)
        else:
            reply = "No response found."
    else:
        reply = "No response generated."

    return reply

def transcribe(audio_path):
    if audio_path is None or audio_path == '':
        return '', '', None

    model = whisper.load_model("base")

    try:
        audio = whisper.load_audio(audio_path)
        audio = whisper.pad_or_trim(audio)
    except Exception as e:
        print(f"Error loading or processing audio file: {e}")
        return '', '', None

    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    try:
        _, probs = model.detect_language(mel)
    except Exception as e:
        print(f"Error detecting language: {e}")
        return '', '', None

    options = whisper.DecodingOptions(language='en', fp16=False)

    try:
        result = whisper.decode(model, mel, options)
        result_text = result.text
    except Exception as e:
        print(f"Error decoding the audio: {e}")
        return '', '', None

    return result_text

def process_audio(input_audio_path, output_audio_path, gender, pitch, speed):
    # Voice activity detection setup
    vad = webrtcvad.Vad(1)  # Mode 1 is less aggressive VAD

    # Read input audio
    with wave.open(input_audio_path, 'rb') as wf:
        sample_rate = wf.getframerate()
        audio_channels = wf.getnchannels()
        audio_sample_width = wf.getsampwidth()
        audio_data = wf.readframes(wf.getnframes())

    # Check audio specifications
    assert sample_rate == 16000, "Audio sampling rate must be 16 kHz"
    assert audio_channels == 1, "Audio must be mono"

    # Apply VAD
    audio_frames = []
    frame_duration = 30  # ms
    frame_size = int(sample_rate * frame_duration / 1000)
    for i in range(0, len(audio_data), frame_size * audio_sample_width):
        frame = audio_data[i:i + frame_size * audio_sample_width]
        if vad.is_speech(frame, sample_rate):
            audio_frames.append(frame)

    audio_data = b''.join(audio_frames)

    # Save cleaned audio
    with wave.open(output_audio_path, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(audio_sample_width)
        wf.setframerate(16000)
        wf.writeframes(audio_data)

def text_to_speech(text, file_path, gender='male', pitch=1.0, speed=1.0):
    engine = pyttsx3.init()
    voices = engine.getProperty('voices')

    if gender == 'male':
        voice = voices[0].id
    else:
        voice = voices[1].id

    engine.setProperty('voice', voice)
    engine.setProperty('rate', int(200 * speed))  # Speed control
    engine.save_to_file(text, file_path)
    engine.runAndWait()

    # Note: Pyttsx3 does not support pitch adjustment directly

# Create the Gradio interface
def process_inputs(audio_path, image_path, voice_gender, text_prompt, pitch, speed):
    if text_prompt:
        prompt = text_prompt
    else:
        prompt = transcribe(audio_path)

    if image_path:
        chatgpt_output = img2txt(prompt, image_path)
    else:
        chatgpt_output = "No image provided."

    temp_audio_path = "Temp.wav"
    process_audio(audio_path, temp_audio_path, voice_gender, pitch, speed)  # Process audio for VAD

    processed_audio_path = "Temp.mp3"
    text_to_speech(chatgpt_output, processed_audio_path, gender=voice_gender, pitch=pitch, speed=speed)

    return "", chatgpt_output, processed_audio_path

# Define the Gradio interface
iface = gr.Interface(
    fn=process_inputs,
    inputs=[
        gr.Audio(sources=["microphone"], type="filepath"),
        gr.Image(type="filepath"),
        gr.Dropdown(choices=["male", "female"], label="Select Voice Gender"),
        gr.Textbox(label="Type your prompt or question", lines=2, placeholder="Type here..."),
        gr.Slider(minimum=0.5, maximum=2.0, step=0.1, label="Speed", value=1.0),
        gr.Slider(minimum=0.5, maximum=2.0, step=0.1, label="Pitch", value=1.0)
    ],
    outputs=[
        gr.Textbox(label="Speech to Text"),
        gr.Textbox(label="ChatGPT Output"),
        gr.Audio(type="filepath", label="Generated Audio")
    ],
    title="Learn OpenAI Whisper: Image processing with Whisper and Llava",
    description="Upload an image and interact via voice input or type a prompt. Receive audio responses with customizable voice features."
)

# Launch the interface
iface.launch(debug=True)


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone




Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://617a6d320d29c357b4.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/gradio/queueing.py", line 536, in process_events
    response = await route_utils.call_process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/route_utils.py", line 321, in call_process_api
    output = await app.get_blocks().process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1935, in process_api
    result = await self.call_function(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1520, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
  File "/usr/local/lib/python3.10/dist-packages/anyio/to_thread.py", line 33, in run_sync
    return await get_asynclib().run_sync_in_worker_thread(
  File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 877, in run_sync_in_worker_thread
    return await future
  File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 8