In [2]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.8.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.5.1 (from gradio)
  Downloading gradio_client-1.5.1-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.19-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.8.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad

In [2]:
from transformers import pipeline
from huggingface_hub import login
from diffusers import StableDiffusionPipeline
import gradio as gr
import torch

# Log in to Hugging Face (replace with your token)
login("")

# Load Hugging Face models
speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-base")

# Load Stable Diffusion model using diffusers
text_to_image = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
).to("cuda" if torch.cuda.is_available() else "cpu")

# Speech-to-text function
def transcribe_audio(audio_file):
    try:
        result = speech_to_text(audio_file)
        transcription = result["text"]
        return transcription
    except Exception as e:
        return f"Error in transcription: {str(e)}"

# Text-to-image function
def generate_image_from_text(text):
    try:
        image = text_to_image(text).images[0]  # Generate one image
        return image
    except Exception as e:
        return f"Error in image generation: {str(e)}"

# Combined processing function
def process_audio_and_generate_image(audio_file):
    transcription = transcribe_audio(audio_file)
    if "Error" in transcription:
        return None, transcription

    image = generate_image_from_text(transcription)
    if isinstance(image, str) and "Error" in image:
        return None, image

    return image, transcription

# Gradio interface
iface = gr.Interface(
    fn=process_audio_and_generate_image,
    inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
    outputs=[
        gr.Image(label="Generated Image"),
        gr.Textbox(label="Transcription")
    ],
    title="Speech-to-Text and Image Generation",
    description="Upload an audio file to transcribe speech to text, and then generate an image based on the transcription.",
)

# Launch the interface
iface.launch(share=True)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://fa9b6da076b4ade1dc.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


