In [1]:
!pip install -q transformers torch datasets soundfile torchaudio pillow scipy gradio


In [4]:
from transformers import AutoModel, AutoConfig

# List of 10 core Transformer architectures
core_model_names = [
    "bert-base-uncased", "gpt2", "t5-small", "roberta-base",
    "distilbert-base-uncased", "google/vit-base-patch16-224",
    "facebook/bart-base", "google/electra-small-discriminator",
    "albert-base-v2", "microsoft/deberta-v3-small"
]

print("--- Verifying Core Architectures ---")
for name in core_model_names:
    try:
        # Loading config to save memory, but verifies the model is accessible
        config = AutoConfig.from_pretrained(name)
        print(f"‚úÖ Architecture Verified: {name}")
    except Exception as e:
        print(f"‚ùå Error loading {name}: {e}")

--- Verifying Core Architectures ---
‚úÖ Architecture Verified: bert-base-uncased
‚úÖ Architecture Verified: gpt2
‚úÖ Architecture Verified: t5-small
‚úÖ Architecture Verified: roberta-base
‚úÖ Architecture Verified: distilbert-base-uncased
‚úÖ Architecture Verified: google/vit-base-patch16-224
‚úÖ Architecture Verified: facebook/bart-base
‚úÖ Architecture Verified: google/electra-small-discriminator
‚úÖ Architecture Verified: albert-base-v2
‚úÖ Architecture Verified: microsoft/deberta-v3-small


In [7]:
import torch
from transformers import pipeline

device = 0 if torch.cuda.is_available() else -1
print(f"Using device: {'GPU' if device == 0 else 'CPU'}")


print("Loading Speech Model (Text-to-Audio)...")
tts_pipe = pipeline("text-to-speech", model="facebook/mms-tts-eng", device=device)

print("Loading Music Model (Music Generation)...")
music_pipe = pipeline("text-to-audio", model="facebook/musicgen-small", device=device)

print("Loading VQA Model (Questioning Images)...")
vqa_pipe = pipeline("visual-question-answering", model="dandelin/vilt-b32-mlm", device=device)

print("Loading Captioning Model (Image-to-Text)...")
caption_pipe = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", device=device)

print("Loading Audio Classifier...")
audio_classifier = pipeline("audio-classification", model="MIT/ast-finetuned-audioset-10-10-0.4593", device=device)



Using device: GPU
Loading Speech Model (Text-to-Audio)...


Device set to use cuda:0


Loading Music Model (Music Generation)...


Device set to use cuda:0


Loading VQA Model (Questioning Images)...


Some weights of ViltForQuestionAnswering were not initialized from the model checkpoint at dandelin/vilt-b32-mlm and are newly initialized: ['classifier.0.bias', 'classifier.0.weight', 'classifier.1.bias', 'classifier.1.weight', 'classifier.3.bias', 'classifier.3.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


Loading Captioning Model (Image-to-Text)...


Device set to use cuda:0


Loading Audio Classifier...


Device set to use cuda:0


In [6]:
import gradio as gr
import numpy as np
import scipy.io.wavfile

# --- Logic Functions ---

def process_speech(text):
    out = tts_pipe(text)
    return (out["sampling_rate"], out["audio"].T)

def process_music(prompt):
    # Generates a short clip for testing
    out = music_pipe(prompt, forward_params={"max_new_tokens": 256})
    return (out["sampling_rate"], out["audio"].T)

def process_vision(img, question):
    # Task 1: Generate a description
    caption = caption_pipe(img)[0]['generated_text']
    # Task 2: Answer specific question
    answer = "N/A"
    if question:
        res = vqa_pipe(img, question=question, top_k=1)
        answer = res[0]['answer']
    return caption, answer

def process_classify(audio_path):
    if audio_path is None: return "Please record/upload audio"
    label = audio_classifier(audio_path)
    return {l['label']: l['score'] for l in label}

# --- Build the UI ---

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# ü§ñ 15-Model AI Playground")

    with gr.Tab("üéôÔ∏è Speech (TTS)"):
        t_in = gr.Textbox(label="Enter Text", value="Transformers are powerful models.")
        s_out = gr.Audio(label="AI Voice")
        gr.Button("Generate").click(process_speech, t_in, s_out)

    with gr.Tab("üéµ Music Gen"):
        m_in = gr.Textbox(label="Music Style", placeholder="Lo-fi hip hop with piano")
        m_out = gr.Audio(label="AI Music")
        gr.Button("Compose").click(process_music, m_in, m_out)

    with gr.Tab("üñºÔ∏è Vision (VQA/Caption)"):
        with gr.Row():
            i_in = gr.Image(type="pil", label="Upload Image")
            with gr.Column():
                q_in = gr.Textbox(label="Ask about the image")
                c_out = gr.Textbox(label="AI Caption")
                a_out = gr.Textbox(label="Answer")
        gr.Button("Analyze").click(process_vision, [i_in, q_in], [c_out, a_out])

    with gr.Tab("üéß Audio Classifier"):
        aud_in = gr.Audio(type="filepath", label="Record Audio")
        lab_out = gr.Label(label="Detected Sounds")
        gr.Button("Identify").click(process_classify, aud_in, lab_out)

# Launch with a shareable link
demo.launch(share=True, debug=True)

  with gr.Blocks(theme=gr.themes.Soft()) as demo:


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://454e85133a0649002c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://454e85133a0649002c.gradio.live


