In [None]:
!pip install gradio torch transformers pillow huggingface_hub bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl (67.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.46.0


In [None]:
pip install transformers_stream_generator

Collecting transformers_stream_generator
  Downloading transformers-stream-generator-0.0.5.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: transformers_stream_generator
  Building wheel for transformers_stream_generator (setup.py) ... [?25l[?25hdone
  Created wheel for transformers_stream_generator: filename=transformers_stream_generator-0.0.5-py3-none-any.whl size=12426 sha256=f900a400446ae26a3db425452f8ddbe0eaf531c01856cdbf5998dd09488b14c6
  Stored in directory: /root/.cache/pip/wheels/23/e8/f0/b3c58c12d1ffe60bcc8c7d121115f26b2c1878653edfca48db
Successfully built transformers_stream_generator
Installing collected packages: transformers_stream_generator
Successfully installed transformers_stream_generator-0.0.5


In [None]:
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoProcessor, BlipForConditionalGeneration
from PIL import Image

# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"

# --- Code Generation Model (Lightweight) ---
code_model_name = "deepseek-ai/DeepSeek-R1-distill-qwen-1.5b"
code_tokenizer = AutoTokenizer.from_pretrained(code_model_name)
code_model = AutoModelForCausalLM.from_pretrained(code_model_name).to(device)

# --- Image-to-Text Model (BLIP) ---
blip_model_name = "Salesforce/blip-image-captioning-base"
blip_processor = AutoProcessor.from_pretrained(blip_model_name)
blip_model = BlipForConditionalGeneration.from_pretrained(blip_model_name).to(device)

# --- Code Generation Function ---
def generate_code(prompt, temperature, top_p, max_new_tokens):
    if not prompt:
        return "Please provide a code prompt."

    full_prompt = f"Write a Python function to {prompt}:\n"
    inputs = code_tokenizer(full_prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = code_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=True
        )

    code = code_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return code

# --- Image Description Function ---
def generate_image_description(image, temperature, top_p, max_new_tokens):
    if image is None:
        return "Please upload an image."

    inputs = blip_processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        output = blip_model.generate(**inputs, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p)

    description = blip_processor.batch_decode(output, skip_special_tokens=True)[0]
    return description

# --- Gradio Interface ---
with gr.Blocks() as demo:
    gr.Markdown("# 🧠 Multimodal Gradio App (T4 Friendly)")

    with gr.Row():
        temperature = gr.Slider(0.1, 1.0, value=0.7, label="Temperature")
        top_p = gr.Slider(0.1, 1.0, value=0.9, label="Top P")
        max_new_tokens = gr.Slider(10, 500, value=100, label="Max New Tokens")

    with gr.Tab("🖼️ Image Description"):
        image_input = gr.Image(type="pil", label="Upload Image")
        description_output = gr.Textbox(label="Generated Description")
        describe_button = gr.Button("Generate Description")
        describe_button.click(
            fn=generate_image_description,
            inputs=[image_input, temperature, top_p, max_new_tokens],
            outputs=description_output
        )

    with gr.Tab("💻 Code Generation"):
        code_prompt = gr.Textbox(label="Code Prompt (e.g., 'sort a list of numbers')")
        code_output = gr.Textbox(label="Generated Code", lines=12)
        code_button = gr.Button("Generate Code")
        code_button.click(
            fn=generate_code,
            inputs=[code_prompt, temperature, top_p, max_new_tokens],
            outputs=code_output
        )

# Launch app
demo.launch()


In [None]:
!pip install -q transformers accelerate gradio

import gradio as gr
import torch
from PIL import Image
from transformers import (
    AutoProcessor, AutoModelForCausalLM,
    AutoTokenizer
)

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"

# --- Load Qwen-VL-Chat for Image+Text ---
vl_model_name = "Qwen/Qwen-VL-Chat"
vl_processor = AutoProcessor.from_pretrained(vl_model_name, trust_remote_code=True)
vl_model = AutoModelForCausalLM.from_pretrained(vl_model_name, trust_remote_code=True).to(device)
vl_model.eval()

# --- Load DeepSeek for Text-to-Text ---
text_model_name = "deepseek-ai/DeepSeek-R1-distill-qwen-1.5b"
text_tokenizer = AutoTokenizer.from_pretrained(text_model_name, trust_remote_code=True)
text_model = AutoModelForCausalLM.from_pretrained(text_model_name, trust_remote_code=True).to(device)
text_model.eval()

# --- Image + Question Inference ---
def qwen_vl_chat_infer(image, question, temperature=0.7, top_p=0.9, max_new_tokens=128):
    if image is None or not question.strip():
        return "Please upload an image and enter a question."

    prompt = (
        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
        f"<|im_start|>user\n{question}\n<image>\n<|im_end|>\n"
        "<|im_start|>assistant\n"
    )
    inputs = vl_processor(text=prompt, images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = vl_model.generate(
            **inputs,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            max_new_tokens=max_new_tokens
        )

    response = vl_processor.batch_decode(outputs, skip_special_tokens=True)[0]
    if "<|im_start|>assistant" in response:
        response = response.split("<|im_start|>assistant")[-1].strip()
    return response

# --- Text-to-Text Inference ---
def generate_text(prompt, temperature=0.7, top_p=0.9, max_new_tokens=128):
    if not prompt.strip():
        return "Please enter a prompt."

    inputs = text_tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = text_model.generate(
            **inputs,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            max_new_tokens=max_new_tokens
        )

    return text_tokenizer.decode(outputs[0], skip_special_tokens=True)

# --- Gradio App ---
with gr.Blocks() as demo:
    gr.Markdown("## 🧠 Multimodal App: Qwen-VL + DeepSeek (T4-Friendly)")

    with gr.Tabs():
        with gr.Tab(" Image + Question"):
            with gr.Row():
                with gr.Column():
                    image_input = gr.Image(type="pil", label="Upload Image")
                    question_input = gr.Textbox(label="Ask a question about the image")
                    temperature = gr.Slider(0.1, 1.0, value=0.7, label="Temperature")
                    top_p = gr.Slider(0.1, 1.0, value=0.9, label="Top P")
                    max_new_tokens = gr.Slider(10, 512, value=128, label="Max New Tokens")
                    submit_btn = gr.Button("Generate Answer")
                with gr.Column():
                    output_text = gr.Textbox(label="Model Answer", lines=10)
            submit_btn.click(
                fn=qwen_vl_chat_infer,
                inputs=[image_input, question_input, temperature, top_p, max_new_tokens],
                outputs=output_text
            )

        with gr.Tab(" Text-to-Text"):
            with gr.Row():
                with gr.Column():
                    text_prompt = gr.Textbox(label="Enter your prompt")
                    text_submit = gr.Button("Generate Text")
                with gr.Column():
                    text_output = gr.Textbox(label="Generated Text", lines=10)
            text_submit.click(
                fn=generate_text,
                inputs=[text_prompt, temperature, top_p, max_new_tokens],
                outputs=text_output
            )

demo.launch()


visual.py:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Qwen/Qwen-VL-Chat:
- visual.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


qwen_generation_utils.py:   0%|          | 0.00/14.9k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Qwen/Qwen-VL-Chat:
- qwen_generation_utils.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/Qwen/Qwen-VL-Chat:
- visual.py
- qwen_generation_utils.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin.index.json:   0%|          | 0.00/79.9k [00:00<?, ?B/s]

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

pytorch_model-00003-of-00010.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

pytorch_model-00006-of-00010.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

pytorch_model-00001-of-00010.bin:   0%|          | 0.00/1.96G [00:00<?, ?B/s]

pytorch_model-00004-of-00010.bin:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

pytorch_model-00008-of-00010.bin:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00010.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

pytorch_model-00005-of-00010.bin:   0%|          | 0.00/1.92G [00:00<?, ?B/s]

pytorch_model-00007-of-00010.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/83.4k [00:00<?, ?B/s]

pytorch_model-00009-of-00010.bin:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

pytorch_model-00010-of-00010.bin:   0%|          | 0.00/1.73G [00:00<?, ?B/s]

RuntimeError: Data processing error: CAS service error : Error : single flight error: Real call failed: ReqwestMiddlewareError(Reqwest(reqwest::Error { kind: Request, url: "https://transfer.xethub.hf.co/xorbs/default/9614c7683f226a9eaa50e06c12bb2dd6e09837b8307402f1e7f06bfa0cc4ed09?X-Xet-Signed-Range=bytes%3D0-57854528&Expires=1749133088&Policy=eyJTdGF0ZW1lbnQiOlt7IlJlc291cmNlIjoiaHR0cHM6Ly90cmFuc2Zlci54ZXRodWIuaGYuY28veG9yYnMvZGVmYXVsdC85NjE0Yzc2ODNmMjI2YTllYWE1MGUwNmMxMmJiMmRkNmUwOTgzN2I4MzA3NDAyZjFlN2YwNmJmYTBjYzRlZDA5P1gtWGV0LVNpZ25lZC1SYW5nZT1ieXRlcyUzRDAtNTc4NTQ1MjgiLCJDb25kaXRpb24iOnsiRGF0ZUxlc3NUaGFuIjp7IkFXUzpFcG9jaFRpbWUiOjE3NDkxMzMwODh9fX1dfQ__&Signature=N1saA0H-UP0h~wLQH0ka16JX6De81C4bxNnAy2adpCKf97CFJOn6eSG4rJp7PRexSsZe98y0RdAxz0k6Qlc75JUjnAbcEvoqPYzkrfA67x8GmDPvN6Bq5JMFco3ehIororPk-VvtMBiot9bIOBlJQfoPcKv~OX3dqvh80VUsSndAmH2TRnBTDJyXuwbUIh~PYeu4oiAxvSOwZnorD6tSkTP8xojKDnEPDNPdOYZ5cBIehK7Ajz5bPFuhzg~2iaWvpK9KwOEMCnmNSxE4qtlAke4JNpGHnlFlevH2~Mc4G-oVNo2hhMdwnXvj514Q5g~UbIuQQl50PXhTadKVUl5BtA__&Key-Pair-Id=K2L8F4GPSG1IFC", source: hyper_util::client::legacy::Error(SendRequest, hyper::Error(ChannelClosed)) }))