## **LiquidAI/LFM2-VL-1.6B(live cam)🔥📷**

LFM2-VL-1.6B is Liquid AI’s advanced multimodal vision-language model, engineered to efficiently process text and variable-resolution images using a hybrid backbone that combines the LFM2-1.2B language tower and a 400M shape-optimized SigLIP2 NaFlex vision encoder. Optimized for low-latency and edge AI applications, it offers user-tunable speed-quality tradeoffs and native resolution handling up to 512×512 pixels, along with dynamic image token mapping for high accuracy without distortion. With a context length of 32,768 tokens, support for instruction following, and rapid inference—over twice as fast as comparable models—LFM2-VL-1.6B is ideal for lightweight agentic flows and custom fine-tuning, making it a versatile choice for real-time multimodal deployments.

`accelerator: 1 X NVIDIA T4*`

<img src="https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/1SUIQM5T2kGN7YgV2ZHS5.png" width="700"/>


*notebook by: [prithivMLmods](https://huggingface.co/prithivMLmods)*

### **Install packages**

In [None]:
%%capture
!pip install git+https://github.com/huggingface/transformers.git \
             git+https://github.com/huggingface/accelerate.git \
             git+https://github.com/huggingface/peft.git \
             transformers-stream-generator huggingface_hub albumentations \
             pyvips-binary qwen-vl-utils sentencepiece opencv-python docling-core \
             python-docx torchvision safetensors matplotlib num2words \

!pip install xformers requests pymupdf hf_xet spaces pyvips pillow gradio \
             einops torch fpdf timm av decord bitsandbytes reportlab \
#Hold tight, this will take around 2-3 minutes.

### **Run Demo(live-cam) App**

In [None]:
import spaces
import json
import math
import os
import traceback
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple
import re
import time
from threading import Thread
from io import BytesIO
import uuid
import tempfile

import gradio as gr
import requests
import torch
from PIL import Image
import numpy as np

from transformers import (
    AutoProcessor,
    AutoModelForImageTextToText,
)

# --- Constants and Model Setup ---
MAX_INPUT_TOKEN_LENGTH = 4096
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
print("torch.__version__ =", torch.__version__)
print("torch.version.cuda =", torch.version.cuda)
print("cuda available:", torch.cuda.is_available())
print("cuda device count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("current device:", torch.cuda.current_device())
    print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

print("Using device:", device)


# --- Model Loading ---
MODEL_ID = "LiquidAI/LFM2-VL-1.6B"

print(f"Loading model: {MODEL_ID}")
model = AutoModelForImageTextToText.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.float16
).to(device).eval()
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
print("Model loaded successfully.")


# --- Core Application Logic ---
@spaces.GPU
def generate_caption_for_frame(
    image: Image.Image,
    prompt_input: str,
    max_new_tokens: int,
    temperature: float,
    top_p: float,
    top_k: int,
    repetition_penalty: float
):
    """
    Main function that handles model inference for a single frame.
    """
    # Handle cases where the streaming might pass a None object
    if image is None:
        return "Waiting for webcam feed..."
    if not prompt_input or not prompt_input.strip():
        return "Please enter a prompt to start captioning."

    # --- Prepare inputs for LiquidAI LFM2-VL ---

    # 1. The processor expects a chat format. We create a list of messages.
    # The "<image>" token is a placeholder that the processor will replace
    # with the actual image embeddings.
    messages = [
        {"role": "user", "content": f"<image>\n{prompt_input}"}
    ]

    # 2. Apply the chat template and process the inputs.
    # The processor handles tokenization and image processing together.
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(
        text=prompt,
        images=image.convert("RGB"),
        return_tensors="pt",
    ).to(device)


    # --- Generate the response ---
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            repetition_penalty=repetition_penalty,
            do_sample=True if temperature > 0 else False,
        )

    # Decode the generated tokens to text, skipping special tokens
    response = processor.batch_decode(out, skip_special_tokens=True)[0]

    # The response includes the original prompt; this cleans it up
    # by splitting on the assistant's turn marker.
    cleaned_response = response.split("assistant\n")[-1].strip()

    return cleaned_response


# --- Gradio UI Definition ---
def create_gradio_interface():
    """Builds and returns the Gradio web interface for live captioning."""
    css = """
    .main-container { max-width: 1200px; margin: 0 auto; }
    .footer-credit { text-align: center; margin-top: 1em; }
    """
    with gr.Blocks(theme="bethecloud/storj_theme", css=css) as demo:
        gr.HTML(f"""
        <div class="title" style="text-align: center">
            <h1>Webcam Captioning with LFM2-VL-1.6B 📷</h1>
            <p style="font-size: 1.1em; color: #6b7280; margin-bottom: 0.6em;">
                Enable your webcam and provide a prompt to get real-time scene descriptions.
            </p>
        </div>
        """)

        with gr.Row():
            # Left Column (Inputs)
            with gr.Column(scale=1):
                prompt_input = gr.Textbox(
                    label="Captioning Prompt",
                    placeholder="✦︎ What is happening in the scene?",
                    value="Describe what you see in this video frame."
                )
                # The key change: use sources='webcam' and streaming=True
                webcam_input = gr.Image(
                    label="Webcam Feed",
                    sources=['webcam'],
                    type="pil",
                    streaming=True, # This enables the live feed
                    height=400
                )

                with gr.Accordion("Advanced Settings", open=False):
                    max_new_tokens = gr.Slider(minimum=16, maximum=1024, value=512, step=8, label="Max New Tokens")
                    temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, step=0.1, value=0.7)
                    top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                    top_k = gr.Slider(label="Top-k", minimum=1, maximum=100, step=1, value=50)
                    repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.1)

                clear_btn = gr.Button("🗑️ Clear Output", variant="secondary")

            # Right Column (Outputs)
            with gr.Column(scale=1):
                caption_output = gr.Textbox(
                    label="Live Caption",
                    interactive=False,
                    lines=20,
                    show_copy_button=True,
                    placeholder="Captions will appear here..."
                )
                gr.Markdown("*notebook by*: [prithivMLmods🤗](https://huggingface.co/prithivMLmods)")

        # Event Handlers
        def clear_output_handler():
            return "Captions cleared."

        # The .stream() method creates the live loop.
        # It calls generate_caption_for_frame every time a new frame is available from the webcam.
        webcam_input.stream(
            fn=generate_caption_for_frame,
            inputs=[
                webcam_input,
                prompt_input,
                max_new_tokens,
                temperature,
                top_p,
                top_k,
                repetition_penalty
            ],
            outputs=[caption_output]
        )

        clear_btn.click(
            clear_output_handler,
            outputs=[caption_output]
        )
    return demo

if __name__ == "__main__":
    demo = create_gradio_interface()
    # Use queue() for better handling of multiple users and streaming
    demo.queue().launch(share=True, show_error=True)