In [None]:
!pip uninstall -y transformers
!pip install transformers==4.44.2 einops timm

In [None]:
!pip install transformers einops timm

In [None]:
import os
import cv2
import torch
import numpy as np
from tqdm import tqdm
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM

# ================= THE CORRECTED HUGGING FACE BYPASS PATCH =================
import transformers.dynamic_module_utils as dyn_utils
original_check = dyn_utils.check_imports

def custom_check_imports(filename):
    try:
        return original_check(filename)
    except ImportError as e:
        if "flash_attn" in str(e):
            # Bypass the error and return the relative imports manually
            return dyn_utils.get_relative_imports(filename)
        else:
            raise e

dyn_utils.check_imports = custom_check_imports
# =========================================================================

# ================= CONFIGURATION =================
INPUT_VIDEO = "/kaggle/input/datasets/gonoszgonosz/rat-test-video/test.mp4"
OUTPUT_VIDEO = "/kaggle/working/Florence2_Video_Output.mp4"

MODEL_ID = "microsoft/Florence-2-large"
TASK_PROMPT = "<REFERRING_EXPRESSION_SEGMENTATION>"
TEXT_INPUT = " rat"

device = "cuda" if torch.cuda.is_available() else "cpu"
# =================================================

def apply_overlay(image, mask, color=(0, 255, 255), alpha=0.5):
    """Blends a solid color over the masked region (Yellow for Florence)."""
    overlay = np.full_like(image, color)
    blended = cv2.addWeighted(image, 1 - alpha, overlay, alpha, 0)
    res = image.copy()
    res[mask == 1] = blended[mask == 1]
    return res

def main():
    print("--- LOADING FLORENCE-2 VLM ---")
    
    # This is the line that was missing:
    processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
    
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID, torch_dtype=torch.float16, trust_remote_code=True
    ).to(device).eval()

    cap = cv2.VideoCapture(INPUT_VIDEO)
    w, h = int(cap.get(3)), int(cap.get(4))
    fps = cap.get(5)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(OUTPUT_VIDEO, fourcc, fps, (w, h))

    print("--- STARTING FLORENCE-2 VIDEO INFERENCE ---")
    pbar = tqdm(total=total_frames)
    
    prompt = TASK_PROMPT + TEXT_INPUT

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret: break
        
        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(image_rgb)
        
        # 1. VLM Inference
        inputs = processor(text=prompt, images=pil_image, return_tensors="pt").to(device, torch.float16)
        
        with torch.no_grad():
            generated_ids = model.generate(
                input_ids=inputs["input_ids"],
                pixel_values=inputs["pixel_values"],
                max_new_tokens=1024,
                do_sample=False,
                num_beams=3
            )
            
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
        parsed_answer = processor.post_process_generation(generated_text, task=TASK_PROMPT, image_size=(w, h))
        
        # 2. Parse Polygons
        pred_mask = np.zeros((h, w), dtype=np.uint8)
        polygons_dict = parsed_answer.get(TASK_PROMPT, {})
        polygons_list = polygons_dict.get('polygons', polygons_dict.get('Polygons', []))
        
        for obj_polys in polygons_list:
            for poly in obj_polys:
                if len(poly) >= 6:
                    poly_np = np.array(poly).reshape(-1, 2).astype(np.int32)
                    # Draw the polygon mask
                    cv2.fillPoly(pred_mask, [poly_np], 1)
                    # Draw a stark outline to show the VLM's anchor points
                    cv2.polylines(frame, [poly_np], isClosed=True, color=(0, 255, 255), thickness=2)
                    
        # 3. Apply Visuals
        res_frame = apply_overlay(frame, pred_mask)
        cv2.putText(res_frame, "Florence-2: Zero-Shot Text-to-Polygon", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
        
        out.write(res_frame)
        pbar.update(1)

    cap.release()
    out.release()
    pbar.close()
    print(f"--- DONE. SAVED TO {OUTPUT_VIDEO} ---")

if __name__ == "__main__":
    main()