In [1]:
!pip install gradio av torch transformers diffusers opencv-python-headless moviepy Pillow


Collecting gradio
  Downloading gradio-5.6.0-py3-none-any.whl.metadata (16 kB)
Collecting av
  Downloading av-13.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.5-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.3 (from gradio)
  Downloading gradio_client-1.4.3-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart==0.0.12 (from gradio)
  Downloading python_multipart-0.0.12-py3-none-any.whl.metadata (1.9 kB)
Collecti

In [1]:
import gradio as gr
import av
import numpy as np
import torch
from transformers import AutoImageProcessor, AutoTokenizer, VisionEncoderDecoderModel, BlipProcessor, BlipForConditionalGeneration
from diffusers import AnimateDiffPipeline, DDIMScheduler, MotionAdapter
from diffusers.utils import export_to_gif
import cv2
from moviepy.editor import VideoFileClip
from PIL import Image
import os

# Load video captioning model (BLIP for better captions)
print("Loading BLIP model for better captions...")
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Load AnimateDiff video generator
adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
pipe = AnimateDiffPipeline.from_pretrained(model_id, motion_adapter=adapter, torch_dtype=torch.float16)
scheduler = DDIMScheduler.from_pretrained(
    model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", beta_schedule="linear", steps_offset=1
)
pipe.scheduler = scheduler
pipe.enable_vae_slicing()
pipe.enable_model_cpu_offload()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
caption_model.to(device)

# Helper function to generate captions using BLIP
def generate_caption(video_path):
    print(f"Processing video: {video_path}")
    clip = VideoFileClip(video_path)
    frames = []
    for i, frame in enumerate(clip.iter_frames(fps=0.5)):
        if i > 20:
            break
        frames.append(frame)

    # Convert frames to PIL images and generate captions
    captions = []
    for frame in frames:
        pil_image = Image.fromarray(frame)
        inputs = processor(pil_image, return_tensors="pt").to(device)
        outputs = caption_model.generate(**inputs)
        caption = processor.decode(outputs[0], skip_special_tokens=True)
        captions.append(caption)

    # Combine captions into a single summary
    combined_caption = " ".join(captions)
    print(f"Generated captions: {captions}")
    return combined_caption

# Helper: Generate AI video based on caption
def generate_ai_video(caption):
    output = pipe(
        prompt=caption,
        negative_prompt="bad quality, worse quality",
        num_frames=16,
        guidance_scale=7.5,
        num_inference_steps=25,
        generator=torch.Generator("cpu").manual_seed(42),
    )
    gif_path = "ai_generated_video.gif"
    export_to_gif(output.frames[0], gif_path)
    return gif_path

# Function to overlay text using OpenCV with better readability
def overlay_text_on_video(input_video_path, caption, output_video_path):
    cap = cv2.VideoCapture(input_video_path)
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

    # Define font properties
    font = cv2.FONT_HERSHEY_COMPLEX  # Changed to a more readable font
    font_scale = 0.5  # Increased font size for better readability
    font_color = (255, 255, 255)  # White text
    thickness = 2
    line_height = 10
    padding = 15  # Increased padding for spacing

    def wrap_text(text, max_width):
        words = text.split()
        lines = []
        current_line = ""
        for word in words:
            test_line = f"{current_line} {word}".strip()
            text_size = cv2.getTextSize(test_line, font, font_scale, thickness)[0]
            if text_size[0] > max_width:
                lines.append(current_line)
                current_line = word
            else:
                current_line = test_line
        lines.append(current_line)
        return lines

    wrapped_text = wrap_text(caption, width - 40)  # Added some extra margin

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Overlay each line of text with improved spacing
        y_position = height - 50 - (len(wrapped_text) - 1) * (line_height + padding)
        for line in wrapped_text:
            text_size = cv2.getTextSize(line, font, font_scale, thickness)[0]
            text_width = text_size[0]
            text_height = text_size[1]

            # Position text properly and add some padding
            cv2.putText(frame, line, (20, y_position), font, font_scale, font_color, thickness)

            y_position += text_height + line_height

        out.write(frame)

    cap.release()
    out.release()

# Generate a simple video with captions embedded
def sora_generate_video(caption, input_video_path):
    try:
        output_video_path = "generated_video.mp4"
        overlay_text_on_video(input_video_path, caption, output_video_path)
        return output_video_path
    except Exception as e:
        print(f"Error during video generation: {e}")
        return None

# Set up Gradio interface
def process_video(input_video):
    input_video_path = input_video
    print(f"Uploaded video path: {input_video_path}")

    # Generate video caption
    caption = generate_caption(input_video_path)
    print(f"Caption generated: {caption}")

    # Generate an AI synthetic video based on caption
    ai_generated_video_path = generate_ai_video(caption)

    # Generate a video placeholder with captions
    generated_video_path = sora_generate_video(caption, input_video_path)
    return generated_video_path, caption, ai_generated_video_path

# Ensure the uploaded_videos directory exists
os.makedirs("uploaded_videos", exist_ok=True)

# Gradio Interface
interface = gr.Interface(
    fn=process_video,
    inputs=gr.Video(label="Upload Input Video"),
    outputs=[
        gr.Video(label="Generated Video with Captions"),
        gr.Textbox(label="Generated Captions"),
        gr.Video(label="AI Generated Synthetic Video")
    ],
    title="Rare-Human Action Video Generator",
    description="Upload a video, generate captions for the actions, and create a rare-human action video."
)

interface.launch(debug=True)


Loading BLIP model for better captions...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.

The config attributes {'motion_activation_fn': 'geglu', 'motion_attention_bias': False, 'motion_cross_attention_dim': None} were passed to MotionAdapter, but are not expected and will be ignored. Please verify your config.json configuration file.


Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://3a32a0134de1842466.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Uploaded video path: /tmp/gradio/c55114b53691f728b8767d1a1ae35b13e9a9db34f83f0593a465d54b329448a7/input_video_11.mp4
Processing video: /tmp/gradio/c55114b53691f728b8767d1a1ae35b13e9a9db34f83f0593a465d54b329448a7/input_video_11.mp4





Generated captions: ['a room with a guitar and a television', 'a man playing guitar in a living room']
Caption generated: a room with a guitar and a television a man playing guitar in a living room


  0%|          | 0/25 [00:00<?, ?it/s]





Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://3a32a0134de1842466.gradio.live


