In [1]:
# Install necessary dependencies
!pip install gradio moviepy transformers torch torchvision torchaudio -q
!git clone https://github.com/lyogavin/train_your_own_sora.git sora_model
!pip install git+https://github.com/openai/CLIP.git -q

# Change to the sora_model directory
%cd sora_model
!pip install -r requirements.txt
%cd ..

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.1/57.1 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.1/320.1 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.2/73.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.8/63.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.2/130.2 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCloning into 'sora_model'...
remote: Enumerating objects: 109, done.[K
remote: Counting objects: 100% (109/109), done.[K
remote: Compressing objects: 100% (70/7

/content


In [9]:
import gradio as gr
import cv2
import numpy as np
from moviepy.editor import VideoFileClip
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
import os

# Load BLIP model for captioning
print("Loading BLIP model for better captions...")
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Helper function to generate captions using BLIP
def generate_caption(video_path):
    print(f"Processing video: {video_path}")
    clip = VideoFileClip(video_path)
    frames = []
    for i, frame in enumerate(clip.iter_frames(fps=0.5)):
        if i > 20:
            break
        frames.append(frame)

    # Convert frames to PIL images and generate captions
    captions = []
    for frame in frames:
        pil_image = Image.fromarray(frame)
        inputs = processor(pil_image, return_tensors="pt").to(device)
        outputs = model.generate(**inputs)
        caption = processor.decode(outputs[0], skip_special_tokens=True)
        captions.append(caption)

    # Combine captions into a single summary
    combined_caption = " ".join(captions)
    print(f"Generated captions: {captions}")
    return combined_caption

# Function to overlay text using OpenCV with better readability
def overlay_text_on_video(input_video_path, caption, output_video_path):
    cap = cv2.VideoCapture(input_video_path)
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

    # Define font properties
    font = cv2.FONT_HERSHEY_COMPLEX  # Changed to a more readable font
    font_scale = 0.5  # Increased font size for better readability
    font_color = (255, 255, 255)  # White text
    thickness = 2
    line_height = 10
    padding = 15  # Increased padding for spacing

    def wrap_text(text, max_width):
        words = text.split()
        lines = []
        current_line = ""
        for word in words:
            test_line = f"{current_line} {word}".strip()
            text_size = cv2.getTextSize(test_line, font, font_scale, thickness)[0]
            if text_size[0] > max_width:
                lines.append(current_line)
                current_line = word
            else:
                current_line = test_line
        lines.append(current_line)
        return lines

    wrapped_text = wrap_text(caption, width - 40)  # Added some extra margin

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Overlay each line of text with improved spacing
        y_position = height - 50 - (len(wrapped_text) - 1) * (line_height + padding)
        for line in wrapped_text:
            text_size = cv2.getTextSize(line, font, font_scale, thickness)[0]
            text_width = text_size[0]
            text_height = text_size[1]

            # Position text properly and add some padding
            cv2.putText(frame, line, (20, y_position), font, font_scale, font_color, thickness)

            y_position += text_height + line_height

        out.write(frame)

    cap.release()
    out.release()

# Generate a simple video with captions embedded
def sora_generate_video(caption, input_video_path):
    try:
        output_video_path = "generated_video.mp4"
        overlay_text_on_video(input_video_path, caption, output_video_path)
        return output_video_path
    except Exception as e:
        print(f"Error during video generation: {e}")
        return None

# Set up Gradio interface
def process_video(input_video):
    input_video_path = input_video
    print(f"Uploaded video path: {input_video_path}")

    # Generate video caption
    caption = generate_caption(input_video_path)
    print(f"Caption generated: {caption}")

    # Generate a video placeholder with captions
    generated_video_path = sora_generate_video(caption, input_video_path)
    return generated_video_path, caption

# Ensure the uploaded_videos directory exists
os.makedirs("uploaded_videos", exist_ok=True)

# Gradio Interface
interface = gr.Interface(
    fn=process_video,
    inputs=gr.Video(label="Upload Input Video"),
    outputs=[
        gr.Video(label="Generated Video"),
        gr.Textbox(label="Generated Captions")
    ],
    title="Rare-Human Action Video Generator",
    description="Upload a video, generate captions for the actions, and create a rare-human action video."
)

interface.launch(debug=True)


Loading BLIP model for better captions...
Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://b081a0d546b5a3add9.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Uploaded video path: /tmp/gradio/e95fdc5e0b03f7b03541db7dc2c42b956644291d58bb0787bd25c526c7fa7648/input_video_1.mp4
Processing video: /tmp/gradio/e95fdc5e0b03f7b03541db7dc2c42b956644291d58bb0787bd25c526c7fa7648/input_video_1.mp4





Generated captions: ['a bed in a room with blue curtains', 'a bedroom with a bed and a window', 'a man sitting on a bed in a room', 'a man is lying on a bed in a room', 'a man standing in front of a bed']
Caption generated: a bed in a room with blue curtains a bedroom with a bed and a window a man sitting on a bed in a room a man is lying on a bed in a room a man standing in front of a bed





Uploaded video path: /tmp/gradio/75a67c7f53e95705c994437040c2024959c3457f0812e34daa745138d5f6471e/input_video_2.mp4
Processing video: /tmp/gradio/75a67c7f53e95705c994437040c2024959c3457f0812e34daa745138d5f6471e/input_video_2.mp4





Generated captions: ['a man is sitting in a chair and talking to a cat', 'a man sitting at a table with a laptop', 'a man in a black shirt is standing in a kitchen', 'a man is standing in front of a counter', 'a man is cooking in a kitchen with a light on']
Caption generated: a man is sitting in a chair and talking to a cat a man sitting at a table with a laptop a man in a black shirt is standing in a kitchen a man is standing in front of a counter a man is cooking in a kitchen with a light on





Uploaded video path: /tmp/gradio/b35de978686479b92b316e49b01893d33af2756643f3ef984430224de14cdc0d/input_video_3.mp4
Processing video: /tmp/gradio/b35de978686479b92b316e49b01893d33af2756643f3ef984430224de14cdc0d/input_video_3.mp4





Generated captions: ['a refrigerator and a refrigerator freezer in a kitchen', 'a woman in a red jacket is standing in front of a refrigerator', 'a refrigerator with a door open and a light on', 'a woman standing in front of a refrigerator', 'a woman in a red shirt is standing in front of a refrigerator', 'a woman in a red sweatshirt is standing in front of a refrigerator']
Caption generated: a refrigerator and a refrigerator freezer in a kitchen a woman in a red jacket is standing in front of a refrigerator a refrigerator with a door open and a light on a woman standing in front of a refrigerator a woman in a red shirt is standing in front of a refrigerator a woman in a red sweatshirt is standing in front of a refrigerator





Uploaded video path: /tmp/gradio/0af648b5777c0aaa6b32667cc8b83c8391fe5344a672e71e9cefb5978591bfa1/input_video_4.mp4
Processing video: /tmp/gradio/0af648b5777c0aaa6b32667cc8b83c8391fe5344a672e71e9cefb5978591bfa1/input_video_4.mp4





Generated captions: ['a man in a blue shirt', 'a man and woman standing in a room', 'a man standing in front of a green wall', 'a man in a white hat and a green wall', 'a man in a blue shirt and a woman in a white hat', 'a man in a white hat and a man in a blue shirt']
Caption generated: a man in a blue shirt a man and woman standing in a room a man standing in front of a green wall a man in a white hat and a green wall a man in a blue shirt and a woman in a white hat a man in a white hat and a man in a blue shirt





Uploaded video path: /tmp/gradio/6bdb9b43087725543ad675b5b8fe20c589346454476e67734b5c0cb221f56424/input_video_5.mp4
Processing video: /tmp/gradio/6bdb9b43087725543ad675b5b8fe20c589346454476e67734b5c0cb221f56424/input_video_5.mp4





Generated captions: ['a man in a green and black suit sitting on a bed', 'a woman in a green and black outfit sitting on a bed', 'a woman in a green and black outfit sitting on a bed', 'a man is sitting on a bed in a room', 'a man in a room with a bed and a television', 'a woman is sitting on a bed in a room']
Caption generated: a man in a green and black suit sitting on a bed a woman in a green and black outfit sitting on a bed a woman in a green and black outfit sitting on a bed a man is sitting on a bed in a room a man in a room with a bed and a television a woman is sitting on a bed in a room






Uploaded video path: /tmp/gradio/61e14451355bcbc79f887f02129da350b657716bdef99286f826a3d5f1b26971/input_video_6.mp4
Processing video: /tmp/gradio/61e14451355bcbc79f887f02129da350b657716bdef99286f826a3d5f1b26971/input_video_6.mp4
Generated captions: ['a man is sitting in the back seat of a car', 'a man is sitting in a car with a bottle', 'a man sitting in a car with a beer', 'a man is seen in the video, and is seen on the screen', 'a man is driving a car with a cell']
Caption generated: a man is sitting in the back seat of a car a man is sitting in a car with a bottle a man sitting in a car with a beer a man is seen in the video, and is seen on the screen a man is driving a car with a cell






Uploaded video path: /tmp/gradio/87ec9c67e69128503262bb0540ffd15c06b232e8d6956f4028e9d7f1c6c070ae/input_video_7.mp4
Processing video: /tmp/gradio/87ec9c67e69128503262bb0540ffd15c06b232e8d6956f4028e9d7f1c6c070ae/input_video_7.mp4
Generated captions: ['a man is walking up the stairs in a house', 'a man is standing on the stairs and looking at a dog', 'a woman is standing in a doorway with her dog', 'a video of a man in a room with a camera', 'a woman is standing on the stairs in a room']
Caption generated: a man is walking up the stairs in a house a man is standing on the stairs and looking at a dog a woman is standing in a doorway with her dog a video of a man in a room with a camera a woman is standing on the stairs in a room





Uploaded video path: /tmp/gradio/6f12ad9431152392bbbc27c1c36d6f2dea84c8661d93a998f7d06eb73dbd1bf5/input_video_8.mp4
Processing video: /tmp/gradio/6f12ad9431152392bbbc27c1c36d6f2dea84c8661d93a998f7d06eb73dbd1bf5/input_video_8.mp4





Generated captions: ['a woman is sitting on the floor in a kitchen', 'a man is playing a video game in a kitchen', 'a woman is cleaning a kitchen with a hose', 'a man is standing in a room with a door', 'a man sitting on a toilet in a bathroom']
Caption generated: a woman is sitting on the floor in a kitchen a man is playing a video game in a kitchen a woman is cleaning a kitchen with a hose a man is standing in a room with a door a man sitting on a toilet in a bathroom






Uploaded video path: /tmp/gradio/3dd21a498b9a567b4cc91b43aac5b68335a5d10e3885fd9b4bdb7194ca566694/input_video_9.mp4
Processing video: /tmp/gradio/3dd21a498b9a567b4cc91b43aac5b68335a5d10e3885fd9b4bdb7194ca566694/input_video_9.mp4
Generated captions: ['a man and woman sitting on a couch', 'a woman sitting on a couch', 'a woman sitting on a couch', 'a woman sitting on a couch', 'a woman sitting on a couch', 'a woman sitting on a couch']
Caption generated: a man and woman sitting on a couch a woman sitting on a couch a woman sitting on a couch a woman sitting on a couch a woman sitting on a couch a woman sitting on a couch





Uploaded video path: /tmp/gradio/57182cc8ea69bcca133dfeb7454f9f70a0759b7f5f3c715a7ce913d6aedb44ee/input_video_10.mp4
Processing video: /tmp/gradio/57182cc8ea69bcca133dfeb7454f9f70a0759b7f5f3c715a7ce913d6aedb44ee/input_video_10.mp4





Generated captions: ['a man is seen in the dark with his dog', 'the video shows the man being taken from the back of a car', 'a man is seen in the image of a man in a car', 'a man is seen in the video, and the video is still showing him', 'the video shows the man being taken from the back of a car']
Caption generated: a man is seen in the dark with his dog the video shows the man being taken from the back of a car a man is seen in the image of a man in a car a man is seen in the video, and the video is still showing him the video shows the man being taken from the back of a car





Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://b081a0d546b5a3add9.gradio.live


