In [None]:
import cv2
import os

# Your videos and their descriptions
videos = ['2.5.mp4', 'face.mp4']
descriptions = ['a doctor is talking to an old lady - her patient',
                'A patient is lying in an operating room. His face is being filmed.']

# Directory to save extracted frames (optional)
frames_dir = "extracted_frames"
os.makedirs(frames_dir, exist_ok=True)

def extract_frames(video_path, video_index, frame_rate=1):
    """
    Extract frames from video at `frame_rate` frames per second.
    Returns list of frame file paths.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error opening video {video_path}")
        return []

    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_interval = int(fps // frame_rate) if fps > 0 else 30  # default 30 if unknown

    frame_paths = []
    frame_count = 0
    saved_count = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % frame_interval == 0:
            frame_filename = f"{frames_dir}/video{video_index}_frame{saved_count}.jpg"
            cv2.imwrite(frame_filename, frame)
            frame_paths.append(frame_filename)
            saved_count += 1
        frame_count += 1

    cap.release()
    return frame_paths

# For each video, extract frames and print paired description
video_data = []
for i, video_path in enumerate(videos):
    print(f"Processing {video_path} ...")
    frames = extract_frames(video_path, i)
    desc = descriptions[i]
    video_data.append({
        'video_path': video_path,
        'description': desc,
        'frames': frames
    })

# Example output:
for v in video_data:
    print(f"Video: {v['video_path']}")
    print(f"Description: {v['description']}")
    print(f"Extracted frames count: {len(v['frames'])}")
    print("---")


Processing 2.5.mp4 ...
Processing face.mp4 ...
Video: 2.5.mp4
Description: a doctor is talking to an old lady - her patient
Extracted frames count: 5
---
Video: face.mp4
Description: A patient is lying in an operating room. His face is being filmed.
Extracted frames count: 22
---


In [None]:
video_data

[{'video_path': '2.5.mp4',
  'description': 'a doctor is talking to an old lady - her patient',
  'frames': ['extracted_frames/video0_frame0.jpg',
   'extracted_frames/video0_frame1.jpg',
   'extracted_frames/video0_frame2.jpg',
   'extracted_frames/video0_frame3.jpg',
   'extracted_frames/video0_frame4.jpg']},
 {'video_path': 'face.mp4',
  'description': 'A patient is lying in an operating room. His face is being filmed.',
  'frames': ['extracted_frames/video1_frame0.jpg',
   'extracted_frames/video1_frame1.jpg',
   'extracted_frames/video1_frame2.jpg',
   'extracted_frames/video1_frame3.jpg',
   'extracted_frames/video1_frame4.jpg',
   'extracted_frames/video1_frame5.jpg',
   'extracted_frames/video1_frame6.jpg',
   'extracted_frames/video1_frame7.jpg',
   'extracted_frames/video1_frame8.jpg',
   'extracted_frames/video1_frame9.jpg',
   'extracted_frames/video1_frame10.jpg',
   'extracted_frames/video1_frame11.jpg',
   'extracted_frames/video1_frame12.jpg',
   'extracted_frames/video

In [None]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import os

device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name).to(device)
processor = CLIPProcessor.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [None]:
def clip_similarity(frame_path, text):
    image = Image.open(frame_path).convert("RGB")
    inputs = processor(text=[text], images=image, return_tensors="pt", padding=True).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        similarity = logits_per_image.item()
    return similarity


In [None]:
from tqdm import tqdm

for video in video_data:
    desc = video['description']
    frame_scores = []
    print(f"Scoring frames of video '{video['video_path']}' against text '{desc}'")

    for frame_path in tqdm(video['frames']):
        sim = clip_similarity(frame_path, desc)
        frame_scores.append((frame_path, sim))

    # Sort frames by similarity descending
    frame_scores.sort(key=lambda x: x[1], reverse=True)

    # Save or print top scoring frames info
    top_n = 5
    print(f"Top {top_n} frames for '{desc}':")
    for i in range(top_n):
        print(f"{frame_scores[i][0]} — similarity: {frame_scores[i][1]:.4f}")

    # Attach scores back to video data for downstream usage
    video['frame_scores'] = frame_scores


Scoring frames of video '2.5.mp4' against text 'a doctor is talking to an old lady - her patient'


100%|██████████| 5/5 [00:01<00:00,  3.27it/s]


Top 5 frames for 'a doctor is talking to an old lady - her patient':
extracted_frames/video0_frame2.jpg — similarity: 26.9253
extracted_frames/video0_frame0.jpg — similarity: 26.1620
extracted_frames/video0_frame3.jpg — similarity: 26.1570
extracted_frames/video0_frame4.jpg — similarity: 25.9427
extracted_frames/video0_frame1.jpg — similarity: 25.8353
Scoring frames of video 'face.mp4' against text 'A patient is lying in an operating room. His face is being filmed.'


100%|██████████| 22/22 [00:00<00:00, 36.32it/s]

Top 5 frames for 'A patient is lying in an operating room. His face is being filmed.':
extracted_frames/video1_frame9.jpg — similarity: 27.8086
extracted_frames/video1_frame1.jpg — similarity: 27.5639
extracted_frames/video1_frame20.jpg — similarity: 27.5557
extracted_frames/video1_frame15.jpg — similarity: 27.4929
extracted_frames/video1_frame21.jpg — similarity: 27.4711





In [None]:
def frame_path_to_timestamp(frame_path, frame_rate=1):
    # e.g. extracted_frames/video0_frame10.jpg
    basename = os.path.basename(frame_path)
    parts = basename.split('_')
    frame_number = int(parts[1].replace('frame','').replace('.jpg',''))
    timestamp_sec = frame_number / frame_rate
    return timestamp_sec

for video in video_data:
    top_frames = video.get('frame_scores', [])[:5]
    print(f"Timestamps for top matching frames in {video['video_path']}:")
    for frame_path, score in top_frames:
        ts = frame_path_to_timestamp(frame_path)
        print(f"{frame_path} at ~{ts:.1f} sec (score: {score:.4f})")


Timestamps for top matching frames in 2.5.mp4:
extracted_frames/video0_frame2.jpg at ~2.0 sec (score: 26.9253)
extracted_frames/video0_frame0.jpg at ~0.0 sec (score: 26.1620)
extracted_frames/video0_frame3.jpg at ~3.0 sec (score: 26.1570)
extracted_frames/video0_frame4.jpg at ~4.0 sec (score: 25.9427)
extracted_frames/video0_frame1.jpg at ~1.0 sec (score: 25.8353)
Timestamps for top matching frames in face.mp4:
extracted_frames/video1_frame9.jpg at ~9.0 sec (score: 27.8086)
extracted_frames/video1_frame1.jpg at ~1.0 sec (score: 27.5639)
extracted_frames/video1_frame20.jpg at ~20.0 sec (score: 27.5557)
extracted_frames/video1_frame15.jpg at ~15.0 sec (score: 27.4929)
extracted_frames/video1_frame21.jpg at ~21.0 sec (score: 27.4711)


In [None]:
# Example timestamps (in seconds) and duration (e.g., 5 seconds around the key frame)
scene_segments = {
    "patient_face": [(0, 20)],  # from 10 to 15 seconds in video X
    "doctor_old_lady": [(20, 24)],  # from 25 to 30 seconds in video Y
}


In [None]:
from moviepy.editor import VideoFileClip, concatenate_videoclips

# Example videos where scenes are located
videos = {
    "patient_face": "face.mp4",
    "doctor_old_lady": "2.5.mp4"
}

# Corresponding time segments (start, end) in seconds
# Example timestamps (in seconds) and duration (e.g., 5 seconds around the key frame)
scene_segments = {
    "patient_face": [(5, 15)],  # from 5 sec to 15 sec in face.mp4
    "doctor_old_lady": [(1, 3)],  # from 1 sec to 3 sec in 2.5.mp4
}


clips_to_concatenate = []

for scene, segments in scene_segments.items():
    video_path = videos[scene]
    print(f"Processing {scene} from {video_path}")
    video = VideoFileClip(video_path)

    for start, end in segments:
        print(f"Cutting segment: {start} to {end} seconds")
        clip = video.subclip(start, end)
        clips_to_concatenate.append(clip)

# Concatenate all clips into final video
final_video = concatenate_videoclips(clips_to_concatenate)

# Export final video
final_video.write_videofile("final_compiled_video.mp4", codec="libx264", audio_codec="aac")


Processing patient_face from face.mp4
Cutting segment: 5 to 15 seconds
Processing doctor_old_lady from 2.5.mp4
Cutting segment: 1 to 3 seconds


chunk:  52%|█████▏    | 92/177 [11:39<10:45,  7.60s/it, now=None]

Moviepy - Building video final_compiled_video.mp4.
MoviePy - Writing audio in final_compiled_videoTEMP_MPY_wvf_snd.mp4



chunk:   0%|          | 0/265 [00:00<?, ?it/s, now=None][A
chunk:  17%|█▋        | 45/265 [00:00<00:00, 439.34it/s, now=None][A
chunk:  39%|███▉      | 103/265 [00:00<00:00, 516.21it/s, now=None][A
chunk:  58%|█████▊    | 155/265 [00:00<00:00, 206.34it/s, now=None][A
chunk:  80%|████████  | 212/265 [00:00<00:00, 279.62it/s, now=None][A
chunk:  52%|█████▏    | 92/177 [11:39<10:46,  7.61s/it, now=None]

MoviePy - Done.
Moviepy - Writing video final_compiled_video.mp4




t:   0%|          | 0/360 [00:00<?, ?it/s, now=None][A
t:   6%|▌         | 22/360 [00:00<00:01, 218.18it/s, now=None][A
t:  12%|█▏        | 44/360 [00:00<00:01, 201.79it/s, now=None][A
t:  18%|█▊        | 65/360 [00:00<00:01, 201.19it/s, now=None][A
t:  24%|██▍       | 86/360 [00:00<00:02, 100.95it/s, now=None][A
t:  28%|██▊       | 101/360 [00:01<00:03, 71.67it/s, now=None][A
t:  31%|███       | 112/360 [00:01<00:03, 62.94it/s, now=None][A
t:  34%|███▎      | 121/360 [00:01<00:04, 57.45it/s, now=None][A
t:  36%|███▌      | 129/360 [00:01<00:04, 52.72it/s, now=None][A
t:  38%|███▊      | 136/360 [00:01<00:04, 49.55it/s, now=None][A
t:  39%|███▉      | 142/360 [00:02<00:04, 46.32it/s, now=None][A
t:  41%|████      | 147/360 [00:02<00:04, 45.25it/s, now=None][A
t:  42%|████▏     | 152/360 [00:02<00:04, 45.97it/s, now=None][A
t:  44%|████▎     | 157/360 [00:02<00:04, 43.12it/s, now=None][A
t:  45%|████▌     | 162/360 [00:02<00:04, 40.95it/s, now=None][A
t:  46%|████▋     |

Moviepy - Done !
Moviepy - video ready final_compiled_video.mp4


# Another test

In [8]:
!pip install transformers accelerate torchvision pillow opencv-python pillow torch torchvision transformers accelerate



In [9]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
import os
import cv2
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration


In [10]:
def generate_caption(image_path):
    image = Image.open(image_path).convert('RGB')
    inputs = processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(**inputs)

    caption = processor.decode(output[0], skip_special_tokens=True)
    return caption


def extract_key_frames(video_path, num_frames=5):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise ValueError(f"Could not open {video_path}")

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    indices = [int(i * total_frames / num_frames) for i in range(num_frames)]

    frame_paths = []
    for idx, frame_num in enumerate(indices):
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
        ret, frame = cap.read()
        if ret:
            frame_path = f"extracted_frames/{os.path.basename(video_path).split('.')[0]}_frame{idx}.jpg"
            cv2.imwrite(frame_path, frame)
            frame_paths.append(frame_path)
    cap.release()
    return frame_paths



# Setup BLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# Folder to save frames
os.makedirs("extracted_frames", exist_ok=True)


In [11]:
video_path_1 = "1.3.mp4"  # or "2.5.mp4" etc.
video_path_2 = "2.5.mp"
video_path_3 = "face.mp4"

video_paths = [video_path_1, video_path_2, video_path_3]

# Extract 5 key frames
frame_paths = extract_key_frames(video_path, num_frames=5)

# Caption each frame
captions = []
for frame_path in frame_paths:
    caption = generate_caption(frame_path)
    print(f"{frame_path} => {caption}")
    captions.append(caption)

# Merge and summarize
unique_captions = list(set(captions))
scene_description = " ".join(unique_captions)

print("\n🎬 Final Scene Description:")
print(scene_description)


extracted_frames/1_frame0.jpg => a blurry view of a hospital hallway
extracted_frames/1_frame1.jpg => a medical worker in scrubs his face with a mask
extracted_frames/1_frame2.jpg => a woman in scrubs is getting her bed
extracted_frames/1_frame3.jpg => a man in a hospital bed with a nurse
extracted_frames/1_frame4.jpg => a person in a hospital bed with a blood drip

🎬 Final Scene Description:
a person in a hospital bed with a blood drip a medical worker in scrubs his face with a mask a blurry view of a hospital hallway a man in a hospital bed with a nurse a woman in scrubs is getting her bed


# Another test

In [12]:
import os
import cv2
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration

# ---- Setup ----
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

os.makedirs("extracted_frames", exist_ok=True)

# ---- Functions ----
def extract_key_frames(video_path, num_frames=5):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise ValueError(f"Could not open {video_path}")

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    indices = [int(i * total_frames / num_frames) for i in range(num_frames)]

    frame_paths = []
    for idx, frame_num in enumerate(indices):
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
        ret, frame = cap.read()
        if ret:
            base = os.path.basename(video_path).split('.')[0]
            frame_path = f"extracted_frames/{base}_frame{idx}.jpg"
            cv2.imwrite(frame_path, frame)
            frame_paths.append(frame_path)
    cap.release()
    return frame_paths

def generate_caption(image_path):
    image = Image.open(image_path).convert('RGB')
    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model.generate(**inputs)
    return processor.decode(output[0], skip_special_tokens=True)

# ---- Video List ----
video_paths = ["1.3.mp4", "2.5.mp4", "face.mp4"]

# ---- Process Each Video ----
for video_path in video_paths:
    print(f"\n🔍 Processing video: {video_path}")

    try:
        frame_paths = extract_key_frames(video_path, num_frames=5)
    except Exception as e:
        print(f"Failed to extract frames: {e}")
        continue

    captions = []
    for frame_path in frame_paths:
        caption = generate_caption(frame_path)
        print(f"{frame_path} => {caption}")
        captions.append(caption)

    unique_captions = list(set(captions))
    scene_description = " ".join(unique_captions)

    print("\n🎬 Final Scene Description:")
    print(scene_description)



🔍 Processing video: 1.3.mp4
extracted_frames/1_frame0.jpg => a blurry view of a hospital hallway
extracted_frames/1_frame1.jpg => a medical worker in scrubs his face with a mask
extracted_frames/1_frame2.jpg => a woman in scrubs is getting her bed
extracted_frames/1_frame3.jpg => a man in a hospital bed with a nurse
extracted_frames/1_frame4.jpg => a person in a hospital bed with a blood drip

🎬 Final Scene Description:
a person in a hospital bed with a blood drip a medical worker in scrubs his face with a mask a blurry view of a hospital hallway a man in a hospital bed with a nurse a woman in scrubs is getting her bed

🔍 Processing video: 2.5.mp4
extracted_frames/2_frame0.jpg => a woman in a hospital bed with a patient
extracted_frames/2_frame1.jpg => a woman in a hospital bed with a patient
extracted_frames/2_frame2.jpg => a woman in a hospital bed with a patient
extracted_frames/2_frame3.jpg => a woman in a hospital bed with a patient
extracted_frames/2_frame4.jpg => a woman in a h

In [13]:
!pip install sentence-transformers




In [14]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')  # fast & accurate


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [15]:
# Your generated descriptions per video
video_descriptions = {
    "2.5.mp4": "a woman in a hospital bed with a patient",
    "face.mp4": "a woman in a hospital bed with a tube",
    "1.3.mp4": "a person in a hospital bed with a blood drip a medical worker in scrubs his face with a mask a blurry view of a hospital hallway a man in a hospital bed with a nurse a woman in scrubs is getting her bed"
}

# User prompt
user_prompt = "firstly i want to show how the doctor is talking to the patient, then i want to show a patient lying on the bed with a tube"

# Break user prompt into ordered tasks (manual or use NLP parser)
prompt_segments = [
    "how the doctor is talking to the patient",
    "a patient lying on the bed with a tube"
]


In [16]:
# Embed prompt segments
prompt_embeddings = model.encode(prompt_segments, convert_to_tensor=True)

# Embed video descriptions
video_names = list(video_descriptions.keys())
video_texts = list(video_descriptions.values())
video_embeddings = model.encode(video_texts, convert_to_tensor=True)

# Compute cosine similarity
import torch
matches = []
for i, prompt_emb in enumerate(prompt_embeddings):
    cosine_scores = util.cos_sim(prompt_emb, video_embeddings)[0]
    best_idx = torch.argmax(cosine_scores).item()
    best_video = video_names[best_idx]
    best_score = cosine_scores[best_idx].item()

    matches.append({
        "prompt_segment": prompt_segments[i],
        "matched_video": best_video,
        "matched_caption": video_descriptions[best_video],
        "score": best_score
    })


In [17]:
for match in matches:
    print(f"Prompt: {match['prompt_segment']}")
    print(f"Matched Video: {match['matched_video']}")
    print(f"Caption: {match['matched_caption']}")
    print(f"Similarity: {match['score']:.4f}")
    print("---")


Prompt: how the doctor is talking to the patient
Matched Video: 1.3.mp4
Caption: a person in a hospital bed with a blood drip a medical worker in scrubs his face with a mask a blurry view of a hospital hallway a man in a hospital bed with a nurse a woman in scrubs is getting her bed
Similarity: 0.4856
---
Prompt: a patient lying on the bed with a tube
Matched Video: face.mp4
Caption: a woman in a hospital bed with a tube
Similarity: 0.7587
---


In [None]:
!pip install moviepy opencv-python torch torchvision pillow sentence-transformers transformers accelerate


[0mCollecting moviepy
  Using cached moviepy-2.2.1-py3-none-any.whl.metadata (6.9 kB)
Collecting opencv-python
  Using cached opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting torch
  Using cached torch-2.7.1-cp313-cp313-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting torchvision
  Using cached torchvision-0.22.1-cp313-cp313-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting pillow
  Using cached pillow-11.2.1-cp313-cp313-manylinux_2_28_x86_64.whl.metadata (8.9 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting transformers
  Using cached transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting accelerate
  Using cached accelerate-1.8.1-py3-none-any.whl.metadata (19 kB)
Collecting imageio<3.0,>=2.5 (from moviepy)
  Using cached imageio-2.37.0-py3-none-any.whl.metadata (5.2 kB)
Collecting imageio_ffmpeg>=0.2.0 (from moviepy)
  Using cach

In [20]:
import os
import cv2
from PIL import Image
import torch
from moviepy.editor import VideoFileClip, concatenate_videoclips
from transformers import BlipProcessor, BlipForConditionalGeneration
from sentence_transformers import SentenceTransformer, util

# ========== SETUP ==========
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load models
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
text_model = SentenceTransformer("all-MiniLM-L6-v2")

os.makedirs("extracted_frames", exist_ok=True)

# ========== STEP 1: Extract Key Frames ==========
def extract_key_frames(video_path, num_frames=3):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise ValueError(f"Could not open {video_path}")
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    indices = [int(i * total_frames / num_frames) for i in range(num_frames)]
    paths = []
    for idx, frame_num in enumerate(indices):
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
        ret, frame = cap.read()
        if ret:
            base = os.path.basename(video_path).split('.')[0]
            path = f"extracted_frames/{base}_frame{idx}.jpg"
            cv2.imwrite(path, frame)
            paths.append(path)
    cap.release()
    return paths

# ========== STEP 2: Caption Frames ==========
def caption_image(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = blip_processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        output = blip_model.generate(**inputs)
    return blip_processor.decode(output[0], skip_special_tokens=True)

# ========== STEP 3: Build Descriptions for All Videos ==========
def get_video_description(video_path, num_frames=3):
    frame_paths = extract_key_frames(video_path, num_frames)
    captions = [caption_image(fp) for fp in frame_paths]
    return " ".join(list(set(captions)))  # unique, merged

# ========== STEP 4: Match Prompt to Videos ==========
def match_prompt_segments(prompt_segments, video_descriptions):
    matches = []
    prompt_embs = text_model.encode(prompt_segments, convert_to_tensor=True)
    video_names = list(video_descriptions.keys())
    video_caps = list(video_descriptions.values())
    video_embs = text_model.encode(video_caps, convert_to_tensor=True)

    for i, p_emb in enumerate(prompt_embs):
        sim = util.cos_sim(p_emb, video_embs)[0]
        best_idx = torch.argmax(sim).item()
        matches.append({
            "prompt": prompt_segments[i],
            "matched_video": video_names[best_idx],
            "caption": video_caps[best_idx],
            "score": sim[best_idx].item()
        })
    return matches

# ========== STEP 5: Extract Segments and Concatenate ==========
def cut_and_merge(matches, duration=5, output="final_compiled_video.mp4"):
    clips = []
    for match in matches:
        video_path = match["matched_video"]
        clip = VideoFileClip(video_path)
        end = min(duration, clip.duration)
        clips.append(clip.subclip(0, end))  # from start for simplicity
    final = concatenate_videoclips(clips)
    final.write_videofile(output, codec="libx264", audio_codec="aac")

# ========== MAIN ==========
if __name__ == "__main__":
    # List of videos
    video_paths = ["1.3.mp4", "2.5.mp4", "face.mp4"]

    # Step 1–3: Get descriptions
    video_descriptions = {}
    for vp in video_paths:
        print(f"\n🔍 Generating caption for {vp}")
        try:
            desc = get_video_description(vp, num_frames=3)
            video_descriptions[vp] = desc
            print(f"📝 {vp} => {desc}")
        except Exception as e:
            print(f"⚠️ Error with {vp}: {e}")

    # User textual prompt (structure)
    prompt = "firstly i want to show how the doctor is talking to the patient, then i want to show a patient lying on the bed with a tube"
    prompt_segments = [
        "how the doctor is talking to the patient",
        "a patient lying on the bed with a tube"
    ]

    # Step 4: Match
    print("\n🔗 Matching prompt segments to videos...")
    matches = match_prompt_segments(prompt_segments, video_descriptions)

    for m in matches:
        print(f"\nPrompt: {m['prompt']}")
        print(f"Matched: {m['matched_video']}")
        print(f"Caption: {m['caption']}")
        print(f"Score: {m['score']:.4f}")

    # Step 5: Build final video
    print("\n🎬 Cutting and merging matched clips...")
    cut_and_merge(matches, duration=5)
    print("✅ Final video saved as final_compiled_video.mp4")



🔍 Generating caption for 1.3.mp4
📝 1.3.mp4 => a man in a hospital bed with a nurse a blurry view of a hospital hallway a person in scrubs a patient ' s arm

🔍 Generating caption for 2.5.mp4
📝 2.5.mp4 => a woman in a hospital bed with a patient

🔍 Generating caption for face.mp4
📝 face.mp4 => a woman in a hospital bed with a tube

🔗 Matching prompt segments to videos...

Prompt: how the doctor is talking to the patient
Matched: 1.3.mp4
Caption: a man in a hospital bed with a nurse a blurry view of a hospital hallway a person in scrubs a patient ' s arm
Score: 0.5195

Prompt: a patient lying on the bed with a tube
Matched: face.mp4
Caption: a woman in a hospital bed with a tube
Score: 0.7587

🎬 Cutting and merging matched clips...


chunk:  99%|█████████▊| 218/221 [00:35<00:00, 396.91it/s, now=None]

Moviepy - Building video final_compiled_video.mp4.
MoviePy - Writing audio in final_compiled_videoTEMP_MPY_wvf_snd.mp4



chunk:   0%|          | 0/221 [00:00<?, ?it/s, now=None][A
chunk:  37%|███▋      | 81/221 [00:00<00:00, 805.84it/s, now=None][A
chunk:  73%|███████▎  | 162/221 [00:00<00:00, 488.49it/s, now=None][A
chunk:  99%|█████████▊| 218/221 [00:00<00:00, 459.87it/s, now=None][A
chunk:  99%|█████████▊| 218/221 [00:36<00:00, 396.91it/s, now=None]

MoviePy - Done.
Moviepy - Writing video final_compiled_video.mp4




t:   0%|          | 0/300 [00:00<?, ?it/s, now=None][A
t:   2%|▏         | 7/300 [00:00<00:04, 59.80it/s, now=None][A
t:   4%|▍         | 13/300 [00:00<00:08, 34.04it/s, now=None][A
t:   6%|▌         | 17/300 [00:00<00:10, 27.54it/s, now=None][A
t:   7%|▋         | 21/300 [00:00<00:11, 24.20it/s, now=None][A
t:   8%|▊         | 24/300 [00:00<00:12, 22.45it/s, now=None][A
t:   9%|▉         | 27/300 [00:01<00:14, 18.70it/s, now=None][A
t:  10%|▉         | 29/300 [00:01<00:16, 16.91it/s, now=None][A
t:  11%|█         | 32/300 [00:01<00:14, 18.41it/s, now=None][A
t:  11%|█▏        | 34/300 [00:01<00:14, 18.67it/s, now=None][A
t:  12%|█▏        | 36/300 [00:01<00:15, 17.55it/s, now=None][A
t:  13%|█▎        | 38/300 [00:01<00:15, 16.63it/s, now=None][A
t:  13%|█▎        | 40/300 [00:01<00:15, 17.06it/s, now=None][A
t:  14%|█▍        | 43/300 [00:02<00:13, 19.18it/s, now=None][A
t:  15%|█▌        | 46/300 [00:02<00:13, 19.16it/s, now=None][A
t:  16%|█▋        | 49/300 [00:02<

Moviepy - Done !
Moviepy - video ready final_compiled_video.mp4
✅ Final video saved as final_compiled_video.mp4


# Video descriptions


In [27]:
import os
import cv2
import json
import torch
from PIL import Image
from datetime import datetime
from moviepy.editor import VideoFileClip
from transformers import BlipProcessor, BlipForConditionalGeneration

# ---- Setup ----
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

os.makedirs("extracted_frames", exist_ok=True)

# ---- Frame Extraction ----
def extract_key_frames(video_path, num_frames=5):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise ValueError(f"Could not open {video_path}")
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    indices = [int(i * total_frames / num_frames) for i in range(num_frames)]
    frame_paths = []
    for idx, frame_num in enumerate(indices):
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
        ret, frame = cap.read()
        if ret:
            base = os.path.basename(video_path).split('.')[0]
            path = f"extracted_frames/{base}_frame{idx}.jpg"
            cv2.imwrite(path, frame)
            frame_paths.append(path)
    cap.release()
    return frame_paths

# ---- Caption Generation ----
def generate_caption(image_path):
    image = Image.open(image_path).convert('RGB')
    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model.generate(**inputs)
    return processor.decode(output[0], skip_special_tokens=True)

# ---- Utility: Extract File Info ----
def get_file_info(video_path):
    stat = os.stat(video_path)
    size = stat.st_size
    created = datetime.utcfromtimestamp(stat.st_ctime).strftime('%Y-%m-%d %H:%M:%S UTC')
    return size, created

# ---- Generate Video Description Object ----
def describe_video(video_path, num_frames=5):
    try:
        clip = VideoFileClip(video_path)
        duration = clip.duration
        width, height = clip.size
        fps = clip.fps
        has_audio = clip.audio is not None
        video_format = video_path.split('.')[-1]
    except Exception as e:
        raise RuntimeError(f"Video metadata extraction failed: {e}")

    # Metadata
    file_name = os.path.basename(video_path)
    file_path = os.path.abspath(video_path)
    file_size, created_at = get_file_info(video_path)

    # Captions
    frame_paths = extract_key_frames(video_path, num_frames=num_frames)
    frame_captions = [generate_caption(fp) for fp in frame_paths]

    # Dummy tagging (improvement idea: use keyword extraction or CLIP)
    tags = list(set(word for cap in frame_captions for word in cap.lower().split() if word.isalpha()))
    tags = tags[:5]  # limit

    # Build structured JSON
    json_obj = {
        "type": "video",
        "metadata": {
            "fileName": file_name,
            "filePath": file_path,
            "fileSize": file_size,
            "createdAt": created_at,
            "description": " ".join(set(frame_captions[:2])),
            "tags": tags
        },
        "duration": duration,
        "resolution": {"width": width, "height": height},
        "frameRate": fps,
        "hasAudio": has_audio,
        "videoFormat": video_format,
        "contentAnalysis": {
            "contentOverview": " ".join(set(frame_captions)),
            "actionIntroduction": frame_captions[0],
            "timeBoundDetails": [
                {
                    "detailStartTime": round(i * (duration / num_frames), 2),
                    "detailEndTime": round((i + 1) * (duration / num_frames), 2),
                    "detailDescription": frame_captions[i],
                    "detailConfidence": round(0.8 + 0.02 * (num_frames - i) / num_frames, 2)  # fake confidence
                }
                for i in range(len(frame_captions))
            ],
            "detectedObjects": tags[:5],
            "detectedScenes": list(set(["indoor" if "room" in c or "bed" in c else "outdoor" for c in frame_captions])),
            "estimatedMood": "neutral"
        }
    }

    return json_obj


In [30]:
video_paths = ["Chocolate2.MP4", "Chocolate3.MP4", "Business_center.MP4"]

all_descriptions = []
for path in video_paths:
    print(f"Processing: {path}")
    try:
        desc = describe_video(path)
        all_descriptions.append(desc)
        with open(f"{os.path.splitext(os.path.basename(path))[0]}_description.json", "w") as f:
            json.dump(desc, f, indent=2)
    except Exception as e:
        print(f"Failed: {e}")


Processing: Chocolate2.MP4
Processing: Chocolate3.MP4
Processing: Business_center.MP4


# Try with audio

In [24]:
!pip install openai-whisper git+https://github.com/openai/whisper moviepy


Collecting git+https://github.com/openai/whisper
  Cloning https://github.com/openai/whisper to /tmp/pip-req-build-qhpidnsb
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper /tmp/pip-req-build-qhpidnsb
  Resolved https://github.com/openai/whisper to commit dd985ac4b90cafeef8712f2998d62c59c3e62d22
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [



In [25]:
import os
import cv2
import whisper
import torch
import json
from PIL import Image
from moviepy.editor import VideoFileClip
from transformers import BlipProcessor, BlipForConditionalGeneration
from datetime import datetime

# Setup
device = "cuda" if torch.cuda.is_available() else "cpu"
os.makedirs("extracted_frames", exist_ok=True)

# Load models
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
whisper_model = whisper.load_model("base")  # or 'small'/'medium'

# --------- Frame Extraction ----------
def extract_key_frames(video_path, num_frames=5):
    cap = cv2.VideoCapture(video_path)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    indices = [int(i * total / num_frames) for i in range(num_frames)]
    paths = []
    for idx, f in enumerate(indices):
        cap.set(cv2.CAP_PROP_POS_FRAMES, f)
        ret, frame = cap.read()
        if ret:
            name = os.path.splitext(os.path.basename(video_path))[0]
            path = f"extracted_frames/{name}_frame{idx}.jpg"
            cv2.imwrite(path, frame)
            paths.append(path)
    cap.release()
    return paths

# --------- Visual Captioning ----------
def generate_caption(img_path):
    image = Image.open(img_path).convert("RGB")
    inputs = blip_processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        out = blip_model.generate(**inputs)
    return blip_processor.decode(out[0], skip_special_tokens=True)

# --------- Audio Transcription ----------
def transcribe_audio(video_path):
    temp_audio = "temp_audio.wav"
    video = VideoFileClip(video_path)
    video.audio.write_audiofile(temp_audio, verbose=False, logger=None)
    result = whisper_model.transcribe(temp_audio)
    os.remove(temp_audio)
    return result["text"], result.get("segments", [])

# --------- File Metadata ----------
def get_file_info(path):
    stat = os.stat(path)
    return stat.st_size, datetime.utcfromtimestamp(stat.st_ctime).strftime('%Y-%m-%d %H:%M:%S UTC')

# --------- Combined Description ----------
def describe_video_multimodal(video_path, num_frames=5):
    # File metadata
    file_name = os.path.basename(video_path)
    file_path = os.path.abspath(video_path)
    file_size, created_at = get_file_info(video_path)
    clip = VideoFileClip(video_path)
    width, height = clip.size
    duration = clip.duration
    fps = clip.fps
    has_audio = clip.audio is not None
    fmt = file_name.split(".")[-1]

    # Visual description
    frame_paths = extract_key_frames(video_path, num_frames)
    visual_captions = [generate_caption(p) for p in frame_paths]

    # Audio transcription
    audio_text, audio_segments = transcribe_audio(video_path) if has_audio else ("", [])

    # Merge content
    combined_desc = ". ".join(set(visual_captions)) + (". Spoken: " + audio_text if audio_text else "")

    return {
        "type": "video",
        "metadata": {
            "fileName": file_name,
            "filePath": file_path,
            "fileSize": file_size,
            "createdAt": created_at,
            "description": combined_desc[:150] + "...",
            "tags": list(set(word for cap in visual_captions for word in cap.lower().split() if word.isalpha()))[:5]
        },
        "duration": duration,
        "resolution": {"width": width, "height": height},
        "frameRate": fps,
        "hasAudio": has_audio,
        "videoFormat": fmt,
        "contentAnalysis": {
            "contentOverview": combined_desc,
            "actionIntroduction": visual_captions[0],
            "visualCaptions": visual_captions,
            "audioTranscript": audio_text,
            "audioSegments": audio_segments,
            "estimatedMood": "neutral"
        }
    }


100%|███████████████████████████████████████| 139M/139M [00:04<00:00, 31.0MiB/s]


In [26]:
video_paths = ["1.3.mp4", "2.5.mp4", "face.mp4"]

for path in video_paths:
    print(f"\nProcessing: {path}")
    try:
        result = describe_video_multimodal(path)
        with open(f"{os.path.splitext(os.path.basename(path))[0]}_multimodal_description.json", "w") as f:
            json.dump(result, f, indent=2)
        print("✅ Done:", result['metadata']['description'])
    except Exception as e:
        print("❌ Error:", e)



Processing: 1.3.mp4
✅ Done: a person in a hospital bed with a blood drip. a medical worker in scrubs his face with a mask. a blurry view of a hospital hallway. a man in a hospita...

Processing: 2.5.mp4
✅ Done: a woman in a hospital bed with a patient...

Processing: face.mp4
✅ Done: a woman in a hospital bed with a tube. Spoken:  Там же велосипеда. А вот этоendek<|ru|>...
