In [None]:
import cv2
import numpy as np
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input, decode_predictions
from transformers import BertTokenizer, BertModel
import torch
from moviepy.editor import VideoFileClip

# Initialize MobileNetV2 and BERT
model_mobilenet = MobileNetV2(weights='imagenet', include_top=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = BertModel.from_pretrained('bert-base-uncased')

def generate_captions(video_path):
    cap = cv2.VideoCapture(video_path)
    captions = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        # Preprocess the frame for MobileNetV2
        frame = cv2.resize(frame, (224, 224))
        frame_array = preprocess_input(frame)
        frame_array = np.expand_dims(frame_array, axis=0)

        # Predict and decode predictions
        preds = model_mobilenet.predict(frame_array)
        decoded = decode_predictions(preds, top=1)[0][0][1]
        captions.append(decoded)
    cap.release()
    return captions

def calculate_similarity(captions, description):
    description_encoded = tokenizer(description, return_tensors='pt')
    description_output = model_bert(**description_encoded)
    description_embedding = description_output.last_hidden_state[:, 0, :]

    matching_indices = []
    for i, caption in enumerate(captions):
        caption_encoded = tokenizer(caption, return_tensors='pt')
        caption_output = model_bert(**caption_encoded)
        caption_embedding = caption_output.last_hidden_state[:, 0, :]

        # Cosine similarity
        cos_sim = torch.nn.functional.cosine_similarity(description_embedding, caption_embedding)
        if cos_sim.item() > 0.8:  # Threshold for similarity
            matching_indices.append(i)

    return matching_indices

def clip_video(video_path, output_path, frame_indices, fps=30):
    start_time = frame_indices[0] / fps
    end_time = (frame_indices[-1] + 1) / fps
    video = VideoFileClip(video_path).subclip(start_time, end_time)
    video.write_videofile(output_path, codec='libx264')

def process_video(video_path, description, output_path):
    print("Generating captions...")
    captions = generate_captions(video_path)
    print("Calculating similarities...")
    matching_indices = calculate_similarity(captions, description)
    if matching_indices:
        print("Clipping video...")
        clip_video(video_path, output_path, matching_indices)
        print(f"Video successfully saved to {output_path}")
    else:
        print("No matching activities found.")

# Example usage
video_path = 'input_video.mp4'
activity_description = 'dog playing'
output_video_path = 'output_video.mp4'
process_video(video_path, activity_description, output_video_path)
