In [1]:
import cv2
import os
import time
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration, T5Tokenizer, T5ForConditionalGeneration

# Initialize the processor and model for image captioning
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model_img_captioning = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")

# Initialize the tokenizer and model for summarization
tokenizer_t5 = T5Tokenizer.from_pretrained("t5-small")
model_t5 = T5ForConditionalGeneration.from_pretrained("t5-small").to("cpu")


def generate_caption(pil_image):
    try:
        inputs = processor(pil_image, return_tensors="pt")
        out = model_img_captioning.generate(**inputs)
        caption = processor.decode(out[0], skip_special_tokens=True)
        return caption
    except Exception as e:
        print(f"Error generating caption: {e}")
        return "Unable to process image."


# Path to the folder containing videos
folder_path = '/Users/kristinakuznetsova/Downloads/UBnormal/Scene2'

# Define path to store frames (modify this path to your desired location)
frames_folder = "/Users/kristinakuznetsova/Downloads/frames"
os.makedirs(frames_folder, exist_ok=True)

# Get list of video files in the folder
video_files = [f for f in os.listdir(folder_path) if f.endswith(('mp4', 'avi', 'mkv'))]

# Process and generate captions for each video
all_video_captions = []
for video_file in video_files:
    video_path = os.path.join(folder_path, video_file)
    cap = cv2.VideoCapture(video_path)
    video_captions = []
    frame_rate = cap.get(cv2.CAP_PROP_FPS)
    sample_interval = int(frame_rate)  # sample one frame per second
    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % sample_interval == 0:
            pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            caption = generate_caption(pil_image)
            if caption != "Unable to process image.":
                video_captions.append(caption)
                print(f"{caption} - '{video_file}'")

                # Save the frame with a filename based on video and frame number
                frame_filename = f"{os.path.splitext(video_file)[0]}_{frame_count}.jpg"
                frame_path = os.path.join(frames_folder, frame_filename)
                cv2.imwrite(frame_path, frame)  # Save the frame as JPG

        frame_count += 1  # Move frame count outside the if condition

    cap.release()  # Release video capture object outside the loop

    # Generate summary from the collected captions for this video
    if video_captions:
        input_text = "summarize: " + " ".join(list(set(video_captions)))
        inputs = tokenizer_t5(input_text, return_tensors="pt", max_length=512, truncation=True)
        summary_ids = model_t5.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
        all_video_captions.append(f"{video_file}: {summary}")
        print(f"Summary for {video_file}: {summary}")

# Print all summaries
print("\nAll video summaries:")
for video_summary in all_video_captions:
    print(video_summary)


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


a street with a white line on the road - 'normal_scene_2_scenario_5.mp4'




a street with a person walking down it - 'normal_scene_2_scenario_5.mp4'
a street with a person walking down it - 'normal_scene_2_scenario_5.mp4'
a street with a person walking down it - 'normal_scene_2_scenario_5.mp4'
a street with a person walking down it - 'normal_scene_2_scenario_5.mp4'
a person walking down a street in a city - 'normal_scene_2_scenario_5.mp4'
a street with a person walking down it - 'normal_scene_2_scenario_5.mp4'
a street with a person walking down it - 'normal_scene_2_scenario_5.mp4'
a street with a person walking down it - 'normal_scene_2_scenario_5.mp4'
a street with a person walking down it - 'normal_scene_2_scenario_5.mp4'
a street with a person walking down it - 'normal_scene_2_scenario_5.mp4'
a street with a white line on the road - 'normal_scene_2_scenario_5.mp4'
a street with a white line on the road - 'normal_scene_2_scenario_5.mp4'
a street with a white line on the road - 'normal_scene_2_scenario_5.mp4'
a street with a person walking down it - 'normal_

In [10]:
video_files

['abnormal_scene_4_scenario_1_fire.mp4']

In [11]:
all_video_captions

['abnormal_scene_4_scenario_1_fire.mp4: a fire is burning in a room with couches a fire burning in a room with couches and chairs a fire burning in a room with couches and chairs a fire burning in a room with couches and chairs a fire burning in the middle of a room.']