
For python generation from a jupyter file, use following command:

    `jupyter nbconvert --to script your_notebook.ipynb`


In [None]:
from transformers import (
    pipeline, AutoProcessor,
    AutoModelForImageTextToText, AutoTokenizer, LlavaForConditionalGeneration,
    LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration,
)
from PIL import Image
import requests
import os
import torch

import av
import numpy as np
from huggingface_hub import hf_hub_download
import sentencepiece


In [None]:
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"cuda available" if torch.cuda.is_available() else "cuda not available")
device = torch.device("cpu")
print(f"device: {device}")

In [None]:

# Define the model ID and the local path where the model should be stored
model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"

In [None]:
local_model_path = os.path.expanduser("~/myProject_LLM/model/")
# Check if the model files exist in the local path
model_files = ["config.json", "pytorch_model.bin"]
model_exists = all(os.path.exists(os.path.join(local_model_path, file)) for file in model_files)
print(f"Model files exist: {model_exists}")

In [None]:
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
    model_id, 
    torch_dtype=torch.float16, 
    low_cpu_mem_usage=True, 
).to(device)


In [None]:
processor = LlavaNextVideoProcessor.from_pretrained(model_id, use_fast=False)

In [None]:
# path of the image you want to describe
# Load the video as an np.array, sampling uniformly 8 frames (can sample more for longer videos)
# video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
video_path1 = os.path.expanduser("~/myProject_LLM/myDocs/BoschSmartCameras_11-04-2025_21-05-26-315.mp4")
video_path2 = os.path.expanduser("~/myProject_LLM/myDocs/BoschSmartCameras_11-04-2025_21-05-54-126.mp4")
video_path3 = os.path.expanduser("~/myProject_LLM/myDocs/BoschSmartCameras_11-04-2025_21-08-26-159.mp4")
image_path = os.path.expanduser("~/myProject_LLM/myDocs/picture.jpg")
file_path = video_path1

In [None]:
# Define a chat history and use `apply_chat_template` to get correctly formatted prompt
# Each value in "content" has to be a list of dicts with types ("text", "image") 
conversation = [
    {
      "role": "user",
      "content": [
          {"type": "text", "text": "Beschreibe das Media auf Deutsch mit folgender Struktur: 1. Hauptmotiv, 2. Hintergrund, 3. Bewegungen, 4. Anzahl unterschiedliche Personen, 5. Ist es ein Einbruch?"}
        ],
    }
]
    
# Supported extensions for images and videos
image_extensions = {".jpg", ".jpeg", ".png", ".bmp", ".tiff"}
video_extensions = {".mp4", ".avi", ".mov", ".mkv"}

# Add files to the conversation

ext = os.path.splitext(file_path)[1].lower()  # Get the file extension
if ext in image_extensions:
    conversation[0]["content"].append({"type": "image", "path": file_path})
elif ext in video_extensions:
    conversation[0]["content"].append({"type": "video", "path": file_path})
else:
    print(f"Unsupported file type: {file_path}")


In [None]:
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

In [None]:
inputs = processor.apply_chat_template(conversation, num_frames=8, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt")

In [None]:
output = model.generate(**inputs, max_new_tokens=300)

In [None]:
# Decode the output
decoded_output = processor.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
# Print the output in a readable format
for line in decoded_output:
    print(line)

In [1]:
import os


In [2]:
from moviepy.editor import VideoFileClip


In [7]:
def reduce_frames(input_video_path, output_video_path, target_fps, target_codec):
    # Load the video file
    video = VideoFileClip(input_video_path)
    video_fps = video.fps
    print(f"Input FPS: {video_fps}")
    
    try:
        # Resample the video to the target FPS
        video_reduced = video.set_fps(target_fps)
        video_reduced_fps = video_reduced.fps
        print(f"Reduced FPS: {video_reduced_fps}")
        
        # Write the output video file with the specified FPS
        video_reduced.write_videofile(output_video_path, codec=target_codec, audio_codec="aac", fps=target_fps)
   

        print(f"Output video saved to: {output_video_path}")
    finally:
        # Ensure resources are properly released
        video.close()
        video_reduced.close()
    
    return video_fps, video_reduced_fps

# Example usage
input_video_path1 = os.path.expanduser("~/myProject_LLM/myDocs/BoschSmartCameras_11-04-2025_21-05-26-315.mp4")
output_video_path1 = os.path.expanduser("~/myProject_LLM/myDocs/RED1_BoschSmartCameras_11-04-2025_21-05-26-315.mp4")
target_fps = 1
target_codec="mpeg4" 

in_fps, out_fps = reduce_frames(input_video_path1, output_video_path1, target_fps, target_codec)
print(f"FPS : {in_fps}, {out_fps}")

Input FPS: 29.97002997002997
Reduced FPS: 1
Moviepy - Building video /home/gabriel/myProject_LLM/myDocs/RED1_BoschSmartCameras_11-04-2025_21-05-26-315.mp4.
MoviePy - Writing audio in RED1_BoschSmartCameras_11-04-2025_21-05-26-315TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /home/gabriel/myProject_LLM/myDocs/RED1_BoschSmartCameras_11-04-2025_21-05-26-315.mp4



                                                            

Moviepy - Done !
Moviepy - video ready /home/gabriel/myProject_LLM/myDocs/RED1_BoschSmartCameras_11-04-2025_21-05-26-315.mp4
Output video saved to: /home/gabriel/myProject_LLM/myDocs/RED1_BoschSmartCameras_11-04-2025_21-05-26-315.mp4
FPS : 29.97002997002997, 1




In [None]:
import cv2
import os
import numpy as np
from skimage.metrics import structural_similarity as compare_ssim


video_path1 = os.path.expanduser("~/myProject_LLM/myDocs/BoschSmartCameras_11-04-2025_21-05-26-315.mp4")
video_path2 = os.path.expanduser("~/myProject_LLM/myDocs/BoschSmartCameras_11-04-2025_21-05-54-126.mp4")
video_path3 = os.path.expanduser("~/myProject_LLM/myDocs/BoschSmartCameras_11-04-2025_21-08-26-159.mp4")

input_video_path = video_path3
output_frame_path = os.path.expanduser("~/myProject_LLM/myDocs/frames/")


# Load video
cap = cv2.VideoCapture(input_video_path)

frame_diffs = []
prev_frame = None
relevant_frames = []
frame_id = 0

threshold = 30  # Sensitivity (tune this!)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    gray = cv2.GaussianBlur(gray, (5, 5), 0)

    if prev_frame is not None:
        diff = cv2.absdiff(gray, prev_frame)
        non_zero_count = np.count_nonzero(diff)

        if non_zero_count > threshold * gray.size / 100:
            relevant_frames.append(frame_id)

    prev_frame = gray
    frame_id += 1

cap.release()

print("Relevant frame indices:", frame_indices)

# Videopfad und Ausgabeverzeichnis

os.makedirs(output_frame_path, exist_ok=True)

# Liste der relevanten Frames (zuvor ermittelt)

# Video öffnen
cap = cv2.VideoCapture(input_video_path)
frame_id = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    if frame_id in relevant_frames:
        filename = os.path.join(output_frame_path, f"frame_{frame_id}.jpg")
        cv2.imwrite(filename, frame)
        print(f"Gespeichert: {filename}")

    frame_id += 1

cap.release()



Relevant frame indices: [44, 61, 121, 181, 241, 298, 299, 300, 301, 337, 338, 339, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 365, 366, 367, 368, 369, 370, 374, 376, 378, 380, 384, 386, 387, 388, 389, 390, 391, 421, 481, 526, 528, 541, 601, 661, 721, 781, 841, 901, 961, 1021, 1081, 1141, 1201, 1261, 1321, 1381]
Gespeichert: /home/gabriel/myProject_LLM/myDocs/frames/frame_44.jpg
Gespeichert: /home/gabriel/myProject_LLM/myDocs/frames/frame_61.jpg
Gespeichert: /home/gabriel/myProject_LLM/myDocs/frames/frame_121.jpg
Gespeichert: /home/gabriel/myProject_LLM/myDocs/frames/frame_181.jpg
Gespeichert: /home/gabriel/myProject_LLM/myDocs/frames/frame_241.jpg
Gespeichert: /home/gabriel/myProject_LLM/myDocs/frames/frame_298.jpg
Gespeichert: /home/gabriel/myProject_LLM/myDocs/frames/frame_299.jpg
Gespeichert: /home/gabriel/myProject_LLM/myDocs/frames/frame_300.jpg
Gespeichert: /home/gabriel/myProject_LLM/myDocs/frames/frame_301.jpg
Gespeichert: /home/gabriel/myProject_LLM/myDoc

In [9]:
import cv2
import os
import numpy as np
from skimage.metrics import structural_similarity as compare_ssim


video_path1 = os.path.expanduser("~/myProject_LLM/myDocs/BoschSmartCameras_11-04-2025_21-05-26-315.mp4")
video_path2 = os.path.expanduser("~/myProject_LLM/myDocs/BoschSmartCameras_11-04-2025_21-05-54-126.mp4")
video_path3 = os.path.expanduser("~/myProject_LLM/myDocs/BoschSmartCameras_11-04-2025_21-08-26-159.mp4")

input_video_path = video_path3
output_frame_path = os.path.expanduser("~/myProject_LLM/myDocs/frames/")


# Load video
cap = cv2.VideoCapture(input_video_path)

frame_diffs = []
prev_frame = None
relevant_frames = []
frame_id = 0
ssim_threshold = 0.95  # < 1.0 means some change; lower = more sensitive


while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    gray = cv2.GaussianBlur(gray, (5, 5), 0)

    if prev_frame is not None:
        # SSIM zwischen aktuellen und vorherigem Frame berechnen
        score, _ = compare_ssim(prev_frame, gray, full=True)
        
        if score < ssim_threshold:
            relevant_frames.append(frame_id)

    prev_frame = gray
    frame_id += 1

cap.release()

print("Relevante Frames (SSIM < {:.2f}):".format(ssim_threshold), relevant_frames)


# Videopfad und Ausgabeverzeichnis

os.makedirs(output_frame_path, exist_ok=True)

# Liste der relevanten Frames (zuvor ermittelt)

# Video öffnen
cap = cv2.VideoCapture(input_video_path)
frame_id = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    if frame_id in relevant_frames:
        filename = os.path.join(output_frame_path, f"frame_{frame_id}.jpg")
        cv2.imwrite(filename, frame)
        print(f"Gespeichert: {filename}")

    frame_id += 1

cap.release()


Relevante Frames (SSIM < 0.95): [275, 298, 299, 337, 356]
Gespeichert: /home/gabriel/myProject_LLM/myDocs/frames/frame_275.jpg
Gespeichert: /home/gabriel/myProject_LLM/myDocs/frames/frame_298.jpg
Gespeichert: /home/gabriel/myProject_LLM/myDocs/frames/frame_299.jpg
Gespeichert: /home/gabriel/myProject_LLM/myDocs/frames/frame_337.jpg
Gespeichert: /home/gabriel/myProject_LLM/myDocs/frames/frame_356.jpg


Gespeichert: /home/gabriel/myProject_LLM/myDocs/frames/frame_60.jpg
Gespeichert: /home/gabriel/myProject_LLM/myDocs/frames/frame_120.jpg
Gespeichert: /home/gabriel/myProject_LLM/myDocs/frames/frame_166.jpg
Gespeichert: /home/gabriel/myProject_LLM/myDocs/frames/frame_226.jpg
Gespeichert: /home/gabriel/myProject_LLM/myDocs/frames/frame_286.jpg
Gespeichert: /home/gabriel/myProject_LLM/myDocs/frames/frame_346.jpg
Gespeichert: /home/gabriel/myProject_LLM/myDocs/frames/frame_406.jpg
Gespeichert: /home/gabriel/myProject_LLM/myDocs/frames/frame_466.jpg
