In [1]:
from transformers import pipeline, AutoProcessor, AutoModelForImageTextToText, AutoTokenizer, LlavaForConditionalGeneration
from PIL import Image
import requests
import os
import torch

In [2]:

# Define the model ID and the local path where the model should be stored
model_id = "llava-hf/llava-interleave-qwen-0.5b-hf"
local_model_path = os.path.expanduser("~/myProject_LLM/model/")

In [5]:
model = LlavaForConditionalGeneration.from_pretrained(
    model_id, 
    torch_dtype=torch.float16, 
    low_cpu_mem_usage=True, 
).to(0)

processor = AutoProcessor.from_pretrained(model_id)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [6]:
# Define a chat history and use `apply_chat_template` to get correctly formatted prompt
# Each value in "content" has to be a list of dicts with types ("text", "image") 
conversation = [
    {

      "role": "user",
      "content": [
          {"type": "text", "text": "What are these?"},
          {"type": "image"},
        ],
    },
]

In [7]:
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

In [8]:
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"

In [9]:
raw_image = Image.open(requests.get(image_file, stream=True).raw)
inputs = processor(images=raw_image, text=prompt, return_tensors='pt').to(0, torch.float16)

In [10]:
output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
print(processor.decode(output[0][2:], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.




What are these?
assistant
These are two cats, one on the left and one on the right. They are lying on a pink blanket, which is placed on a couch. The cat on the left is sleeping, while the one on the right is resting.


In [11]:
# path of the image you want to describe
image_path = os.path.expanduser("~/myProject_LLM/myDocs/picture.jpg")

In [12]:
# Load the image from the URL or local path with error handling
if os.path.exists(image_path):  # Check if it's a local file
    try:
        image = Image.open(image_path)
        image.verify()  # Verify that the file is an image
        image = Image.open(image_path)  # Reopen the image after verification
    except Exception as e:
        print(f"Error loading image: {e}")
        image = None
else:  # Assume it's a URL
    try:
        response = requests.get(image_path, stream=True)
        if response.status_code == 200:
            try:
                image = Image.open(response.raw)
                image.verify()  # Verify that the file is an image
                image = Image.open(response.raw)  # Reopen the image after verification
            except Exception as e:
                print(f"Error loading image: {e}")
                image = None
        else:
            print(f"Failed to fetch image. HTTP status code: {response.status_code}")
            image = None
    except requests.exceptions.MissingSchema:
        print(f"Invalid URL: {image_path}. Please provide a valid URL or local file path.")
        image = None

In [13]:
if image is None:
    print("Image is not loaded properly.")
else:
    try:
        inputs = processor(images=image, text=prompt, return_tensors='pt').to(0, torch.float16)
        output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
        print(processor.decode(output[0][2:], skip_special_tokens=True))
    except Exception as e:
        print(f"Error during model inference: {e}")


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.




What are these?
assistant
These are three young tigers, also known as cubs, lying down on the ground. They are in a natural setting, possibly a forest or a savanna, with a dirt path in the background. The cubs are close to each other, and they appear to be enjoying each other's company.


In [14]:
# path of the image you want to describe
video_path = os.path.expanduser("~/myProject_LLM/myDocs/BoschSmartCameras_11-04-2025_21-05-26-315.mp4")

In [15]:
import torchvision.transforms as transforms
import cv2

In [16]:
# Function to perform inference on a video
def infer_video(model, video_path, frame_skip=10):
    # Öffnet das Video, das sich unter dem Pfad video_path befindet, mit OpenCV.
    # cap ist ein Objekt, das Zugriff auf die Frames des Videos ermöglicht.
    cap = cv2.VideoCapture(video_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    results = []
    frames = []  # Liste zum Sammeln von Frames

    for i in range(frame_count):
        # cap.read() liest das nächste Frame aus dem Video.
        # ret: Gibt True zurück, wenn das Frame erfolgreich gelesen wurde, sonst False.
        # frame: Das gelesene Frame als NumPy-Array.
        # Wenn kein Frame mehr gelesen werden kann (ret == False), wird die Schleife abgebrochen.
        ret, frame = cap.read()
        if not ret:
            break
        # Konvertiert das Frame von OpenCVs BGR-Farbformat in das RGB-Format.
        # Wandelt das NumPy-Array in ein PIL-Bild um, das für die Verarbeitung durch das Modell geeignet ist.
        if i % frame_skip == 0:
            frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            frames.append(frame)
            print(f"Selected {len(frames)} frames for processing.")
            inputs = processor(images=frame, text=prompt, return_tensors='pt').to(0, torch.float16)
            output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
            results.append(processor.decode(output[0][2:], skip_special_tokens=True))

    cap.release()  # Gibt das Video-Objekt frei, um Ressourcen freizugeben.
    # Gibt die Liste der Ergebnisse zurück, die während der Verarbeitung jedes Frames gesammelt wurden.
    return frames, results

In [17]:
video_frames, video_outputs  = infer_video(model, video_path, frame_skip=80)


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Selected 1 frames for processing.


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Selected 2 frames for processing.


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Selected 3 frames for processing.


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Selected 4 frames for processing.


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Selected 5 frames for processing.


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Selected 6 frames for processing.


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Selected 7 frames for processing.


In [18]:
for i, output in enumerate(video_outputs):
    print(f"Output {i + 1}:\n{output.strip()}\n{'-' * 80}")

Output 1:
What are these?
assistant
These are outdoor furniture items, specifically patio furniture. They include a table with chairs, a bench, and a chair. The table is placed on a patio area with a brick floor, and the chairs are arranged around it. The bench is positioned next to the table, and the chair is placed on the opposite side of the table. The furniture appears to be designed for relaxation and socializing, suitable for a patio or garden setting.
--------------------------------------------------------------------------------
Output 2:
What are these?
assistant
These are outdoor furniture items, specifically patio furniture. They include a table with chairs, a bench, and a chair. The table is placed on a patio area with a brick floor, and the chairs are arranged around it. The bench is positioned next to the table, and the chair is placed on the opposite side of the table. The furniture appears to be designed for relaxation and socializing, suitable for a patio or garden se

In [19]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from dotenv import load_dotenv

# Specify the path to your .env file
dotenv_path = '/home/gabriel/myProject/myvenv/.env'

def ask_question(question, video_outputs):
    from langchain.schema import Document
    try:
        # Load environment variables from .env file
        load_dotenv(dotenv_path)

        # Combine the video_outputs into a single string
        video_outputs_text = "\n\n".join([f"Output {i + 1}:\n{output.strip()}" for i, output in enumerate(video_outputs)])

        # Create a list of Documents for input_documents
        input_documents = [Document(page_content=video_outputs_text)]

        # Access the API key
        openai_api_key = os.getenv('OPENAI_API_KEY')
        if not openai_api_key:
            raise ValueError("OpenAI API key is missing.")

        # Initialize the LLM and QA chain
        llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
        chain = load_qa_chain(llm=llm, chain_type='stuff')

        # Run the chain with the required inputs
        response = chain.run(input_documents=input_documents, question=question)
        return response
    except Exception as e:
        print(f"Error in ask_question: {e}")
        return None

In [20]:
question = """Summarize the describtion of the video with following
text description of the frames of the video.
Please be precise and concise.
No repetition of the same information
if they are visible on different frames.
Formulate it in manner that is great to read"""

In [21]:
video_description = ask_question(question, video_outputs)

  llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
stuff: https://python.langchain.com/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/docs/how_to/#qa-with-rag
  chain = load_qa_chain(llm=llm, chain_type='stuff')
  response = chain.run(input_documents=input_documents, question=question)


In [22]:
print(video_description)


The video shows a series of images taken from a camera mounted on a tripod in a backyard setting. The scene includes a white building with red shutters, a paved area with a table and chairs, and a garden with plants and trees. There are also people visible in the frames, including one standing near the building and another riding a scooter. The image quality is low, suggesting it may have been taken with a lower-quality camera or in poor lighting conditions.
