https://arxiv.org/pdf/2312.16171 (referenced from)

In [3]:
import cv2
import os
from datetime import timedelta
from openai import OpenAI
import csv
import base64
from dotenv import load_dotenv
import pandas as pd

In [4]:
# Set up OpenAI client
def configure():
    load_dotenv
MODEL = "gpt-4o-mini"
client = OpenAI(api_key=os.getenv('api_key'))

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [6]:
def extract_and_save_frames(video_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    video = cv2.VideoCapture(video_path)
    fps = video.get(cv2.CAP_PROP_FPS)
    frame_count = 0
    saved_count = 0

    while True:
        ret, frame = video.read()
        if not ret:
            break

        if frame_count % int(fps) == 0:
            timestamp = frame_count / fps
            timestamp_str = str(timedelta(seconds=int(timestamp)))
            frame_filename = f"frame_{timestamp_str.replace(':', '_')}.jpg"
            frame_path = os.path.join(output_folder, frame_filename)
            cv2.imwrite(frame_path, frame)
            saved_count += 1
            print(f"Saved frame at {timestamp_str}")

        frame_count += 1

    video.release()
    print(f"Total frames in video: {frame_count}")
    print(f"Frames saved (1 per second): {saved_count}")
    return saved_count

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

def process_image(image_path, prompt="Describe this image in one sentence."):
    base64_image = encode_image(image_path)
    
    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that describes images accurately and concisely. The person that you are assisting is a blind person. You will say DANGER if the person if you detect any cars, obtacles, potholes, people, and other dangeroud things in close vicinity"},
            {"role": "user", "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {
                    "url": f"data:image/jpeg;base64,{base64_image}"}
                }
            ]}
        ],
        temperature=0.0,
    )
    
    return response.choices[0].message.content.strip()

def process_frames(frames_folder, output_csv, max_frames=2000):
    with open(output_csv, 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['Timestamp', 'Description'])
        
        # Sort the frames by their timestamp
        frames = sorted(os.listdir(frames_folder))
        
        # Process only the first max_frames
        for frame_filename in frames[:max_frames]:
            frame_path = os.path.join(frames_folder, frame_filename)
            timestamp_str = frame_filename.split('_')[1].replace('.jpg', '').replace('_', ':')
            
            description = process_image(frame_path)
            
            csvwriter.writerow([timestamp_str, description])
            
            print(f"Timestamp: {timestamp_str}")
            print(f"Description: {description}")
            print("---")
        
        if len(frames) > max_frames:
            print(f"Note: Only processed the first {max_frames} frames out of {len(frames)} total frames.")

def main(video_path, output_csv, max_frames=10):
    configure()
    frames_folder = "extracted_frames"
    
    # Step 1: Extract frames
    print("Extracting frames from video...")
    extract_and_save_frames(video_path, frames_folder)
    
    # Step 2: Process frames with LLM
    print(f"Processing the first {max_frames} frames with LLM...")
    process_frames(frames_folder, output_csv, max_frames)
    
    print(f"Processing complete. Results saved to {output_csv}")

    video = cv2.VideoCapture(video_path)
    fps = video.get(cv2.CAP_PROP_FPS)
    frame_count = 0
    saved_count = 0

    while True:
        ret, frame = video.read()
        if not ret:
            break

        if frame_count % int(fps) == 0:
            timestamp = frame_count / fps
            timestamp_str = str(timedelta(seconds=int(timestamp)))
            frame_filename = f"frame_{timestamp_str.replace(':', '_')}.jpg"
            frame_path = os.path.join(output_folder, frame_filename)
            cv2.imwrite(frame_path, frame)
            saved_count += 1
            print(f"Saved frame at {timestamp_str}")

        frame_count += 1

    video.release()
    print(f"Total frames in video: {frame_count}")
    print(f"Frames saved (1 per second): {saved_count}")
    return saved_count

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

def process_image(image_path, prompt="Describe this image in one sentence."):
    base64_image = encode_image(image_path)
    
    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that describes images accurately and concisely. The person that you are assisting is a blind person. You will say DANGER if the person if you detect any cars, obtacles, potholes, people, and other dangeroud things in close vicinity"},
            {"role": "user", "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {
                    "url": f"data:image/jpeg;base64,{base64_image}"}
                }
            ]}
        ],
        temperature=0.0,
    )
    
    return response.choices[0].message.content.strip()

def process_frames(frames_folder, output_csv, max_frames=10):
    with open(output_csv, 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['Timestamp', 'Description'])
        
        # Sort the frames by their timestamp
        frames = sorted(os.listdir(frames_folder))
        
        # Process only the first max_frames
        for frame_filename in frames[:max_frames]:
            frame_path = os.path.join(frames_folder, frame_filename)
            timestamp_str = frame_filename.split('_')[1].replace('.jpg', '').replace('_', ':')
            
            description = process_image(frame_path)
            
            csvwriter.writerow([timestamp_str, description])
            
            print(f"Timestamp: {timestamp_str}")
            print(f"Description: {description}")
            print("---")
        
        if len(frames) > max_frames:
            print(f"Note: Only processed the first {max_frames} frames out of {len(frames)} total frames.")

def main(video_path, output_csv, max_frames=100):
    frames_folder = "extracted_frames"
    
    # Step 1: Extract frames
    print("Extracting frames from video...")
    extract_and_save_frames(video_path, frames_folder)
    
    # Step 2: Process frames with LLM
    print(f"Processing the first {max_frames} frames with LLM...")
    process_frames(frames_folder, output_csv, max_frames)
    
    print(f"Processing complete. Results saved to {output_csv}")

In [None]:
# Usage

video_path = "./video.mp4"
output_csv = "./video_descriptions.csv"
max_frames_to_process = 100

main(video_path, output_csv)

In [1]:
def summarize_csv(csv_path):
    # Load the CSV file
    df = pd.read_csv(csv_path)
    
    # Combine all descriptions into a single text
    combined_descriptions = " ".join(df['Description'].tolist())
    
    # Create a prompt for the LLM
    prompt = f"Summarize the following descriptions: {combined_descriptions}"
    
    # Get the summary from the LLM
    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that summarizes text accurately and concisely."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.0,
    )
    
    summary = response.choices[0].message.content.strip()
    
    print("Summary of CSV Descriptions:")
    print(summary)
    # Save the summary to a text file
    with open("summary.txt", "w") as f:
        f.write(summary)
    return summary

# Usage
csv_path = "./video_descriptions.csv"
summarize_csv(csv_path)

NameError: name 'pd' is not defined