# importing libraries


In [None]:
import os
import glob
import re
import csv
import json
import cv2
import numpy as np
from PIL import Image
import base64
from io import BytesIO
from typing import Dict, List, Any, TypedDict, Literal

In [None]:
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_google_genai import ChatGoogleGenerativeAI
from langgraph.graph import StateGraph, END

# Create output directories

In [None]:
os.makedirs("output", exist_ok=True)
output_csv_file = "output/video_summary_results.csv"
output_json_file = "output/frame_importance.json"

# Initialize the Gemini model

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
model = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.1,)

# Define graph state

In [None]:
# Define graph state


# Helper Functions

In [None]:
# Helper function to convert image to base64
def image_to_base64(image_path):
    try:
        img = Image.open(image_path)
        buffered = BytesIO()
        img.save(buffered, format=img.format if img.format else "JPEG")
        img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
        mime_type = f"image/{img.format.lower()}" if img.format else "image/jpeg"
        return img_str, mime_type
    except Exception as e:
        print(f"Error converting image to base64: {str(e)}")
        return None, None

# Helper function to describe frames directly - NO RETRY
def describe_frame_directly(frame_path: str) -> Dict[str, Any]:
    """Describe a frame directly without using the workflow"""
    # Convert image to base64
    img_str, mime_type = image_to_base64(frame_path)
    if not img_str:
        return {
            "image_name": os.path.basename(frame_path),
            "extracted_text": "Failed to process image",
            "visual_description": "Error occurred during image processing",
            "error": "Image conversion failed"
        }

    # Call the model directly - NO RETRY
    prompt = f"""
        You are an expert in multilingual document understanding.

        Your task is to extract and analyze text and informative visual elements from the given image.

        Rules:
          - Analyze the provided image to extract all textual content.
          - If text is in Arabic, copy it in Arabic and provide an English translation in quotes immediately after the Arabic text.
          - If text is entirely in English, copy it as is.
          - If text is primarily Arabic with some English words, copy the Arabic text and place the English words in quotes within the Arabic text.
          - Additionally, identify any informative visual elements in the image that convey data or information.
          - This specifically includes elements such as charts, diagrams, text tables, histograms, flowcharts, illustrations, or other visual representations of data.
          - Do not describe the general image design, background, or purely decorative elements.
          - Translate the visual description to Arabic and remove English after translation.
          - Structure your output as follows, presenting the image information in a clear vertical format:

        Image Name: {os.path.basename(frame_path)}
        Extracted Text: [Copied text according to language rules, with English translations/quoted English words]
        Visual Description: [Detailed description of any informative visual elements present. State 'None' if no such visual elements are found.]
    """

    try:
        messages = [
            HumanMessage(
                content=[
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{img_str}"}}
                ]
            )
        ]

        # Direct API call - NO RETRY
        response = model.invoke(messages)
        output_text = response.content.strip()

        # Extract information using regex
        image_name_match = re.search(r'Image Name:\s*(.*?)\s*Extracted Text:', output_text, re.DOTALL) or \
                          re.search(r'اسم الصورة:\s*(.*?)\s*النص المستخرج:', output_text, re.DOTALL)

        extracted_text_match = re.search(r'Extracted Text:\s*(.*?)\s*Visual Description:', output_text, re.DOTALL) or \
                              re.search(r'النص المستخرج:\s*(.*?)\s*الوصف المرئي:', output_text, re.DOTALL)

        visual_description_match = re.search(r'Visual Description:\s*(.*)', output_text, re.DOTALL) or \
                                  re.search(r'الوصف المرئي:\s*(.*)', output_text, re.DOTALL)

        return {
            "image_name": image_name_match.group(1).strip() if image_name_match else os.path.basename(frame_path),
            "extracted_text": extracted_text_match.group(1).strip() if extracted_text_match else "No text found",
            "visual_description": visual_description_match.group(1).strip() if visual_description_match else "No visual description",
            "raw_output": output_text
        }
    except Exception as e:
        print(f"Error describing frame: {str(e)}")
        return {
            "image_name": os.path.basename(frame_path),
            "extracted_text": "Error occurred",
            "visual_description": f"Failed to analyze: {str(e)}",
            "raw_output": f"Error: {str(e)}"
        }

# Graph functions
def extract_frame_features(state: GraphState) -> GraphState:
    """Extract frame features like colors, contrast, and content"""
    frame_path = state["frame_path"]

    try:
        # Read the frame
        img = cv2.imread(frame_path)
        if img is None:
            state["frame_features"] = {"error": "Failed to load frame"}
            state["next_step"] = "evaluate_importance"
            return state

        # Extract basic features
        height, width, channels = img.shape

        # Convert to grayscale
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # Calculate contrast
        contrast = np.std(gray)

        # Calculate average brightness
        brightness = np.mean(gray)

        # Calculate ratio of black frame (very dark pixels)
        dark_pixels = np.sum(gray < 30) / (height * width)

        # Analyze colors
        color_variance = np.var(img.reshape(-1, 3), axis=0).sum()

        # Face detection (optional)
        face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
        faces = face_cascade.detectMultiScale(gray, 1.1, 4)
        has_faces = len(faces) > 0

        # Compile features
        state["frame_features"] = {
            "dimensions": {"height": height, "width": width},
            "contrast": float(contrast),
            "brightness": float(brightness),
            "dark_ratio": float(dark_pixels),
            "color_variance": float(color_variance),
            "has_faces": has_faces,
            "face_count": len(faces),
        }

        # Convert image to base64 for sending to model
        pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        buffered = BytesIO()
        pil_img.save(buffered, format="JPEG")
        img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")

        state["frame_data"] = {
            "base64_image": img_str,
            "file_name": os.path.basename(frame_path)
        }
    except Exception as e:
        print(f"Error extracting frame features: {str(e)}")
        state["frame_features"] = {"error": f"Feature extraction failed: {str(e)}"}
        state["frame_data"] = {"file_name": os.path.basename(frame_path)}

    state["next_step"] = "evaluate_importance"
    return state

def evaluate_importance(state: GraphState) -> GraphState:
    """Evaluate frame importance using AI model"""

    # If frame is mostly black, classify it directly as unimportant
    if state["frame_features"].get("dark_ratio", 0) > 0.9:
        state["importance"] = "not_important"
        state["reason"] = "Frame is mostly black (over 90%)"
        state["next_step"] = END
        return state

    # Check if there was an error in feature extraction
    if "error" in state["frame_features"]:
        state["importance"] = "not_important"
        state["reason"] = f"Could not properly analyze frame: {state['frame_features']['error']}"
        state["next_step"] = END
        return state

    try:
        # Prepare message for model with image
        messages = [
            SystemMessage(content="""You are an expert in video summarization. Your task is to evaluate the importance of a video frame for inclusion in a video summary.

            Evaluate the frame and classify it as either "important" or "not_important" based on the following criteria:

            Important frames:
            - Contain essential information for the video
            - Show important events or scene changes
            - Contain important text or visual information
            - Represent key moments in the video

            Unimportant frames:
            - Black or single-color frames
            - Regular portrait shots unrelated to video content
            - Transitional or blurry frames
            - Frames very similar to previous ones

            Return a JSON containing:
            {
              "importance": "important" or "not_important",
              "reason": "reason for your classification"
            }
            """),
            HumanMessage(
                content=[
                    {"type": "text", "text": f"Evaluate the importance of this video frame."},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{state['frame_data']['base64_image']}"}}
                ]
            )
        ]

        # Direct API call - NO RETRY
        response = model.invoke(messages)

        # Process response
        try:
            # Try to extract JSON from response
            json_match = re.search(r'({.*})', response.content.replace('\n', ' '))
            if json_match:
                result = json.loads(json_match.group(1))
                state["importance"] = result.get("importance", "not_important")
                state["reason"] = result.get("reason", "No reason provided")
            else:
                # Parse text response
                if "important" in response.content.lower():
                    state["importance"] = "important"
                else:
                    state["importance"] = "not_important"

                state["reason"] = response.content
        except Exception as e:
            print(f"Error parsing importance response: {str(e)}")
            state["importance"] = "not_important"
            state["reason"] = f"Error processing response: {str(e)}"
    except Exception as e:
        print(f"Error evaluating importance: {str(e)}")
        state["importance"] = "not_important"
        state["reason"] = f"Failed to evaluate: {str(e)}"

    # Determine next step based on frame importance
    if state["importance"] == "important":
        state["next_step"] = "describe_frame"
    else:
        state["next_step"] = END

    return state

def describe_frame(state: GraphState) -> GraphState:
    """Describe important frame and extract text from it"""
    frame_path = state["frame_path"]

    try:
        # Prepare message for model with image
        prompt = """
            You are an expert in multilingual document understanding.

            Your task is to extract and analyze text and informative visual elements from the given image.

            Rules:
              - Analyze the provided image to extract all textual content.
              - If text is in Arabic, copy it in Arabic and provide an English translation in quotes immediately after the Arabic text.
              - If text is entirely in English, copy it as is.
              - If text is primarily Arabic with some English words, copy the Arabic text and place the English words in quotes within the Arabic text.
              - Additionally, identify any informative visual elements in the image that convey data or information.
              - This specifically includes elements such as charts, diagrams, text tables, histograms, flowcharts, illustrations, or other visual representations of data.
              - Do not describe the general image design, background, or purely decorative elements.
              - Translate the visual description to Arabic if needed.
              - Structure your output as follows, presenting the image information in a clear vertical format:

            Image Name: %s
            Extracted Text: [Copied text according to language rules, with English translations/quoted English words]
            Visual Description: [Detailed description of any informative visual elements present. State 'None' if no such visual elements are found.]
        """ % os.path.basename(frame_path)

        # Call model with image - NO RETRY
        messages = [
            HumanMessage(
                content=[
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{state['frame_data']['base64_image']}"}}
                ]
            )
        ]

        # Direct API call - NO RETRY
                # Direct API call - NO RETRY
        response = model.invoke(messages)
        output_text = response.content.strip()

        # Extract different sections from text
        image_name_match = re.search(r'Image Name:\s*(.*?)\s*Extracted Text:', output_text, re.DOTALL) or \
                          re.search(r'اسم الصورة:\s*(.*?)\s*النص المستخرج:', output_text, re.DOTALL)

        extracted_text_match = re.search(r'Extracted Text:\s*(.*?)\s*Visual Description:', output_text, re.DOTALL) or \
                              re.search(r'النص المستخرج:\s*(.*?)\s*الوصف المرئي:', output_text, re.DOTALL)

        visual_description_match = re.search(r'Visual Description:\s*(.*)', output_text, re.DOTALL) or \
                                  re.search(r'الوصف المرئي:\s*(.*)', output_text, re.DOTALL)

        # Store results
        state["description"] = {
            "image_name": image_name_match.group(1).strip() if image_name_match else os.path.basename(frame_path),
            "extracted_text": extracted_text_match.group(1).strip() if extracted_text_match else "No text found",
            "visual_description": visual_description_match.group(1).strip() if visual_description_match else "No visual description",
            "raw_output": output_text
        }
    except Exception as e:
        print(f"Error describing frame: {str(e)}")
        state["description"] = {
            "image_name": os.path.basename(frame_path),
            "extracted_text": "Error processing text",
            "visual_description": "Error generating description",
            "error": str(e)
        }

    state["next_step"] = END
    return state

# Function to decide next step
def decide_next_step(state: GraphState) -> str:
    return state["next_step"]

# Create the graph
workflow = StateGraph(GraphState)

# Add nodes
workflow.add_node("extract_features", extract_frame_features)
workflow.add_node("evaluate_importance", evaluate_importance)
workflow.add_node("describe_frame", describe_frame)

# Define graph transitions
workflow.add_edge("extract_features", "evaluate_importance")
workflow.add_conditional_edges(
    "evaluate_importance",
    decide_next_step,
    {
        "describe_frame": "describe_frame",
        END: END
    }
)
workflow.add_edge("describe_frame", END)

# Set entry point
workflow.set_entry_point("extract_features")

# Compile the graph
frame_processor = workflow.compile()

# Function to save frame description to CSV
def save_description_to_csv(result: Dict[str, Any]):
    """Save important frame description to CSV file"""
    # Check for description information
    if "description" not in result or not result["description"]:
        print(f"  No description available for frame: {result.get('frame', '')}")
        return False

    description = result["description"]

    # Prepare row data
    row = {
        "Image Name": description.get("image_name", os.path.basename(result.get("path", ""))),
        "Extracted Text": description.get("extracted_text", "No extracted text"),
        "Visual Description": description.get("visual_description", "No visual description")
    }

    # Print information for verification
    print(f"  Extracted text ({len(row['Extracted Text'])} chars): {row['Extracted Text'][:50]}...")
    print(f"  Visual description ({len(row['Visual Description'])} chars): {row['Visual Description'][:50]}...")

    # Check if file exists to determine if header should be written
    file_exists = os.path.exists(output_csv_file)

    try:
        with open(output_csv_file, 'a', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=["Image Name", "Extracted Text", "Visual Description"])

            if not file_exists:
                writer.writeheader()

            writer.writerow(row)

        print(f"  Frame description saved to {output_csv_file}")
        return True
    except Exception as e:
        print(f" Error saving frame description: {str(e)}")
        return False

# Function to process a single frame
def process_single_frame(frame_path: str) -> Dict[str, Any]:
    """Process a single video frame: classify and if important, describe it"""
    initial_state = {
        "frame_path": frame_path,
        "frame_data": {},
        "frame_features": {},
        "importance": "not_important",
        "reason": "",
        "description": {},
        "next_step": "extract_features"
    }

    try:
        # Execute the graph
        result = frame_processor.invoke(initial_state)

        # Prepare basic result - WITHOUT features
        output = {
            "frame": os.path.basename(frame_path),
            "path": frame_path,
            "importance": result["importance"],
            "reason": result["reason"],
        }

        # Add description if frame is important
        if result["importance"] == "important":
            # Ensure description information exists
            if "description" in result and isinstance(result["description"], dict) and result["description"]:
                output["description"] = result["description"]
            else:
                print(f"  Warning: No description extracted for important frame: {os.path.basename(frame_path)}")
                # Try to describe the frame directly if not described through the graph
                try:
                    # Use direct description function - WITHOUT retries
                    output["description"] = describe_frame_directly(frame_path)
                    print(f"   Description extracted successfully on second attempt")
                except Exception as e:
                    print(f"   Failed to extract description: {str(e)}")
                    output["description"] = {
                        "image_name": os.path.basename(frame_path),
                        "extracted_text": "Failed to extract text",
                        "visual_description": "Failed to extract visual description",
                        "error": str(e)
                    }

        return output
    except Exception as e:
        print(f"  Error processing frame: {str(e)}")
        return {
            "frame": os.path.basename(frame_path),
            "path": frame_path,
            "importance": "error",
            "reason": f"Processing error: {str(e)}",
            "error": str(e)
        }

# Function to process a set of frames
def process_frames(frame_paths: List[str]) -> List[Dict[str, Any]]:
    """Process a set of video frames and evaluate their importance and describe important ones"""
    results = []
    important_frames_count = 0

    for i, frame_path in enumerate(frame_paths):
        print(f"Processing frame {i+1}/{len(frame_paths)}: {os.path.basename(frame_path)}")

        try:
            result = process_single_frame(frame_path)
            results.append(result)

            if result["importance"] == "important":
                important_frames_count += 1
                print(f"   Important: {result['reason'][:50]}...")

                # Ensure description exists
                if "description" not in result or not result["description"]:
                    print(f"  Retrying description for important frame...")
                    # Call frame description again directly - WITHOUT retries
                    try:
                        result["description"] = describe_frame_directly(frame_path)
                        print(f"  Description extracted successfully on retry")
                    except Exception as e:
                        print(f"  Retry failed: {str(e)}")

                # Save important frame description to CSV
                if "description" in result and result["description"]:
                    save_description_to_csv(result)
            else:
                print(f"   Not important: {result['reason'][:50]}...")

        except Exception as e:
            print(f"  Error processing frame: {str(e)}")
            results.append({
                "frame": os.path.basename(frame_path),
                "path": frame_path,
                "importance": "error",
                "reason": str(e)
            })

    print(f"\nFound {important_frames_count} important frames out of {len(frame_paths)}")
    return results

# Function to read frames from a folder
def get_frames_from_folder(folder_path: str, extensions=('.jpg', '.jpeg', '.png', '.bmp', '.tiff')) -> List[str]:
    """Extract paths of all image files from a specified folder"""
    if not os.path.exists(folder_path):
        print(f"Error: Folder {folder_path} does not exist")
        return []

    frame_paths = []
    for ext in extensions:
        frame_paths.extend(glob.glob(os.path.join(folder_path, f'*{ext}')))

    # Sort files to ensure they're processed in a logical order
    frame_paths.sort()

    print(f"Found {len(frame_paths)} frames in folder {folder_path}")
    return frame_paths

# Main function
def main(frames_folder: str):
    """Main function to process frames in a folder"""
    # Ensure output folder exists
    os.makedirs("output", exist_ok=True)

    # Get list of all frames in folder
    frame_paths = get_frames_from_folder(frames_folder)

    if not frame_paths:
        print("No frames found for processing!")
        return

    # Process frames
    results = process_frames(frame_paths)

    # Save classification results to JSON file
    with open(output_json_file, "w", encoding="utf-8") as f:
        # Remove features from output - already handled in process_single_frame
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"Classification results saved to {output_json_file}")

    # Analyze results
    important_frames = [r for r in results if r["importance"] == "important"]

    # Display summary
    print("\n--- Results Summary ---")
    print(f"Total frames: {len(results)}")
    print(f"Important frames: {len(important_frames)}")
    print(f"Unimportant frames: {len(results) - len(important_frames)}")
    print(f"Important frame descriptions saved to: {output_csv_file}")
    print(f"Classification results saved to: {output_json_file}")

    # Display additional information about important frames
    if important_frames:
        print("\n--- Important Frames ---")
        for i, frame in enumerate(important_frames):
            print(f"{i+1}. {frame['frame']}: {frame['reason'][:100]}...")

# Google Colab Helper Functions
def setup_for_colab():
    """Setup environment for Google Colab"""
    try:
        from google.colab import files
        print("Running in Google Colab environment")
        return True
    except:
        print("Not running in Google Colab environment")
        return False

def upload_images_to_colab():
    """Function to handle image uploads in Google Colab"""
    from google.colab import files

    # Create upload folder
    upload_folder = "/content/uploaded_images"
    os.makedirs(upload_folder, exist_ok=True)

    print("Please upload your video frames (image files)")
    uploaded = files.upload()

    # Save uploaded files to the folder
    for filename, content in uploaded.items():
        file_path = os.path.join(upload_folder, filename)
        with open(file_path, "wb") as f:
            f.write(content)

    print(f"Uploaded {len(uploaded)} files to {upload_folder}")
    return upload_folder

def download_results_from_colab():
    """Function to download results in Google Colab"""
    from google.colab import files

    # Download CSV and JSON files
    if os.path.exists(output_csv_file):
        files.download(output_csv_file)
        print(f"Downloaded {output_csv_file}")

    if os.path.exists(output_json_file):
        files.download(output_json_file)
        print(f"Downloaded {output_json_file}")



# Helper function to convert image to base64

In [None]:
# Call main function when file is executed directly
if __name__ == "__main__":
    # Check if running in Google Colab
    if setup_for_colab():
        # Handle file uploads in Colab
        FRAMES_FOLDER = upload_images_to_colab()

        # Process frames
        main(FRAMES_FOLDER)

        # Download results
        download_results_from_colab()
    else:
        # If not in Colab, specify frames folder directly
        FRAMES_FOLDER = "frames"  # Change this to your actual folder path

        # Create folder if it doesn't exist
        os.makedirs(FRAMES_FOLDER, exist_ok=True)

        # Process frames
        main(FRAMES_FOLDER)