# $Audio Transcript$

## `01` Import Libs:

In [None]:
import os
import google.generativeai as genai
from PIL import Image
from dotenv import load_dotenv

import requests
from io import BytesIO

import re

## `02` API setup:

In [36]:
load_dotenv('../.env') 

True

In [37]:
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") # replace by yours
genai.configure(api_key=GOOGLE_API_KEY)


## Model setup

In [38]:
model = genai.GenerativeModel('gemini-1.5-flash')

## Load images

In [46]:
image_path = '../outputs/keyframes/keyframe_0002.jpg'

## Functions

In [75]:
def get_dummy_image(original_path):
    """
    This function loads an image and returns an in-memory copy of it.
    It does not save anything to disk. 
    Parameters:
        original_path (str): The path to the original image file.
    Returns:
        PIL.Image: An in-memory copy of the original image.
    """
    original = Image.open(original_path)
    buffer = BytesIO()
    original.save(buffer, format=original.format)  # Save to memory buffer
    buffer.seek(0)
    return Image.open(buffer)

In [76]:
img = get_dummy_image(image_path)
img.show()

In [60]:
def parse_model_output_to_rows(model_output_text):
    """
    Parses the model's output string (which is expected to be a Markdown table)
    into a list of dictionaries, where each dictionary represents a row for the CSV.

    This function needs to be adapted if the model's output format changes.
    """
    parsed_rows = []

    # This regex attempts to capture the data from a single row in the Markdown table format.
    # It looks for: | any characters (non-pipe) | any characters (non-pipe) | any characters (non-pipe) |
    # re.DOTALL allows '.' to match newlines, important if text fields contain line breaks.
    match = re.search(r'\|\s*([^|]+?)\s*\|\s*([^|]+?)\s*\|\s*([^|]+?)\s*\|', model_output_text, re.DOTALL)

    if match:
        # Group 1: Image Name (e.g., "Image 1")
        image_name = match.group(1).strip()
        # Group 2: Extracted Text (e.g., "فرع تعلم الآلة ...")
        extracted_text = match.group(2).strip()
        # Group 3: Visual Description (e.g., "None")
        visual_description = match.group(3).strip()

        # Clean up any HTML break tags (<br>, <br/>) from the extracted text
        # and replace them with actual newlines for better CSV readability.
        extracted_text = extracted_text.replace("<br>", "\n").replace("<br/>", "\n")

        # Add the extracted data as a dictionary to our list of rows
        parsed_rows.append({
            "Image Name": image_name,
            "Extracted Text": extracted_text,
            "Visual Description": visual_description
        })
    else:
        # If the regex doesn't find a match, it means the model's output
        # wasn't in the expected Markdown table format.
        print("Warning: Could not parse model output into expected format. Check 'simulated_model_response_text' or actual model output format.")
        print("Raw model output:\n", model_output_text)
        # As a fallback, we add an entry indicating a parsing error
        parsed_rows.append({"Image Name": "Parsing Error", "Extracted Text": model_output_text, "Visual Description": "Error during parsing"})

    return parsed_rows

## Prompt

In [None]:
response = model.generate_content([
    "Analyze the provided images to extract all textual content. ",
    "If the text is in Arabic, transcribe it in Arabic and provide an English translation in quotation marks immediately following the Arabic text. ",
    "If the text is entirely in English, transcribe it as is. ",
    "If the text is predominantly Arabic with some English words, transcribe the Arabic and enclose the English words in quotation marks within the Arabic transcription. ",
    "Additionally, identify and describe any *embedded, informative visuals* within the images that convey data or information. ",
    "This specifically includes elements such as graphs, charts, tables of text, histograms, flowcharts, diagrams, or other visual representations of data. ",
    "Do NOT describe the overall image design, background, or purely decorative elements. ",
    "Structure the output as follows, with each image's information presented in a clear, column-like format: ",
    "Image Name: [Name of Image File]",
    "Extracted Text: [Transcribed text as per language rules, with English translations/quoted English words]",
    "Visual Description: [Detailed description of any embedded, informative visuals present. State 'None' if no such visuals are found.]",
    img, 
])
print(response.text)

Image Name: Machine Learning Diagram
Extracted Text: التعليم باشراف و بدون اشراف "Machine Learning"  "SUPERVISED LEARNING" "Develop predictive model based on both input and output data" "CLASSIFICATION" "REGRESSION" "UNSUPERVISED LEARNING" "Group and interpret data based only on input data" "CLUSTERING"  "MACHINE LEARNING"
Visual Description: The image contains a flowchart.  The flowchart depicts a hierarchical structure of machine learning, starting with a central box labeled "MACHINE LEARNING". This box branches into two main categories: "SUPERVISED LEARNING" and "UNSUPERVISED LEARNING".  "SUPERVISED LEARNING" further branches into "CLASSIFICATION" and "REGRESSION", while "UNSUPERVISED LEARNING" branches into "CLUSTERING".  Each branch is represented by an arrow, showing the relationship between the concepts.  The descriptions within each box provide a short definition of the respective machine learning type.

