# $Audio Transcript$

## `01` Import Libs:

In [35]:
import os
import google.generativeai as genai
from PIL import Image
from dotenv import load_dotenv

import requests
from io import BytesIO

## `02` API setup:

In [36]:
load_dotenv('../.env') 

True

In [37]:
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") # replace by yours
genai.configure(api_key=GOOGLE_API_KEY)


## Model setup

In [38]:
model = genai.GenerativeModel('gemini-1.5-flash')

## Load images

In [44]:
images = Image.open('../outputs/keyframes/keyframe_0002.jpg')

## Functions

In [None]:
def get_dummy_images():
    # Replace this with your actual image loading logic
    # For demonstration, we'll just use a placeholder
    return ["image_data_for_Image1.jpg", "image_data_for_Image2.png"]

In [None]:
def parse_model_output_to_rows(model_output_text):
    """
    Parses the model's output string (which is expected to be a Markdown table)
    into a list of dictionaries, where each dictionary represents a row for the CSV.

    This function needs to be adapted if the model's output format changes.
    """
    parsed_rows = []

    # This regex attempts to capture the data from a single row in the Markdown table format.
    # It looks for: | any characters (non-pipe) | any characters (non-pipe) | any characters (non-pipe) |
    # re.DOTALL allows '.' to match newlines, important if text fields contain line breaks.
    match = re.search(r'\|\s*([^|]+?)\s*\|\s*([^|]+?)\s*\|\s*([^|]+?)\s*\|', model_output_text, re.DOTALL)

    if match:
        # Group 1: Image Name (e.g., "Image 1")
        image_name = match.group(1).strip()
        # Group 2: Extracted Text (e.g., "فرع تعلم الآلة ...")
        extracted_text = match.group(2).strip()
        # Group 3: Visual Description (e.g., "None")
        visual_description = match.group(3).strip()

        # Clean up any HTML break tags (<br>, <br/>) from the extracted text
        # and replace them with actual newlines for better CSV readability.
        extracted_text = extracted_text.replace("<br>", "\n").replace("<br/>", "\n")

        # Add the extracted data as a dictionary to our list of rows
        parsed_rows.append({
            "Image Name": image_name,
            "Extracted Text": extracted_text,
            "Visual Description": visual_description
        })
    else:
        # If the regex doesn't find a match, it means the model's output
        # wasn't in the expected Markdown table format.
        print("Warning: Could not parse model output into expected format. Check 'simulated_model_response_text' or actual model output format.")
        print("Raw model output:\n", model_output_text)
        # As a fallback, we add an entry indicating a parsing error
        parsed_rows.append({"Image Name": "Parsing Error", "Extracted Text": model_output_text, "Visual Description": "Error during parsing"})

    return parsed_rows

## Prompt

In [None]:
response = model.generate_content([
    "Analyze the provided images to extract all textual content. ",
    "If the text is in Arabic, transcribe it in Arabic and provide an English translation in quotation marks immediately following the Arabic text. ",
    "If the text is entirely in English, transcribe it as is. ",
    "If the text is predominantly Arabic with some English words, transcribe the Arabic and enclose the English words in quotation marks within the Arabic transcription. ",
    "Additionally, identify and describe any *embedded, informative visuals* within the images that convey data or information. ",
    "This specifically includes elements such as graphs, charts, tables of text, histograms, flowcharts, diagrams, or other visual representations of data. ",
    "Do NOT describe the overall image design, background, or purely decorative elements. ",
    "Structure the output as follows, with each image's information presented in a clear, column-like format: ",
    "Image Name: [Name of Image File]",
    "Extracted Text: [Transcribed text as per language rules, with English translations/quoted English words]",
    "Visual Description: [Detailed description of any embedded, informative visuals present. State 'None' if no such visuals are found.]",
    "", # An empty string for separation if needed, or remove if not desired
    images
])
print(response.text)

Image Name: Machine Learning Diagram
Extracted Text: التعلم باشراف و بدون اشراف "Machine Learning"
Visual Description: The image contains a flowchart. The central node is labeled "MACHINE LEARNING".  Two branches stem from this node: "SUPERVISED LEARNING" (with the description "Develop predictive model based on both input and output data") and "UNSUPERVISED LEARNING" (with the description "Group and interpret data based only on input data").  Each of these branches further splits into three sub-branches:  "SUPERVISED LEARNING" branches into "CLASSIFICATION", "REGRESSION", and an unnamed node; and "UNSUPERVISED LEARNING" branches into "CLUSTERING" and two unnamed nodes.  The flowchart visually represents the hierarchical relationship between machine learning and its subcategories.

