# $Audio Transcript$

## `01` Import Libs:

In [34]:
import os
import google.generativeai as genai
from PIL import Image
from dotenv import load_dotenv

import requests
from io import BytesIO

import re
import csv

import pandas as pd

## `02` API setup:

In [19]:
load_dotenv('../.env') 

True

In [20]:
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") # replace by yours
genai.configure(api_key=GOOGLE_API_KEY)


## Model setup

In [21]:
model = genai.GenerativeModel('gemini-1.5-flash')

## Load images

In [29]:
image_path = '../outputs/keyframes/keyframe_0002.jpg'
output_csv_file = "../outputs/image_analysis_results_combined.csv"
csv_headers = ["Image Name", "Extracted Text", "Visual Description"]

## Functions

In [23]:
def get_dummy_image(original_path):
    """
    This function loads an image and returns an in-memory copy of it.
    It does not save anything to disk. 
    Parameters:
        original_path (str): The path to the original image file.
    Returns:
        PIL.Image: An in-memory copy of the original image.
    """
    original = Image.open(original_path)
    buffer = BytesIO()
    original.save(buffer, format=original.format)  # Save to memory buffer
    buffer.seek(0)
    return Image.open(buffer)

In [None]:
def parse_model_output_to_rows(model_output_text, output_csv_file= output_csv_file, csv_headers= csv_headers):
    """
    This function parses the model's output string from a labeled text format (e.g., "Image Name: ... Extracted Text: ...")
    into a dictionary that can be written to a CSV file. It extracts the image name,
    extracted text, and visual description from the model's output.
    Parameters:
        model_output_text (str): The output text from the model containing labeled information.
        output_csv_file (str): The path to the CSV file where the data will be saved
        csv_headers (list): The headers for the CSV file.
    Returns:
        bool: True if the data was successfully saved to the CSV file, False otherwise.
    """
    image_name_match = re.search(r'Image Name:\s*(.*?)\s*Extracted Text:', model_output_text, re.DOTALL)
    extracted_text_match = re.search(r'Extracted Text:\s*(.*?)\s*Visual Description:', model_output_text, re.DOTALL)
    visual_description_match = re.search(r'Visual Description:\s*(.*)', model_output_text, re.DOTALL)

    image_name = "N/A"
    extracted_text = "N/A"
    visual_description = "N/A"

    if image_name_match:
        image_name = image_name_match.group(1).strip()
    if extracted_text_match:
        extracted_text = extracted_text_match.group(1).strip()
        extracted_text = extracted_text.replace("<br>", "\n").replace("<br/>", "\n")
    if visual_description_match:
        visual_description = visual_description_match.group(1).strip()

    row_to_save = {}
    if image_name != "N/A" or extracted_text != "N/A" or visual_description != "N/A":
        row_to_save = {
            "Image Name": image_name,
            "Extracted Text": extracted_text,
            "Visual Description": visual_description
        }
    else:
        print("Warning: Could not parse model output into expected labeled format.")
        print("Raw model output:\n", model_output_text)
        row_to_save = {"Image Name": "Parsing Error", "Extracted Text": model_output_text, "Visual Description": "Error during parsing"}

    try:
        file_exists = os.path.exists(output_csv_file)
        write_header = not file_exists or os.path.getsize(output_csv_file) == 0

        with open(output_csv_file, 'a', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=csv_headers)

            if write_header:
                writer.writeheader()

            writer.writerow(row_to_save)

        print(f"Data for '{image_name}' appended to {output_csv_file}")
        return True

    except Exception as e:
        print(f"Error saving data for '{image_name}' to CSV: {e}")
        return False

## Prompt

In [None]:
img = get_dummy_image(image_path)

In [32]:
response = model.generate_content([
    "Analyze the provided images to extract all textual content. ",
    "If the text is in Arabic, transcribe it in Arabic and provide an English translation in quotation marks immediately following the Arabic text. ",
    "If the text is entirely in English, transcribe it as is. ",
    "If the text is predominantly Arabic with some English words, transcribe the Arabic and enclose the English words in quotation marks within the Arabic transcription. ",
    "Additionally, identify and describe any *embedded, informative visuals* within the images that convey data or information. ",
    "This specifically includes elements such as graphs, charts, tables of text, histograms, flowcharts, diagrams, or other visual representations of data. ",
    "Do NOT describe the overall image design, background, or purely decorative elements. ",
    "Structure the output as follows, with each image's information presented in a clear, column-like format: ",
    "Image Name: [Name of Image File]",
    "Extracted Text: [Transcribed text as per language rules, with English translations/quoted English words]",
    "Visual Description: [Detailed description of any embedded, informative visuals present. State 'None' if no such visuals are found.]",
    img, 
])
print(response.text)

Image Name: Machine Learning Diagram
Extracted Text: التعليم باشراف و بدون اشراف "Machine Learning"  "SUPERVISED LEARNING" "Develop predictive model based on both input and output data" "CLASSIFICATION" "REGRESSION" "UNSUPERVISED LEARNING" "Group and interpret data based only on input data" "CLUSTERING"  "MACHINE LEARNING"
Visual Description: The image contains a flowchart.  The central box shows "MACHINE LEARNING".  Arrows branch out to two boxes representing "SUPERVISED LEARNING" and "UNSUPERVISED LEARNING". Each of these boxes contains a short description of the learning type.  Further arrows from "SUPERVISED LEARNING" point to "CLASSIFICATION" and "REGRESSION". An arrow from "UNSUPERVISED LEARNING" points to "CLUSTERING".  All boxes are rectangular.



In [None]:
parsed_data = parse_model_output_to_rows(response.text)
print(parsed_data)

Data for 'Machine Learning Diagram' appended to ../outputs/image_analysis_results_combined.csv
True


In [36]:
pd.read_csv(output_csv_file)

Unnamed: 0,Image Name,Extracted Text,Visual Description
0,Machine Learning Diagram,"التعليم باشراف و بدون اشراف ""Machine Learning""...",The image contains a flowchart. The central b...
