# $Audio Transcript$

## `01` Import Libs:

In [None]:
pip install langchain pillow pytesseract transformers torch torchvision pandas

In [None]:
pip install langchain-google-genai

In [3]:
import os
import google.generativeai as genai
from PIL import Image
import requests
from io import BytesIO
import re
import csv
import pandas as pd
import glob
from langchain.agents import initialize_agent, AgentType, Tool
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import LLMChain
from langchain.chains.router import MultiPromptChain
from langchain.prompts import PromptTemplate

In [4]:
from PIL import Image
import base64
from io import BytesIO
from langchain_core.messages import HumanMessage


## `02` API setup:

In [5]:
from google.colab import userdata
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
genai.configure(api_key= GOOGLE_API_KEY)

## Model setup

In [6]:
llm = ChatGoogleGenerativeAI(model='gemini-1.5-flash', temperature=0.2)

## Load images

In [16]:
os.makedirs("output", exist_ok=True)
output_csv_file = "output/results.csv"
image_dir = '/content/images'
image_pattern = '*.jpg'
image_paths = glob.glob(os.path.join(image_dir, image_pattern))
csv_headers = ["Image Name", "Extracted Text", "Visual Description"]

## helper Functions

In [17]:
def get_dummy_image(original_path):
    """
    This function loads an image and returns an in-memory copy of it.
    It does not save anything to disk.
    Parameters:
        original_path (str): The path to the original image file.
    Returns:
        PIL.Image: An in-memory copy of the original image.
    """
    original = Image.open(original_path)
    buffer = BytesIO()
    original.save(buffer, format=original.format)  # Save to memory buffer
    buffer.seek(0)
    return Image.open(buffer)

## Agent tools

###Extract text and discription

In [25]:
def extract_text_with_vision(image_path):
    # Open and process the image
    image = Image.open(image_path)

    # Convert image to base64 string
    buffered = BytesIO()
    image.save(buffered, format=image.format)
    img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
    mime_type = f"image/{image.format.lower()}"

    prompt = f"""
        You are an expert in multilingual document understanding.

        Your job is to extract and analyze the text and informative visuals from the given image.

        Rules:
          -Analyze the provided images to extract all textual content.
          -If the text is in Arabic, transcribe it in Arabic and provide an English translation in quotation marks immediately following the Arabic text.
          -If the text is entirely in English, transcribe it as is.
          -If the text is predominantly Arabic with some English words, transcribe the Arabic and enclose the English words in quotation marks within the Arabic transcription.
          -Additionally, identify and describe any *embedded, informative visuals* within the images that convey data or information.
          -This specifically includes elements such as graphs, charts, tables of text, histograms, flowcharts, diagrams, or other visual representations of data.
          -Do NOT describe the overall image design, background, or purely decorative elements.
          -Structure the output as follows, with each image's information presented in a clear, column-like format:

        Image Name: {os.path.basename(image_path)}
        Extracted Text: [Transcribed text as per language rules, with English translations/quoted English words]
        Visual Description: [Detailed description of any embedded, informative visuals present. State 'None' if no such visuals are found.]
        """

    # Create properly formatted message for Gemini
    message = HumanMessage(
        content=[
            {"type": "text", "text": prompt},
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:{mime_type};base64,{img_str}"
                }
            }
        ]
    )

    result = llm.invoke([message])
    return result.content.strip()

"********************************************************************************************"

# Initialize the tool correctly
discription_tool = Tool.from_function(
    name="OCRwithVision",
    func=extract_text_with_vision,
    description="Performs OCR and visual analysis using Gemini Vision model"
)

###convert to rows

In [34]:
def parse_model_output_to_rows(model_output_text, output_csv_file=output_csv_file, csv_headers=csv_headers):
    try:
        # Improved parsing with more robust pattern matching
        image_name_match = re.search(r'Image Name:\s*(.*?)\s*(?=Extracted Text:|Visual Description:|$)',
                                   model_output_text, re.DOTALL | re.IGNORECASE)
        extracted_text_match = re.search(r'Extracted Text:\s*(.*?)\s*(?=Visual Description:|$)',
                                       model_output_text, re.DOTALL | re.IGNORECASE)
        visual_description_match = re.search(r'Visual Description:\s*(.*)',
                                           model_output_text, re.DOTALL | re.IGNORECASE)

        image_name = image_name_match.group(1).strip() if image_name_match else "N/A"
        extracted_text = extracted_text_match.group(1).strip() if extracted_text_match else "N/A"
        visual_description = visual_description_match.group(1).strip() if visual_description_match else "N/A"

        # Clean up text
        extracted_text = extracted_text.replace("<br>", "\n").replace("<br/>", "\n")

        # Create the row dictionary
        row_to_save = {
            "Image Name": image_name,
            "Extracted Text": extracted_text,
            "Visual Description": visual_description
        }

        # Write to CSV
        file_exists = os.path.exists(output_csv_file)
        write_header = not file_exists or os.path.getsize(output_csv_file) == 0

        with open(output_csv_file, 'a', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=csv_headers)

            if write_header:
                writer.writeheader()

            writer.writerow(row_to_save)

        print(f"Data for '{image_name}' saved to {output_csv_file}")
        return f"Successfully saved data for {image_name} to CSV"

    except Exception as e:
        error_msg = f"Error parsing model output: {str(e)}"
        print(error_msg)
        return error_msg

# Initialize the tool correctly
export_csv_tool = Tool.from_function(
    name="ExportCSV",
    func=parse_model_output_to_rows,
    description="Parses model output and saves image data to CSV file"
)

# Create a new tool to handle the CSV saving
def save_to_csv_wrapper(model_output):
    """Wrapper function to save model output to CSV"""
    result = parse_model_output_to_rows(model_output)
    return result

'**************************************************************************************************************************'

csv_save_tool = Tool.from_function(
    name="SaveToCSV",
    func=save_to_csv_wrapper,
    description="Saves the extracted information to a CSV file"
)



In [35]:
def summarize_descriptions(descriptions):
    """Summarize multiple visual descriptions using LLM"""
    prompt = """
    You are an expert in translation and summarizing visual content descriptions.
    Please provide a concise summary of the following visual descriptions:

    {descriptions}

    Summary should:
    - Highlight common themes
    - Identify key information patterns
    - the summurization should be in Arabic
    - Be 4-6 sentences maximum
    """

    combined = "\n\n".join(descriptions)

    # Generate summary
    result = llm.invoke(prompt.format(descriptions=combined))
    return result.content

In [36]:
tools = [discription_tool, export_csv_tool]
agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True
)

In [37]:
agent_prompt = """
You are a helpful assistant that can process images, extract information, and save it to a CSV file using provided tools.

Your task is to process the image located at {image_path}.
Follow these steps:
1. **Think**: I need to process the image.
2. **Action**: Use the `OCRwithVision` tool to extract text and visual descriptions from the image.
3. **Think**: I need to extract data to extract text and visual descriptions .
5. **Action Input**: Provide the image path: {image_path}
6. **Observation**: [The output from the OCRwithVision tool will appear here]
7. **Think**: I have the extracted data. Now I need to format it and save it to a CSV file using the `ExportCSV` tool.
8. **Action**: Use the `ExportCSV` tool.
9. **Action Input**: Provide the output from nthe `OCRwithVision` tool which needs to be parsed and saved.
10. **Observation**: [The output from the ExportCSV tool will appear here]
11. **Think**: The data has been saved to the CSV file.
12. **Final Answer**: The image has been processed, and the extracted information has been saved to the CSV file.

Begin!

"""

In [40]:
if not os.path.exists(output_csv_file):
    with open(output_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_headers)
        writer.writeheader()

for image_path in image_paths:
    try:
        print(f"Processing image: {image_path}")

        result = agent.invoke({
         "input": agent_prompt.format(image_path=image_path)
         })
        print(f"Result for {image_path}: {result}")

    except Exception as e:
        print(f"Error processing {image_path}: {e}")

print("All images processed and results saved to CSV.")

Processing image: /content/images/keyframe_0003.jpg


[1m> Entering new AgentExecutor chain...[0m


  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 50
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 30
}
].


[32;1m[1;3mThought:I need to perform OCR on the image using the OCRwithVision tool.

Action:
```json
{
  "action": "OCRwithVision",
  "action_input": "/content/images/keyframe_0003.jpg"
}
```[0m

  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 50
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 28
}
].


Error processing /content/images/keyframe_0003.jpg: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 50
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 28
}
]
Processing image: /content/images/keyframe_0002.jpg


[1m> Entering new AgentExecutor chain...[0m


  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 50
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 26
}
].


Error processing /content/images/keyframe_0002.jpg: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 50
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 26
}
]
Processing image: /content/images/keyframe_0004.jpg


[1m> Entering new AgentExecutor chain...[0m
Error processing /content/images/keyframe_0004.jpg: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.go

In [39]:
discription = pd.read_csv(output_csv_file)
discription = discription['Extracted Text']
final_result = summarize_descriptions(discription)
print(final_result)

TypeError: sequence item 0: expected str instance, float found