In [1]:
!pip install -q -U "google-generativeai>=0.7.2"

In [2]:
import google.generativeai as genai

In [3]:
from google.colab import userdata
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key = GOOGLE_API_KEY)

In [4]:
# Function to automatically select an available Gemini model
def get_available_gemini_model():
    models_to_try = [
        "gemini-1.5-flash" ,
        "gemini-1.5-flash-8b",  # Try 8B model first
        "gemini-1.5-flash-002"   # Fall back to standard flash model
    ]

    for model_name in models_to_try:
        try:
            model = genai.GenerativeModel(model_name=model_name)
            # If model initializes without error, return it
            print(f"Using model: {model_name}")
            return model
        except Exception as e:
            print(f"Model {model_name} is unavailable. Trying next...")

    raise RuntimeError("No available Gemini models found. Please check API access.")

# Initialize the model
model = get_available_gemini_model()

Using model: gemini-1.5-flash


In [6]:
# Extra code to visualize the img
import PIL.Image
image1 = "amazon-home.png"
image2 = "amazon-search.png"
image3= "product-detail.png"
image4 = "add-to-cart.png"
image5= "confirmation.png"

img1 = PIL.Image.open(image1)
img2 = PIL.Image.open(image2)
img3 = PIL.Image.open(image3)
img4 = PIL.Image.open(image4)
img5 = PIL.Image.open(image5)

In [7]:
# Fucntion to generate descriptions for above images based on the prompt
import google.generativeai as genai
from PIL import Image

def describe_img(image_path):
     """ Generate description of the image using gemini"""
     try:
        # Open the image
        img = Image.open(image_path)
        # Write the prompt for that image
        prompt =""" This image contains home screen of an app. Given the screenshot describe the site in as much detail as possible
            making sure you note all the items on the screen. Return output in json format : {description:description , items:[item1 ,
            item2 , item3 , etc]}"""
        # Write the response
        response = model.generate_content([prompt, img])
        description = response.text
        return description if description else "No description generated"
     except Exception as e:
        return f"Error:{e}"

image_paths =[
    "amazon-home.png",
    "amazon-search.png",
    "product-detail.png",
    "add-to-cart.png",
    "confirmation.png"]

image_descriptions = [describe_img(img) for img in image_paths]

for i, desc in enumerate(image_descriptions,1):
    print(f"Image {i}: {desc}")

Image 1: ```json
{
  "description": "This is a screenshot of the Amazon.com homepage. The top navigation bar displays options for searching, account management, returns, and the shopping cart. The main section showcases a banner for \"Shop Books\", followed by sections featuring \"Pick up where you left off\" (showing scarves), \"Shop for your home essentials\" (showing cleaning tools and storage), \"Get your game on\" (showing a computer), and \"Top categories in Kitchen appliances\" (showing a pressure cooker).  The location is set to Pune, India. A message indicates that the user is currently on the .com site, with an option to switch to the .in site for local delivery.",
  "items": [
    {"type": "navigation", "item": "All"},
    {"type": "navigation", "item": "Rufus"},
    {"type": "navigation", "item": "Today's Deals"},
    {"type": "navigation", "item": "Buy Again"},
    {"type": "navigation", "item": "Customer Service"},
    {"type": "navigation", "item": "Registry"},
    {"typ

In [8]:
# Use the above function to generate test-cases from the above decriptions:
import json
def generate_prompt_for_test_cases(descriptions):
    """ Generate detailed prompt for the Gemini api to create structured test cases for the given screenshots"""
    prompt =""" You are an AI test case generator. Based on the screenshots and their descriptions
            generate 3 meaningful test cases for each screenshot. The test cases should be structured
            to verify user interactions, UI elements and expected system behaviour.\

            ### **Guidelines:**
            - The test cases should be a bit generic and applicable to any application.
            - Each test case should comtain **Title, Objective and Test Steps.**
            - The steps should include :
            - **Action** - The user interaction(e.g. "Click the 'Search' button" , "Open the Amazon.in website in a browser")
            - **Data(if applicable)**- Any input required(e.g. "Search term :'Laptop Stand'")
            - **Expected Result** - What should happen after the action(e.g. "The Amazon homepage is displayed with a search bar visible at the top.")


            ---
            ##**Example Format:**
            [{
              Test Case 1
              "Title" : "Search for a Product on Amazon.in" ,

              "Objective" : "To ensure the search functionality on Amazon.in returns relevant results when a user
              searches for a product.",

              "Test Steps":[
              {"Step 1":
              "Action": "Open the Amazon.in website in a browser.",
              "Data": "https://amazon.in",
              "Expected Result": "The Amazon homepage is displayed with a search bar visible at the top."},

              { "Step 2":
              "Action": "Enter the product name (e.g., 'laptop stand') in the search bar.",
              "Data" : "laptop stand",
              "Expected Result" : "The search bar displays the entered text ('laptop stand')." },

              {"Step 3":
              "Action": "Click on the “Search” icon or press “Enter” on the keyboard.",
              "Data": "None",
              "Expected Result": "The page refreshes, and the product search results related to 'laptop stand' are
              displayed."},

              {"Step 4":
              "Action": "Verify that the search results are relevant to the product searched.",
              "Data": "None",
              "Expected Result": "The page shows a list of laptops, including product names, images, prices,
              and other relevant information."}
              ]}]}]

            """
            # Add image descriptions to the prompt:
    for i,description in enumerate(image_descriptions):
        prompt += f"Image {i+1}: {description}\n"

    prompt += "Ensure each image has exactly 3 structured test cases.Ensure the test cases cover functional correctness, UI validation, and user interaction validation. Output must be strictly in JSON format."

    return prompt

model = genai.GenerativeModel('gemini-1.5-flash')
test_case_prompt = generate_prompt_for_test_cases(image_descriptions)
response = model.generate_content(test_case_prompt)
modified_response = response.text.replace("```json", "").replace("```", "")
print(modified_response)


[
  {
    "image": 1,
    "testCases": [
      {
        "Title": "Verify Homepage Navigation and Elements",
        "Objective": "To ensure all navigational elements on the Amazon.com homepage are functional and displayed correctly.",
        "TestSteps": [
          {
            "Step": 1,
            "Action": "Open the Amazon.com website in a browser.",
            "Data": "https://www.amazon.com",
            "ExpectedResult": "The Amazon homepage loads successfully, displaying all expected elements (navigation bar, search bar, featured sections, etc.)."
          },
          {
            "Step": 2,
            "Action": "Click on each navigation item in the top navigation bar (All, Rufus, Today's Deals, etc.).",
            "Data": "None",
            "ExpectedResult": "Each navigation link should correctly redirect to the respective section or page.  Error messages should be absent."
          },
          {
            "Step": 3,
            "Action": "Verify the location a

In [None]:
# Convert the test-cases into a structured json response
# GENERATE STRUCTURED JSON
def generate_test_cases(image_paths):
    test_case_prompt = generate_prompt_for_test_cases(image_descriptions)
    model = genai.GenerativeModel('gemini-1.5-flash')
    response = model.generate_content(test_case_prompt)
    modified_response = response.text.replace("```json", "").replace("```", "")
  # Try parsing JSON
    try:
        test_cases = json.loads(modified_response)
    except json.JSONDecodeError:
        test_cases = {"error": "Failed to parse response as JSON."}

    return test_cases
test_cases_output = generate_test_cases(image_paths)
print(json.dumps(test_cases_output, indent=4))

In [10]:
def save_output(test_cases, filename="test_cases.json"):
    """Save the generated test cases to a JSON file."""
    try:
      with open(filename, "w" , encoding ="utf-8") as f:
          json.dump(test_cases, f, indent=4, ensure_ascii=False)
    except Exception as e:
        print(f"Error saving output to {filename}: {e}")

save_output(test_cases_output)

In [None]:
import json
from PIL import Image
import google.generativeai as genai

def get_ui_elements_coordinates(image_path):
    """Extract UI elements with their coordinates from a screenshot."""

    # Open the image
    img = Image.open(image_path)

    # Define prompt for Gemini model
    prompt = """Analyze the image screenshot and detect interactive UI elements like input fields,
                buttons, checkboxes, etc.
                For each detected element, return:
                - The name/label of the element if available.
                - The type of the element (button, text field, dropdown, etc.).
                - The bounding box coordinates in pixels as {"x": X, "y": Y, "width": W, "height": H}.
                Return the response in JSON format.
             """

    # Generate response using Gemini model
    response = model.generate_content([prompt, img])

    # Process response text
    modified_response = response.text.replace("```json", "").replace("```", "").strip()

    try:
        coordinates = json.loads(modified_response)  # Convert string to JSON
    except json.JSONDecodeError:
        coordinates = {"error": "Failed to parse response as JSON."}
        coordinates =[]

    return coordinates

# List of images
image_paths = [
    "amazon-home.png",
    "amazon-search.png",
    "product-detail.png",
    "add-to-cart.png",
    "confirmation.png"
]

# Get UI element coordinates for each image
ui_coordinates = {image: get_ui_elements_coordinates(image) for image in image_paths}

# Print results
for image, coordinates in ui_coordinates.items():
    print(f"Image: {image} -> UI Elements:{coordinates}")



In [27]:
for image , coordinates in ui_coordinates.items():
  save_output(coordinates , f"{image}_coordinates.json")


In [28]:
def save_ui_coordinates(ui_coordinates, filename="ui_coordinates.json"):
    """Save the generated test cases to a JSON file."""
    try:
      with open(filename, "w" , encoding ="utf-8") as f:
          json.dump(ui_coordinates, f, indent=4, ensure_ascii=False)
    except Exception as e:
        print(f"Error saving output to {filename}: {e}")

save_output(ui_coordinates)

In [29]:
with open("ui_coordinates.json", "w") as f:
    json.dump(ui_coordinates, f, indent=4)

# Print output
print(json.dumps(ui_coordinates, indent=4))

{
    "amazon-home.png": [
        {
            "name": "Search Amazon",
            "type": "text field",
            "bounding_box": {
                "x": 267,
                "y": 78,
                "width": 180,
                "height": 28
            }
        },
        {
            "name": "All",
            "type": "dropdown",
            "bounding_box": {
                "x": 187,
                "y": 78,
                "width": 50,
                "height": 28
            }
        },
        {
            "name": "Go",
            "type": "button",
            "bounding_box": {
                "x": 456,
                "y": 78,
                "width": 28,
                "height": 28
            }
        },
        {
            "name": "Account & Lists",
            "type": "dropdown",
            "bounding_box": {
                "x": 746,
                "y": 78,
                "width": 130,
                "height": 28
            }
        },
        {
        

In [37]:

import os
import json
from PIL import Image

def crop_ui_elements(image_path, ui_coordinates):
    """Crop UI elements from a screenshot based on bounding box coordinates.

    Args:
        image_path (str): Path to the image to crop.
        ui_coordinates (list): List of UI elements with bounding box coordinates.

    Returns:
        dict: Dictionary of cropped images with UI element names as keys.
    """
    img = Image.open(image_path).convert("RGB")  # Ensure RGB mode for quality
    width, height = img.size  # Get actual image size
    cropped_images = {}

    for i, element in enumerate(ui_coordinates):
        # Handle different bounding box key formats
        bbox_key = "bounding_box" if "bounding_box" in element else "bbox" if "bbox" in element else None
        if not bbox_key:
            print(f"Skipping element {i} in {image_path}: No bounding box found")
            continue  # Skip if no bounding box exists

        # Extract bounding box coordinates safely
        try:
            x = element[bbox_key]['x']
            y = element[bbox_key]['y']
            w = element[bbox_key]['width']
            h = element[bbox_key]['height']

            # 🔹 Normalize coordinates if needed
            if 0 < x < 1 and 0 < y < 1 and w <= 1 and h <= 1:
                x = int(x * width)
                y = int(y * height)
                w = int(w * width)
                h = int(h * height)

            # Ensure coordinates are within image bounds
            x, y = max(0, x), max(0, y)
            w, h = min(width - x, w), min(height - y, h)

            # Define the cropping box (left, upper, right, lower)
            crop_box = (x, y, x + w, y + h)
            cropped_img = img.crop(crop_box)  # Crop the image

            # Get element name, fallback to generic name if missing
            # element_name = element.get("name", f"Element_{i}").replace(" ", "_")

            # Store cropped image
            cropped_images[element_name] = cropped_img

        except KeyError as e:
            print(f"Error: Missing key {e} in element {i} of {image_path}")

    return cropped_images

# Load UI coordinates from JSON file
with open("ui_coordinates.json", "r") as file:
    ui_coordinates = json.load(file)

# List of images to process
image_paths = [
    "amazon-home.png",
    "amazon-search.png",
    "product-detail.png",
    "add-to-cart.png",
    "confirmation.png"
]

# Output folder for cropped images
output_dir = "cropped_images"
os.makedirs(output_dir, exist_ok=True)

# Process and save cropped UI elements
cropped_elements_per_image = {}

for image in image_paths:
    if image in ui_coordinates:  # Ensure UI coordinates exist for the image
        cropped_elements = crop_ui_elements(image, ui_coordinates[image])
        cropped_elements_per_image[image] = cropped_elements

        # Save cropped images
        for element_name, cropped_img in cropped_elements.items():
            output_path = os.path.join(output_dir, f"{image}_{element_name}.png")
            cropped_img.save(output_path, quality=100)  # Ensure max quality
            print(f"Saved: {output_path}")

print("\nBatch processing complete. Cropped UI elements saved.")


Saved: cropped_images/amazon-home.png_See_more.png
Saved: cropped_images/amazon-search.png_See_more.png
Saved: cropped_images/product-detail.png_See_more.png
Saved: cropped_images/add-to-cart.png_See_more.png
Saved: cropped_images/confirmation.png_See_more.png

✅ Batch processing complete. Cropped UI elements saved.


In [35]:
import shutil
from google.colab import files

folder_path = "cropped_images"  # Change this to your folder name
zip_filename = folder_path + ".zip"

# Zip the folder
shutil.make_archive(folder_path, 'zip', folder_path)

# Download the zip file
files.download(zip_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import os
from PIL import Image, ImageDraw, ImageFont

def highlight_ui_elements(image_path, ui_coordinates, output_path=None):
    """
    Highlight UI elements in the screenshot by drawing bounding boxes.

    Args:
        - image_path (str): Path to the original image.
        - ui_coordinates (list): List of UI elements with bounding box coordinates.
        - output_path (str, optional): Path to save the output image with highlighted elements.

    Returns:
        - Image object with highlighted elements.
    """
    # Load the image
    img = Image.open(image_path).convert("RGB")
    draw = ImageDraw.Draw(img)
    width, height  = img.size

    # Loop through each UI element and draw bounding boxes
    for i, element in enumerate(ui_coordinates):
        # Handle different bounding box key formats
        bbox_key = "bounding_box" if "bounding_box" in element else "bbox" if "bbox" in element else None
        if not bbox_key:
            print(f"Skipping element {i} in {image_path}: No bounding box found")
            continue  # Skip if no bounding box exists

    try:
            x = element[bbox_key]['x']
            y = element[bbox_key]['y']
            w = element[bbox_key]['width']
            h = element[bbox_key]['height']

            # Normalize coordinates if needed
            if 0 < x < 1 and 0 < y < 1 and w <= 1 and h <= 1:
                x = int(x * width)
                y = int(y * height)
                w = int(w * width)
                h = int(h * height)

            # Ensure coordinates are within image bounds
            x, y = max(0, x), max(0, y)
            w, h = min(width - x, w), min(height - y, h)

            # Draw the bounding box (left, upper, right, lower)
            draw.rectangle([x, y, x + w, y + h], outline="red", width=3)  # Red color for highligh
    except KeyError as e:
            print(f"Error: Missing key {e} in element {i} of {image_path}")
    img.save(output_path, quality=100)  # Save with maximum quality
    print(f"Highlighted image saved: {output_path}")

# Load UI coordinates from JSON file
with open("ui_coordinates.json", "r") as file:
    ui_coordinates = json.load(file)

# List of images to process
image_paths = [
    "amazon-home.png",
    "amazon-search.png",
    "product-detail.png",
    "add-to-cart.png",
    "confirmation.png"
]
# Output folder for highlighted images
output_dir = "highlighted_images"
os.makedirs(output_dir, exist_ok=True)

# Process and save highlighted UI elements
for image in image_paths:
    if image in ui_coordinates:  # Ensure UI coordinates exist for the image
        # Set output path for the highlighted image
        output_path = os.path.join(output_dir, f"highlighted_{image}")

        # Highlight UI elements and save the image
        highlight_ui_elements(image, ui_coordinates[image], output_path)

print("\nHighlighting complete. Images with highlighted UI elements saved.")