# selenium


In [None]:
# Install necessary packages
!apt install -y chromium-chromedriver # Install chromium and chromedriver for automated browsing.
!pip install selenium # Install the selenium Python package for controlling the browser programmatically.

In [None]:
# Import required modules
from selenium import webdriver # Controls the Chrome browser
from selenium.webdriver.common.by import By
# The By class is a tool that helps you find elements on a webpage.
# It’s like giving Selenium instructions on where to look for something on the page.
from selenium.webdriver.common.keys import Keys
# The Keys class lets you simulate keyboard actions in the browser.
# You can use it to press keys like Enter, Tab, or even navigate with the arrow keys.
import time # Adds delays to ensure images are loaded during dynamic scrolling
import requests # Downloads the images from the URLs
import os # Handles folder creation and file saving

When you use Selenium to automate a browser, Selenium acts as a "remote control" for the browser. However, the browser doesn’t understand Selenium directly—it needs a translator to communicate.

The WebDriver is like this translator. It’s a small program specifically designed to "speak the language" of a particular browser (like Chrome, Firefox, or Edge).

In [None]:
# Configure Chrome options for Colab
# --no-sandbox and --disable-dev-shm-usage address Colab’s resource limitations
def configure_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run Chrome in headless mode, Colab doesn’t have a graphical interface. The --headless option ensures Chrome runs invisibly.
    options.add_argument('--no-sandbox')  # Bypass OS security model
    # This disables a security feature (sandboxing) that isolates Chrome from the rest of the system. It’s needed in some environments like Colab where strict isolation might prevent Chrome from running.
    options.add_argument('--disable-dev-shm-usage')  # Overcome limited resource problems
    # This prevents Chrome from using a shared memory location (dev/shm). In environments like Colab, the shared memory might be too small, causing crashes. This line ensures Chrome uses normal memory instead.
    return webdriver.Chrome(options=options)

In [None]:
time.sleep(1)

In [None]:
time.sleep(10)

In [None]:
fruits = ["apple", "banana", "cherry"]
for index, fruit in enumerate(fruits):
    print(f"Index: {index}, Fruit: {fruit}")

In [None]:
# Function to download and save images
def download_and_save_images(search_query, num_images, output_folder):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    driver = configure_driver()

    # Construct the Google Image Search URL
    url = f"https://www.google.com/search?q={search_query}&source=lnms&tbm=isch"
    driver.get(url)

    # Scroll and load images dynamically
    image_urls = set()
    scroll_pause_time = 2  # Time to wait for images to load
    while len(image_urls) < num_images:
        # Scroll down to load more images
        driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
        # driver.find_element(By.TAG_NAME, "body")
        # This tells Selenium to find the entire body of the webpage.
        # The body is the main part of the webpage where all the visible content is located.
        # .send_keys(Keys.END)
        # Once the body is found, this simulates pressing the End key on the keyboard.
        # Pressing the End key scrolls the webpage all the way to the bottom.
        time.sleep(scroll_pause_time)

        # Find image elements and extract URLs
        images = driver.find_elements(By.CSS_SELECTOR, "img")
        # Find all the <img> elements on a webpage,
        # It uses a CSS selector, which is "img" in this case. This CSS selector matches all image tags on the page.
        for img in images:
            src = img.get_attribute("src")
            # This retrieves the value of the src attribute from an image element (<img>).
            if src and "http" in src:  # Ensure valid URLs
                image_urls.add(src)

        # Break the loop if no new images are found
        if len(image_urls) >= num_images:
            break

    # Download and save images to the folder
    for idx, url in enumerate(list(image_urls)[:num_images]):
        try:
            response = requests.get(url, stream=True)
            if response.status_code == 200:
                file_path = os.path.join(output_folder, f"image_{idx + 1}.jpg")
                # The following block of code is used for downloading a file and saving it locally.
                with open(file_path, "wb") as f: # "wb" means that the file will be written in binary mode (which is important for non-text files like images)
                    for chunk in response.iter_content(1024):
                      # response.iter_content(1024): This gets the content of the file in chunks of 1024 bytes (1 KB) at a time.
                      # It's useful when downloading large files, as it doesn’t load the entire file into memory all at once,
                      # preventing memory overload.
                        f.write(chunk)
        except Exception as e:
            print(f"Failed to download {url}: {e}")

    driver.quit()
    print(f"Saved {len(image_urls)} images to the folder: {output_folder}")

In [None]:
download_and_save_images(search_query="cars", num_images=50, output_folder="cars")