In [14]:
import os
import csv
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

def setup_driver():
    """Set up a Selenium WebDriver with a manually specified driver."""
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")

    # Manually specify ChromeDriver path (update if needed)
    driver_path = "C:/Users/Sushmitha/Downloads/multimodal/task1/chromedriver.exe"
    service = Service(driver_path)
    
    driver = webdriver.Chrome(service=service, options=options)
    return driver

def fetch_and_download_images(driver, query, num_images=50, dataset_path="RandomVision-50"):
    """Fetch image URLs from Bing Search and download them."""
    search_url = f"https://www.bing.com/images/search?q={query}&form=HDRSC2"
    driver.get(search_url)
    time.sleep(2)  # Allow images to load

    images = driver.find_elements(By.CSS_SELECTOR, "img.mimg")
    category_folder = os.path.join(dataset_path, query.replace(" ", "_"))
    os.makedirs(category_folder, exist_ok=True)
    
    metadata = []
    for i, img in enumerate(images[:num_images]):
        src = img.get_attribute("src")
        if src and "http" in src:
            try:
                response = requests.get(src, stream=True, timeout=5)
                if response.status_code == 200:
                    filename = f"{query.replace(' ', '')}{i}.jpg"
                    filepath = os.path.join(category_folder, filename)
                    with open(filepath, 'wb') as file:
                        file.write(response.content)
                    metadata.append([src, filename])
            except Exception as e:
                print(f"Error downloading {src}: {e}")
    return metadata

def save_metadata(metadata, dataset_path="RandomVision-50"):
    """Save metadata (image URLs and filenames) to a CSV file."""
    csv_file = os.path.join(dataset_path, "metadata.csv")
    with open(csv_file, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Image URL", "Filename"])
        writer.writerows(metadata)

def main():
    categories = [
        "Mountain Landscapes", "Space Nebulas", "City Skylines", "Wildlife Photography", "Colorful Birds",
        "Underwater Creatures", "Ancient Ruins", "Futuristic Cars", "Robotic Technology", "Abstract Art",
        "Street Graffiti", "Luxury Yachts", "Traditional Dances", "Fireworks Displays", "Hot Air Balloons",
        "Historical Castles", "Deep Sea Diving", "Golden Sunsets", "Extreme Sports", "Forest Waterfalls"
    ]
    
    dataset_path = "RandomVision-50"
    os.makedirs(dataset_path, exist_ok=True)
    
    driver = setup_driver()
    all_metadata = []

    for category in categories:
        print(f"Downloading images for {category}...")
        metadata = fetch_and_download_images(driver, category, dataset_path=dataset_path)
        all_metadata.extend(metadata)

    save_metadata(all_metadata, dataset_path)
    driver.quit()
    print("Dataset collection completed!")

if __name__ == "__main__":
    main()



NoSuchDriverException: Message: Unable to obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location


In [1]:
pip install requests

Collecting requests
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting charset-normalizer<4,>=2 (from requests)
  Downloading charset_normalizer-3.4.1-cp312-cp312-win_amd64.whl.metadata (36 kB)
Collecting idna<4,>=2.5 (from requests)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Using cached urllib3-2.3.0-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests)
  Using cached certifi-2025.1.31-py3-none-any.whl.metadata (2.5 kB)
Using cached requests-2.32.3-py3-none-any.whl (64 kB)
Using cached certifi-2025.1.31-py3-none-any.whl (166 kB)
Downloading charset_normalizer-3.4.1-cp312-cp312-win_amd64.whl (102 kB)
Using cached idna-3.10-py3-none-any.whl (70 kB)
Using cached urllib3-2.3.0-py3-none-any.whl (128 kB)
Installing collected packages: urllib3, idna, charset-normalizer, certifi, requests
Successfully installed certifi-2025.1.31 charset-normalizer-3.4.1 idna-3.10 requests-2.32

In [3]:
pip install selenium


Collecting selenium
  Using cached selenium-4.29.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Using cached trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Using cached trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting typing_extensions~=4.9 (from selenium)
  Using cached typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting websocket-client~=1.8 (from selenium)
  Using cached websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Collecting attrs>=23.2.0 (from trio~=0.17->selenium)
  Downloading attrs-25.3.0-py3-none-any.whl.metadata (10 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Using cached sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Using cached outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting sniffio>=1.3.0 (from trio~=0.17->selenium)
  Using cached sniffio-1.3.

In [6]:
pip install webdriver_manager

Collecting webdriver_manager
  Using cached webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting python-dotenv (from webdriver_manager)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Using cached webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, webdriver_manager
Successfully installed python-dotenv-1.0.1 webdriver_manager-4.0.2
Note: you may need to restart the kernel to use updated packages.


In [19]:
import os
import csv
import time
import requests
from bs4 import BeautifulSoup

# Define search categories (tourist places)
categories = [
    "Eiffel Tower France", "Great Wall of China", "Machu Picchu Peru", "Statue of Liberty USA", "Colosseum Italy",
    "Taj Mahal India", "Christ the Redeemer Brazil", "Sydney Opera House Australia", "Santorini Greece", "Grand Canyon USA",
    "Mount Fuji Japan", "Niagara Falls Canada", "Pyramids of Giza Egypt", "Petra Jordan", "Big Ben London",
    "Burj Khalifa Dubai", "Times Square New York", "Great Barrier Reef Australia", "Northern Lights Norway", "Banff National Park Canada"
]

# Create dataset folder
dataset_path = "TouristPlaces-50"
os.makedirs(dataset_path, exist_ok=True)

def fetch_image_urls(query, num_images=50):
    """Scrape Bing Image Search for image URLs."""
    search_url = f"https://www.bing.com/images/search?q={query.replace(' ', '+')}&form=HDRSC2"
    headers = {"User-Agent": "Mozilla/5.0"}
    
    response = requests.get(search_url, headers=headers)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")
    image_elements = soup.find_all("img")

    image_urls = []
    for img in image_elements:
        src = img.get("src")
        if src and src.startswith("http"):
            image_urls.append(src)
        if len(image_urls) >= num_images:
            break

    return image_urls

def download_images(query, image_urls):
    """Download images from URLs and save them in categorized folders."""
    category_folder = os.path.join(dataset_path, query.replace(" ", "_"))
    os.makedirs(category_folder, exist_ok=True)
    
    metadata = []
    for i, img_url in enumerate(image_urls):
        try:
            response = requests.get(img_url, stream=True, timeout=5)
            if response.status_code == 200:
                filename = f"{query.replace(' ', '')}{i}.jpg"
                filepath = os.path.join(category_folder, filename)
                
                with open(filepath, "wb") as file:
                    file.write(response.content)
                
                metadata.append([query, img_url, filename])
                print(f"Downloaded: {filename}")
        except Exception as e:
            print(f"Error downloading {img_url}: {e}")
    
    return metadata

# Start the scraping and downloading process
all_metadata = []
for category in categories:
    print(f"Fetching images for: {category}...")
    image_urls = fetch_image_urls(category)
    metadata = download_images(category, image_urls)
    all_metadata.extend(metadata)

# Save metadata to CSV
csv_file = os.path.join(dataset_path, "metadata.csv")
with open(csv_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Category", "Image URL", "Filename"])
    writer.writerows(all_metadata)

print("✅ Image scraping and downloading completed!")


Fetching images for: Eiffel Tower France...
Error downloading https://tse2.mm.bing.net/th?q=Tour+Eiffel+Paris+France&w=42&h=42&c=7&rs=1&p=0&o=5&pid=1.7&mkt=en-IN&cc=IN&setlang=en&adlt=moderate&t=1: HTTPSConnectionPool(host='tse2.mm.bing.net', port=443): Read timed out. (read timeout=5)
Error downloading https://tse4.mm.bing.net/th?q=Eiffel+Tower+in+Paris+France&w=42&h=42&c=7&rs=1&p=0&o=5&pid=1.7&mkt=en-IN&cc=IN&setlang=en&adlt=moderate&t=1: HTTPSConnectionPool(host='tse4.mm.bing.net', port=443): Read timed out. (read timeout=5)
Error downloading https://tse4.mm.bing.net/th?q=Eiffel+Tower+in+Night&w=42&h=42&c=7&rs=1&p=0&o=5&pid=1.7&mkt=en-IN&cc=IN&setlang=en&adlt=moderate&t=1: HTTPSConnectionPool(host='tse4.mm.bing.net', port=443): Read timed out. (read timeout=5)
Downloaded: EiffelTowerFrance3.jpg
Error downloading https://tse4.mm.bing.net/th?q=Climbing+Eiffel+Tower&w=42&h=42&c=7&rs=1&p=0&o=5&pid=1.7&mkt=en-IN&cc=IN&setlang=en&adlt=moderate&t=1: HTTPSConnectionPool(host='tse4.mm.bing.n

In [18]:
pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting beautifulsoup4 (from bs4)
  Downloading beautifulsoup4-4.13.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4->bs4)
  Using cached soupsieve-2.6-py3-none-any.whl.metadata (4.6 kB)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Downloading beautifulsoup4-4.13.3-py3-none-any.whl (186 kB)
Using cached soupsieve-2.6-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.13.3 bs4-0.0.2 soupsieve-2.6
Note: you may need to restart the kernel to use updated packages.
