In [None]:
import requests
import os
from PIL import Image
from io import BytesIO
from datetime import datetime
from dotenv import load_dotenv
import time
import json
import hashlib

In [None]:
# Function to download and save images from Pexels
def auto_pexels_images(query, max_batches=5, batch_size=80, wait_time=3600):
    """
    Automatically fetch and save images from Pexels in batches, respecting API quotas.

    Args:
        query (str): Search query for images (e.g., "people's faces").
        max_batches (int): Maximum number of batches to fetch per run.
        batch_size (int): Number of images per batch.
        wait_time (int): Time to wait between full runs, in seconds.
    """
    MAX_RETRIES = 3  # Number of retries for a failed download
    api_call_count = 0  # Initialize API call counter

    while True:
        # Load previously downloaded URLs
        downloaded_urls = load_downloaded_urls()

        # Check existing images to determine the starting index
        existing_files = [f for f in os.listdir(SAVE_DIR) if f.startswith("image_") and f.endswith(".jpg")]
        existing_numbers = [int(f.split("_")[1].split(".")[0]) for f in existing_files if f.split("_")[1].split(".")[0].isdigit()]
        start_index = max(existing_numbers) + 1 if existing_numbers else 1

        total_downloaded = 0
        page = 1

        while total_downloaded < max_batches * batch_size:
            headers = {"Authorization": PEXELS_API_KEY}
            params = {
                "query": query, 
                "per_page": batch_size, 
                "page": page
            }

            response = requests.get(PEXELS_API_URL, headers=headers, params=params)
            api_call_count += 1  # Increment API call counter
            print(f"API Calls Made: {api_call_count}")

            if response.status_code == 429:  # Handle rate limiting (Too Many Requests)
                print("Rate limit reached. Waiting for 60 seconds before retrying...")
                time.sleep(60)  # Wait for 60 seconds before retrying
                continue  # Retry the same request

            if response.status_code != 200:
                print(f"Error: {response.status_code} - {response.text}")
                break

            data = response.json()
            photos = data.get("photos", [])

            if not photos:
                print("No more photos available.")
                break

            for photo in photos:
                image_url = photo["src"]["original"]

                if image_url in downloaded_urls:
                    print(f"Skipped duplicate URL: {image_url}")
                    continue

                for attempt in range(MAX_RETRIES):
                    try:
                        response = requests.get(image_url, timeout=120)
                        if response.status_code == 200:
                            file_name = os.path.join(SAVE_DIR, f"image_{start_index + total_downloaded}.jpg")
                            with open(file_name, "wb") as f:
                                f.write(response.content)
                            print(f"Downloaded: {file_name}")

                            downloaded_urls.add(image_url)
                            total_downloaded += 1
                            break
                    except (requests.ConnectionError, requests.Timeout) as e:
                        print(f"Attempt {attempt + 1} failed for {image_url}: {e}")
                        if attempt == MAX_RETRIES - 1:
                            print(f"Failed to download after {MAX_RETRIES} attempts: {image_url}")
                            with open("failed_urls.log", "a") as log_file:
                                log_file.write(image_url + "\n")

            page += 1  # Move to the next page

            if total_downloaded >= max_batches * batch_size:
                break

        # Save updated downloaded URLs
        save_downloaded_urls(downloaded_urls)

        print(f"Total images downloaded: {total_downloaded}")
        print(f"Process completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}.")

        # Wait before the next full run
        print(f"Waiting for {wait_time / 60} minutes before the next run...")
        time.sleep(wait_time)

# Call the function with your desired search query
auto_pexels_images(query="people's faces", max_batches=5, batch_size=80, wait_time=3600)


In [None]:
# Usage example
load_dotenv()

PEXELS_API_KEY = os.getenv("PEXELS_API_KEY")

if not PEXELS_API_KEY:
    raise ValueError("PEXELS_API_KEY not found in .env file")

PEXELS_API_URL = "https://api.pexels.com/v1/search"

CSE_ID = "a22d39ffb0f5145d9"  
QUERY = "people's faces"
OUTPUT_DIR = "google_images"
NUM_IMAGES = 100  # Google API allows up to 100/day for free tier access
BATCH_SIZE = 20    # Number of images per batch

In [27]:
# Persistent Tracking: Load previously downloaded URLs
def load_downloaded_urls(file_path="downloaded_google_urls.json"):
    if os.path.exists(file_path):
        try:
            with open(file_path, "r") as f:
                urls = set(json.load(f))
            print(f"Loaded {len(urls)} previously downloaded URLs from {file_path}.")
            return urls
        except (json.JSONDecodeError, ValueError):
            print(f"File {file_path} is empty or invalid. Starting with an empty URL set.")
            return set()
    else:
        return set()

In [28]:
# Persistent Tracking: Save downloaded URLs
def save_downloaded_urls(downloaded_urls, file_path="downloaded_google_urls.json"):
    with open(file_path, "w") as f:
        json.dump(list(downloaded_urls), f)
    print(f"Saved {len(downloaded_urls)} downloaded URLs to {file_path}.")

In [29]:
# Persistent Tracking: Load previously computed image hashes
def load_image_hashes(file_path="image_hashes.json"):
    if os.path.exists(file_path):
        with open(file_path, "r") as f:
            hashes = set(json.load(f))
        print(f"Loaded {len(hashes)} previously computed image hashes from {file_path}.")
        return hashes
    else:
        return set()

In [30]:
# Persistent Tracking: Save image hashes
def save_image_hashes(image_hashes, file_path="image_hashes.json"):
    with open(file_path, "w") as f:
        json.dump(list(image_hashes), f)
    print(f"Saved {len(image_hashes)} image hashes to {file_path}.")

In [31]:
def compute_image_hash(file_path):
    with open(file_path, "rb") as f:
        return hashlib.md5(f.read()).hexdigest()

In [32]:
# Function to download and save images from Google Custom Search API
def google_search_images(api_key, cse_id, query, output_dir, num_images=900, batch_size=10):
    """
    Search and download images from Google Custom Search API.

    Args:
        api_key (str): Google Custom Search API key.
        cse_id (str): Custom Search Engine ID.
        query (str): Search query (e.g., "people's faces").
        output_dir (str): Directory to save images.
        num_images (int): Number of images to fetch.
        batch_size (int): Number of images per batch.
    """
    # Base URL for Google Custom Search
    url = "https://www.googleapis.com/customsearch/v1"

    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Load previously downloaded URLs
    downloaded_urls = load_downloaded_urls()

    # Load previously computed image hashes
    image_hashes = load_image_hashes()

    # Check existing images to determine the starting index
    existing_files = [f for f in os.listdir(output_dir) if f.endswith(".jpg")]
    existing_numbers = [int(f.split(".")[0]) for f in existing_files if f.split(".")[0].isdigit()]
    start_index = max(existing_numbers) + 1 if existing_numbers else 1

    downloaded = 0
    start = 1

    while downloaded < num_images:
        params = {
            "q": query,                     # Search query
            "cx": cse_id,                   # Custom Search Engine ID
            "key": api_key,                 # API key
            "searchType": "image",          # Specify image search
            "num": min(batch_size, 10),     # Max 10 results per request
            "start": start                  # Pagination
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"Error: {response.status_code} - {response.text}")
            break

        results = response.json()

        # Download images
        for item in results.get("items", []):
            image_url = item["link"]

            if image_url in downloaded_urls:
                print(f"Skipped duplicate URL: {image_url}")
                continue

            try:
                img_data = requests.get(image_url).content
                file_name = os.path.join(output_dir, f"{start_index + downloaded}.jpg")
                with open(file_name, "wb") as f:
                    f.write(img_data)
                print(f"Downloaded: {file_name}")

                # Compute hash and check for duplicate content
                image_hash = compute_image_hash(file_name)
                if image_hash in image_hashes:
                    print(f"Removed duplicate content: {file_name}")
                    os.remove(file_name)
                    continue
                else:
                    image_hashes.add(image_hash)

                downloaded_urls.add(image_url)
                downloaded += 1

                if downloaded >= num_images:
                    break
            except Exception as e:
                print(f"Error downloading {image_url}: {e}")

        # Update start index for next batch
        start += 10
        if "items" not in results:
            break

    # Save updated downloaded URLs
    save_downloaded_urls(downloaded_urls)

    # Save updated image hashes
    save_image_hashes(image_hashes)

    print(f"Total images downloaded: {downloaded}")
    print(f"Process completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}.")

In [33]:
# Call the function
google_search_images(API_KEY, CSE_ID, QUERY, OUTPUT_DIR, NUM_IMAGES, BATCH_SIZE)

File downloaded_google_urls.json is empty or invalid. Starting with an empty URL set.
Downloaded: google_images/101.jpg
Downloaded: google_images/102.jpg
Downloaded: google_images/103.jpg
Downloaded: google_images/104.jpg
Downloaded: google_images/105.jpg
Downloaded: google_images/106.jpg
Downloaded: google_images/107.jpg
Downloaded: google_images/108.jpg
Downloaded: google_images/109.jpg
Downloaded: google_images/110.jpg
Downloaded: google_images/111.jpg
Downloaded: google_images/112.jpg
Downloaded: google_images/113.jpg
Downloaded: google_images/114.jpg
Downloaded: google_images/115.jpg
Downloaded: google_images/116.jpg
Downloaded: google_images/117.jpg
Downloaded: google_images/118.jpg
Downloaded: google_images/119.jpg
Downloaded: google_images/120.jpg
Downloaded: google_images/121.jpg
Downloaded: google_images/122.jpg
Downloaded: google_images/123.jpg
Downloaded: google_images/124.jpg
Downloaded: google_images/125.jpg
Downloaded: google_images/126.jpg
Downloaded: google_images/127.

KeyboardInterrupt: 