In [None]:
import requests
import os
from PIL import Image
from io import BytesIO
from datetime import datetime
from dotenv import load_dotenv
import time
import json

In [2]:
# Directory to save downloaded pexels images
SAVE_DIR = "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/pexels_disgust"
os.makedirs(SAVE_DIR, exist_ok=True)

In [None]:
# Load variables from .env
load_dotenv()

#set your Pexels API key
PEXELS_API_URL = "https://api.pexels.com/v1/search"

PEXELS_API_KEY = os.getenv("PEXELS_API_KEY")

if not PEXELS_API_KEY:
    raise ValueError("PEXELS_API_KEY not found in .env file")

In [4]:
# Persistent Tracking: Load previously downloaded URLs
def load_downloaded_urls(file_path="/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/downloaded_urls_pexels.json"):
    if os.path.exists(file_path):
        with open(file_path, "r") as f:
            urls = set(json.load(f))
        print(f"Loaded {len(urls)} previously downloaded URLs from {file_path}.")
        return urls
    else:
        return set()

In [5]:
# Persistent Tracking: Save downloaded URLs
def save_downloaded_urls(downloaded_urls, file_path="/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/downloaded_urls_pexels.json"):
    with open(file_path, "w") as f:
        json.dump(list(downloaded_urls), f)
    print(f"Saved {len(downloaded_urls)} downloaded URLs to {file_path}.")

In [6]:
# Function to download and save images from Pexels
# max_batches=200,  == set based on a 200/hr api call max policy by pexels
# batch_size=80 == set on a max 80 imgs / api call policy
# wait_time=3600 == set to automate hourly calls
def auto_pexels_images(query, max_batches=200, batch_size=80, wait_time=3600):
    """
    Automatically fetch and save images from Pexels in batches, respecting API quotas.

    Args:
        query (str): Search query for images (e.g., "people's faces").
        max_batches (int): Maximum number of batches to fetch per run.
        batch_size (int): Number of images per batch.
        wait_time (int): Time to wait between full runs, in seconds.
    """
    MAX_RETRIES = 3  # Number of retries for a failed download
    api_call_count = 0  # Initialize API call counter

    while True:
        # Load previously downloaded URLs
        downloaded_urls = load_downloaded_urls()

        # Check existing images to determine the starting index
        existing_files = [f for f in os.listdir(SAVE_DIR) if f.startswith("image_") and f.endswith(".jpg")]
        existing_numbers = [int(f.split("_")[1].split(".")[0]) for f in existing_files if f.split("_")[1].split(".")[0].isdigit()]
        start_index = max(existing_numbers) + 1 if existing_numbers else 1

        total_downloaded = 0
        page = 1

        while total_downloaded < max_batches * batch_size:
            headers = {"Authorization": PEXELS_API_KEY}
            params = {
                "query": query, 
                "per_page": batch_size, 
                "page": page
            }

            response = requests.get(PEXELS_API_URL, headers=headers, params=params)
            api_call_count += 1  # Increment API call counter
            print(f"API Calls Made: {api_call_count}")

            if response.status_code == 429:  # Handle rate limiting (Too Many Requests)
                print("Rate limit reached. Waiting for 60 seconds before retrying...")
                time.sleep(60)  # Wait for 60 seconds before retrying
                continue  # Retry the same request

            if response.status_code != 200:
                print(f"Error: {response.status_code} - {response.text}")
                break

            data = response.json()
            photos = data.get("photos", [])

            if not photos:
                print("No more photos available.")
                break

            for photo in photos:
                image_url = photo["src"]["original"]

                if image_url in downloaded_urls:
                    print(f"Skipped duplicate URL: {image_url}")
                    continue

                for attempt in range(MAX_RETRIES):
                    try:
                        response = requests.get(image_url, timeout=120)
                        if response.status_code == 200:
                            file_name = os.path.join(SAVE_DIR, f"image_{start_index + total_downloaded}.jpg")
                            with open(file_name, "wb") as f:
                                f.write(response.content)
                            print(f"Downloaded: {file_name}")

                            downloaded_urls.add(image_url)
                            total_downloaded += 1
                            break
                    except (requests.ConnectionError, requests.Timeout) as e:
                        print(f"Attempt {attempt + 1} failed for {image_url}: {e}")
                        if attempt == MAX_RETRIES - 1:
                            print(f"Failed to download after {MAX_RETRIES} attempts: {image_url}")
                            with open("failed_urls.log", "a") as log_file:
                                log_file.write(image_url + "\n")

            page += 1  # Move to the next page

            if total_downloaded >= max_batches * batch_size:
                break

        # Save updated downloaded URLs
        save_downloaded_urls(downloaded_urls)

        print(f"Total images downloaded: {total_downloaded}")
        print(f"Process completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}.")

        # Wait before the next full run
        print(f"Waiting for {wait_time / 60} minutes before the next run...")
        time.sleep(wait_time)

In [7]:
# Call the function with desired search query
auto_pexels_images(query="Disgust+Emotion+Face", max_batches=200, batch_size=80, wait_time=3600)

Loaded 1 previously downloaded URLs from /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/downloaded_urls_pexels.json.
API Calls Made: 1
Downloaded: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/pexels_disgust/image_1.jpg
Downloaded: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/pexels_disgust/image_2.jpg
Downloaded: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/pexels_disgust/image_3.jpg
Downloaded: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/pexels_disgust/image_4.jpg
Downloaded: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/pexels_disgust/image_5.jpg
Downloaded: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/pexels_disgust/image_6.jpg
Downloaded: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/pexels_disgust/image_7.jpg
Downloaded: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/pexels_disgust/image_8.jpg
Downloaded: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/pexels_d

KeyboardInterrupt: 