In [None]:
import requests
import os
from PIL import Image
from io import BytesIO
from datetime import datetime
from dotenv import load_dotenv
import time
import json
from pathlib import Path

In [2]:
# Directory to save downloaded pexels images
SAVE_DIR = "/Users/natalyagrokh/AI/ml_expressions/img_datasets/pexels_dataset_archive/pexels_scraped"
os.makedirs(SAVE_DIR, exist_ok=True)
file_path="/Users/natalyagrokh/AI/ml_expressions/img_datasets/pexels_scraped/downloaded_urls_pexels_2025.json"

In [None]:
# Pexels URL
PEXELS_API_URL = "https://api.pexels.com/v1/search"

In [4]:
# Persistent Tracking: Load previously downloaded URLs
def load_downloaded_urls(file_path):
    if os.path.exists(file_path):
        try:
            with open(file_path, "r") as f:
                urls = set(json.load(f))
            print(f"Loaded {len(urls)} previously downloaded URLs from {file_path}.")
            return urls
        except json.JSONDecodeError:
            print(f"Warning: {file_path} is empty or corrupted. Starting fresh for this query.")
            return set()
    return set()

In [5]:
# Persistent Tracking: Saves downloaded URLs to specific JSON file
def save_downloaded_urls(downloaded_urls, file_path):
    with open(file_path, "w") as f:
        json.dump(list(downloaded_urls), f, indent=4)
    print(f"Saved {len(downloaded_urls)} downloaded URLs to {file_path}.")

In [6]:
# Recursively scans all subdirectories in the root_dir to find the
    # highest number in filenames like 'image_xxxx.jpg'
def find_highest_image_number(root_dir):
    highest_num = 0
    root_path = Path(root_dir)
    
    # Use rglob to find all matching files recursively
    all_image_files = list(root_path.rglob("image_*.jpg")) + list(root_path.rglob("image_*.jpeg"))
    
    for file_path in all_image_files:
        try:
            # Extracts the number part from "image_1234.jpg"
            num_str = file_path.stem.split('_')[1]
            num = int(num_str)
            if num > highest_num:
                highest_num = num
        except (IndexError, ValueError):
            # Ignores files that don't match the expected "image_xxxx" format
            continue
            
    print(f"Found highest existing image number: {highest_num}")
    return highest_num

In [7]:
# Returns a list of diverse, URL-formatted search queries to find natural human expressions
def generate_pexels_queries():

    # Using '+' for URL encoding
    queries = [
        "spectator+reaction","fan+celebration", "commuter", "person+laughing", 
        "heated+discussion","intense+concentration", "joyful", "pensive+person", 
        "annoyed+face","worried+person", "thoughtful+expression", "people+talking", 
        "faces", "smiling", "groups", "friends", "family", "couples", "individuals", 
        "emotions", "sad", "angry", "happy", "joyful","suspicious", "disgust","fear", 
        "rage","human faces","profession", "human", "people", "portraits", "contempt", 
        "candid+portrait", "street+photography"
    ]
    return queries

In [8]:
# Exhaustively fetches and saves images for a SINGLE query, saving all files
    # and tracking information into the specified 'save_dir'
    # Returns the number of images it successfully downloaded
def auto_pexels_images(api_key, query, save_dir, start_index, max_api_calls=200, batch_size=80):
    
    PEXELS_API_URL = "https://api.pexels.com/v1/search"
    MAX_RETRIES = 3
    api_call_count = 0
    page = 1
    
    tracking_file_path = os.path.join(save_dir, "downloaded_urls.json")
    downloaded_urls = load_downloaded_urls(tracking_file_path)

    total_downloaded_for_query = 0

    while api_call_count < max_api_calls:
        headers = {"Authorization": api_key}
        params = {"query": query, "per_page": batch_size, "page": page}

        try:
            response = requests.get(PEXELS_API_URL, headers=headers, params=params)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"API request failed: {e}. Stopping this query.")
            break
            
        api_call_count += 1
        print(f"API Call {api_call_count}/{max_api_calls} for query '{query}', page {page}.")

        data = response.json()
        photos = data.get("photos", [])

        if not photos:
            print(f"No more photos found for query '{query}'. This query is complete.")
            break

        new_downloads_this_page = 0
        for photo in photos:
            image_url = photo["src"]["original"]
            if image_url in downloaded_urls:
                continue

            metadata = {
                "photographer": photo.get("photographer"),
                "photographer_url": photo.get("photographer_url"),
                "pexels_url": photo.get("url"),
                "source_url": image_url
            }

            for attempt in range(MAX_RETRIES):
                try:
                    dl_response = requests.get(image_url, timeout=120)
                    dl_response.raise_for_status()
                    
                    # MODIFIED: Use the start_index passed to the function
                    filename_base = f"image_{start_index + total_downloaded_for_query:04d}" # Added 4-digit padding
                    image_filename = os.path.join(save_dir, f"{filename_base}.jpg")
                    metadata_filename = os.path.join(save_dir, f"{filename_base}.json")

                    with open(image_filename, "wb") as f:
                        f.write(dl_response.content)
                    
                    with open(metadata_filename, "w") as f:
                        json.dump(metadata, f, indent=4)
                    
                    print(f"  Success: Downloaded {image_filename} and its metadata.")
                    downloaded_urls.add(image_url)
                    save_downloaded_urls(downloaded_urls, tracking_file_path)
                    
                    total_downloaded_for_query += 1
                    new_downloads_this_page += 1
                    break
                except requests.exceptions.RequestException as e:
                    print(f"  Attempt {attempt + 1} failed for {image_url}: {e}")
                    if attempt == MAX_RETRIES - 1:
                        print(f"  Failed to download after {MAX_RETRIES} attempts.")
        
        if new_downloads_this_page == 0 and len(photos) > 0:
            print(f"No new images found on page {page}. Assuming query '{query}' is complete.")
            break
        
        page += 1
        time.sleep(1)

    print(f"\nFinished query '{query}'. Downloaded a total of {total_downloaded_for_query} new images for this query.")
    return total_downloaded_for_query

In [None]:
# --- MAIN EXECUTION BLOCK ---

if __name__ == "__main__":
    
    os.makedirs(SAVE_DIR, exist_ok=True)

    # Load variables from .env
    load_dotenv()

    PEXELS_API_KEY = os.getenv("PEXELS_API_KEY")

    if not PEXELS_API_KEY:
        raise ValueError("PEXELS_API_KEY not found in .env file")
    
    # 1. Find the highest existing number across all subfolders first.
    highest_number = find_highest_image_number(SAVE_DIR)
    current_image_counter = highest_number + 1
    print(f"Starting new session. Numbering will begin at: {current_image_counter}")
    
    all_queries = generate_pexels_queries()
    print(f"Found {len(all_queries)} queries to process for Pexels.\n{'='*50}")

    for query in all_queries:
        print(f"\nProcessing query: '{query}'")
        
        output_folder_for_query = os.path.join(SAVE_DIR, query.replace('+', '_'))
        os.makedirs(output_folder_for_query, exist_ok=True)
        
        # 2. Pass the current counter and get back the number of new files.
        newly_downloaded_count = auto_pexels_images(
            api_key=PEXELS_API_KEY,
            query=query,
            save_dir=output_folder_for_query,
            start_index=current_image_counter,
            max_api_calls=200,
            batch_size=80
        )
        # 3. Increment the main counter.
        current_image_counter += newly_downloaded_count

        print(f"{'='*50}")

    print(f"\nAll queries have been processed. Next session will start after image_{current_image_counter - 1}.")

Found highest existing image number: 52253
Starting new session. Numbering will begin at: 52254
Found 36 queries to process for Pexels.

Processing query: 'spectator+reaction'
Loaded 295 previously downloaded URLs from /Users/natalyagrokh/AI/ml_expressions/img_datasets/pexels_dataset_archive/pexels_scraped/spectator_reaction/downloaded_urls.json.
API Call 1/200 for query 'spectator+reaction', page 1.
  Success: Downloaded /Users/natalyagrokh/AI/ml_expressions/img_datasets/pexels_dataset_archive/pexels_scraped/spectator_reaction/image_52254.jpg and its metadata.
Saved 296 downloaded URLs to /Users/natalyagrokh/AI/ml_expressions/img_datasets/pexels_dataset_archive/pexels_scraped/spectator_reaction/downloaded_urls.json.
  Success: Downloaded /Users/natalyagrokh/AI/ml_expressions/img_datasets/pexels_dataset_archive/pexels_scraped/spectator_reaction/image_52255.jpg and its metadata.
Saved 297 downloaded URLs to /Users/natalyagrokh/AI/ml_expressions/img_datasets/pexels_dataset_archive/pexels