In [None]:
import requests
import os
import time
import json
from datetime import datetime
from dotenv import load_dotenv
from pathlib import Path

In [None]:
# global configurations
base_url = "https://www.flickr.com/services/rest/"
method = "flickr.photos.search"
commercial_licenses = "4,7,9,10" # (CC BY, No known restrictions, US Gov, CC0)

OUTPUT_FOLDER = "/Users/natalyagrokh/AI/ml_expressions/img_datasets/flickr_scraped"
file_path="/Users/natalyagrokh/AI/ml_expressions/img_datasets/flickr_scraped/downloaded_urls_flickr_2025.json"

In [3]:
# Loads a set of previously downloaded URLs from a specific JSON file
def load_downloaded_urls(file_path):
    if os.path.exists(file_path):
        try:
            with open(file_path, "r") as f:
                urls = set(json.load(f))
            print(f"Loaded {len(urls)} previously downloaded URLs from {file_path}.")
            return urls
        except json.JSONDecodeError:
            print(f"Warning: {file_path} is empty or corrupted. Starting fresh for this query.")
            return set()
    return set()

In [4]:
# Persistent Tracking: saves the set of downloaded URLs to a specific JSON file.
def save_downloaded_urls(downloaded_urls, file_path):
    with open(file_path, "w") as f:
        json.dump(list(downloaded_urls), f, indent=4)
    print(f"Saved {len(downloaded_urls)} downloaded URLs to {file_path}.")

In [5]:
# Recursively scans all subdirectories in the root_dir to find the
    # highest number in filenames like 'image_xxxx.jpg'. 
def find_highest_image_number(root_dir):
    highest_num = 0
    root_path = Path(root_dir)
    all_image_files = list(root_path.rglob("image_*.jpg")) + list(root_path.rglob("image_*.jpeg"))
    
    for file_path in all_image_files:
        try:
            num_str = file_path.stem.split('_')[1]
            num = int(num_str)
            if num > highest_num:
                highest_num = num
        except (IndexError, ValueError):
            continue
            
    print(f"Found highest existing image number: {highest_num}")
    return highest_num

In [6]:
# Generate queries related to people
def generate_people_related_queries():
    queries = ["candid+portrait", "street+photography", "spectator+reaction",
        "fan+celebration", "commuter", "person+laughing", "heated+discussion",
        "intense+concentration", "joyful", "pensive+person", "annoyed+face",
        "worried+person", "thoughtful+expression", "people+talking", "faces", 
        "smiling", "groups", "friends", "family", "couples", "individuals", 
        "emotions", "sad", "angry", "happy", "joyful","suspicious", "disgust","fear", 
        "rage","human faces","profession", "human", "people", "portraits", "contempt"
    ] 
    return queries

In [7]:
# Downloads a single image and saves its corresponding metadata.
def download_image(url, output_folder, image_filename, metadata, metadata_filename):
    try:
        response = requests.get(url, stream=True, timeout=120)
        response.raise_for_status()
        
        file_path = os.path.join(output_folder, image_filename)
        with open(file_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Downloaded: {image_filename}")

        # Save metadata
        metadata_path = os.path.join(output_folder, metadata_filename)
        with open(metadata_path, "w") as f:
            json.dump(metadata, f, indent=4)
        print(f"Saved Metadata: {metadata_filename}")
        
        return True
    except requests.exceptions.HTTPError as e:
        print(f"  HTTP error for {url}: {e}")
        return False
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        return False

In [8]:
# Exhaustively fetches and saves images for a SINGLE query, saving all files
    # and tracking information into the specified 'save_dir'.
    # Returns the number of images it successfully downloaded
def auto_flickr_images(api_key, query, save_dir, start_index, batch_size=100, max_requests=3600):
    
    api_call_count = 0
    page = 1
    
    tracking_file_path = os.path.join(save_dir, "downloaded_urls.json")
    downloaded_urls = load_downloaded_urls(tracking_file_path)

    total_downloaded_for_query = 0

    while api_call_count < max_requests:
        params = {
            "method": method, "api_key": api_key, "text": query,
            "license": commercial_licenses, "format": "json", "nojsoncallback": 1,
            "per_page": batch_size, "page": page,
            "extras": "url_o,url_m,license,owner_name,tags"
        }

        try:
            response = requests.get(base_url, params=params)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"API request failed: {e}. Stopping this query.")
            break
        
        api_call_count += 1
        print(f"API Call {api_call_count}/{max_requests} for query '{query}', page {page}.")

        data = response.json()
        photos = data.get("photos", {}).get("photo", [])

        if not photos:
            print(f"No more photos found for query '{query}'. This query is complete.")
            break

        new_downloads_this_page = 0
        for photo in photos:
            photo_url = photo.get("url_o") or photo.get("url_m")
            if not photo_url or photo_url in downloaded_urls:
                continue

            metadata = {
                "photo_id": photo.get("id"), 
                "owner_name": photo.get("ownername"),
                "license_id": photo.get("license"), 
                "tags": photo.get("tags"),
                "source_url": photo_url
            }
            
            # MODIFIED: Use the start_index passed to the function
            filename_base = f"image_{start_index + total_downloaded_for_query:04d}" # Added 4-digit padding
            image_filename = f"{filename_base}.jpg"
            metadata_filename = f"{filename_base}.json"
        
            if download_image(photo_url, save_dir, image_filename, metadata, metadata_filename):
                downloaded_urls.add(photo_url)
                save_downloaded_urls(downloaded_urls, tracking_file_path)
                total_downloaded_for_query += 1
                new_downloads_this_page += 1

        if new_downloads_this_page == 0 and len(photos) > 0:
            print(f"No new images found on page {page}. Assuming query '{query}' is complete.")
            break
        
        page += 1
        time.sleep(1.1)

    print(f"\nFinished query '{query}'. Downloaded a total of {total_downloaded_for_query} new images for this query.")
    return total_downloaded_for_query

In [None]:
# --- MAIN EXECUTION BLOCK ---
if __name__ == "__main__":
    
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)

    # Load variables from .env
    load_dotenv()

    API_KEY = os.getenv("FLICKR_API_KEY")

    if not API_KEY:
        raise ValueError("FLICKR_API_KEY not found in .env file")
    
    # 1. Find the highest existing number across all subfolders first.
    highest_number = find_highest_image_number(OUTPUT_FOLDER)
    current_image_counter = highest_number + 1
    print(f"Starting new session. Numbering will begin at: {current_image_counter}")
    
    all_queries = generate_people_related_queries()
    print(f"Found {len(all_queries)} queries to process for Flickr.\n{'='*50}")

    for query in all_queries:
        print(f"\nProcessing query: '{query}'")
        
        output_folder_for_query = os.path.join(OUTPUT_FOLDER, query.replace('+', '_'))
        os.makedirs(output_folder_for_query, exist_ok=True)
        
        # 2. Pass the current counter and get back the number of new files.
        newly_downloaded_count = auto_flickr_images(
            api_key=API_KEY,
            query=query,
            save_dir=output_folder_for_query,
            start_index=current_image_counter,
            batch_size=100,
            max_requests=3600
        )
        # 3. Increment the main counter.
        current_image_counter += newly_downloaded_count

        print(f"{'='*50}")

    print(f"\nAll queries have been processed. Next session will start after image_{current_image_counter - 1}.")

Found highest existing image number: 2006
Starting new session. Numbering will begin at: 2007
Found 36 queries to process for Flickr.

Processing query: 'candid+portrait'
Loaded 419 previously downloaded URLs from /Users/natalyagrokh/AI/ml_expressions/img_datasets/flickr_scraped/candid_portrait/downloaded_urls.json.
API Call 1/3600 for query 'candid+portrait', page 1.
Downloaded: image_2007.jpg
Saved Metadata: image_2007.json
Saved 420 downloaded URLs to /Users/natalyagrokh/AI/ml_expressions/img_datasets/flickr_scraped/candid_portrait/downloaded_urls.json.
Downloaded: image_2008.jpg
Saved Metadata: image_2008.json
Saved 421 downloaded URLs to /Users/natalyagrokh/AI/ml_expressions/img_datasets/flickr_scraped/candid_portrait/downloaded_urls.json.
Downloaded: image_2009.jpg
Saved Metadata: image_2009.json
Saved 422 downloaded URLs to /Users/natalyagrokh/AI/ml_expressions/img_datasets/flickr_scraped/candid_portrait/downloaded_urls.json.
Downloaded: image_2010.jpg
Saved Metadata: image_2010