In [None]:
import requests
import os
import time
import json
from datetime import datetime
from dotenv import load_dotenv

In [2]:
# Persistent Tracking: Load previously downloaded URLs
def load_downloaded_urls(file_path="downloaded_urls_flickr.json"):
    if os.path.exists(file_path):
        try:
            with open(file_path, "r") as f:
                urls = set(json.load(f))
            print(f"Loaded {len(urls)} previously downloaded URLs from {file_path}.")
            return urls
        except json.JSONDecodeError:
            print(f"Warning: {file_path} is empty or corrupted. Initializing as empty.")
            return set()
    else:
        return set()

In [3]:
# Persistent Tracking: Save downloaded URLs
def save_downloaded_urls(downloaded_urls, file_path="/home/natalyagrokh/img_datasets/downloaded_urls_flickr.json"):
    with open(file_path, "w") as f:
        json.dump(list(downloaded_urls), f)
    print(f"Saved {len(downloaded_urls)} downloaded URLs to {file_path}.")

In [4]:
# Generate queries related to people
def generate_people_related_queries():
    return ["contempt"] 
    # "faces", "smiling", "groups", "friends", "family", 
    #         "couples", "individuals", "emotions", "sad", "angry", "happy", "joyful",
    #         "suspicious", "disgust","fear", "rage","human faces","profession", "human", 
    #         "people", "portraits"]

In [5]:
# Download an image from a URL
def download_image(url, output_folder, filename):
    try:
        response = requests.get(url, stream=True, timeout=120)
        response.raise_for_status()
        file_path = os.path.join(output_folder, filename)
        with open(file_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Downloaded: {file_path}")
        return True
    except requests.exceptions.HTTPError as e:
        if response.status_code == 429:
            print(f"Rate limit exceeded for {url}. Retrying in 60 seconds...")
            time.sleep(60)
            return False  # Retry logic will occur in the main loop
        else:
            print(f"HTTP error for {url}: {e}")
            return False
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        return False

In [6]:
# Fetch and save images from Flickr
def auto_flickr_images(api_key, queries, output_folder, batch_size=100, max_requests=3600, wait_time=3600):
    base_url = "https://www.flickr.com/services/rest/"
    method = "flickr.photos.search"

    os.makedirs(output_folder, exist_ok=True)
    api_call_count = 0
    downloaded_urls = load_downloaded_urls()

    # Ensure JSON file is created even if no downloads occur
    if not downloaded_urls:
        save_downloaded_urls(downloaded_urls)

    existing_files = [f for f in os.listdir(output_folder) if f.endswith(".jpg")]
    existing_numbers = [int(f.split("_")[1].split(".")[0]) for f in existing_files if "_" in f]
    start_index = max(existing_numbers) + 1 if existing_numbers else 1

    for query in queries:
        print(f"Starting query: {query}")
        total_downloaded = 0
        page = 1

        while api_call_count < max_requests:
            params = {
                "method": method,
                "api_key": api_key,
                "text": query,
                "format": "json",
                "nojsoncallback": 1,
                "per_page": batch_size,
                "page": page,
                "extras": "url_o,url_m"
            }

            try:
                response = requests.get(base_url, params=params)
                if response.status_code == 429:  # Handle Too Many Requests
                    print("Too many requests. Waiting for 60 seconds...")
                    time.sleep(60)
                    continue

                api_call_count += 1
                print(f"API Calls Made: {api_call_count}")

                if response.status_code != 200:
                    print(f"Error: {response.status_code} - {response.text}")
                    break

                data = response.json()
                photos = data.get("photos", {}).get("photo", [])

                if not photos:
                    print(f"No more photos available for query: {query}")
                    break

                for photo in photos:
                    photo_url = photo.get("url_o") or photo.get("url_m")
                    if not photo_url:
                        print(f"Skipping photo due to missing URL: {photo}")
                        continue

                    filename = f"image_{start_index + total_downloaded}.jpg"
                    if download_image(photo_url, output_folder, filename):
                        downloaded_urls.add(photo_url)
                        save_downloaded_urls(downloaded_urls)  # Save immediately after each download
                        total_downloaded += 1

                page += 1

                if api_call_count >= max_requests:
                    print("Max API requests reached for this hour.")
                    break

                time.sleep(1)  # Enforce 1 request per second

            except requests.exceptions.RequestException as e:
                print(f"Error fetching page {page}: {e}")

        print(f"Finished query: {query}")

        if api_call_count >= max_requests:
            print(f"Waiting {wait_time / 60} minutes before starting the next cycle...")
            time.sleep(wait_time)
            api_call_count = 0

    save_downloaded_urls(downloaded_urls)
    print(f"Process completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}.")

In [None]:
# Example Usage
if __name__ == "__main__":
    # Load variables from .env
    load_dotenv()

    API_KEY = os.getenv("FLICKR_API_KEY") 
    
    if not API_KEY:
        raise ValueError("FLICKR_API_KEY not found in .env file")
    
    QUERIES = generate_people_related_queries()
    OUTPUT_FOLDER = "/home/natalyagrokh/img_datasets/flickr_contempt"

    auto_flickr_images(api_key=API_KEY, queries=QUERIES, output_folder=OUTPUT_FOLDER, batch_size=100, max_requests=3600, wait_time=3600)

Saved 0 downloaded URLs to /home/natalyagrokh/img_datasets/downloaded_urls_flickr.json.
Starting query: contempt
API Calls Made: 1
Downloaded: /home/natalyagrokh/img_datasets/flickr_images/image_1.jpg
Saved 1 downloaded URLs to /home/natalyagrokh/img_datasets/downloaded_urls_flickr.json.
Downloaded: /home/natalyagrokh/img_datasets/flickr_images/image_2.jpg
Saved 2 downloaded URLs to /home/natalyagrokh/img_datasets/downloaded_urls_flickr.json.
Downloaded: /home/natalyagrokh/img_datasets/flickr_images/image_3.jpg
Saved 3 downloaded URLs to /home/natalyagrokh/img_datasets/downloaded_urls_flickr.json.
Downloaded: /home/natalyagrokh/img_datasets/flickr_images/image_4.jpg
Saved 4 downloaded URLs to /home/natalyagrokh/img_datasets/downloaded_urls_flickr.json.
Downloaded: /home/natalyagrokh/img_datasets/flickr_images/image_5.jpg
Saved 5 downloaded URLs to /home/natalyagrokh/img_datasets/downloaded_urls_flickr.json.
Downloaded: /home/natalyagrokh/img_datasets/flickr_images/image_6.jpg
Saved 6 d

KeyboardInterrupt: 