In [2]:
# Replace with your own Flickr API Key
API_KEY = '93266420f40b9e228861494be6d38dcf'

# Define the breeds you want to fetch images for
breeds = ["husky", "shiba", "chihuahua"]

# Directory to store the downloaded images
output_dir = "dog_images"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [1]:
import os
import requests
from PIL import Image
from io import BytesIO

In [5]:
def fetch_flickr_images(query, num_images=1000, output_dir="dog_images"):
    flickr_api_url = "https://www.flickr.com/services/rest/"
    params = {
        'method': 'flickr.photos.search',
        'api_key': API_KEY,
        'text': query,
        'per_page': 100,
        'format': 'json',
        'nojsoncallback': 1,
        'sort': 'relevance',
        'license': 1
    }
    
    breed_dir = os.path.join(output_dir, query)
    if not os.path.exists(breed_dir):
        os.makedirs(breed_dir)
    
    # Check how many images are already downloaded
    existing_images = len([f for f in os.listdir(breed_dir) if f.endswith('.jpg')])
    images_fetched = existing_images  # Start from the existing images
    current_page = (images_fetched // 100) + 1  # Start from the page where we left off
    
    while images_fetched < num_images:
        params['page'] = current_page
        response = requests.get(flickr_api_url, params=params)
        
        if response.status_code != 200:
            print(f"Failed to fetch page {current_page} for {query}: {response.status_code}")
            break
        
        data = response.json()
        if data['stat'] != 'ok':
            print(f"API error for {query}: {data['message']}")
            break
        
        photos = data['photos']['photo']
        print(f"Page {current_page} returned {len(photos)} photos.")
        
        if not photos:
            print(f"No more photos available for {query}.")
            break
        
        for photo in photos:
            photo_url = f"https://farm{photo['farm']}.staticflickr.com/{photo['server']}/{photo['id']}_{photo['secret']}.jpg"
            try:
                # Generate a unique image name based on the current index
                img_name = f"{query}_{images_fetched + 1}.jpg"
                img_path = os.path.join(breed_dir, img_name)

                # Skip downloading if the image already exists
                if os.path.exists(img_path):
                    print(f"Image {img_name} already exists, skipping.")
                    images_fetched += 1
                    continue

                img_data = requests.get(photo_url).content
                img = Image.open(BytesIO(img_data))
                img.save(img_path)
                images_fetched += 1
                print(f"Downloaded image {images_fetched}/{num_images} for {query}")
                
                if images_fetched >= num_images:
                    print(f"Finished downloading {num_images} images for {query}")
                    return
            except Exception as e:
                print(f"Error downloading image {photo_url}: {e}")
        
        # Increment the page for the next iteration
        current_page += 1
    
    print(f"Downloaded {images_fetched} images for {query}") 


In [7]:
# Ensure breeds is defined and not empty
if 'breeds' in locals() and breeds:
    # Define the number of images to download for each breed
    images_per_breed = 10000 // len(breeds)  # Divide 10,000 images across breedsbv 

    # Fetch images for each breed
    for breed in breeds:
        try:
            print(f"Starting download for breed: {breed}")
            fetch_flickr_images(breed, num_images=images_per_breed, output_dir=output_dir)
        except Exception as e:
            print(f"Error while processing breed '{breed}': {e}")
else:
    print("Error: 'breeds' is not defined or is empty. Please define 'breeds' first.")


Starting download for breed: husky
Page 1 returned 99 photos.
Downloaded image 1/3333 for husky
Downloaded image 2/3333 for husky
Downloaded image 3/3333 for husky
Downloaded image 4/3333 for husky
Downloaded image 5/3333 for husky
Downloaded image 6/3333 for husky
Downloaded image 7/3333 for husky
Downloaded image 8/3333 for husky
Downloaded image 9/3333 for husky
Downloaded image 10/3333 for husky
Downloaded image 11/3333 for husky
Downloaded image 12/3333 for husky
Downloaded image 13/3333 for husky
Downloaded image 14/3333 for husky
Downloaded image 15/3333 for husky
Downloaded image 16/3333 for husky
Downloaded image 17/3333 for husky
Downloaded image 18/3333 for husky
Downloaded image 19/3333 for husky
Downloaded image 20/3333 for husky
Downloaded image 21/3333 for husky
Downloaded image 22/3333 for husky
Downloaded image 23/3333 for husky
Downloaded image 24/3333 for husky
Downloaded image 25/3333 for husky
Downloaded image 26/3333 for husky
Downloaded image 27/3333 for husky
Do

KeyboardInterrupt: 

In [None]:
# Check the number of images downloaded for each breed
for breed in breeds:
    breed_dir = os.path.join(output_dir, breed)
    breed_images = len([f for f in os.listdir(breed_dir) if f.endswith('.jpg')])
    print(f"{breed} has {breed_images} images.")


In [None]:
from sklearn.model_selection import train_test_split
import shutil

# Split images into train, validation, and test sets
for breed in breeds:
    breed_dir = os.path.join(output_dir, breed)
    images = [f for f in os.listdir(breed_dir) if f.endswith('.jpg')]

    # Split the images into 70% train, 15% validation, and 15% test
    train_images, temp_images = train_test_split(images, test_size=0.3, random_state=42)
    val_images, test_images = train_test_split(temp_images, test_size=0.5, random_state=42)

    # Create directories for train, validation, and test sets
    for subset in ['train', 'val', 'test']:
        subset_dir = os.path.join(output_dir, breed, subset)
        os.makedirs(subset_dir, exist_ok=True)

    # Move images to corresponding directories
    for img_list, subset in zip([train_images, val_images, test_images], ['train', 'val', 'test']):
        for img_name in img_list:
            src = os.path.join(breed_dir, img_name)
            dst = os.path.join(output_dir, breed, subset, img_name)
            shutil.move(src, dst)
