In [None]:
# This script uses SerpAPI to scrape butterfly images, saves them, and manages request responses

In [None]:
# imports
import os
import requests
from serpapi import GoogleSearch
from google.colab import userdata

In [None]:
# list of butterflies
south_fl_butterflies = ["Giant Swallowtail",
    "Palamedes Swallowtail",
    "Black Swallowtail",
    "Zebra Swallowtail",
    "Eastern Tiger Swallowtail",
    "Monarch",
    "Queen",
    "Gulf Fritillary",
    "Common Buckeye",
    "Florida Juno",
    "Hackberry Emperor",
    "Pearl Crescent",
    "Painted Lady",
    "Little Yellow",
    "Clouded Skipper",
    "Sachem",
    "Southern Broken-Dash",
    "Caribbean Swallowtail",
    "Mourning Cloak",
    "Red-Spotted Purple",
    "Viceroy",
    "Eastern Comma",
    "American Lady",
    "Variegated Fritillary",
    "Checkered Skipper",
    "Fiery Skipper",
    "Delaware Skipper",
    "Dion Skipper",
    "Mallow Skipper",
    "Dorantes Longtail",
    "Long-tailed Skipper",
    "Southern Skipperling",
    "Common Checkered Skipper",
    "Cobweb Skipper",
    "Tropical Checkered Skipper",
    "Gray Hairstreak",
    "Common Hairstreak",
    "Red-banded Hairstreak",
    "Little Metalmark",
    "Texas Crescent",
    "Southeastern Fritillary",
    "Phaon Crescent",
    "Common Wood-Nymph",
    "Great Southern White",
    "Cloudless Sulphur",
    "Orange-barred Sulphur",
    "Yellow Sulphur",
    "Mango Skipper",
    "Tawny-edged Skipper",
    "Ocola Skipper",
    "Hesperides Skipper",
    "Orange Skipperling",
    "Two-tailed Swallowtail",
    "Tiger Swallowtail",
    "Black-spotted Yellow",
    "Banded Orange Heliconian",
    "Juno Longwing",
    "Passionflower Butterfly",
    "Mexican Yellow",
    "Orange Sulphur",
    "Anise Swallowtail",
    "Lyside Sulphur",
    "Leopard Lacewing",
    "Claudia's Longwing",
    "Black Witch",
    "Polydamas Swallowtail",
    "Silvery Checkerspot",
    "Ceraunus Blue",
    "Long-tailed Skipper",
    "Common White",
    "Horace's Duskywing",
    "Clouded Skipper",
    "Broad-winged Skipper",
    "Barred Yellow",
    "Pearly-Eye",
    "Southern White",
    "Wood Satyr",
    "Gray Comma",
    "Two-striped Skipper",
    "Red-spotted Purple",
    "Giant Swallowtail",
    "Cabbage White",
    "Alfalfa Butterfly",
    "Buckeye Butterfly",
    "Skipper",
    "Skipperling",
    "Northern Crescent",
    "Little Wood Satyr",
    "Julia Longwing",
    "Zebra Longwing",
    "Pipevine Swallowtail",
    "Snout Butterfly",
    "Dainty Sulphur",
    "Tropical Marble",
    "Giant White",
    "Eufala Skipper",
    "Southern Skipper",
    "Tropical White",
    "Falcate Orangetip",
    "Dorsal Redbanded Swallowtail"]

In [None]:
# use LLM to improve the make the labeling the same
from groq import Groq
import json
client = Groq(api_key=userdata.get('GROQ_API_KEY'))

system_prompt = f"""
Here is a list of butterflies: {butterflies}
They are not normalized under the same format, for instance some are capitlized like : ZEBRA LONG WING and some have hyphens like Zebra-Longwing. What is required is a lower case non hyphen format
Like this:  tailed jay

Your response must be json under this format:
{{
  'butterflies': []
}}
"""
completion = client.chat.completions.create(
    model = "llama-3.3-70b-versatile",
    messages = [{"role":"system","content":system_prompt}],
    response_format = {"type":"json_object"}
)

In [None]:
# function to scrape images
def scrape_images(butterflies):
    final_images = {}
    
    for butterfly in butterflies:
        params = {
            "q": f"{butterfly} Butterfly",
            "engine": "google_images",
            "ijn": "0",
            "api_key": userdata.get('SERP_API_KEY')
        }

        search = GoogleSearch(params)
        results = search.get_dict()
        
        if "images_results" in results:
            images = [img['thumbnail'] for img in results["images_results"]]
            final_images[butterfly] = images
        else:
            print(f"No images found for {butterfly}")
    
    return final_images

In [None]:
# download images
def download_images(butterflies, final_images):
    for butterfly in butterflies:
        os.makedirs(butterfly, exist_ok=True)
        
        for image_url in final_images.get(butterfly, []):
            image_name = image_url.split('/')[-1]
            image_path = os.path.join(f'/content/{butterfly}', image_name)
            
            response = requests.get(image_url)
            
            if response.status_code == 200:
                with open(image_path, "wb") as file:
                    file.write(response.content)
            else:
                print(f"Failed to download {image_url}")

In [None]:
# execute scraping and downloading
final_images = scrape_images(south_fl_butterflies)
download_images(south_fl_butterflies, final_images)