In [5]:
import requests
from bs4 import BeautifulSoup
import re
import os
import time
from urllib.parse import urljoin

# Set up base parameters
base_url = "https://archives.bulbagarden.net"
start_url = "https://archives.bulbagarden.net/wiki/Category:HOME_menu_sprites"
output_dir = "sprites"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
}

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

def get_soup(url, retries=3, delay=2):
    """Fetches a page and returns its BeautifulSoup object."""
    for attempt in range(retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            return BeautifulSoup(response.text, "html.parser")
        except requests.exceptions.RequestException as e:
            print(f"Error fetching {url} (attempt {attempt+1}/{retries}): {e}")
            time.sleep(delay)
    raise Exception(f"Failed to fetch {url} after {retries} attempts.")

def parse_images(soup):
    """Parses the image file names and returns their download URLs and target names."""
    file_links = soup.select("div.mw-category-generated li a")
    results = []

    for link in file_links:
        href = link.get("href")
        text = link.get_text()

        text_clean = text.replace(" ", "_")

        match = re.search(r"Menu_HOME_(\d{4}(?:-[\w]+)?)\.png", text_clean)
        if match:
            dex_id = match.group(1)
            filename = f"Sprite_{dex_id}.png"
            full_image_page = urljoin(base_url, href)
            results.append((full_image_page, filename))

    return results

def get_direct_image_url(image_page_url):
    """Gets the direct URL to the image file."""
    soup = get_soup(image_page_url)
    full_image = soup.select_one("div.fullImageLink a")
    if full_image:
        return urljoin(base_url, full_image.get("href"))
    return None

def download_image(img_url, dest_path):
    """Downloads an image if it doesn't already exist."""
    if not os.path.exists(dest_path):
        try:
            response = requests.get(img_url, headers=headers, timeout=10)
            response.raise_for_status()
            with open(dest_path, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded: {dest_path}")
            time.sleep(1)  # be polite
        except Exception as e:
            print(f"Failed to download {img_url}: {e}")
    else:
        print(f"Already exists: {dest_path}")

def find_next_page(soup):
    """Finds the URL for the next page if it exists."""
    for link in soup.select("a"):
        if link.get_text(strip=True).lower() == "next page":
            return urljoin(base_url, link.get("href"))
    return None

def scrape_all(max_pages=10):
    current_url = start_url
    pages_scraped = 0
    while current_url and pages_scraped < max_pages:
        print(f"\nProcessing page: {current_url}")
        soup = get_soup(current_url)
        image_entries = parse_images(soup)
        for image_page_url, filename in image_entries:
            img_url = get_direct_image_url(image_page_url)
            if img_url:
                download_image(img_url, os.path.join(output_dir, filename))
        current_url = find_next_page(soup)
        pages_scraped += 1

if __name__ == "__main__":
    scrape_all()



Processing page: https://archives.bulbagarden.net/wiki/Category:HOME_menu_sprites
Downloaded: sprites\Sprite_0001.png
Downloaded: sprites\Sprite_0002.png
Downloaded: sprites\Sprite_0003-Mega.png
Downloaded: sprites\Sprite_0003.png
Downloaded: sprites\Sprite_0004.png
Downloaded: sprites\Sprite_0005.png
Downloaded: sprites\Sprite_0006-Mega_X.png
Downloaded: sprites\Sprite_0006-Mega_Y.png
Downloaded: sprites\Sprite_0006.png
Downloaded: sprites\Sprite_0007.png
Downloaded: sprites\Sprite_0008.png
Downloaded: sprites\Sprite_0009-Mega.png
Downloaded: sprites\Sprite_0009.png
Downloaded: sprites\Sprite_0010.png
Downloaded: sprites\Sprite_0011.png
Downloaded: sprites\Sprite_0012.png
Downloaded: sprites\Sprite_0013.png
Downloaded: sprites\Sprite_0014.png
Downloaded: sprites\Sprite_0015-Mega.png
Downloaded: sprites\Sprite_0015.png
Downloaded: sprites\Sprite_0016.png
Downloaded: sprites\Sprite_0017.png
Downloaded: sprites\Sprite_0018-Mega.png
Downloaded: sprites\Sprite_0018.png
Downloaded: sprites

In [6]:
import os
import shutil
import re

sprites_dir = "sprites"
national_dir = os.path.join(sprites_dir, "national")
os.makedirs(national_dir, exist_ok=True)

for filename in os.listdir(sprites_dir):
    if filename.endswith(".png") and filename.startswith("Sprite_"):
        match = re.match(r"Sprite_(\d{4})(?:-([\w]+))?\.png", filename)
        if match:
            _, form = match.groups()
            if form:
                target_dir = os.path.join(sprites_dir, form)
            else:
                target_dir = national_dir
            os.makedirs(target_dir, exist_ok=True)

            src_path = os.path.join(sprites_dir, filename)
            dst_path = os.path.join(target_dir, filename)
            shutil.move(src_path, dst_path)
            print(f"Moved: {filename} → {target_dir}")

Moved: Sprite_0001.png → sprites\national
Moved: Sprite_0002.png → sprites\national
Moved: Sprite_0003-Mega.png → sprites\Mega
Moved: Sprite_0003.png → sprites\national
Moved: Sprite_0004.png → sprites\national
Moved: Sprite_0005.png → sprites\national
Moved: Sprite_0006-Mega_X.png → sprites\Mega_X
Moved: Sprite_0006-Mega_Y.png → sprites\Mega_Y
Moved: Sprite_0006.png → sprites\national
Moved: Sprite_0007.png → sprites\national
Moved: Sprite_0008.png → sprites\national
Moved: Sprite_0009-Mega.png → sprites\Mega
Moved: Sprite_0009.png → sprites\national
Moved: Sprite_0010.png → sprites\national
Moved: Sprite_0011.png → sprites\national
Moved: Sprite_0012.png → sprites\national
Moved: Sprite_0013.png → sprites\national
Moved: Sprite_0014.png → sprites\national
Moved: Sprite_0015-Mega.png → sprites\Mega
Moved: Sprite_0015.png → sprites\national
Moved: Sprite_0016.png → sprites\national
Moved: Sprite_0017.png → sprites\national
Moved: Sprite_0018-Mega.png → sprites\Mega
Moved: Sprite_0018.p

In [None]:
import os
import re
import requests
from bs4 import BeautifulSoup

base = 'https://archives.bulbagarden.net'
category_url = base + '/w/index.php?title=Category:Type_icons'

# Regular expressions to match the filenames we are interested in
pattern_sw = re.compile(r'^(\w+) icon SwSh\.png$')
pattern_la = re.compile(r'^(\w+) icon LA\.png$')

# Function to download images for each page
def download_icons(page_url):
    resp = requests.get(page_url)
    soup = BeautifulSoup(resp.text, 'html.parser')

    # Iterate over each image item and check if it matches the required pattern
    for img_item in soup.select('li a'):  # or whatever tag holds file links
        fname = img_item.text.strip()
        m_sw = pattern_sw.match(fname)
        m_la = pattern_la.match(fname)
        if not (m_sw or m_la):
            continue
        type_name = (m_sw or m_la).group(1)
        filepage_url = base + img_item['href']
        
        # Fetch file page
        fp = requests.get(filepage_url)
        fsoup = BeautifulSoup(fp.text, 'html.parser')
        
        # Locate the actual image URL (often via <a class="internal"> or <div class="fullMedia">)
        img = fsoup.find('a', class_='internal')
        img_url = img['href']
        
        # Determine folder and filename
        folder = 'Small_Sprites' if m_sw else 'Full_Icons'
        outname = f"{type_name}_Small_Icon.png" if m_sw else f"{type_name}_Full_Icon.png"
        
        # Create folder if it doesn't exist
        os.makedirs(folder, exist_ok=True)
        
        # Download and save image
        img_data = requests.get(img_url).content
        with open(os.path.join(folder, outname), 'wb') as f:
            f.write(img_data)
        print(f'Saved: {outname}')

# Start scraping from the first page
download_icons(category_url)

# Now check for the next page (pagination)
previous_page_url = category_url  # Initial page URL

while True:
    # Get the last file name on the current page
    soup = BeautifulSoup(requests.get(previous_page_url).text, 'html.parser')
    last_file_name = soup.find_all('li')[-1].find('a')['href'].split('filefrom=')[-1]

    # Construct URL for the next page using the last file name as the "filefrom" value
    next_page_url = f"{category_url}&filefrom={last_file_name}"

    # If the URL doesn't change, we've reached the last page
    if next_page_url == previous_page_url:
        print("No more pages to scrape.")
        break  # Break if there is no next page
    
    # Download images from the next page
    print(f"Scraping next page: {next_page_url}")
    download_icons(next_page_url)

    # Update previous_page_url for the next iteration
    previous_page_url = next_page_url


Saved: Bug_Full_Icon.png
Saved: Bug_Small_Icon.png
Saved: Dark_Full_Icon.png
Saved: Dark_Small_Icon.png
Saved: Dragon_Full_Icon.png
Saved: Dragon_Small_Icon.png
Saved: Electric_Full_Icon.png
Saved: Electric_Small_Icon.png
Saved: Fairy_Full_Icon.png
Saved: Fairy_Small_Icon.png
Saved: Fighting_Full_Icon.png
Saved: Fighting_Small_Icon.png
Saved: Fire_Full_Icon.png
Saved: Fire_Small_Icon.png
No more pages to scrape.


  filefrom_link = soup.find('a', text='next')  # Adjust this to the correct selector
