## Section 9: Project 2 - Image Scraper

### Using API Scraping

In [8]:
import requests
import os
import jmespath as jp
from tqdm.auto import tqdm
import time

API response: https://jsonhero.io/j/S6YVDIDz5X7Z

JSON Hero gives the JPATH for a given node, so we can parse using JPATH, instead of python dictionaries

In [9]:
# !pip install jmespath

In [10]:
def hit_unsplash_api(keyword, page=1):
    headers = {
        "authority": "unsplash.com",
        "accept": "*/*",
        "accept-language": "en-US",
        "referer": "https://unsplash.com/s/photos/pandas",
        "sec-ch-ua": '"Google Chrome";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": '"Linux"',
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
    }
    params = {
        "query": keyword,
        "per_page": "10",
        "page": f"{page}",
    }
    response = requests.get(
        "https://unsplash.com/napi/search/photos", params=params, headers=headers
    )
    time.sleep(1)
    data = response.json()
    jmes_path = "results[?premium==`false`].links.download"
    image_links = jp.search(jmes_path, data)
    return image_links

In [11]:
def get_image_links(keyword, image_count):
    images_per_page = 10
    all_image_links = []

    print(f"[x] {keyword=}")
    pages_to_crawl = (image_count // images_per_page) + 1
    print(f"[x] {pages_to_crawl=}")

    if image_count < images_per_page:
        image_links = hit_unsplash_api(keyword)
        all_image_links.extend(image_links)
    else:
        for page in tqdm(
            range(1, pages_to_crawl + 1),
            desc="Getting links",
        ):
            image_links = hit_unsplash_api(keyword, page=page)
            all_image_links.extend(image_links)

    print(f"[x] Found {len(all_image_links)} '{keyword}' free images")
    return all_image_links

In [12]:
def download_images(keyword, image_links):
    BASE_FOLDER = "images"
    keyword = keyword.lower().replace(" ", "_")
    os.makedirs(os.path.join(BASE_FOLDER, keyword), exist_ok=True)
    print(f"[x] Downloading {image_count} images")
    image_links = image_links[:image_count]
    for index, image_link in tqdm(
        enumerate(image_links, start=1),
        desc="Downloading images",
        total=len(image_links),
    ):
        response = requests.get(image_link)
        image_path = os.path.join(BASE_FOLDER, keyword, f"{keyword}_{index}.jpg")
        with open(image_path, "wb") as f:
            f.write(response.content)

In [13]:
keywords = {
    "Outer Space": 5,
    "redpanda": 10,
    "Mountain Range": 15,
    "fractals": 20,
    "Animals and birds": 25,
}

In [14]:
for keyword, image_count in keywords.items():
    image_links = get_image_links(keyword, image_count)
    download_images(keyword, image_links)
    print()

[x] keyword='Outer Space'
[x] pages_to_crawl=1
[x] Found 8 'Outer Space' free images
[x] Downloading 5 images


Downloading images:   0%|          | 0/5 [00:00<?, ?it/s]


[x] keyword='redpanda'
[x] pages_to_crawl=2


Getting links:   0%|          | 0/2 [00:00<?, ?it/s]

[x] Found 10 'redpanda' free images
[x] Downloading 10 images


Downloading images:   0%|          | 0/10 [00:00<?, ?it/s]


[x] keyword='Mountain Range'
[x] pages_to_crawl=2


Getting links:   0%|          | 0/2 [00:00<?, ?it/s]

[x] Found 16 'Mountain Range' free images
[x] Downloading 15 images


Downloading images:   0%|          | 0/15 [00:00<?, ?it/s]


[x] keyword='fractals'
[x] pages_to_crawl=3


Getting links:   0%|          | 0/3 [00:00<?, ?it/s]

[x] Found 25 'fractals' free images
[x] Downloading 20 images


Downloading images:   0%|          | 0/20 [00:00<?, ?it/s]


[x] keyword='Animals and birds'
[x] pages_to_crawl=3


Getting links:   0%|          | 0/3 [00:00<?, ?it/s]

[x] Found 25 'Animals and birds' free images
[x] Downloading 25 images


Downloading images:   0%|          | 0/25 [00:00<?, ?it/s]


