In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import json
import yaml
import os
from os import path
from tqdm import tqdm
from pyvips import Image
from requests import get

In [2]:
# Load the config and extract credentials from it
config = yaml.load(open('../config.yml', 'r'), Loader=yaml.SafeLoader)
auth = (config['username'], config['api_key'])
debug_key = config['debug_key']

# Define the default headers that will be send with each request
default_headers = {
    'User-Agent': 'fluffle-notebook (by NoppesTheFolf on Twitter)'
}

# Create the cache directory if it doesn't exist yet
cache_dir = config['cache_dir']
if not path.exists(cache_dir):
    os.makedirs(cache_dir)

# Define functions to download files to and get files from cache directory 
def get_location(id: str):
    return path.join(cache_dir, '{}.png'.format(id))

def predownload_image(id: str, download_function):
    dest = get_location(id)
    if path.exists(dest):
        return
    
    response = download_function()
    image = Image.new_from_buffer(response.content, '')
    image.pngsave(dest)
    return dest

In [3]:
# Download the images from e621
def download_e621_post(id):
    def download():
        post_response = get('https://e621.net/posts/{}.json'.format(id), headers=default_headers, auth=auth)
        if post_response.status_code == 404:
            raise Exception('Post with ID {} could not be found.'.format(id))
        
        post_file = post_response.json()['post']['file']
        download_response = get(post_file['url'], headers=default_headers, auth=auth)
        if download_response.status_code == 404:
            raise Exception('File for post with ID {} could not be found.'.format(id))

        return download_response
    
    return predownload_image(id, download)

with open('../data/posts.json', 'r') as file:
    post_groups = json.load(file)

post_ids = pd.Series([item for sublist in post_groups for item in sublist])
if len(post_ids) != len(post_ids.unique()):
    raise Exception('Not all post IDs are unique!')

for post_id in tqdm(np.concatenate(post_groups), desc='Downloading images from e621'):
    download_e621_post(post_id)

Downloading images from e621: 100%|██████████| 115/115 [00:00<00:00, 32839.39it/s]


In [4]:
# Download the images from unsplash
def download_unsplash(id):
    def download():
        response = get('https://source.unsplash.com/' + id, headers=default_headers)
        if response.status_code == 404:
            raise Exception('No Unsplash image could be found with ID {}.'.format(id))
        
        return response

    return predownload_image(id, download)

with open('../data/unsplash.yml', 'r') as file:
    unsplash = yaml.load(file, Loader=yaml.SafeLoader)

unsplash_ids = np.concatenate(list(map(lambda x: x['ids'] ,unsplash.values())))
for unsplash_id in tqdm(unsplash_ids, desc='Downloading images from Unsplash'):
    download_unsplash(unsplash_id)

Downloading images from Unsplash: 100%|██████████| 12/12 [00:00<00:00, 23944.65it/s]
