# Download images

In [None]:
import logging
import requests
import json
import os
import pathlib
import time
import csv

from typing import List, Type, Optional, Union, Tuple

from dotenv import load_dotenv
load_dotenv() 

## Configure the downloader

In [None]:
datasetName = 'dragons'

query = os.getenv('QUERY')
base_url = os.getenv('BASE_URL')
headers = {
    'Accept': 'application/json', 
    'User-Agent': 'Dragon dataset downloader 0.0.1',
}

image_download_interval = 1.1
json_download_interval = 5
pages_start = 1
pages_end = 2

datasetsRootPath = pathlib.Path('datasets') / datasetName
datasetsRootPath.mkdir(exist_ok=True, parents=True)

loadedImagesListFile = datasetsRootPath / ('loaded_images_' + datasetName + '.csv')

queryJsonFile = datasetsRootPath / (query + f'_pages_{pages_start}-{pages_end}' + '.json')

downloadFolder = datasetsRootPath / 'data' / 'obj'
downloadFolder.mkdir(exist_ok=True, parents=True)

logging.basicConfig(filename=datasetsRootPath / 'skipped.log', level=logging.INFO)

## Get definitions by score, for mass scraping, getting by post id range would be better.

In [None]:
lst = []
for i in range(pages_start,pages_end):
    req = requests.get(base_url+f'/posts?page={i}' + query, headers=headers)
    print('Page:', str(i), 'Status code:', str(req.status_code), 'Number of fetched definitions:', str(len(req.json()['posts'])))
    lst = lst + req.json()['posts']
    time.sleep(json_download_interval)

## Save definitions into a JSON-file

In [None]:
with open(queryJsonFile, "w") as outfile: 
    json.dump(lst, outfile)

## Load definitions from a JSON-file

In [None]:
with open(queryJsonFile) as json_file:
    loaded = json.load(json_file)

In [None]:
maxScorePost = max(loaded, key=lambda x:x['score']['total'])
minScorePost = min(loaded, key=lambda x:x['score']['total'])
maxIDPost = max(loaded, key=lambda x:x['id'])
minIDPost = min(loaded, key=lambda x:x['id'])

## Define the helper methods

In [None]:
def saveImageFromPost(post: List[dict], savePath) -> None:
    url = post.get('file').get('url')

    if url is None:
        logging.error(f'Skipping image ID {post["id"]} due to missing image URL')
        raise ValueError

    filename = url.rsplit('/', 1)[1]
    
    # Use MD5, ID is also a possibility.
    saveImageFromUrl(url,filename, savePath)

def saveImageFromUrl(url: str, filename: str, savePath) -> None:
    r = requests.get(url, allow_redirects=True)
    if not r.ok:
        print(r.status_code)
    open(savePath.joinpath(filename), 'wb').write(r.content)

def addIdsAndUrlsToCsv(new_ids: List[int], new_urls: List[str], filepath):
    with open(filepath, mode='a') as outfile:
        writer = csv.writer(outfile)
        for new_id, new_url in zip(new_ids,new_urls):
            writer.writerow([new_id,new_url])

def readAlreadyLoadedSet(filepath) -> set:
    '''
    Assumes:
        - A CSV with two columns interpreted as 'id' and 'url'
    '''
    already_loaded = set()

    with open(filepath, mode='r') as infile:
        reader = csv.reader(infile)
        data = list(reader)
        for row in data:
            already_loaded.add(int(row[0]))
    return already_loaded

def saveDatasetFromListOfPosts(lst, savePath, loadedImagesListFile, already_loaded: Optional[set] = set()):
    try:
        already_loaded = readAlreadyLoadedSet(loadedImagesListFile)
    except:
        already_loaded = set()

    new_urls = []
    new_ids = []

    try:
        for post in lst:
            if post['id'] not in already_loaded:
                try:
                    saveImageFromPost(post, savePath)
                except ValueError:
                    continue
                new_urls.append(post['file']['url'])
                new_ids.append(post['id'])
            else:
                continue
            time.sleep(image_download_interval)
    except:
        addIdsAndUrlsToCsv(new_ids,new_urls,loadedImagesListFile)
        raise
    addIdsAndUrlsToCsv(new_ids,new_urls)

## Save the dataset defined in definitions json, can be interrupted

In [None]:
saveDatasetFromListOfPosts(loaded, downloadFolder, loadedImagesListFile)