In [None]:
import requests
import json
import os
import time
import numpy as np
import logging

logging.basicConfig(
    filename='run.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
)

def pause_report(length, random_delay, file_count):
    
    pause_time = length*np.random.normal(1,random_delay)
    logging.info(f"Downloaded {file_count} files. Pausing scraper for {round(pause_time,2)} seconds.")
    time.sleep(pause_time)

def get_data(query, page_start, page_end, short_pause=3, long_pause=30, random_delay=0.25):

    if page_start < 1:
        logging.error("Starting page number has to be greater than 0.")
        logging.error("Process exiting with error.")
        return 0
    
    if not os.path.exists(f'raw_data/query_{query}'):
        logging.debug(f'Creating directory: raw_data/query_{query}')
        os.makedirs(f'raw_data/query_{query}')
        
    saved_files = []
        
    for page_number in range(page_start, page_end):

        url = f"https://www.artstation.com/api/v2/search/projects.json?page={page_number}&per_page=75&query={query}&sorting=relevance"
        
        filename = f'raw_data/query_{query}/data_{page_number}.json'
            
        # only scrape url if json file of the url doesn't already exist
        if not os.path.exists(filename):
            client = requests.session()
            r = client.get(url)
            logging.info(f"Scraping {url} {r.status_code}: {r.reason}")
            
            # check if rate-limit was exceeded.
            if r.status_code == 429:
                logging.error("Rate-limit exceeded. Wait for a while and try running scraper again.")
                logging.error("Process exiting with error.")
                return 0
            
            # if "data" in response text is empty, the max page has probably been reached.
            data = json.loads(r.text)
            if len(data['data']) == 0:
                logging.warning(f"Data list is empty. Query {query} has no data on page {page_number}.")
                logging.warning(f"Max page for this query has probably been reached. Ending scraping process.")
                return 0

            # save file
            logging.info(f'Saving file as {filename}')
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(r.text, f, ensure_ascii=False, indent=4)
                saved_files.append(filename)
                
            # take a short pause per url scraped
            pause_report(short_pause, random_delay, len(saved_files))
        
        else:
            logging.info(f'{filename} already exists. Skipping the current url.')

        # take a long pause if 10 urls have been scraped
        if len(saved_files) > 0 and len(saved_files)%10 == 0:
            pause_report(long_pause, random_delay, len(saved_files))
      
    # process end summary
    logging.info(f"Files saved: {saved_files}")
    logging.info(f"Process finished running.")
    return 1


In [None]:
# load query
with open('query.txt', 'r') as f:
    query = f.read().strip()

# download raw json data
get_data(query, 1, 750_000, short_pause=10, long_pause=60) 