In [7]:
import json
import pandas as pd
import numpy as np
import os

import time
import requests
import cloudscraper
import logging

In [8]:
def pause_report(length, random_delay, count):
    pause_time = length*np.random.normal(1,random_delay)
    logging.info(f"Downloaded {count} users. Pausing scraper for {round(pause_time,2)} seconds.")
    time.sleep(pause_time)

In [13]:
def download_users(df_art, short_pause = 2, long_pause = 60, random_delay = 0.25):
    scraper       = cloudscraper.create_scraper()
    users         = dict()
    skipped_users = dict()
    for _, row in df_art.iterrows():
        # Get the user
        uname = row['user']['username']
        uurl = f"https://www.artstation.com/users/{uname}.json"
        try:
            response = scraper.get(uurl)
            response.raise_for_status()
            
            user = response.json()
            users[uname] = user
            pause_report(short_pause, random_delay, len(users))
        except (requests.exceptions.RequestException or requests.exceptions.ConnectionError or AttributeError or TypeError) as e:
            logging.debug("Cannot establish connection with the following row in df:")
            logging.debug(f"{uname}:{uurl}")
            logging.debug("Skipping to next row")
            skipped_users[uname] = uurl #add key value pair to skipped_urls dict
            
            if len(skipped_users) % 10 == 0:
                logging.warning("More than 10 urls have had connection error. Refreshing session and pausing for 5 minutes.")
                logging.warning(len(skipped_users))
                pause_report(300, random_delay, len(users)) # pause for 5 minutes if more than 10 urls with connection error
                scraper = cloudscraper.create_scraper()
        
        # take a long pause if 100 users have been downloaded
        if len(users) > 0 and len(users)%100 == 0:
            pause_report(long_pause, random_delay, len(users))
            
        # take a long pause x 2 if 500 users have been downloaded
        if len(users) > 0 and len(users)%500 == 0:
            pause_report(long_pause*2, random_delay, len(users))
            
            
    # Re-try all skipped users
    for skipped_uname, uurl in skipped_users.items():
        if skipped_uname in users.keys():
            continue
        
        scraper = cloudscraper.create_scraper() # refresh session
        try:
            response = scraper.get(uurl)
            response.raise_for_status()
            
            user = response.json()
            users[skipped_uname] = user
            pause_report(short_pause, random_delay, len(users))
        except (requests.exceptions.RequestException or requests.exceptions.ConnectionError or AttributeError or TypeError) as e:
            logging.debug("Cannot establish connection with the following row in df:")
            logging.debug(f"{skipped_uname}:{uurl}")
            logging.debug("Skipping to next row")
            logging.debug(e)
            continue
    
    return pd.DataFrame(users).T

In [14]:
with open('query.txt', 'r') as f:
    query = f.read().strip()

In [15]:
# Load the data
df_art = pd.read_json(f"full_data/query_{query}.json")

In [16]:
# Scrape
df_users = download_users(df_art, short_pause = 2, long_pause = 60, random_delay = 0.25)
df_users.to_csv(f"full_data/users_{query}.csv", index=False)