In [24]:
import requests
import json
import re
from bs4 import BeautifulSoup as bs4
from IPython.display import display, HTML
import os
import pandas as pd
import datetime as dt
from concurrent.futures import ThreadPoolExecutor, as_completed

In [25]:
client_id = os.environ.get("DEVIANTART_CLIENT_ID")
client_secret = os.environ.get("DEVIANTART_CLIENT_SECRET")

In [26]:
token_request = requests.post("https://www.deviantart.com/oauth2/token", params= {
    "client_id": client_id,
    "client_secret": client_secret,
    "grant_type": "client_credentials"
} )

In [27]:
token = token_request.json()["access_token"]

In [28]:
offset = 0
limit = 24
esk_request = requests.get("https://www.deviantart.com/api/v1/oauth2/gallery/all", 
headers={
    'Authorization': 'Bearer {}'.format(token)
    }, 
    params= {
    "username": "esk-masterlist",
    "offset": offset,
    "limit": limit
}).json()

In [29]:
esk_requests = [esk_request["results"]]
while esk_request["has_more"]:
    offset += limit
    esk_request = requests.get("https://www.deviantart.com/api/v1/oauth2/gallery/all", 
        headers={
            'Authorization': 'Bearer {}'.format(token)
            }, 
            params= {
            "username": "esk-masterlist",
            "offset": offset,
            "limit": limit
        }).json()
    esk_requests.append(esk_request["results"])

In [None]:
deviation_ids = [{"deviationid": esk["deviationid"], "publication date": dt.datetime.fromtimestamp(int(esk["published_time"]))} for esks in esk_requests for esk in esks]

In [None]:
start_index = 0
end_index = 50
esk_metadata_request = requests.post("https://www.deviantart.com/api/v1/oauth2/deviation/metadata", 
    data={
        "deviationids[]": [entry["deviationid"] for entry in deviation_ids[start_index:end_index]]
    },
    headers={
        'Authorization': 'Bearer {}'.format(token)
    }).json()["metadata"]


In [None]:
esk_metadata = [esk_metadata_request]
while end_index < len(deviation_ids):
    start_index += 50
    end_index += 50
    if end_index > len(deviation_ids):
        end_index = len(deviation_ids)
    esk_metadata_request = requests.post("https://www.deviantart.com/api/v1/oauth2/deviation/metadata", 
    data={
        "deviationids[]": [entry["deviationid"] for entry in deviation_ids[start_index:end_index]]
    },
    headers={
        'Authorization': 'Bearer {}'.format(token)
    }).json()["metadata"]
    esk_metadata.append(esk_metadata_request)


In [None]:

def esk_record_from_web_response(raw_esk_record):
    parsed_html = bs4(raw_esk_record["description"])
    for category in ["origin", "nature", "boundary", "species", "collection","uncommon traits", "rare traits", "unique traits", "morphs", "nature features", "accessories", "enchantments", "elementals", "familiars"]:
        tmp = parsed_html.body.select_one(f'div > b:-soup-contains("{category}")', class_="legacy-journal")
        if tmp != None:
            tmp = tmp.next_sibling
            if tmp != None:
                if tmp.get_text(" ", strip=True) == r"^\s+$":
                    tmp = tmp.find_next("span")
                    if tmp != None and tmp.get_text(" ", strip=True) == r"^\s+$":
                            tmp = tmp.find_next("span")
                if tmp != None:
                    tmp = tmp.get_text(" ", strip=True)
                    if str.endswith(tmp, ".") or str.endswith(tmp, "("):
                        tmp = tmp[:-1].strip()
                    tmp = re.sub(r"\(.*\)", "", tmp)
                    raw_esk_record[category] = tmp
    owner_history = parsed_html.body.select_one('div > b:-soup-contains("owner history")', class_="legacy-journal")
    owner_count = 1
    if owner_history != None:
        owner_history = owner_history.find_next_sibling("sub")
        if owner_history != None:
            owner_history = owner_history.findChildren("br")
            if owner_history != None:
                owner_count = len(owner_history) + 1
    raw_esk_record["owner count"] = owner_count
    raw_esk_record.pop("description", None)
    return raw_esk_record
   

In [None]:
esk_descriptions = [{"deviationid": esk["deviationid"], "title": esk["title"], "description": esk["description"]} for esks in esk_metadata for esk in esks if (re.fullmatch( r"\d{3,4}", esk["title"]) != None)]

In [None]:
esk_list = []

for esk in esk_descriptions:
    cleaned = esk_record_from_web_response(esk)
    if cleaned != None:
        esk_list.append(cleaned)

In [None]:
df = pd.DataFrame(esk_list)
deviation_id_frame = pd.DataFrame(deviation_ids)
df = pd.merge(df, deviation_id_frame, on="deviationid", how="inner")
df= df[df["title"] != "198"]
df= df[df["origin"] != "-"]

In [None]:
def get_biome_from_comments(deviationid):
    global df
    try:
        esk_comment_id = requests.get(f"https://www.deviantart.com/api/v1/oauth2/comments/deviation/{deviationid}", 
            headers={
                'Authorization': 'Bearer {}'.format(token)
            }).json()["thread"][0]["commentid"]

        comment_soup = bs4(requests.get(f"https://www.deviantart.com/api/v1/oauth2/comments/deviation/{deviationid}", 
        params={
            "commentid": esk_comment_id
        },
        headers={
            'Authorization': 'Bearer {}'.format(token)
        }).json()["thread"][0]["body"])
        comment = comment_soup.body.find("img")["alt"]
        comment = re.sub(" by Esk-Masterlist", "", comment)
        df.loc[df["deviationid"] == deviationid,"biome"] = comment
        return comment
    except Exception as e:
        return e

In [None]:
def runner():
    global df
    threads= []
    with ThreadPoolExecutor(max_workers=24) as executor:
        for deviationid in df["deviationid"]:
            threads.append(executor.submit(get_biome_from_comments, deviationid))
            
        for task in as_completed(threads):
            print(task.result()) 


In [None]:
runner()

Arid Biome
Developed Biome
Plains Biome
Mountain Biome
list index out of range
list index out of range
Arid Biome
Forest Biome
Marine Biome
Mountain Biome
Forest Biome
Forest Biome
list index out of range
Developed Biome
Plains Biome
Forest Biome
Developed Biome
Forest Biome
Arid Biome
Developed Biome
Developed Biome
Forest Biome
Developed Biome
Developed Biome
list index out of range
Developed Biome
Forest Biome
Arid Biome
Developed Biome
list index out of range
list index out of range
list index out of range
Plains Biome
Developed Biome
Arid Biome
Fresh Waters Biome
Arid Biome
Developed Biome
Forest Biome
Arid Biome
Plains Biome
Developed Biome
Fresh Waters Biome
Mountain Biome
Forest Biome
Arid Biome
Forest Biome
Developed Biome
Mountain Biome
Forest Biome
Developed Biome
Marine Biome
list index out of range
list index out of range
Marine Biome
list index out of range
Marine Biome
Marine Biome
list index out of range
list index out of range
Marine Biome
Developed Biome
list index ou

In [None]:
with open("esks_final.csv", "w") as csv_file:
    df.to_csv(csv_file, encoding='utf-8', sep='\t')