In [2]:
import requests
import json
import re
from bs4 import BeautifulSoup as bs4
from IPython.display import display, HTML
import os
import pandas as pd
import datetime as dt
from concurrent.futures import ThreadPoolExecutor, as_completed

In [3]:
client_id = os.environ.get("DEVIANTART_CLIENT_ID")
client_secret = os.environ.get("DEVIANTART_CLIENT_SECRET")

In [4]:
token_request = requests.post("https://www.deviantart.com/oauth2/token", params= {
    "client_id": client_id,
    "client_secret": client_secret,
    "grant_type": "client_credentials"
} )

In [5]:
token = token_request.json()["access_token"]

In [6]:
offset = 0
limit = 24
esk_request = requests.get("https://www.deviantart.com/api/v1/oauth2/gallery/all", 
headers={
    'Authorization': 'Bearer {}'.format(token)
    }, 
    params= {
    "username": "esk-masterlist",
    "offset": offset,
    "limit": limit
}).json()

In [7]:
esk_requests = [esk_request["results"]]
while esk_request["has_more"]:
    offset += limit
    esk_request = requests.get("https://www.deviantart.com/api/v1/oauth2/gallery/all", 
        headers={
            'Authorization': 'Bearer {}'.format(token)
            }, 
            params= {
            "username": "esk-masterlist",
            "offset": offset,
            "limit": limit
        }).json()
    esk_requests.append(esk_request["results"])

In [8]:
deviation_ids = [{"deviationid": esk["deviationid"], "publication date": dt.datetime.fromtimestamp(int(esk["published_time"]))} for esks in esk_requests for esk in esks]


In [9]:
start_index = 0
end_index = 50
esk_metadata_request = requests.post("https://www.deviantart.com/api/v1/oauth2/deviation/metadata", 
    data={
        "deviationids[]": [entry["deviationid"] for entry in deviation_ids[start_index:end_index]]
    },
    headers={
        'Authorization': 'Bearer {}'.format(token)
    }).json()["metadata"]

In [10]:
esk_metadata = [esk_metadata_request]
while end_index < len(deviation_ids):
    start_index += 50
    end_index += 50
    if end_index > len(deviation_ids):
        end_index = len(deviation_ids)
    esk_metadata_request = requests.post("https://www.deviantart.com/api/v1/oauth2/deviation/metadata", 
    data={
        "deviationids[]": [entry["deviationid"] for entry in deviation_ids[start_index:end_index]]
    },
    headers={
        'Authorization': 'Bearer {}'.format(token)
    }).json()["metadata"]
    esk_metadata.append(esk_metadata_request)


In [11]:
def owner_count_from_web_response(raw_esk_record):
    parsed_html = bs4(raw_esk_record["description"])
    owner_history = parsed_html.body.select_one('div > b:-soup-contains("owner history")', class_="legacy-journal")
    owner_count = 0
    if owner_history != None:
        owner_history = owner_history.find_next_sibling("sub")
        if owner_history != None:
            owner_history = owner_history.get_text("|", strip=False) 
            if owner_history != None:
                owner_history_list = str.split(owner_history, "|")
                if owner_history_list[0] in ['staff reward for ', 'claimed from ', 'created via ', 'created via', 'staff reward', 'purchased', 'won via ', 'created via transformation by ', 'MYO by ', 'won via\\xa0', 'created via\\xa0', 'staff reward for\\xa0', 'claimed from\\xa0', 'semi-custom', 'purchased\\xa0', 'purchased MYO', ' claimed from ', 'rare MYO purchased by ', 'ThoseWhoWentMissing', '\\xa0', 'purchased MYO slot by', 'purchased by ', 'purchased MYO slot']:
                    owner_count += 1
                owner_count += len(re.findall(r"\|[^|]*\s+to\s+[^|]*\||\|\s*to\s*\|", owner_history))
    if owner_count == 0:
        owner_count += 1
    return owner_count

In [12]:
def get_category(description, category, category_list):
    try:
        category_index = description.index(category)
        category_list = set(category_list)
        next_category = next((ele for ele in description[category_index + 1:] if ele in category_list), None)
        if next_category != None:
            try:
                next_category_index = description.index(next_category)
            except ValueError:
                return 'NA'
            category_text = " ".join(description[category_index+1:next_category_index])    
        else:
            category_text = " ".join(description[category_index+1:])
            
        category_text = re.sub(r'[\.\(]$', "", category_text)
        category_text = category_text.strip()
        return category_text
    except ValueError:
        return 'NA'

In [37]:
def esk_record_from_web_response(raw_esk_record):
    parsed_html = bs4(raw_esk_record["description"])
    description = parsed_html.body.select_one('div > b', class_="legacy-journal").parent.parent.get_text("|", strip=True)
    description = re.sub(r'ownership.*', "", description)
    description = re.sub(r'\* this pre-arpg mutation is no longer allowed for new esk', "", description)
    description = re.sub(r'\([^\)]*\)', "", description)
    description = re.sub('\\xa0', " ", description)
    description = re.sub(r'nature feature\s+', "nature features", description)
    description = re.sub(r'mutation(?!s)', "mutations", description)
    description = re.sub(r'accessory', "accessories", description)
    description = re.sub(r'elemental(?!s)', "elementals", description)
    description = re.sub(r'familiar(?!s)', "familiars", description)
    description = re.sub(r'enchantment(?!s)', "enchantments", description)
    description = re.sub(r'morphs', "morph", description)
    description = re.sub('designers', "designer", description)
    description = str.split(description, "|")
    category_list = ["origin", "nature", "boundary", "size", "species", "collection", "designer", "uncommon traits", "rare traits", "unique traits", "nature features", "mutations", "morph", "original form", "accessories", "familiars", "enchantments", "elementals", "TF rewards", "blessings", "curses"]
    for category in category_list:
        raw_esk_record[category] = get_category(description, category, category_list)
    raw_esk_record["owner count"] = owner_count_from_web_response(raw_esk_record)
    raw_esk_record.pop("size", None)
    raw_esk_record.pop("designer", None)
    raw_esk_record.pop("description", None)
    raw_esk_record.pop("TF rewards", None)
    raw_esk_record.pop("blessings", None)
    raw_esk_record.pop("curses", None)
    return raw_esk_record

In [38]:
esk_descriptions = [{"deviationid": esk["deviationid"], "title": esk["title"], "description": esk["description"]} for esks in esk_metadata for esk in esks if (re.fullmatch( r"\d{3,4}", esk["title"]) != None)]

In [39]:
esk_list = []
for esk in esk_descriptions:
    cleaned = esk_record_from_web_response(esk)
    if cleaned != None:
        esk_list.append(cleaned)

In [40]:
df = pd.DataFrame(esk_list)
deviation_id_frame = pd.DataFrame(deviation_ids)
df = pd.merge(df, deviation_id_frame, on="deviationid", how="inner")
df= df[df["origin"] != "-"]

In [41]:
def get_biome_from_comments(deviationid):
    global df
    try:
        esk_comment_metadata = requests.get(f"https://www.deviantart.com/api/v1/oauth2/comments/deviation/{deviationid}", 
            headers={
                'Authorization': 'Bearer {}'.format(token)
            },
            params={
                "maxdepth": 2
            }).json()["thread"]
        if len(esk_comment_metadata) > 0:
            parent_comment_id = ''
            for comment in esk_comment_metadata:
                if comment["parentid"] == None and comment["user"]["username"] == "Esk-Masterlist":
                    parent_comment_id = comment["commentid"]
                    break
            if len(parent_comment_id) == 0:
                raise ValueError("No Esk-Masterlist comment found")
        else:
             raise ValueError("No comments found")
        biome_comment = ''
        for comment in esk_comment_metadata:
            if comment["parentid"] == parent_comment_id and comment["user"]["username"] == "Esk-Masterlist":
                biome_comment = comment["body"]
                break
        if len(biome_comment) == 0:
                raise ValueError("No reply to biome comment found")    
        comment_soup = bs4(biome_comment)
        comment = comment_soup.body.find("img")["alt"]
        comment = re.sub(" by Esk-Masterlist", "", comment)
        df.loc[df["deviationid"] == deviationid,"biome"] = comment
        return comment
    except ValueError as v:
        df.loc[df["deviationid"] == deviationid,"biome"] = "NA"
        return v
    except KeyError as k:
        df.loc[df["deviationid"] == deviationid,"biome"] = "NA"
        return k
    except Exception as e:
        df.loc[df["deviationid"] == deviationid,"biome"] = "NA"
        return e


In [42]:
def runner():
    global df
    threads= []
    with ThreadPoolExecutor(max_workers=24) as executor:
        for deviationid in df["deviationid"]:
            threads.append(executor.submit(get_biome_from_comments, deviationid))
        """ for task in as_completed(threads):
            print(task.result()) """ 


In [43]:
runner()

In [44]:
with open("esks_complete.csv", "w") as csv_file:
    df.to_csv(csv_file, encoding='utf-8', sep=';')