# Notes
##### All Items Have
- id
- level _(can be 0)_
- name
- icon
- quality
- class
- subclass
##### Items can have
- __Equipment Info__
  - Equip Slot
  - __Weapon Stats__
    - dmgmin
    - dmgmax
    - speed
    - dps _(can be meta data)_
  - __Attributes__
    - __Primary Attributes__
      - Strength
      - Agility
      - Intellect
      - Stamina
      - Spirit
      - Armor
    - __Secondary Attributes__
      - Melee/Ranged/Spell/All Critical Strike Rating
      - Melee/Ranged/Spell/All Hit Rating
      - Melee/Ranged/All Attack Power
      - Healing/Damage/All Spell Power 
- Vendor Sell Price

## Links
- [Extentions in Webdriver](https://www.reddit.com/r/learnpython/comments/4zzn69/how_do_i_get_adblockplus_to_work_with_selenium/)
- [requests docs](https://requests.readthedocs.io/en/latest/)
- [tqdm docs](https://tqdm.github.io/)
- [concurrent.futures docs](https://docs.python.org/dev/library/concurrent.futures.html)
- [seaborn docs](https://seaborn.pydata.org/api.html)
- [chrome switches](https://stackoverflow.com/questions/38335671/where-can-i-find-a-list-of-all-available-chromeoption-arguments)
- [mathplotlib docs](https://matplotlib.org/stable/api/index.html)

In [1]:
# Imports
import concurrent.futures
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import numpy as np
import pandas as pd
from pathlib import Path
import shutil
from tqdm import tqdm
import time
import requests
import xml.etree.ElementTree as ET
import seaborn as sns
import json
import matplotlib.pyplot as plt
import matplotlib as mpl
import os
import re

In [2]:
# Settings
# root_url = r'https://www.wowhead.com/wotlk'
root_url = r'https://www.wowhead.com/cata'
output_dir = r'output'

min_itemlvl = 0
max_itemlvl = 416

In [3]:
# Setup
Path(output_dir).mkdir(parents=True, exist_ok=True)

driver_options = webdriver.chrome.options.Options()
driver_options.page_load_strategy = 'normal'
driver_options.add_argument(r'--headless')
driver = webdriver.Chrome(options=driver_options)

In [4]:
# Scrape item quality and build itemlist links
def process_item_quality_elem(elem):
    _id = int(elem.get_attribute("value"))
    _name = elem.text
    _color = elem.value_of_css_property("color")
    _color = _color[_color.index('(') + 1:]
    _color = _color[:_color.index(')')]
    _color_split = _color.split(', ')
    _color = '#' + "{:02x}".format(int(_color_split[0])) + "{:02x}".format(
        int(_color_split[1])) + "{:02x}".format(int(_color_split[2]))
    return {"id": _id, "name": _name, "color": _color}

item_qualities_csv_path = Path(output_dir + "/item_qualities.csv")
item_qualities_csv_exists = item_qualities_csv_path.exists()

if(item_qualities_csv_exists):
    item_qualities = pd.read_csv(item_qualities_csv_path, sep=';')
else:
    items_url = root_url + "/items"
    driver.get(items_url)
    item_quality_elems = driver.find_elements(
        By.CSS_SELECTOR, '#filter-facet-quality > option')
    item_qualities = pd.DataFrame.from_records(
        [process_item_quality_elem(e) for e in item_quality_elems],
        index='id'
    )
    item_qualities.to_csv(item_qualities_csv_path, sep=';')
item_qualities

Unnamed: 0,id,name,color
0,0,Poor,#9d9d9d
1,1,Common,#ffffff
2,2,Uncommon,#1eff00
3,3,Rare,#0070dd
4,4,Epic,#9345ff
5,5,Legendary,#ff8000
6,6,Artifact,#e5cc80
7,7,Heirloom,#00ccff


In [5]:
# close selenium web driver since we're done using it
driver.close()

In [6]:
item_qualities_palette = sns.color_palette(item_qualities['color'])

In [7]:
def get_itemlist_url(item_level, quality_id=None):
    result = f"{root_url}/items/min-level:{item_level}/max-level:{item_level}"
    if quality_id != None:
        result += f"/quality:{quality_id}"
    return result

itemlist_ilvl_urls = {i:get_itemlist_url(i) for i in range(min_itemlvl, max_itemlvl)}

In [8]:
# Get Item list data
def process_itemlist(url):
    rsp = requests.get(url)
    rsp.raise_for_status()
    match_iter = re.findall(r"WH\.Gatherer\.addData\(.*?\);", rsp.text, re.MULTILINE)
    results = []
    for m in match_iter:
        json_str = m[m.index(r'WH.Gatherer.addData('):]
        json_str = json_str[json_str.index(r'{'):]
        json_str = json_str.rstrip(r');')
        json_obj = json.loads(json_str)
        results.append(json_obj)
    result = []
    for row in results:
        for k in row.keys():
            record = record = {'item_id':int(k)} | row[k]
            result.append(record)
    return result

itemlist_datas = []
with concurrent.futures.ThreadPoolExecutor(max_workers=(os.cpu_count() - 1)) as executor:
    unprocessed = {u:None for u in itemlist_ilvl_urls.values()}
    fail_count = {u:0 for u in itemlist_ilvl_urls.values()}
    while len(unprocessed) > 0:
        urls = [u for u in unprocessed.keys()]
        futures = {executor.submit(process_itemlist, u):u for u in urls}
        for f in tqdm(iterable=concurrent.futures.as_completed(futures),desc="Handling itemlist data futures", leave=False, total=len(futures)):
            u = futures[f]
            u_is_qualitylink = u.find(r'/quality:') >= 0
            try:
                data = f.result()
            except Exception as e:
                fail_count[u] += 1
                if(fail_count[u] > 2):
                    unprocessed.pop(u, None)
                    print(f"Could not process itemlist {u} due to exception: {e}")
                continue
            else:
                unprocessed.pop(u, None)
                if len(data) < 1000 or u_is_qualitylink:
                    itemlist_datas += data
                elif u_is_qualitylink:
                    print(f"Even filtering on quality did not bring item count below 1000 for url {u}")
                    itemlist_datas += data
                else:
                    for qid in item_qualities['id']:
                        qu = f"{u}/quality:{qid}"
                        unprocessed[qu] = None
                        fail_count[qu] = 0
                
itemlist_datas = pd.DataFrame.from_records(itemlist_datas)
itemlist_datas

                                                                                  

Unnamed: 0,item_id,name_enus,quality,icon,screenshot,jsonequip,attainable,flags2,displayName,qualityTier
0,52889,Blooded Darkspear Dagger,2,inv_weapon_shortblade_10,{},"{'agi': 1, 'appearances': {'0': [20388, 'inv_w...",0,24576,,0
1,21013,Scraggy Leather Pants,0,inv_pants_06,{},"{'appearances': {'0': [8409, '']}, 'armor': 19...",0,24576,,0
2,57531,Winter Jacket,1,inv_chest_leather_26v2,{},"{'appearances': {'0': [67029, '']}, 'armor': 2...",0,24576,,0
3,21017,Shoddy Chain Pants,0,inv_pants_03,{},"{'appearances': {'0': [3441, '']}, 'armor': 31...",0,24576,,0
4,2654,Flimsy Chain Pants,0,inv_pants_03,{},"{'appearances': {'0': [687, '']}, 'armor': 31,...",0,24576,,0
...,...,...,...,...,...,...,...,...,...,...
47828,31650,High Warlord's Ringmail Shoulderpads,3,inv_shoulder_29,{},"{'appearances': {'0': [32128, '']}, 'armor': 5...",0,24577,,0
47829,34670,Seeker's Gavel,3,inv_mace_35,{},"{'appearances': {'0': [37031, 'inv_mace_08']},...",0,25088,,0
47830,35362,Kodohide Helm,3,inv_helmet_30,{},"{'appearances': {'0': [48746, '']}, 'armor': 4...",0,24576,,0
47831,35381,Seer's Linked Armor,3,inv_chest_chain_11,{},"{'agi': 22, 'appearances': {'0': [48756, '']},...",0,24576,,0


In [None]:
# Scrape item XML download
item_xml_dir = Path(output_dir + "/itemxml")
item_xml_dir.mkdir(parents=True, exist_ok=True)
def download_item_xml(item_id, overwrite=False):
    url = f"{root_url}/item={item_id}&xml"
    filepath = item_xml_dir.joinpath(f"{item_id}.xml").absolute()
    if (not filepath.exists() or overwrite):
        rsp = requests.get(url)
        rsp.raise_for_status()
        filepath.write_bytes(rsp.content)
    return filepath

item_xml_paths = []
with concurrent.futures.ThreadPoolExecutor(max_workers=(os.cpu_count() - 1)) as executor:
    fail_count = -1
    while fail_count != 0:
        fail_count = 0
        futures = [executor.submit(download_item_xml, item_id, False) for item_id in itemlist_datas['item_id']]
        for f in tqdm(iterable=concurrent.futures.as_completed(futures), desc="Downloading Item XML", total=len(futures), leave=False):
            try:
                fres = f.result()
            except:
                fail_count += 1
            else:
                item_xml_paths.append(fres)


Downloading Item XML:  45%|████▌     | 21589/47833 [07:51<14:58, 29.22it/s]  

In [None]:
# parsing item XML
json_dir = Path(output_dir + "/itemjson")
json_dir.mkdir(parents=True, exist_ok=True)
jsonequip_dir = Path(output_dir + "/jsonequip")
jsonequip_dir.mkdir(parents=True, exist_ok=True)

def parse_item_xml(path, overwrite_json=False):
    root = ET.parse(path).find(".//item")
    class_elem = root.find("class")
    subclass_elem = root.find("subclass")
    quality_elem = root.find("quality")
    icon_elem = root.find("icon")
    inventorySlot_elem = root.find("inventorySlot")
    parsedItem = {
        "id": int(root.attrib['id']),
        "name": root.findtext("name"),
        "level": int(root.findtext("level")),
        "quality_id": int(quality_elem.attrib['id']),
        "quality_name": quality_elem.text,
        "class_id": int(class_elem.attrib['id']),
        "class_name": class_elem.text,
        "subclass_id": int(subclass_elem.attrib['id']),
        "subclass_name": subclass_elem.text,
        "icon_displayId": int(icon_elem.attrib['displayId']),
        "icon_name": icon_elem.text,
        "inventorySlot_id": int(inventorySlot_elem.attrib['id']),
        "inventorySlot_name": inventorySlot_elem.text,
        "htmlTooltip": root.findtext("htmlTooltip"),
        "link": root.findtext("link")
    }
    json_str = root.findtext("json")
    if (json_str):
        json_filepath = json_dir.joinpath("{}.json".format(parsedItem['id']))
        if(not json_filepath.exists() or overwrite_json):
            json_filepath.write_text('{' + json_str + '}')

    jsonequip_str = root.findtext("jsonEquip")
    if (jsonequip_str):         
            jsonequip_filepath = jsonequip_dir.joinpath("{}.equip.json".format(parsedItem['id']))
            if(not jsonequip_filepath.exists() or overwrite_json):
                jsonequip_filepath.write_text('{' + jsonequip_str + '}')
    return parsedItem

fail_count = 0
parsed_item_xml = []
with concurrent.futures.ThreadPoolExecutor(max_workers=(os.cpu_count() - 1)) as executor:
    futures = [executor.submit(parse_item_xml, p) for p in item_xml_paths]
    for f in tqdm(iterable=concurrent.futures.as_completed(futures), desc="Parsing Item XML", total=len(futures), leave=False):
        try:
            fres = f.result()
        except:
            fail_count += 1
        else:
            parsed_item_xml.append(fres)
items = pd.DataFrame.from_records(parsed_item_xml, index='id')

In [None]:
def parse_jsonequip(fp):
        with fp.open() as f:
            data = json.load(f)
            data["item_id"] = int(Path(fp.stem).stem)
            return data

jsonequips = []

with concurrent.futures.ThreadPoolExecutor(max_workers=(os.cpu_count() - 1)) as executor:
    futures = []
    for fp in tqdm(iterable=jsonequip_dir.iterdir(), desc="Starting jsonequip parse futures", leave=False):
        futures.append(executor.submit(parse_jsonequip, fp))

    for f in tqdm(iterable=concurrent.futures.as_completed(futures), desc="Parsing item jsonequips", total=len(futures), leave=False):
        df = f.result()
        jsonequips.append(df)

jsonequips = pd.DataFrame.from_records(jsonequips).drop(columns={'appearances','displayid'})
jsonequips.set_index('item_id', inplace=True)
jsonequips

In [None]:
# TODO Convert jsonequips to long format
temp3 = "abc"
print(f'{temp3=}')

In [None]:
def get_weapon_dps(item_id):
        try:
            row = jsonequips.loc[item_id]
        except:
            return pd.NA
        else:
            if pd.notna(row['dps']):
                return row['dps']
            if pd.notna(row['mledps']):
                print("used mledps")
                return row['mledps']
            if pd.notna(row['rgddps']):
                print("used rgddps")
                return row['rgddps']
            # TODO Calculate from damage and speed numbers if all else fails
            return pd.NA

weapons = items[items['class_name'] == 'Weapons'][['subclass_name','name', 'level','quality_id','quality_name']].reset_index()
weapons.rename(columns={'subclass_name':'subclass', 'quality_name':'quality', 'level':'item level', 'id':'item_id'}, inplace=True)

weapons['quality_color'] = pd.Series([(lambda qid: item_qualities_palette[qid])(qid) for qid in weapons['quality_id']])
weapons['dps'] = pd.Series([get_weapon_dps(item_id) for item_id in weapons['item_id']])
weapons = weapons[weapons['dps'].notna()].sort_values('item_id', ascending=True).reset_index().drop(columns='index')
weapons

In [None]:
weapons_by_subclass = {}
for subclass in weapons['subclass'].drop_duplicates().sort_values():
    weapons_by_subclass[subclass] = weapons[weapons['subclass'] == subclass]

In [None]:
sns.set_theme(rc={'figure.figsize':(16,9)})
sns.lineplot(data=weapons_by_subclass['Daggers'], x='item level',y='dps', hue='quality_color', legend=None, sort=True).set_title('Daggers')