# Notes

## Links

- [Extentions in Webdriver](https://www.reddit.com/r/learnpython/comments/4zzn69/how_do_i_get_adblockplus_to_work_with_selenium/)
- [requests docs](https://requests.readthedocs.io/en/latest/)
- [tqdm docs](https://tqdm.github.io/)
- [concurrent.futures docs](https://docs.python.org/dev/library/concurrent.futures.html)

In [None]:
# Imports
import concurrent.futures
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import time
import requests


In [None]:
# Settings
root_url = r'https://www.wowhead.com/wotlk'
output_dir = r'output'

min_itemlvl = 0
max_itemlvl = 284

In [None]:
# Setup
Path(output_dir).mkdir(parents=True, exist_ok=True)

driver_options = webdriver.chrome.options.Options()
driver_options.page_load_strategy = 'normal'
driver_options.add_argument(r'--headless')
driver = webdriver.Chrome(options=driver_options)

# Setup item scraping
items_url = root_url + "/items"
driver.get(items_url)

In [None]:
# Scrape item quality and build itemlist links
def process_item_quality_elem(elem):
    _id = int(elem.get_attribute("value"))
    _name = elem.text
    _color = elem.value_of_css_property("color")
    _color = _color[_color.index('(') + 1:]
    _color = _color[:_color.index(')')]
    _color_split = _color.split(', ')
    _color = '#' + "{:02x}".format(int(_color_split[0])) + "{:02x}".format(
        int(_color_split[1])) + "{:02x}".format(int(_color_split[2]))
    return {"id": _id, "name": _name, "color": _color}

item_qualities_csv_path = Path(output_dir + "/item_qualities.csv")
item_qualities_csv_exists = item_qualities_csv_path.exists()

if(item_qualities_csv_exists):
    item_qualities = pd.read_csv(item_qualities_csv_path, sep=';')
else:
    item_quality_elems = driver.find_elements(
        By.CSS_SELECTOR, '#filter-facet-quality > option')
    item_qualities = pd.DataFrame.from_records(
        [process_item_quality_elem(e) for e in item_quality_elems],
        index='id'
    )
    item_qualities.to_csv(item_qualities_csv_path, sep=';')


itemlist_urls = []
for quality in item_qualities.itertuples():
    for itemlvl in range(min_itemlvl, max_itemlvl):
        itemlist_url = "{u}/min-level:{l:n}/max-level:{l:n}/quality:{q:n}".format(
            u=items_url,
            l=itemlvl,
            q=quality.Index
        )
        itemlist_urls.append(itemlist_url)

In [None]:
# Scrape itemslists for item urls
def process_itemlist(url):
    click_next_btn_js = r'next_btn = Array.from(document.querySelectorAll("#tab-items > div.listview-band-top > div.listview-nav > a")).find(x => x.textContent.toLowerCase().startsWith("next")); if(next_btn !== undefined) next_btn.click()'
    get_item_links_js = r'return Array.from(document.querySelectorAll("#tab-items > div.listview-scroller-horizontal > div > table > tbody > tr > td:nth-child(2) > div > a")).map(x => x.href)'
    driver.get(url)
    frames = []
    pre_url = ""
    while driver.current_url != pre_url:
        item_links = driver.execute_script(get_item_links_js)
        frames.append(pd.DataFrame(item_links))
        pre_url = driver.current_url
        driver.execute_script(click_next_btn_js)

    return pd.concat(frames)


item_urls_csv_path = Path(output_dir + "/item_urls.csv")
item_urls_csv_exists = item_urls_csv_path.exists()
if (item_urls_csv_exists):
    item_urls = pd.read_csv(item_urls_csv_path, header=None).rename(
        columns={0: "url"})
else:
    item_url_frames = []
    for u in tqdm(itemlist_urls,desc="Iterating item lists for item urls", leave=False):
        item_url_frames.append(process_itemlist(u))

    item_urls = pd.concat(item_url_frames).drop_duplicates().rename(
        columns={0: "url"})
    item_urls.sort_values(by=['url'], inplace=True, ignore_index=True)
    item_urls.to_csv(item_urls_csv_path,
                     sep=';', index=None, header=None)

In [None]:
# Scrape itemXML
item_xml_dir = Path(output_dir + "/itemxml")
item_xml_dir.mkdir(parents=True, exist_ok=True)


def get_item_xml_info(item_url):
    idx1 = item_url.index('item=', len(root_url))
    idx2 = item_url.index('/', idx1)
    item_xml_url = item_url[:idx2] + "&xml"
    li = item_xml_url.index(r'item=') + len(r'item=')
    ri = item_xml_url.index(r'&xml')
    item_id_str = item_xml_url[li:ri]
    item_xml_filename = item_id_str + r'.xml'
    item_xml_filepath = Path(str(item_xml_dir) + '/' +
                             item_xml_filename).absolute()
    return {r'url': item_xml_url, r'filepath': item_xml_filepath}


def download_item_xml(item_url, overwrite=False):
    item_xml_info = get_item_xml_info(item_url)
    item_xml_exists = item_xml_info['filepath'].exists()
    if (not item_xml_exists or overwrite):
        rsp = requests.get(item_xml_info['url'])
        rsp.raise_for_status()
        item_xml_info['filepath'].write_bytes(rsp.content)
    return item_xml_info

download_item_xml_results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=(os.cpu_count() - 1)) as executor:
    futures = [executor.submit(download_item_xml, u, False)
               for u in item_urls['url']]
    for future in tqdm(iterable=concurrent.futures.as_completed(futures), desc="Downloading Item XML", total=len(futures), leave=False):
        try:
            fres = future.result()
        except:
            print("download_item_xml failed!")
        else:
            download_item_xml_results.append(fres)


In [None]:
# shutdown
driver.close()