In [38]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from time import sleep
from tqdm import tqdm
import os.path
from os import listdir
from os.path import isfile, join
import matplotlib.pyplot as plt

In [44]:
def get_all_recipes():
    item_files = [f for f in listdir('./data/TimeSeries') if isfile(join('./data/TimeSeries', f))] 
    ids = [item_file[:-5].split('_')[-1] for item_file in item_files]
    all_recipes = {}
    for item_id in tqdm(ids):
        raw_xml = load_item(item_id)
        all_recipes = {**all_recipes, **get_reagents_from_xml(raw_xml, item_id)}
    return all_recipes

def get_reagents_from_xml(raw_xml, item_id):
    """
    Get all reageants for all recipes of one item.
    """
    soup = BeautifulSoup(raw_xml, "lxml")
    # Get item name
    name = soup.find('b', class_ = 'q1')
    if name is None:
        name = soup.find('b', class_ = 'q2')
    if name is None:
        name = soup.find('b', class_ = 'q3')
    name = name.text
    # Get each recipe (spell)
    createdby = soup.find('createdby')
    if createdby is None:
        return  {item_id: (name, [])}
    spells = createdby.find_all('spell')
    # Get each reagent in each recipe
    reagent_lists = [spell.find_all('reagent') for spell in spells]
    return {item_id: (name, reagent_lists)}

def load_item(item_id):
    """
    Load an item from local or from WowHead API.
    """
    file_name = './data/item_{}.xml'.format(item_id)
    url = 'https://www.wowhead.com/item={}&xml'.format(item_id)
    if os.path.isfile(file_name):
        f = open(file_name, "r")
        raw_xml = f.read()
        f.close()
    else:
        print('Downloading item {}'.format(item_id))
        sleep(30)
        raw_xml = requests.get(url).text
        with open(file_name, 'w') as outfile:
            outfile.write(raw_xml)
    return raw_xml

In [46]:
all_recipes = get_all_recipes()

46%|████▌     | 66/145 [00:00<00:00, 655.39it/s]Downloading item 163223
 46%|████▌     | 67/145 [00:30<11:44,  9.03s/it]Downloading item 163224
 47%|████▋     | 68/145 [01:01<20:07, 15.69s/it]Downloading item 163225
 48%|████▊     | 69/145 [01:32<25:38, 20.24s/it]Downloading item 165721
 48%|████▊     | 70/145 [02:03<29:16, 23.42s/it]Downloading item 165744
 49%|████▉     | 71/145 [02:34<31:43, 25.73s/it]Downloading item 166270
 50%|████▉     | 72/145 [03:05<33:10, 27.27s/it]Downloading item 168487
 50%|█████     | 73/145 [03:35<33:59, 28.33s/it]Downloading item 168489
 51%|█████     | 74/145 [04:06<34:24, 29.08s/it]Downloading item 168498
 52%|█████▏    | 75/145 [04:37<34:37, 29.68s/it]Downloading item 168499
 52%|█████▏    | 76/145 [05:08<34:31, 30.02s/it]Downloading item 168500
 53%|█████▎    | 77/145 [05:39<34:20, 30.30s/it]Downloading item 168501
 54%|█████▍    | 78/145 [06:10<34:00, 30.46s/it]Downloading item 168506
 54%|█████▍    | 79/145 [06:41<33:36, 30.55s/it]Downloading item

In [26]:
import networkx as nx

def create_graph(reagent_dict):
    G = nx.Graph()
    # add nodes for each id:
    for item in reagent_dict.keys():
        G.add_node(item, name = reagent_dict[item][0])
    for item in reagent_dict.keys():
        reagent_lists = reagent_dict[item][1]
        for reagent_list in reagent_lists:
            for reagent in reagent_list:
                component_id = reagent['id']
                G.add_edge(item, component_id)
    return G

In [35]:
G = create_graph(all_recipes)
name_dict = dict([(item_id, name[0]) for item_id, name in zip(all_recipes.keys(), all_recipes.values())])

In [47]:
print('ok')

ok
