In [2]:
import pandas as pd
import numpy as np
import requests
import bs4
import json
import time
from random import randint
from tqdm import tqdm
import os.path
from os import listdir
from os.path import isfile, join

In [3]:
# Time between two scraps
scrap_time = 120
# Realm choice: 228 --> Outland EU
realm = 228

In [8]:
# Scrap cost trend from https://theunderminejournal.com/#eu/outland/category/alchemy
# First get all potions / elixir / flasks from the list

def create_alchemy_url(house):
    return 'https://theunderminejournal.com/api/category.php?house={}&id=alchemy'.format(house)

def create_herbalism_url(house):
    return 'https://theunderminejournal.com/api/category.php?house={}&id=herbalism'.format(house)

def get_alchemy_raw_json(house):
    url = create_alchemy_url(house)
    time.sleep(scrap_time + randint(0, 60))
    return requests.get(url).text

def get_herbalism_raw_json(house):
    url = create_herbalism_url(house)
    #time.sleep(scrap_time + randint(0, 60))
    return requests.get(url).text

def get_all_alchemy_recipes(house):
    """
    Gets all alchemy recipes.
    """
    raw_json = get_alchemy_raw_json(house)
    parsed_json = json.loads(raw_json)
    categories = parsed_json['results']
    all_items = []
    for category in categories:
        if 'items' in category['data'].keys():
            items = category['data']['items']
            all_items.extend(items)
    return all_items

def get_all_herbalism_recipes(house):
    """
    Gets all herbs.
    """
    raw_json = get_herbalism_raw_json(house)
    parsed_json = json.loads(raw_json)
    categories = parsed_json['results']
    all_items = []
    for category in categories:
        if 'items' in category['data'].keys():
            items = category['data']['items']
            all_items.extend(items)
    return all_items

# Second get trends for one product
# In the Undermine Journal, we have several keys: 'stats', 'history', 'daily', 'monthly', 'auctions', 'globalnow', 'globalmonthly', 'region'
# - stats is the global statistics on the product
# - history contains the price and quantity for every hour the last 300 hours
# - daily contains nothing
# - contains the price and quantity for each day for the last ... days

def create_product_url(house, item):
    """
    Creates the url for a specific product based on:
    - house: the realm
    - item: the id of that product
    """
    return "https://theunderminejournal.com/api/item.php?house={}&item={}".format(house, item)

def get_product_raw_json(house, item):
    """
    Get the raw JSON data for a specific item of a specific realm.
    """
    url = create_product_url(house, item)
    time.sleep(scrap_time + randint(0, 60))
    return requests.get(url).text

def get_product_monthly_df(house, item):
    raw_json = get_product_raw_json(house, item)
    parsed_json = json.loads(raw_json)
    return transform_json_to_df(parsed_json)

def get_df_from_file(file):
    f = open(file, "r")
    content = f.read()
    f.close()
    return transform_json_to_df(json.loads(json.loads(content)))

def transform_json_to_df(parsed_json):
    if 'monthly' in parsed_json.keys():
        if type(parsed_json['monthly']) == list:
            return pd.DataFrame(parsed_json['monthly'][0])
        elif type(parsed_json['monthly']) == dict:
            return pd.DataFrame(parsed_json['monthly'][list(parsed_json['monthly'].keys())[0]])
    else:
        return None

# Third aggregate all trends in one dataframe

def store_all_raw_json(house):
    """
    Stores all the raw json in different files
    """
    recipes = get_all_herbalism_recipes(house)
    dataframes = []
    for recipe in tqdm(recipes):
        item = recipe['id']
        if not os.path.isfile('./data/item_{}.json'.format(item)):
            # Get raw JSON file and store it  
            raw_json = get_product_raw_json(house, item)
            with open('./data/item_{}.json'.format(item), 'w') as outfile:
                json.dump(raw_json, outfile)


def get_all_products(house):
    """
    Get all products and aggregate them in a DataFrame
    """
    recipes = get_all_alchemy_recipes(house)
    dataframes = []
    for recipe in tqdm(recipes):
        item = recipe['id']
        sub_dataframe = get_product_monthly_df(house, item)
        if not sub_dataframe is None:
            sub_dataframe = sub_dataframe.set_index('date')
            sub_dataframe.columns = [el + '_' + str(item) for el in list(sub_dataframe.columns)]
            dataframes.append(sub_dataframe)
    return pd.concat(dataframes, axis=1), recipes

def merge_items_in_df():
    # Get all files from data
    item_files = [f for f in listdir('./data/TimeSeries') if isfile(join('./data/TimeSeries', f))]
    dataframes = []
    for item_file in tqdm(item_files):
        item_name = item_file[:-5].split('_')[-1]
        sub_df = get_df_from_file('./data/TimeSeries/' + item_file)
        if not sub_df is None:
            sub_df = sub_df.set_index('date')
            sub_df.columns = [el + '_' + str(item_name) for el in list(sub_df.columns)]
            dataframes.append(sub_df)
    return pd.concat(dataframes, axis=1)


In [9]:
merge_items_in_df()

100%|██████████| 145/145 [00:01<00:00, 130.23it/s]


Unnamed: 0,silver_109124,quantity_109124,silver_109125,quantity_109125,silver_109126,quantity_109126,silver_109127,quantity_109127,silver_109128,quantity_109128,...,silver_8838,quantity_8838,silver_8839,quantity_8839,silver_8845,quantity_8845,silver_8846,quantity_8846,silver_89639,quantity_89639
2014-12-12,113,6315,158,5592,102,5580,58,7515,113,6056,...,177,3328,695,777,1604,581,385,377,198,454
2014-12-13,135,8463,151,5638,119,6379,97,7380,109,7158,...,175,4593,633,636,1603,597,385,577,185,485
2014-12-14,131,9175,104,7967,120,5697,89,11220,141,8047,...,162,4409,327,981,1548,159,454,757,144,513
2014-12-15,158,8666,86,7126,142,6882,124,9793,105,8880,...,159,2887,324,647,1167,101,371,748,133,70
2014-12-16,122,4770,143,6142,141,6672,103,10155,168,8165,...,152,2919,473,1086,1200,105,368,870,132,249
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-07-08,305,1667,113,2007,267,5583,865,1799,318,2088,...,972,614,3073,259,3331,90,1996,480,5000,41
2020-07-09,299,1709,219,2658,262,5986,295,2399,313,2655,...,900,754,3072,268,2930,218,1896,423,5000,21
2020-07-10,250,2368,210,3180,252,6071,290,1573,300,3767,...,900,768,3071,268,23339,25,1889,427,5000,35
2020-07-11,217,1787,175,417,217,4793,298,2140,210,2416,...,750,577,3070,204,23339,35,1893,296,5000,35


In [5]:
# alchemy_df, recipes = get_all_products(realm)
# store_all_raw_json(realm)

100%|██████████| 92/92 [1:28:59<00:00, 58.03s/it]
