This file gets data from the internet and prepares it in a form that is convenient for analysis.

In [None]:
import re
import tarfile
import json
import yaml
import pandas as pd
from pathlib import Path
from urllib.parse import urljoin
import bz2
import requests
import pickle
import time
from datetime import datetime

In [None]:
START_YEAR = 2026
BASE_YEAR_URL = "https://data.everef.net/market-history/{year}/"
OUT_DIR = Path("data")
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:

UA = {"User-Agent": "everef-market-history-downloader (python requests)"}

# matches: market-history-YYYY-MM-DD.csv.bz2
FILE_RE = re.compile(r"market-history-\d{4}-\d{2}-\d{2}\.csv\.bz2$")



In [None]:
def list_files(year_url: str) -> list[str]:
    """Return absolute URLs of all matching .csv.bz2 files listed on the year page."""
    r = requests.get(year_url, headers=UA, timeout=60)
    r.raise_for_status()

    # Simple href extraction (works for typical directory listings)
    hrefs = re.findall(r'href="([^"]+)"', r.text)

    files = []
    for href in hrefs:
        name = href.split("/")[-1]
        if FILE_RE.search(name):
            files.append(urljoin(year_url, href))

    # de-dup + stable order
    return sorted(set(files))

In [None]:
def download(url: str, out_dir: Path, force=False) -> Path:
    filename = url.split("/")[-1]
    out_path = out_dir / filename

    if out_path.exists() and out_path.stat().st_size > 0 and not force:
        print(f"SKIP  {filename}")
        return out_path

    print(f"GET   {filename}")
    with requests.get(url, headers=UA, stream=True, timeout=120) as r:
        r.raise_for_status()
        tmp_path = out_path.with_suffix(out_path.suffix + ".part")
        with open(tmp_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024):
                if chunk:
                    f.write(chunk)
        tmp_path.replace(out_path)
    time.sleep(0.5)  # be nice to the server
    return out_path

In [None]:

for year in range(START_YEAR, datetime.now().year + 1):
    year_url = BASE_YEAR_URL.format(year=year)
    urls = list_files(year_url)
    for u in urls:
        try:
            download(u, OUT_DIR)
        except Exception as e:
            print(f"FAIL  {u}  ({e})")
    


In [None]:
# Read all bz2 files into a single dataframe
dfs = []
for file_path in OUT_DIR.glob("market-history-*.csv.bz2"):
    with bz2.open(file_path, "rt") as f:
        temp_df = pd.read_csv(f)
        # filter for The Forge
        temp_df = temp_df[temp_df["region_id"] == 10000002]
        temp_df = temp_df[[
            "average",
            "date",
            "highest",
            "lowest",
            "order_count",
            "volume",
            "type_id",
        ]]
        dfs.append(temp_df)

df: pd.DataFrame = pd.concat(dfs, ignore_index=True)

df

In [None]:

tar_path = Path("data/reference-data-latest.tar.xz")
with tarfile.open(tar_path, "r:xz") as tar:

    # print out all the file names
    for member in tar.getmembers():
        print(f"Extracting {member.name}")

In [None]:

with tarfile.open(tar_path, "r:xz") as tar:
    market_groups = tar.extractfile("types.json")
    market_groups_data = yaml.safe_load(market_groups)
    


In [None]:
market_groups_df = pd.DataFrame([
    {
        "type_id": k, 
        "type_name": i["name"]["en"],
        "packaged_volume": i["packaged_volume"] if "packaged_volume" in i else None
        }
    for k, i in market_groups_data.items()
])
market_groups_df

In [None]:

with open(OUT_DIR / "types_df.pkl", "wb") as f:
    pickle.dump(market_groups_df, f)

In [None]:
# Downloaded from:
# https://data.everef.net/reference-data/
tar_path = Path("data/reference-data-latest.tar.xz")
def extract_labels():
    # unzip only once
    with tarfile.open(tar_path, "r:xz") as tar:


        types_file = tar.extractfile("types.json")
        regions_file = tar.extractfile("regions.json")
        # stations_file = tar.extractfile("universe/stations.json")

        types_data = yaml.safe_load(types_file)
        regions_data = yaml.safe_load(regions_file)
        # stations_data = yaml.safe_load(stations_file)
    types_df = pd.DataFrame([
        {
            "type_id": k, 
            "type_name": i["name"]["en"],
            "packaged_volume": i["packaged_volume"]
        }
        for k, i in types_data.items()
    ])
    regions_df = pd.DataFrame([
        {"region_id": k, "region_name": i["name"]["en"]}
        for k, i in regions_data.items()
    ])
    # stations_df = pd.DataFrame([
    #     {"station_id": k, "station_name": i["name"]["en"], "region_id": i["region_id"]}
    #     for k, i in stations_data.items()
    # ])
    # save pickles
    with open(OUT_DIR / "types_df.pkl", "wb") as f:
        pickle.dump(types_df, f)
    with open(OUT_DIR / "regions_df.pkl", "wb") as f:
        pickle.dump(regions_df, f)
    # with open(OUT_DIR / "stations_df.pkl", "wb") as f:
    #     pickle.dump(stations_df, f)
    return types_df, regions_df

def get_labels():
    types_pkl = OUT_DIR / "types_df.pkl"
    regions_pkl = OUT_DIR / "regions_df.pkl"
    # stations_pkl = OUT_DIR / "stations_df.pkl"
    if types_pkl.exists() and regions_pkl.exists():
        with open(types_pkl, "rb") as f:
            types_df = pickle.load(f)
        with open(regions_pkl, "rb") as f:
            regions_df = pickle.load(f)
        # with open(stations_pkl, "rb") as f:
        #     stations_df = pickle.load(f)
    else:
        types_df, regions_df = extract_labels()
    return types_df, regions_df

In [None]:
types_df, regions_df = get_labels()
regions_df

In [None]:
types_df

In [None]:
df["type_id"] = df["type_id"].astype(int)
types_df["type_id"] = types_df["type_id"].astype(int)

In [None]:
df = df.merge(types_df, on="type_id", how="left")
df

In [None]:
# pickle the forge dataframe
with open(OUT_DIR / "forge_market_history.pkl", "wb") as f:
    pickle.dump(df, f)

In [None]:
# get market orders
url = "https://data.everef.net/market-orders/market-orders-latest.v3.csv.bz2"
out_path = download(url, OUT_DIR, force=True)

In [None]:
# Read market orders bz2 file into a dataframe
with bz2.open(out_path, "rt") as f:
    orders_df = pd.read_csv(f)
orders_df

In [None]:
url = "https://data.everef.net/structures/structures-latest.json"
out_path = download(url, OUT_DIR)


In [None]:
# read structures json file
structures1_df = pd.read_json(out_path)
structures1_df

In [None]:
url = "https://data.everef.net/industry-facilities/industry-facilities-latest.json"
out_path = download(url, OUT_DIR)

In [None]:
# read structures json file
indy_df = pd.read_json(out_path)
indy_df


In [None]:
url = "https://data.everef.net/structures/structures-latest.v2.json"
out_path = download(url, OUT_DIR)


In [None]:
# read structures json file
structures_df = pd.read_json(out_path, orient="index")
structures_df

In [None]:
structures_df[structures_df["name"].notna()][structures_df["name"].astype(str).str.contains("Caldari")]

In [None]:
structures_df[
    (structures_df["is_market_structure"] == True)
    & (structures_df["is_public_structure"] == True)
]

In [None]:
df_stations = pd.read_csv("data/stations.csv")
df_stations

In [None]:
url = "https://developers.eveonline.com/static-data/tranquility/eve-online-static-data-3171578-jsonl.zip"
out_path = download(url, OUT_DIR)
out_path


In [None]:
import zipfile
with zipfile.ZipFile(out_path, 'r') as zip_ref:
    zip_ref.extractall(OUT_DIR)
extracted_files = list(OUT_DIR.glob("*.jsonl"))
extracted_files