In [25]:
from bs4 import BeautifulSoup
from tqdm import tqdm
from pathlib import Path
from IPython.display import clear_output
import json
import os
import re
import requests

In [11]:
with open("auth.json") as f:
    AUTH = json.load(f)

In [12]:
HEADERS = AUTH["HEADERS"]
COOKIES = AUTH["COOKIES"]

In [13]:
def get_id(url):
    return int(re.search(r"schematic/(\d+)", url).group(1))

In [14]:
url = "https://www.minecraft-schematics.com/most-downloaded/"

## Download all schematics from a page

In [15]:
def get_links(url):
    r = requests.get(url, headers=HEADERS, cookies=COOKIES)
    soup = BeautifulSoup(r.text, "html.parser")
    # Get all the download buttons
    download_buttons = soup.find_all("a", class_="btn btn-primary")

    # Get links from the buttons
    links = [button.get("href") for button in download_buttons]
    links = [f"https://www.minecraft-schematics.com{link}" for link in links]
    
    return links

In [21]:
def download_schematic(url, path="schematics"):
    r = requests.get(f"{url}download/action/", headers=HEADERS, cookies=COOKIES, params={"type": "schematic"})
    
    # Get the ID from the URL
    id = get_id(url)

    filename = f"{id}.schematic"

    # Create the path if it doesn't exist
    Path(path).mkdir(parents=True, exist_ok=True)

    # Save the file
    with open(os.path.join(path, filename), "wb") as f:
        f.write(r.content)
    

## Get metadata for a schematic given the URL

In [None]:
url = 'https://www.minecraft-schematics.com/schematic/14774/'

In [23]:
def get_metadata(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    # Find all table rows on the page
    rows = soup.find_all("tr")
    
    # Iterate over them, extracting key-value pairs
    metadata = dict()
    for row in rows[:7]:
        cols = row.find_all("td")
        cols = [ele.text.strip() for ele in cols]
        metadata[cols[0]] = cols[1]
    
    return metadata

In [None]:
get_metadata(url)

{'Rating': 'Rated 3.9 from 122 votes',
 'Category': 'Castles',
 'Theme': 'Medieval',
 'Size': 'Huge',
 'Submitted by': 'xKomi',
 'Posted on': 'July 2nd, 2020 09:02 AM EST',
 'Download(s)': 'Has been downloaded 9031 times'}

## Download multiple pages

In [17]:
NUM_PAGES = 10

In [26]:
for _, i in enumerate(range(1, NUM_PAGES)):
    url = f"https://www.minecraft-schematics.com/most-downloaded/{i}/"
    schematics = get_links(url)
    print(f"Page {i}: {len(schematics)} schematics")
    for url in tqdm(schematics):
        metadata = get_metadata(url)
        download_schematic(url, path=f"schematics/{metadata['Category']}")
    
    clear_output()