In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import time
import urllib.robotparser
import os
import random
import pandas as pd
from io import StringIO
import csv
import json

In [2]:
BASE_WIKI_URL = "https://minecraft.wiki"
#SEED_URL = f"{BASE_URL}/Minecraft_Wiki"
SAVE_PATH = "downloaded_pages"

In [7]:
# Help Functions
def write_content(file, section_title, subsection_title, content):
    if section_title:
        file.write(f"{section_title}\n")
    if subsection_title:
        file.write(f"  {subsection_title}\n")
    if content:
        file.write(f"    {' '.join(content)}\n\n")

def write_brew_recipe(cell):
    inputs = []
    outputs = []
    input_span = cell.select_one('.mcui-input')
    if input_span:
        input_items = input_span.select('.invslot-item-image a')
        for item in input_items:
            item_title = item.get('title')
            if item_title:
                inputs.append(item_title)
        input_items = input_span.select('.invslot-item invslot-item-image animated-active a')
        for item in input_items:
            item_title = item.get('title')
            if item_title:
                inputs.append(item_title)
    output_container = cell.select_one('.mcui-output')
    if output_container:
        output_items = output_container.select('.invslot-item-image a')
        for item in output_items:
            item_title = item.get('title')
            if item_title:
                outputs.append(item_title)
    recipe_parts = []
    if inputs:
        recipe_parts.append("input:" + "; ".join(inputs))
    if outputs:
        recipe_parts.append("output:" + "; ".join(outputs))
    return ' | '.join(recipe_parts)

def write_with_sprite(cell):
    img_tags = cell.find_all('img', class_='pixel-image')
    sprite_names = []
    for img_tag in img_tags:
        alt_text = img_tag.get('alt', '')
        sprite_name = alt_text.split(' ')[1]
        sprite_name = sprite_name.replace('.png', '').replace('-', ' ').replace(':','')
        sprite_names.append(sprite_name)
        sprite_names_text = '; '.join(sprite_names)
        additional_text = ' '.join(cell.stripped_strings)
        combined_text = f"{sprite_names_text} {additional_text}".strip()
    return combined_text

def write_smithing_recipe(cell):
    inputs = []
    items = cell.select('.invslot-item-image a')
    if items:
        for item in items:
            item_title = item.get('title')
            if item_title:
                inputs.append(item_title)
    output = inputs[-1] 
    inputs = inputs[:-1]
    recipe_parts = []
    recipe_parts.append("input:" + "; ".join(inputs))
    recipe_parts.append("output:" + output)
    recipe_str = ' | '.join(recipe_parts)
    return recipe_str


def write_crafting_recipe(cell):
    mcui_input = cell.find('span', class_='mcui-input')
    if mcui_input:
        inv_slots = mcui_input.find_all('span', class_='invslot')
        recipe_pattern = []
        for slot in inv_slots:
            item = slot.find('span', class_='invslot-item')
            if item:
                title = item.a.get('title') if item.a else ''
                recipe_pattern.append(title)
            else:
                recipe_pattern.append(None)
        three_by_three_recipe = [recipe_pattern[i:i+3] for i in range(0, len(recipe_pattern), 3)]
        return str(three_by_three_recipe)
    return ""
    

def write_table(table, file):
    rows = table.find_all('tr')
    csv_writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

    for row in rows:
        row_text = []
        for cell in row.find_all(['td','th']):
            if cell.find('span', class_='mcui mcui-Brewing_Stand pixel-image'):
                cell_text = write_brew_recipe(cell)
            elif cell.find('span', class_='mcui mcui-Smithing_Table pixel-image'):
                cell_text = write_smithing_recipe(cell)
            elif cell.find('span', class_='mcui mcui-Crafting_Table pixel-image'):
                cell_text = write_crafting_recipe(cell)
            elif cell.get_text(separator=" ", strip=True) == '' and cell.find('span', class_='sprite-file'):
                cell_text = write_with_sprite(cell)
            else:
                cell_text = cell.get_text(separator=" ", strip=True)
            row_text.append(cell_text)
        file.write('    ')
        csv_writer.writerow(row_text)
    file.write('\n')


def crawl_page(url, save_path, pagename):
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')

    with open(save_path + pagename + ".txt", 'w', encoding='utf-8') as file:
        main_div = soup.find('div', {'class': 'mw-parser-output'})
        section_title = "Overview"
        subsection_title = ""
        content = []
        
        for element in main_div.find_all(['h2', 'h3', 'dl', 'p', 'table'], recursive=False):
            # print(f'Processing {element.name} tag')
            if element.name == 'h2':
                # Write previous section
                write_content(file, section_title, subsection_title, content)
                content = []
                section_title = element.text.strip().replace('[edit | edit source]', '')
                if section_title == "History" or section_title == "Sounds":
                    break
                subsection_title = ""
            elif element.name == 'h3':
                # Write previous subsection + content
                write_content(file, section_title, subsection_title, content)
                content = []
                section_title = ""
                subsection_title = element.text.strip().replace('[edit | edit source]', '')
            elif element.name == 'p' or element.name == 'dl':
                content.append(element.text.strip())
            elif element.name == 'table':
                write_content(file, section_title, subsection_title, content)
                write_table(element, file)
                section_title = ""
                subsection_title = ""
                content = []

        if content:
            write_content(file, section_title, subsection_title, content)

In [6]:
# Test revoming Duplicates
'''
test_url = "https://minecraft.wiki/w/Blocks"
response = requests.get(test_url)
soup = BeautifulSoup(response.content, 'html.parser')

# Try to get the canonical URL
canonical_tag = soup.find("link", rel="canonical")
if canonical_tag and canonical_tag.has_attr("href"):
    canonical_url = canonical_tag["href"]
else:
    canonical_url = response.url

print(canonical_url)
'''
def getRealURL(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Try to get the canonical URL
    canonical_tag = soup.find("link", rel="canonical")
    if canonical_tag and canonical_tag.has_attr("href"):
        canonical_url = canonical_tag["href"]
    else:
        canonical_url = response.url
    return canonical_url
print(getRealURL("https://minecraft.wiki/w/Blocks"))

https://minecraft.wiki/w/Block


In [4]:
# not working
rp = urllib.robotparser.RobotFileParser()
rp.set_url(urljoin(BASE_WIKI_URL, "/robots.txt"))
# print(urljoin(BASE_URL, "robots.txt"))
rp.read()

In [7]:
# pass
disallowed_keywords = ['File:', 'Special:', 'Property', 'User:', 'Minecraft_Wiki', 'Help:', 'Minecraft_Legends', 'Minecraft_Dungeons', '=', 'Minecraft_Story_Mode', 'Story_Mode',':','edition']
def can_fetch(url):
    return not any(keyword in url for keyword in disallowed_keywords)

In [5]:
# pass
print(can_fetch("https://minecraft.wiki/w/Trading"))
print(can_fetch("https://minecraft.wiki/w/Special:UserLogin?returnto=Player"))
print(can_fetch("https://minecraft.wiki/*?title=Property%3A"))

False
False
False


In [41]:
# Crawing for Gameplay Trading
BASE_URL = "https://minecraft.wiki/w/Trading"
SAVE_PATH = "downloaded_pages/Gameplay/Trading/"
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)
to_visit = {BASE_URL}

response = requests.get(BASE_URL)
response.raise_for_status()

soup = BeautifulSoup(response.text, 'html.parser')

with open(SAVE_PATH+"Trading.txt", 'w', encoding='utf-8') as file:
    main_div = soup.find('div', {'class': 'mw-parser-output'})
    section_title = "Overview"
    subsection_title = ""
    content = []

    def write_content():
        if section_title:
            file.write(f"{section_title}\n")
        if subsection_title:
            file.write(f"  {subsection_title}\n")
        if content:
            file.write(f"    {' '.join(content)}\n\n")

    def write_table(table):
        df = pd.read_html(str(table))[0]
        csv_buffer = StringIO()
        df.to_csv(csv_buffer, index=False, header=True)
        table_str = csv_buffer.getvalue()
        indented_table_str = '    ' + table_str.replace('\n', '\n    ')
        file.write("{}\n\n".format(indented_table_str))
    
    for element in main_div.find_all(['h2', 'h3', 'p', 'table'], recursive=False):
        # print(f'Processing {element.name} tag')
        if element.name == 'h2':
            # Write previous section
            write_content()
            content = []
            section_title = element.text.strip().replace('[edit | edit source]', '')
            subsection_title = ""
        elif element.name == 'h3':
            # Write previous subsection content
            write_content()
            content = []
            section_title = ""
            subsection_title = element.text.strip().replace('[edit | edit source]', '')
        elif element.name == 'p':
            content.append(element.text.strip())
        elif element.name == 'table':
            write_content()
            write_table(element)
            section_title = ""
            subsection_title = ""
            content = []

    # Write last section
    if content:
        write_content()

In [59]:
# Crawing for Gameplay Brewing
BASE_URL = "https://minecraft.wiki/w/Brewing"
SAVE_PATH = "downloaded_pages/Gameplay/Brewing/"
PAGE_NAME = "Brewing"
crawl_page(BASE_URL, SAVE_PATH, PAGE_NAME)


In [62]:
# Crawing for Gameplay Enchanting
BASE_URL = "https://minecraft.wiki/w/Enchanting"
SAVE_PATH = "downloaded_pages/Gameplay/Enchanting/"
PAGE_NAME = "Enchanting"
crawl_page(BASE_URL, SAVE_PATH, PAGE_NAME)

response = requests.get(BASE_URL)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', {'data-description': 'Summary of enchantments'})
links_to_crawl = []
file_names = []
for tr in table.find_all('tr')[1:]:
    first_link = tr.find('a')
    if first_link and first_link.has_attr('href'):
        full_url = requests.compat.urljoin(BASE_WIKI_URL, first_link['href'])
        links_to_crawl.append(full_url)
        file_names.append(first_link['title'])
for i in range(len(links_to_crawl)):
    crawl_page(links_to_crawl[i], SAVE_PATH, file_names[i])

In [4]:
def crawl_recipe(url, save_path, pagename):
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    table_body = soup.find('tbody')
    recipes = []
    rows = table_body.find_all('tr')[1:]
    for tr in rows:
        columns = tr.find_all(['th', 'td'])
        names = [a.get_text(strip=True) for a in columns[0].find_all('a')]
        ingredients = [a.get_text(strip=True) for a in columns[1].find_all('a')]
        mcui_input = columns[2].find('span', class_='mcui-input')
        if mcui_input:
            inv_slots = mcui_input.find_all('span', class_='invslot')
            recipe_pattern = []
            for slot in inv_slots:
                item = slot.find('span', class_='invslot-item')
                if item:
                    title = item.a.get('title') if item.a else ''
                    recipe_pattern.append(title)
                else:
                    recipe_pattern.append(None)
            three_by_three_recipe = [recipe_pattern[i:i+3] for i in range(0, len(recipe_pattern), 3)]
        # print(three_by_three_recipe)
        description = columns[3].get_text(strip=True) if len(columns) > 3 else ""

        recipe = {
                'names': names,
                'ingredients': ingredients,
                'sample pattern': three_by_three_recipe,
                'description': description
            }
        recipes.append(recipe)
    with open(os.path.join(save_path, f"{pagename}.json"), 'w', encoding='utf-8') as f:
        json.dump(recipes, f, ensure_ascii=False, indent=4)
        

In [8]:
# Crawing for Crafting
BASE_URL = "https://minecraft.wiki/w/Crafting"
SAVE_PATH = "downloaded_pages/Gameplay/Crafting/"
PAGE_NAME = "Crafating"

# Crawing the main page
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

response = requests.get(BASE_URL)
response.raise_for_status()

soup = BeautifulSoup(response.text, 'html.parser')

with open(SAVE_PATH + PAGE_NAME + ".txt", 'w', encoding='utf-8') as file:
    main_div = soup.find('div', {'class': 'mw-parser-output'})
    section_title = "Overview"
    subsection_title = ""
    content = []
    
    for element in main_div.find_all(['h2', 'h3', 'p', 'table'], recursive=False):
        # print(f'Processing {element.name} tag')
        if element.name == 'h2':
            # Write previous section
            write_content(file, section_title, subsection_title, content)
            content = []
            section_title = element.text.strip().replace('[edit | edit source]', '')
            if section_title == "Complete recipe list":
                break
            subsection_title = ""
        elif element.name == 'h3':
            # Write previous subsection + content
            write_content(file, section_title, subsection_title, content)
            content = []
            section_title = ""
            subsection_title = element.text.strip().replace('[edit | edit source]', '')
        elif element.name == 'p':
            content.append(element.text.strip())
        elif element.name == 'table':
            write_content(file, section_title, subsection_title, content)
            write_table(element, file)
            section_title = ""
            subsection_title = ""
            content = []

    if content:
        write_content(file, section_title, subsection_title, content)



# Crawing the recipes to json files
response = requests.get(BASE_URL)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')

sections = soup.select('div.load-page')
titles = []
urls = []
for section in sections:
    headline = section.find(class_='mw-headline')
    if headline:
        curr_section = headline.text.strip().replace(' ', "_")
        titles.append(curr_section)
        # print(curr_section)
        temp = section.find(class_='hatnote searchaux')
        first_link = temp.find('a')
        if first_link and first_link.has_attr('href'):
            full_url = requests.compat.urljoin(BASE_WIKI_URL, first_link['href'])
            urls.append(full_url)
            # print(full_url)
print(urls)

for i in range(len(titles)):
    crawl_recipe(urls[i], SAVE_PATH, titles[i])

['https://minecraft.wiki/w/Crafting/Building_blocks', 'https://minecraft.wiki/w/Crafting/Decoration_blocks', 'https://minecraft.wiki/w/Crafting/Redstone', 'https://minecraft.wiki/w/Crafting/Transportation', 'https://minecraft.wiki/w/Crafting/Foodstuffs', 'https://minecraft.wiki/w/Crafting/Tools', 'https://minecraft.wiki/w/Crafting/Combat', 'https://minecraft.wiki/w/Crafting/Brewing', 'https://minecraft.wiki/w/Crafting/Materials', 'https://minecraft.wiki/w/Crafting/Miscellaneous']


In [6]:
# Crawing for Smelting
# Commenting out since Smelting needed manual cleaning
'''
BASE_URL = "https://minecraft.wiki/w/Smelting"
SAVE_PATH = "downloaded_pages/Gameplay/Smelting/"
PAGE_NAME = "Smelting"
crawl_page(BASE_URL, SAVE_PATH, PAGE_NAME)
'''

In [19]:
# Crawing for Smithing
BASE_URL = "https://minecraft.wiki/w/Smithing"
SAVE_PATH = "downloaded_pages/Gameplay/Smithing/"
PAGE_NAME = "Smithing"
crawl_page(BASE_URL, SAVE_PATH, PAGE_NAME)

In [20]:
# Crawing for Archaeology
BASE_URL = "https://minecraft.wiki/w/Archaeology"
SAVE_PATH = "downloaded_pages/Gameplay/Archaeology/"
PAGE_NAME = "Archaeology"

def crawl_page(url, save_path, pagename):
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')

    with open(save_path + pagename + ".txt", 'w', encoding='utf-8') as file:
        main_div = soup.find('div', {'class': 'mw-parser-output'})
        section_title = "Overview"
        subsection_title = ""
        content = []
        
        for element in main_div.find_all(['h2', 'h3', 'ul', 'dl','p', 'table'], recursive=False):
            # print(f'Processing {element.name} tag')
            if element.name == 'h2':
                # Write previous section
                write_content(file, section_title, subsection_title, content)
                content = []
                section_title = element.text.strip().replace('[edit | edit source]', '')
                if section_title == "History" or section_title == "Sounds":
                    break
                subsection_title = ""
            elif element.name == 'h3':
                # Write previous subsection + content
                write_content(file, section_title, subsection_title, content)
                content = []
                section_title = ""
                subsection_title = element.text.strip().replace('[edit | edit source]', '')
            elif element.name == 'p' or element.name == 'ul' or element.name == 'dl':
                content.append(element.text.strip())
            elif element.name == 'table':
                write_content(file, section_title, subsection_title, content)
                write_table(element, file)
                section_title = ""
                subsection_title = ""
                content = []

        if content:
            write_content(file, section_title, subsection_title, content)
crawl_page(BASE_URL, SAVE_PATH, PAGE_NAME)

In [24]:
# Crawing for Redstone circuits
BASE_URL = "https://minecraft.wiki/w/Redstone_circuits"
SAVE_PATH = "downloaded_pages/Gameplay/Redstone_circuits/"
PAGE_NAME = "Redstone_circuits"
crawl_page(BASE_URL, SAVE_PATH, PAGE_NAME)

In [4]:
# Crawing for Effect
BASE_URL = "https://minecraft.wiki/w/Effect"
SAVE_PATH = "downloaded_pages/Effect/"
PAGE_NAME = "Effect"
crawl_page(BASE_URL, SAVE_PATH, PAGE_NAME)

response = requests.get(BASE_URL)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', {'data-description': 'Effects'})
links_to_crawl = []
file_names = []
for tr in table.find_all('tr')[1:]:
    first_link = tr.find('a')
    if first_link and first_link.has_attr('href'):
        full_url = requests.compat.urljoin(BASE_WIKI_URL, first_link['href'])
        links_to_crawl.append(full_url)
        file_names.append(first_link['title'])
for i in range(len(links_to_crawl)):
    crawl_page(links_to_crawl[i], SAVE_PATH, file_names[i])

In [8]:
# Crawing for Blocks
BASE_URL = "https://minecraft.wiki/w/Block"
SAVE_PATH = "downloaded_pages/Block/"
PAGE_NAME = "Block"

if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

response = requests.get(BASE_URL)
response.raise_for_status()

soup = BeautifulSoup(response.text, 'html.parser')

with open(SAVE_PATH + PAGE_NAME + ".txt", 'w', encoding='utf-8') as file:
    main_div = soup.find('div', {'class': 'mw-parser-output'})
    section_title = "Overview"
    subsection_title = ""
    content = []
    
    for element in main_div.find_all(['h2', 'h3', 'p', 'table'], recursive=False):
        # print(f'Processing {element.name} tag')
        if element.name == 'h2':
            # Write previous section
            write_content(file, section_title, subsection_title, content)
            content = []
            section_title = element.text.strip().replace('[edit | edit source]', '')
            if section_title == "List of blocks":
                break
            subsection_title = ""
        elif element.name == 'h3':
            # Write previous subsection + content
            write_content(file, section_title, subsection_title, content)
            content = []
            section_title = ""
            subsection_title = element.text.strip().replace('[edit | edit source]', '')
        elif element.name == 'p':
            content.append(element.text.strip())
        elif element.name == 'table':
            write_content(file, section_title, subsection_title, content)
            write_table(element, file)
            section_title = ""
            subsection_title = ""
            content = []

    if content:
        write_content(file, section_title, subsection_title, content)

divs = soup.find_all('div', class_='div-col columns column-width')[:2]
names = []
to_crawl = set()
for li in divs[0].find_all('li'):
    a_tags = li.find_all('a')
    if len(a_tags) > 1:
        name = a_tags[1].get_text(strip=True)
        names.append(name)
        full_url = requests.compat.urljoin(BASE_WIKI_URL, a_tags[1]['href'])
        if "mw-redirect" in a_tags[1].get('class', []):
            full_url = getRealURL(full_url)
        to_crawl.add(realURL)
with open('Normal_Blocks.json', 'w', encoding='utf-8') as file:
    json.dump(names, file, indent=4, ensure_ascii=False)
SAVE_PATH = "downloaded_pages/Block/Normal_Blocks/"
for link in to_crawl:
    file_name = link.split('/')[-1]
    crawl_page(link, SAVE_PATH, file_name)
    time.sleep(random(5))

names = []
to_crawl = set()
for li in divs[1].find_all('li'):
    a_tags = li.find_all('a')
    if len(a_tags) > 1:
        name = a_tags[1].get_text(strip=True)
        names.append(name)
        # realURL = a_tags[1]['href']
        full_url = requests.compat.urljoin(BASE_WIKI_URL, a_tags[1]['href'])
        if "mw-redirect" in a_tags[1].get('class', []):
            full_url = getRealURL(full_url)
        to_crawl.add(realURL)
with open('Technical_Blocks.json', 'w', encoding='utf-8') as file:
    json.dump(names, file, indent=4, ensure_ascii=False)
SAVE_PATH = "downloaded_pages/Block/Technical_Blocks/"
for link in to_crawl:
    file_name = link.split('/')[-1]
    crawl_page(link, SAVE_PATH, file_name)
    time.sleep(random(5))

MissingSchema: Invalid URL '/w/Acacia_Button': No scheme supplied. Perhaps you meant http:///w/Acacia_Button?

In [281]:
'''
while to_visit:
    current_url = to_visit.pop()

    if current_url in visited_urls or not can_fetch(current_url):
        continue

    response = requests.get(current_url)

    soup = BeautifulSoup(response.content, 'html.parser')
    canonical_tag = soup.find("link", rel="canonical")

    if canonical_tag["href"]:
        # Check Redirect
        if not canonical_tag["href"] == current_url:
            visited_urls.add(current_url)
            current_url = canonical_tag["href"]
            if current_url in visited_urls:
                continue

    
    page_name = urlparse(current_url).path.split('/')[-1] or 'index'
    page_name = page_name.replace(':', '_').replace('%', '_').replace('?', '_') + ".html"
    with open(os.path.join(SAVE_PATH, page_name), 'w', encoding="utf-8") as f:
        f.write(response.text)

    # Extract links
    for link in soup.find_all('a', href=True):
            full_url = link['href'] if 'http' in link['href'] else BASE_URL + link['href']
            full_url = sanitize_url(full_url)
            if not can_fetch(full_url):
                continue
            if full_url not in visited_urls and BASE_WIKI_URL in full_url:
                to_visit.add(full_url)

    visited_urls.add(current_url)

    time.sleep(random.uniform(1, 4))

print("Crawling and saving finished!")
'''

Crawling and saving finished!


In [282]:
print('https://minecraft.wiki/w/Acacia_Button' in to_visit)
print(len(to_visit))
print(len(visited_urls))

False
0
3466
