# Try out notebook for TOME vault web scraper

## Observations
- base url of page list: https://te4.org/characters-vault
- url of page n: https://te4.org/characters-vault?page=n
- links of character pages can be extracted from page list
- talents need to be extracted
- somehow extract talents that need to be unlocked?
- stats too
- gear too?
- stats too?
- inscriptions too?
- Only selected permadeth = 'roguelike': https://te4.org/characters-vault?tag_name=&tag_level_min=&tag_level_max=&tag_permadeath%5B%5D=66#
- only selected difficulty = 'insane': https://te4.org/characters-vault?tag_name=&tag_level_min=&tag_level_max=&tag_difficulty%5B%5D=36#
- only selected race = 'cornac': https://te4.org/characters-vault?tag_name=&tag_level_min=&tag_level_max=&tag_race%5B%5D=8#
- only selected class = 'archmage': https://te4.org/characters-vault?tag_name=&tag_level_min=&tag_level_max=&tag_class%5B%5D=7#
- only selected campagin = 'majeyal': https://te4.org/characters-vault?tag_name=&tag_level_min=&tag_level_max=&tag_campaign%5B%5D=2#
- only selected version = '1.7.6': https://te4.org/characters-vault?tag_name=&tag_level_min=&tag_level_max=&tag_game%5B%5D=1191241#
- Can't immediately see maximum number of pages
- should probably build in a way to cap the number of characters extracted
- should somehow solve the issue of chinese characters




In [8]:
# Load packages

from bs4 import BeautifulSoup
import requests
import pandas as pd

### Extract all character links from a page

In [42]:
## Set up
base_url = "https://te4.org/characters-vault"

req = requests.get(base_url)
print(req)

soup = BeautifulSoup(req.text, 'html.parser')

## One character urls
char_url_html = soup.find("tr", {"class": "even"})
char_url = "https://te4.org/" + char_url_html.find_all("a")[1].get("href")
print(char_url)

## All character urls of page
char_url_html_list = soup.find_all("tr", {"class": "even"}) + soup.find_all("tr", {"class": "odd"})

char_url_list = set()

# Loop over all html
for url_html in char_url_html_list:
    char_url_list.add("https://te4.org/" + url_html.find_all("a")[1].get("href"))
    
print(char_url_list)

# Method that gets the character urls from a page
def get_char_urls_from_page(page_url=None, soup=None):
    
    # Set up BeautifulSoup if isn't given
    if not soup:
        req = requests.get(page_url)
        soup = BeautifulSoup(req.text, 'html.parser')
    
    # Extract the html elements that contain the urls
    char_url_html_list = soup.find_all("tr", {"class": "even"}) + soup.find_all("tr", {"class": "odd"})
    
    # Loop over those elements to get the character page urls
    char_url_list = set()
    for url_html in char_url_html_list:
        char_url_list.add("https://te4.org/" + url_html.find_all("a")[1].get("href"))
        
    # Return set
    return char_url_list

<Response [200]>
https://te4.org//characters/269780/tome/5d9f9f25-c041-4060-bdcc-a0eb502d9881
{'https://te4.org//characters/317906/tome/3f2cc355-746f-45da-84dd-67b08ffc0c4d', 'https://te4.org//characters/222525/tome/e5f79120-f676-42ab-bcfd-811f326342a7', 'https://te4.org//characters/216966/tome/000f60f8-7072-458e-8095-1c9326ca297b', 'https://te4.org//characters/322266/tome/9d75577b-2f56-4cae-aeaa-470c27b09105', 'https://te4.org//characters/280165/tome/27f84315-e0dc-41a0-8aaf-f22411b074cf', 'https://te4.org//characters/223039/tome/4c79df4c-39c0-4988-8f67-63f24f330d1b', 'https://te4.org//characters/269780/tome/5d9f9f25-c041-4060-bdcc-a0eb502d9881', 'https://te4.org//characters/19399/tome/6d611740-7856-4a3c-a13e-220895120ce2', 'https://te4.org//characters/328784/tome/47d4412e-a94c-4a19-b23e-3a7558f6e0fb', 'https://te4.org//characters/259705/tome/5f3a3b6a-280c-44c8-8374-25262b973fa9', 'https://te4.org//characters/339687/tome/c689d228-59e9-4642-9947-3384f844a563', 'https://te4.org//characte

### Extract character urls from multiple pages, pages start at 0

In [47]:
base_url = "https://te4.org/characters-vault?tag_name=&tag_level_min=&tag_level_max=&tag_game%5B%5D=1191241&page=1"
empty_url = "https://te4.org/characters-vault?tag_name=&tag_level_min=&tag_level_max=&tag_winner=winner&tag_permadeath%5B%5D=66&tag_difficulty%5B%5D=36&tag_race%5B%5D=47&tag_class%5B%5D=104071&tag_game%5B%5D=1191241#"

# Skip if 'no characters available' shows up or url_limit is reached
req = requests.get(empty_url)

soup = BeautifulSoup(req.text, "html.parser")

check = soup.find("tr", {"class":"odd"})
if check.text == 'No characters available. ':
    print('no')
    
def empty_page(page_url=None, soup=None):
    if not soup:
        req = requests.get(page_url)
        soup = BeautifulSoup(req.text, 'html.parser')
        
    check = soup.find("tr", {"class":"odd"})
    if check.text == 'No characters available. ':
        return True
    else:
        return False
    
# Method that returns all the character urls, up to a maximum
def get_all_character_urls(base_url, max_urls = 100):
    print('Extracting character urls...')
    
    # Set up
    character_urls = set()
    page_number = 0
    
    while len(character_urls) <= max_urls:
        
        print(f'Extracting characters from page {page_number}...')
        
        # Make current page url, get soup
        page_url = f"{base_url}&page={page_number}"
        
        req = requests.get(page_url)
        soup = BeautifulSoup(req.text, "html.parser")
        
        # Break if the page is empty
        if empty_page(soup=soup):
            print(f"Page {page_number} is empty. Ending...")
            break
        
        # Get the character urls from the current page
        character_urls = character_urls | get_char_urls_from_page(soup=soup) # Take union of the two sets
        
        # Update the page number
        page_number += 1
        
    return character_urls
    
urls = get_all_character_urls(base_url)


no
Extracting character urls...
Extracting characters from page 0...
Extracting characters from page 1...
Extracting characters from page 2...
Extracting characters from page 3...
Extracting characters from page 4...


### Extracting info from character page

In [None]:
url = "https://te4.org/characters/259208/tome/bddd99b8-9594-44d2-a0ba-ec743bafabfa"

In [48]:
urls

{'https://te4.org//characters/106853/tome/ca50b1b5-1b82-4c57-8d61-d64ebd5307a5',
 'https://te4.org//characters/121782/tome/7a1c49ff-3f36-4607-b655-fd527f93e16e',
 'https://te4.org//characters/127985/tome/d73ff991-4ea1-4306-99d1-8a17a6956161',
 'https://te4.org//characters/12852/tome/e517a1a4-b16f-408a-b65f-69155fea87cc',
 'https://te4.org//characters/129580/tome/1a708543-7b5b-4903-9f90-7480768c4e4c',
 'https://te4.org//characters/137257/tome/4352333c-2e8f-4106-b65b-d5baa88b2a73',
 'https://te4.org//characters/137257/tome/7a97fbde-413b-41ae-8539-8fc6dbc91439',
 'https://te4.org//characters/14351/tome/041a6946-772b-47c7-8109-a21c8f868eea',
 'https://te4.org//characters/144076/tome/49de2e89-d4c8-4fe2-a399-0e59babfa679',
 'https://te4.org//characters/146199/tome/f5100481-d424-4728-a98c-e153f1ca1094',
 'https://te4.org//characters/168287/tome/74add967-def2-45cf-8301-cf47e6244c18',
 'https://te4.org//characters/168922/tome/6108e97f-0794-499a-ae17-0ed14c1bbfad',
 'https://te4.org//characters/