# Try out notebook for TOME vault web scraper

## Observations
- base url of page list: https://te4.org/characters-vault
- url of page n: https://te4.org/characters-vault?page=n
- links of character pages can be extracted from page list
- talents need to be extracted
- somehow extract talents that need to be unlocked?
- stats too
- gear too?
- stats too?
- inscriptions too?
- Only selected permadeth = 'roguelike': https://te4.org/characters-vault?tag_name=&tag_level_min=&tag_level_max=&tag_permadeath%5B%5D=66#
- only selected difficulty = 'insane': https://te4.org/characters-vault?tag_name=&tag_level_min=&tag_level_max=&tag_difficulty%5B%5D=36#
- only selected race = 'cornac': https://te4.org/characters-vault?tag_name=&tag_level_min=&tag_level_max=&tag_race%5B%5D=8#
- only selected class = 'archmage': https://te4.org/characters-vault?tag_name=&tag_level_min=&tag_level_max=&tag_class%5B%5D=7#
- only selected campagin = 'majeyal': https://te4.org/characters-vault?tag_name=&tag_level_min=&tag_level_max=&tag_campaign%5B%5D=2#
- only selected version = '1.7.6': https://te4.org/characters-vault?tag_name=&tag_level_min=&tag_level_max=&tag_game%5B%5D=1191241#
- Can't immediately see maximum number of pages
- should probably build in a way to cap the number of characters extracted
- should somehow solve the issue of chinese characters




In [9]:
# Load packages

from bs4 import BeautifulSoup
import requests
import pandas as pd
from collections import OrderedDict

### Extract all character links from a page

In [5]:
## Set up
base_url = "https://te4.org/characters-vault"

req = requests.get(base_url)
print(req)

soup = BeautifulSoup(req.text, 'html.parser')

## One character urls
char_url_html = soup.find("tr", {"class": "even"})
char_url = "https://te4.org/" + char_url_html.find_all("a")[1].get("href")
print(char_url)

## All character urls of page
char_url_html_list = soup.find_all("tr", {"class": "even"}) + soup.find_all("tr", {"class": "odd"})

char_url_list = set()

# Loop over all html
for url_html in char_url_html_list:
    char_url_list.add("https://te4.org/" + url_html.find_all("a")[1].get("href"))
    
print(char_url_list)

# Method that gets the character urls from a page
def get_char_urls_from_page(page_url=None, soup=None):
    
    # Set up BeautifulSoup if isn't given
    if not soup:
        req = requests.get(page_url)
        soup = BeautifulSoup(req.text, 'html.parser')
    
    # Extract the html elements that contain the urls
    char_url_html_list = soup.find_all("tr", {"class": "even"}) + soup.find_all("tr", {"class": "odd"})
    
    # Loop over those elements to get the character page urls
    char_url_list = set()
    for url_html in char_url_html_list:
        char_url_list.add("https://te4.org/" + url_html.find_all("a")[1].get("href"))
        
    # Return set
    return char_url_list

<Response [200]>
https://te4.org//characters/203120/tome/e78922d7-ac95-4724-b1b4-ca7861c0389d
{'https://te4.org//characters/339871/tome/1e566172-ce77-44ef-bdc0-f09e91c0680b', 'https://te4.org//characters/339828/tome/d65a76a0-2277-4af1-b74d-c63c10e92101', 'https://te4.org//characters/339094/tome/b9652958-ce75-4ed9-b504-c9491e70ffd2', 'https://te4.org//characters/307264/tome/e9b59d80-48da-4cb3-9add-34b6726ce1e4', 'https://te4.org//characters/295023/tome/61ec896d-a610-4f2b-95b8-221a2d593b09', 'https://te4.org//characters/260512/tome/76d5854d-a916-4c59-a676-d78027137430', 'https://te4.org//characters/268531/tome/c37b7f81-8cc7-4f02-8920-bc59b10b90c1', 'https://te4.org//characters/324511/tome/9d37b393-1d3c-468c-b46b-0bf2704ec54d', 'https://te4.org//characters/313231/tome/6926a71e-52f5-44a5-9763-b6a1edb9ec99', 'https://te4.org//characters/260673/tome/09b64b7c-22e8-4378-acfe-aa359ce6dd2a', 'https://te4.org//characters/203120/tome/e78922d7-ac95-4724-b1b4-ca7861c0389d', 'https://te4.org//charact

### Extract character urls from multiple pages, pages start at 0

In [6]:
base_url = "https://te4.org/characters-vault?tag_name=&tag_level_min=&tag_level_max=&tag_game%5B%5D=1191241&page=1"
empty_url = "https://te4.org/characters-vault?tag_name=&tag_level_min=&tag_level_max=&tag_winner=winner&tag_permadeath%5B%5D=66&tag_difficulty%5B%5D=36&tag_race%5B%5D=47&tag_class%5B%5D=104071&tag_game%5B%5D=1191241#"

# Skip if 'no characters available' shows up or url_limit is reached
req = requests.get(empty_url)

soup = BeautifulSoup(req.text, "html.parser")

check = soup.find("tr", {"class":"odd"})
if check.text == 'No characters available. ':
    print('no')
    
def empty_page(page_url=None, soup=None):
    if not soup:
        req = requests.get(page_url)
        soup = BeautifulSoup(req.text, 'html.parser')
        
    check = soup.find("tr", {"class":"odd"})
    if check.text == 'No characters available. ':
        return True
    else:
        return False
    
# Method that returns all the character urls, up to a maximum
def get_all_character_urls(base_url, max_urls = 100):
    print('Extracting character urls...')
    
    # Set up
    character_urls = set()
    page_number = 0
    
    while len(character_urls) <= max_urls:
        
        print(f'Extracting characters from page {page_number}...')
        
        # Make current page url, get soup
        page_url = f"{base_url}&page={page_number}"
        
        req = requests.get(page_url)
        soup = BeautifulSoup(req.text, "html.parser")
        
        # Break if the page is empty
        if empty_page(soup=soup):
            print(f"Page {page_number} is empty. Ending...")
            break
        
        # Get the character urls from the current page
        character_urls = character_urls | get_char_urls_from_page(soup=soup) # Take union of the two sets
        
        # Update the page number
        page_number += 1
        
    return character_urls
    
urls = get_all_character_urls(base_url)


no
Extracting character urls...
Extracting characters from page 0...
Extracting characters from page 1...
Extracting characters from page 2...
Extracting characters from page 3...
Extracting characters from page 4...


### Extracting info from character page

#### Try out

In [254]:
char_url = "https://te4.org/characters/259208/tome/bddd99b8-9594-44d2-a0ba-ec743bafabfa"

req = requests.get(char_url)

soup = BeautifulSoup(req.text, 'html.parser')

### Name of the character (and the creator)
full_name = soup.find("div", {"id": "title-container"}).text

### Info from tables at the top
char_tables = soup.find_all("div", {"class": "charsheet"})

## Character table (Maybe change to dictionary later)
character_table = char_tables[0]
character_table_entries = character_table.find_all("tr")

# Game & Version
game_line = character_table_entries[0]
game_text = game_line.find_all("td")[1].text
game_text_split = game_text.split(' ')

version = list.pop(game_text_split)
game = ' '.join(game_text_split)

# Difficulty and permadeatch
mode_line = character_table_entries[1]
mode_text = mode_line.find_all("td")[1].text
mode_text_split = mode_text.split(' ')
difficulty = mode_text_split[0]
permadeath = mode_text_split[1]
print(difficulty, permadeath)
# Sex
sex_line = character_table_entries[2]
sex = sex_line.find_all("td")[1].text

# Race
race_line = character_table_entries[3]
race = race_line.find_all("td")[1].text

# Class
class_line = character_table_entries[4]
class_ = class_line.find_all("td")[1].text

# Level
level_line = character_table_entries[5]
level = level_line.find_all("td")[1].text

## Stats
stats_table = char_tables[1]
stats_table_entries = stats_table.find_all("tr")

stats = {}
for row in stats_table_entries:
    stat = row.find_all("td")[0].text
    value = row.find_all("td")[1].text
    stats[stat] = value
    
print(stats)
    
## Infusions
infusions_table = char_tables[14]
infusions_html = infusions_table.find_all("td", {"class": "qtip-link"})

infusions = list()
for infusion in infusions_html:
    infusion.find('div').decompose()
    infusions.append(infusion.text)
    
print(infusions)

## Class and Generic Talents
from collections import OrderedDict


        
class_talents_table = char_tables[15]
generic_talents_table = char_tables[16]

class_talents = get_trees(class_talents_table)
generic_talents = get_trees(generic_talents_table)

print(class_talents)
print(generic_talents)

# Prodigies
prodigy_table = char_tables[17]
print(prodigy_table.find("h4"))
entries = prodigy_table.find_all('tr')

prodigies = list()
for line_html in entries:
    line_html.find('div').decompose()
    prodigies.append(line_html.find('li').text)
    
print(prodigies)





Items Vault
{'Strength': '51 (base 9)', 'Dexterity': '46 (base 30)', 'Constitution': '138 (base 60)', 'Magic': '136 (base 62)', 'Willpower': '23 (base 9)', 'Cunning': '82 (base 60)'}
['Infusion: Movement', 'Rune: Mirror Image', 'Infusion: Wild Growth', 'Rune: Dissipation', 'Rune: Shatter Afflictions']
OrderedDict([('Celestial / Circles', OrderedDict([('Circle of Shifting Shadows', 5), ('Circle of Sanctity', 5), ('Circle of Warding', 3), ('Celestial Surge', 4)])), ('Celestial / Star fury', OrderedDict([('Moonlight Ray', 5), ('Shadow Blast', 5), ('Twilight Surge', 5), ('Starfall', 1)])), ('Celestial / Eclipse', OrderedDict([('Blood Red Moon', 5), ('Totality', 4), ('Corona', 1), ('Darkest Light', 5)])), ('Celestial / Sunlight', OrderedDict([('Searing Light', 5), ('Sun Flare', 5), ('Firebeam', 0), ('Sunburst', 0)])), ('Celestial / Twilight', OrderedDict([('Twilight', 1), ('Jumpgate', 5), ('Mind Blast', 1), ('Shadow Simulacrum', 5)]))])
OrderedDict([('Race / Dwarf', OrderedDict([('Resilienc

#### Methods

In [7]:

# Method that gets a dictionary containing the character table titles and their indices
def get_table_dict(tables):
    
    dict = {}
    
    dict['Prodigies'] = 17
    
    for index, table in enumerate(tables):
        try:
            name = table.find('h4').text
        
            if 'Inscriptions' in name.split(' '):
                name = 'Inscriptions'
        
            dict[name] = index
        except:
            pass
        
    return dict


# Method that extracts the information from the generic and class talent tables
def get_trees(table):
    
    try:
        lines_html = table.find_all("tr")
        
        talents = OrderedDict()
        line_num = 0
        extra = 0

        # Loop over all lines
        for line in lines_html:
            
            # Every fifth line stands for a tree
            if line_num % (5 + extra) == 0:
                line_num = 0
                elements = line.find_all('td')
                tree = elements[0].text

                tree_dict = OrderedDict()
                
                if tree == 'Technique / Combat training':
                    extra = 2 
                else: 
                    extra = 0
            
            # The other lines stand for skills in a tree
            else:
                line.find('div').decompose()
                talent = line.find('li').text
                level = line.find_all('td')[-1].text
                level_int = int(level[0])
                tree_dict[talent] = level_int
                talents[tree] = tree_dict

            line_num += 1
            
        return talents
    
    except Exception as e:
        print('Something went wrong extracting the skill tree')
        print(e)

# Method that puts the relevant data of a character in a dictionary
def get_character_dictionary(char_url):
    
    print(f'Beginning to extract {char_url}...')
    
    # Set up BeautifulSoup
    req = requests.get(char_url)
    soup = BeautifulSoup(req.text, 'html.parser')

    ### Name of the character (and the creator)
    full_name = soup.find("div", {"id": "title-container"}).text

    ### Info from tables at the top
    char_tables = soup.find_all("div", {"class": "charsheet"})
    
    tables_dict = get_table_dict(char_tables)

    ## Character table (Maybe change to dictionary later)
    character_index = tables_dict['Character']
    character_table = char_tables[character_index]
    character_table_entries = character_table.find_all("tr")

    # Game & Version
    game_line = character_table_entries[0]
    game_text = game_line.find_all("td")[1].text
    game_text_split = game_text.split(' ')

    version = list.pop(game_text_split)
    game = ' '.join(game_text_split)

    # Difficulty and permadeatch
    mode_line = character_table_entries[3]
    mode_text = mode_line.find_all("td")[1].text
    mode_text_split = mode_text.split(' ')
    difficulty = mode_text_split[0]
    permadeath = mode_text_split[1]

    # Sex
    sex_line = character_table_entries[4]
    sex = sex_line.find_all("td")[1].text

    # Race
    race_line = character_table_entries[5]
    race = race_line.find_all("td")[1].text

    # Class
    class_line = character_table_entries[6]
    class_ = class_line.find_all("td")[1].text

    # Level
    level_line = character_table_entries[7]
    level = level_line.find_all("td")[1].text

    ## Stats
    stats_index = tables_dict['Primary Stats']
    stats_table = char_tables[stats_index]
    stats_table_entries = stats_table.find_all("tr")

    stats = {}
    for row in stats_table_entries:
        stat = row.find_all("td")[0].text
        value = row.find_all("td")[1].text
        stats[stat] = value
        
    ## Inscriptions
    inscriptions_index = tables_dict['Inscriptions']
    inscriptions_table = char_tables[inscriptions_index]
    inscriptions_html = inscriptions_table.find_all("td", {"class": "qtip-link"})

    inscriptions = list()
    for inscription in inscriptions_html:
        inscription.find('div').decompose()
        inscriptions.append(inscription.text)

    ## Class and Generic Talents
    
    class_talents_index = tables_dict["Class Talents"]
    generic_talents_index = tables_dict["Generic Talents"]
    
    class_talents_table = char_tables[class_talents_index]
    generic_talents_table = char_tables[generic_talents_index]
    
    class_talents = get_trees(class_talents_table)
    generic_talents = get_trees(generic_talents_table)

    ## Prodigies
    prodigies = list()
    
    prodigy_index = tables_dict['Prodigies']
    prodigy_table = char_tables[prodigy_index]
    
    if prodigy_table.find("h4").text == "Prodigies":
        
        entries = prodigy_table.find_all('tr')
        
        for line_html in entries:
            line_html.find('div').decompose()
            prodigies.append(line_html.find('li').text)
        
    char_dictionary = {'name': full_name,
                       'race': race,
                       'class': class_,
                       'sex': sex,
                       'level': level,
                       'stats': stats,
                       'inscriptions': inscriptions,
                       'class talents': class_talents,
                       'generic talents': generic_talents,
                       'prodigies': prodigies,
                       'game': game,
                       'version': version,
                       'difficulty': difficulty,
                       'permadeath': permadeath}
    
    return char_dictionary

### Filter using search link

#### Automatically get the race codes

In [None]:
race_codes = {}

base_url = "https://te4.org/characters-vault?tag_name=&tag_level_min=&tag_level_max="

for i in range(1, 100): 
    tag = f'&tag_race%5B%5D={i}#'
    url = base_url + tag
    
    char_link = get_all_character_urls(url, max_urls=1)
    
    if char_link:
        
        try:
            char = get_character_dictionary(list(char_link)[0])
            race_codes[char['race']] = str(i)
        except: 
            print('something went wrong with this character, probably a very janky character') 

#### Automatically get class codes

In [257]:
base_url = "https://te4.org/characters-vault?tag_name=&tag_level_min=&tag_level_max="

class_codes = {}

for i in range(1, 200): 
    tag = f'&tag_class%5B%5D={i}#'
    url = base_url + tag
    
    char_link = get_all_character_urls(url, max_urls=1)
    
    if char_link:
        
        try:
            char = get_character_dictionary(list(char_link)[0])
            class_codes[char['class']] = str(i)
        except: 
            print('something went wrong with this character, probably a very janky character') 

Extracting character urls...
Extracting characters from page 0...
Page 0 is empty. Ending...
Extracting character urls...
Extracting characters from page 0...
Page 0 is empty. Ending...
Extracting character urls...
Extracting characters from page 0...
Page 0 is empty. Ending...
Extracting character urls...
Extracting characters from page 0...
Beginning to extract https://te4.org//characters/33258/tome/41a0f4bc-7165-4f0e-8f78-7a2a284e5b02...
{'Prodigies': 17, 'Character': 0, 'Primary Stats': 1, 'Resources': 2, 'Speed': 3, 'Vision': 4, 'Offense: Mainhand': 5, 'Offense: Spell': 6, 'Offense: Mind': 7, 'Defense: Base': 8, 'Defense: Immunities': 9, 'Inscriptions (3/3)': 10, 'Class Talents': 11, 'Generic Talents': 12, 'Effects': 13, 'Quests': 14, 'Equipment': 15, 'Inventory': 16, 'Achievements': 17, 'Log': 18}
Extracting character urls...
Extracting characters from page 0...
Page 0 is empty. Ending...
Extracting character urls...
Extracting characters from page 0...
Page 0 is empty. Ending...

KeyboardInterrupt: 

#### Filtering

In [10]:
import filter_codes

# Tag order: name, minlevel, maxlevel, dead, winner, permadeath, difficulty, race, class, campagin, version



Beginning to extract https://te4.org/characters/259208/tome/bddd99b8-9594-44d2-a0ba-ec743bafabfa...


In [11]:
test = get_character_dictionary("https://te4.org/characters/259208/tome/bddd99b8-9594-44d2-a0ba-ec743bafabfa")
print(test)

{'name': 'kankan the level 50 Dwarf Anorithil by birdest', 'race': 'Dwarf', 'class': 'Anorithil', 'sex': 'Female', 'level': '50 / 2143%', 'stats': {'Strength': '51 (base 9)', 'Dexterity': '46 (base 30)', 'Constitution': '138 (base 60)', 'Magic': '136 (base 62)', 'Willpower': '23 (base 9)', 'Cunning': '82 (base 60)'}, 'inscriptions': ['Infusion: Movement', 'Rune: Mirror Image', 'Infusion: Wild Growth', 'Rune: Dissipation', 'Rune: Shatter Afflictions'], 'class talents': OrderedDict([('Celestial / Circles', OrderedDict([('Circle of Shifting Shadows', 5), ('Circle of Sanctity', 5), ('Circle of Warding', 3), ('Celestial Surge', 4)])), ('Celestial / Star fury', OrderedDict([('Moonlight Ray', 5), ('Shadow Blast', 5), ('Twilight Surge', 5), ('Starfall', 1)])), ('Celestial / Eclipse', OrderedDict([('Blood Red Moon', 5), ('Totality', 4), ('Corona', 1), ('Darkest Light', 5)])), ('Celestial / Sunlight', OrderedDict([('Searing Light', 5), ('Sun Flare', 5), ('Firebeam', 0), ('Sunburst', 0)])), ('Cel