# A Pokedex Entry Webscraper

### Step 1 - Retrieve the HTML

In [428]:
import json
from urllib.request import urlopen, Request

def get_html(url):

    # Request access to the page
    request = Request(url, headers={'User-Agent': 'Mozilla/5.0'})

    # Open the page
    page = urlopen(request)

    # Read the HTML
    return page.read().decode("utf-8")

### Step 2 - Retrieve the Pokedex Entries

In [429]:
import re

def get_pokedex_entries(html):

    # Find the table of pokedex entries
    html = re.search("<tbody>(.*)</tbody>", html, re.IGNORECASE).group(1)

    # Remove the game name
    html = re.sub("<th>.*?</th>", "", html)

    # Find all entries
    html = re.findall("<tr>\s*<td class=\"cell-med-text\">(.*?)</td>\s*</tr>", html)

    # Create a list of entries
    entries = []

    # For every entry...
    for entry in html:

        # Replace POKEMON with pokemon
        entry = re.sub("POKéMON", "pokemon", entry)

        # Replace the fancy e's
        entry = re.sub("é", "e", entry)

        # Remove fully capitalized pokemon
        entry = re.sub("[A-Z]{2,}", lambda x: x.group().capitalize(), entry)

        # Replace dashes with spaces
        entry = re.sub("[—-]", " ", entry)

        # Remove all non-alphanumeric chars
        entry = re.sub("[^a-zA-Z0-9 -]", "", entry)

        # Append the entry to the list
        entries.append(entry)

    # Return the entries without the whitespace
    return entries

### Step 3 - Store in a JSON File

In [430]:
def record_entries(pokemon, entries):

    # Store the entries in the dictionary
    pokemon_dictionary[pokemon] = entries

### Step 4 - Find the Next Pokemon

In [431]:
def get_next_pokemon(html):

    # Find the next URL
    pokemon = re.search("<a rel=\"next\" class=\"entity-nav-next\" href=\".*?\">", html, re.IGNORECASE)

    # Turn the URl into a string
    pokemon = pokemon.group()

    # Obtain only the pokemon
    pokemon = re.sub("<a rel=\"next\" class=\"entity-nav-next\" href=\"/pokedex/", "", pokemon)
    pokemon = re.sub("\">", "", pokemon)

    # Return the pokemon name
    return pokemon

### Step 5 - Scrape the Data of all 151 Pokemon

In [432]:
# Find the base URL
base_url = "https://pokemondb.net/pokedex/"

# Find the first pokemon
pokemon = "bulbasaur"

# Create a dictionary of entries for each pokemon
pokemon_dictionary = dict()

# For all 151 Pokemon...
for i in range(151):

    # Get the HTML
    html = get_html(base_url + pokemon)

    # Get the pokedex entries
    entries = get_pokedex_entries(html)

    # Store the pokemon's entries in the file
    record_entries(pokemon, entries)

    # Find the next pokemon
    pokemon = get_next_pokemon(html)

# Write the dictionary to the file
with open("pokedex_entries.json", "w", encoding='utf-8') as f:
    json.dump(pokemon_dictionary, f, indent=4, sort_keys=True)

# Report that the process is complete!
print("Done!")

Done!
