In [5]:
from bs4 import BeautifulSoup as BS
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [60]:
URL = "https://pokemondb.net/pokedex/all"

# Get soup of the page from the URL

def get_soup(url):
    page = requests.get(url)
    return BS(page.content, 'html.parser')

soup = get_soup(URL)
print(soup.prettify()[0:1000])

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Pokémon Pokédex: list of Pokémon with stats | Pokémon Database
  </title>
  <link href="https://img.pokemondb.net" rel="preconnect"/>
  <link href="https://s.pokemondb.net" rel="preconnect"/>
  <style>
   @font-face{font-family:"Fira Sans";font-style:normal;font-weight:400;font-display:swap;src:url("/static/fonts/fira-sans-v10-latin-400.woff2") format("woff2");unicode-range:U+0000-00FF,U+0131,U+0152-0153,U+02BB-02BC,U+02C6,U+02DA,U+02DC,U+2000-206F,U+2074,U+20AC,U+2122,U+2191,U+2193,U+2212,U+2215,U+FEFF,U+FFFD}@font-face{font-family:"Fira Sans";font-style:italic;font-weight:400;font-display:swap;src:url("/static/fonts/fira-sans-v10-latin-400i.woff2") format("woff2");unicode-range:U+0000-00FF,U+0131,U+0152-0153,U+02BB-02BC,U+02C6,U+02DA,U+02DC,U+2000-206F,U+2074,U+20AC,U+2122,U+2191,U+2193,U+2212,U+2215,U+FEFF,U+FFFD}@font-face{font-family:"Fira Sans";font-style:normal;font-weight:700;font-display:swap;src:u

# Idea
The main idea is to copy the contents of the table by scraping the information. The table and information is nicely formatted and it shouldn't be difficult to extract the information.

## Table head

In [19]:
# Find the header of the table in the soup
table_head = soup.find("thead").find_all("th", class_="sorting")
stats = []
for col in table_head:
    print(col.text)
    stat = col.text
    stats.append(stat)

print(stats)

#
Name
Total
HP
Attack
Defense
Sp. Atk
Sp. Def
Speed
['#', 'Name', 'Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']


# Table rows
Now we want to extract all the information for each pokemon (row) in the table.

In [50]:

table_body = soup.find("tbody")
rows = table_body.find_all("tr")
print(rows[0])


<tr>
<td class="cell-num cell-fixed" data-sort-value="1"><span class="infocard-cell-img"><img alt="Bulbasaur" class="img-fixed icon-pkmn" height="42" loading="lazy" src="https://img.pokemondb.net/sprites/sword-shield/icon/bulbasaur.png" width="56"/></span><span class="infocard-cell-data">0001</span></td> <td class="cell-name"><a class="ent-name" href="/pokedex/bulbasaur" title="View Pokedex for #0001 Bulbasaur">Bulbasaur</a></td><td class="cell-icon"><a class="type-icon type-grass" href="/type/grass">Grass</a><br/> <a class="type-icon type-poison" href="/type/poison">Poison</a></td>
<td class="cell-num cell-total">318</td>
<td class="cell-num">45</td>
<td class="cell-num">49</td>
<td class="cell-num">49</td>
<td class="cell-num">65</td>
<td class="cell-num">65</td>
<td class="cell-num">45</td>
</tr>


## What does the above output tell us? 
By looking at the screenshot below, it seems like the first `<td>` entry contains information about the first three columns (i.e., number, name and type), while the rest of the `<td>` entries contain the stat value of the corresponding column.
<br><br>
![title](images/pokedex_first_rows.png)

In [68]:
# Find the numbers of the pokemon
numbers = []

for row in rows:
    number = row.find_all("td")[0].text
    numbers.append(number)

for number in numbers[:20]:
    print(number)

0001
0002
0003
0003
0004
0005
0006
0006
0006
0007
0008
0009
0009
0010
0011
0012
0013
0014
0015
0015


In [62]:
# Find the names of the pokemon
names = []

for row in rows:
    name = row.find_all("td")[1].text
    names.append(name)

for name in names[:20]:
    print(name)

Bulbasaur
Ivysaur
Venusaur
Venusaur Mega Venusaur
Charmander
Charmeleon
Charizard
Charizard Mega Charizard X
Charizard Mega Charizard Y
Squirtle
Wartortle
Blastoise
Blastoise Mega Blastoise
Caterpie
Metapod
Butterfree
Weedle
Kakuna
Beedrill
Beedrill Mega Beedrill


In [58]:
print(f"Length of 'names': {len(names)}")

Length of 'names': 1194


We have extracted 1194 names, but the original table only seems contains 1010 pokemon. What is going on?
<br><br>
Looking closely at the original table, one can see that a Pokémon and its mega evolution still has the same pokédex number. E.g. both **Venusaur** and **Mega Venusaur** have the number 3. The number of rows in the original table is likely 1194, but the unique number of Pokédex numbers is 1010. 

In [67]:
for x in range(10):
    print(names[x], numbers[x])

Bulbasaur 0001
Ivysaur 0002
Venusaur 0003
Venusaur Mega Venusaur 0003
Charmander 0004
Charmeleon 0005
Charizard 0006
Charizard Mega Charizard X 0006
Charizard Mega Charizard Y 0006
Squirtle 0007
