In [5]:
from bs4 import BeautifulSoup as BS
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [60]:
URL = "https://pokemondb.net/pokedex/all"

# Get soup of the page from the URL

def get_soup(url):
    page = requests.get(url)
    return BS(page.content, 'html.parser')

soup = get_soup(URL)
print(soup.prettify()[0:1000])

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Pokémon Pokédex: list of Pokémon with stats | Pokémon Database
  </title>
  <link href="https://img.pokemondb.net" rel="preconnect"/>
  <link href="https://s.pokemondb.net" rel="preconnect"/>
  <style>
   @font-face{font-family:"Fira Sans";font-style:normal;font-weight:400;font-display:swap;src:url("/static/fonts/fira-sans-v10-latin-400.woff2") format("woff2");unicode-range:U+0000-00FF,U+0131,U+0152-0153,U+02BB-02BC,U+02C6,U+02DA,U+02DC,U+2000-206F,U+2074,U+20AC,U+2122,U+2191,U+2193,U+2212,U+2215,U+FEFF,U+FFFD}@font-face{font-family:"Fira Sans";font-style:italic;font-weight:400;font-display:swap;src:url("/static/fonts/fira-sans-v10-latin-400i.woff2") format("woff2");unicode-range:U+0000-00FF,U+0131,U+0152-0153,U+02BB-02BC,U+02C6,U+02DA,U+02DC,U+2000-206F,U+2074,U+20AC,U+2122,U+2191,U+2193,U+2212,U+2215,U+FEFF,U+FFFD}@font-face{font-family:"Fira Sans";font-style:normal;font-weight:700;font-display:swap;src:u

# Idea
The main idea is to copy the contents of the table by scraping the information. The table and information is nicely formatted and it shouldn't be difficult to extract the information.

## Table head

In [94]:
table_head = soup.find('thead')
table_head

<thead>
<tr>
<th class="sorting" data-sort="int"><div class="sortwrap">#</div></th> <th class="sorting" data-sort="string"><div class="sortwrap">Name</div></th> <th><div class="sortwrap">Type</div></th> <th class="sorting" data-sort="int"><div class="sortwrap">Total</div></th> <th class="sorting" data-sort="int"><div class="sortwrap">HP</div></th> <th class="sorting" data-sort="int"><div class="sortwrap">Attack</div></th> <th class="sorting" data-sort="int"><div class="sortwrap">Defense</div></th> <th class="sorting" data-sort="int"><div class="sortwrap">Sp. Atk</div></th> <th class="sorting" data-sort="int"><div class="sortwrap">Sp. Def</div></th> <th class="sorting" data-sort="int"><div class="sortwrap">Speed</div></th> </tr>
</thead>

In [95]:
table_head.find_all("th")

[<th class="sorting" data-sort="int"><div class="sortwrap">#</div></th>,
 <th class="sorting" data-sort="string"><div class="sortwrap">Name</div></th>,
 <th><div class="sortwrap">Type</div></th>,
 <th class="sorting" data-sort="int"><div class="sortwrap">Total</div></th>,
 <th class="sorting" data-sort="int"><div class="sortwrap">HP</div></th>,
 <th class="sorting" data-sort="int"><div class="sortwrap">Attack</div></th>,
 <th class="sorting" data-sort="int"><div class="sortwrap">Defense</div></th>,
 <th class="sorting" data-sort="int"><div class="sortwrap">Sp. Atk</div></th>,
 <th class="sorting" data-sort="int"><div class="sortwrap">Sp. Def</div></th>,
 <th class="sorting" data-sort="int"><div class="sortwrap">Speed</div></th>]

In [96]:
for row in table_head.find_all("th"):
    print(row.text)

#
Name
Type
Total
HP
Attack
Defense
Sp. Atk
Sp. Def
Speed


In [97]:
columns = [row.text.replace('\n', '') for row in table_head.find_all("th")]
columns

['#',
 'Name',
 'Type',
 'Total',
 'HP',
 'Attack',
 'Defense',
 'Sp. Atk',
 'Sp. Def',
 'Speed']

# Table rows
Now we want to extract all the information for each pokemon (row) in the table.

In [50]:

table_body = soup.find("tbody")
rows = table_body.find_all("tr")
print(rows[0])


<tr>
<td class="cell-num cell-fixed" data-sort-value="1"><span class="infocard-cell-img"><img alt="Bulbasaur" class="img-fixed icon-pkmn" height="42" loading="lazy" src="https://img.pokemondb.net/sprites/sword-shield/icon/bulbasaur.png" width="56"/></span><span class="infocard-cell-data">0001</span></td> <td class="cell-name"><a class="ent-name" href="/pokedex/bulbasaur" title="View Pokedex for #0001 Bulbasaur">Bulbasaur</a></td><td class="cell-icon"><a class="type-icon type-grass" href="/type/grass">Grass</a><br/> <a class="type-icon type-poison" href="/type/poison">Poison</a></td>
<td class="cell-num cell-total">318</td>
<td class="cell-num">45</td>
<td class="cell-num">49</td>
<td class="cell-num">49</td>
<td class="cell-num">65</td>
<td class="cell-num">65</td>
<td class="cell-num">45</td>
</tr>


## What does the above output tell us? 
By looking at the screenshot below, it seems like the first `<td>` entry contains information about the first three columns (i.e., number, name and type), while the rest of the `<td>` entries contain the stat value of the corresponding column.
<br><br>
![title](images/pokedex_first_rows.png)

In [68]:
# Find the numbers of the pokemon
numbers = []

for row in rows:
    number = row.find_all("td")[0].text
    numbers.append(number)

for number in numbers[:20]:
    print(number)

0001
0002
0003
0003
0004
0005
0006
0006
0006
0007
0008
0009
0009
0010
0011
0012
0013
0014
0015
0015


But there is more in the first `<td>`, so we need to find a way to extract the information from it.

In [86]:
# Look at the first row

rows[0].find_all("td")

[<td class="cell-num cell-fixed" data-sort-value="1"><span class="infocard-cell-img"><img alt="Bulbasaur" class="img-fixed icon-pkmn" height="42" loading="lazy" src="https://img.pokemondb.net/sprites/sword-shield/icon/bulbasaur.png" width="56"/></span><span class="infocard-cell-data">0001</span></td>,
 <td class="cell-name"><a class="ent-name" href="/pokedex/bulbasaur" title="View Pokedex for #0001 Bulbasaur">Bulbasaur</a></td>,
 <td class="cell-icon"><a class="type-icon type-grass" href="/type/grass">Grass</a><br/> <a class="type-icon type-poison" href="/type/poison">Poison</a></td>,
 <td class="cell-num cell-total">318</td>,
 <td class="cell-num">45</td>,
 <td class="cell-num">49</td>,
 <td class="cell-num">49</td>,
 <td class="cell-num">65</td>,
 <td class="cell-num">65</td>,
 <td class="cell-num">45</td>]

In [89]:
# Loop through the columns of the first row and get the information
for td_idx, td in enumerate(rows[0].find_all("td")):
    print(f"td_index: {td_idx}, td.text: '{td.text}'")

td_index: 0, td.text: '0001'
td_index: 1, td.text: 'Bulbasaur'
td_index: 2, td.text: 'Grass Poison'
td_index: 3, td.text: '318'
td_index: 4, td.text: '45'
td_index: 5, td.text: '49'
td_index: 6, td.text: '49'
td_index: 7, td.text: '65'
td_index: 8, td.text: '65'
td_index: 9, td.text: '45'


We should be able to extract the image links by getting into the `<img ... >` bracket and extracting the `src` part of it.

In [76]:
# Extract links for images
for row in rows[:10]:
    print(row.find_all("td")[0].find("img")["src"])

https://img.pokemondb.net/sprites/sword-shield/icon/bulbasaur.png
https://img.pokemondb.net/sprites/sword-shield/icon/ivysaur.png
https://img.pokemondb.net/sprites/sword-shield/icon/venusaur.png
https://img.pokemondb.net/sprites/sword-shield/icon/venusaur-mega.png
https://img.pokemondb.net/sprites/sword-shield/icon/charmander.png
https://img.pokemondb.net/sprites/sword-shield/icon/charmeleon.png
https://img.pokemondb.net/sprites/sword-shield/icon/charizard.png
https://img.pokemondb.net/sprites/sword-shield/icon/charizard-mega-x.png
https://img.pokemondb.net/sprites/sword-shield/icon/charizard-mega-y.png
https://img.pokemondb.net/sprites/sword-shield/icon/squirtle.png


In [62]:
# Find the names of the pokemon
names = []

for row in rows:
    name = row.find_all("td")[1].text
    names.append(name)

for name in names[:20]:
    print(name)

Bulbasaur
Ivysaur
Venusaur
Venusaur Mega Venusaur
Charmander
Charmeleon
Charizard
Charizard Mega Charizard X
Charizard Mega Charizard Y
Squirtle
Wartortle
Blastoise
Blastoise Mega Blastoise
Caterpie
Metapod
Butterfree
Weedle
Kakuna
Beedrill
Beedrill Mega Beedrill


In [58]:
print(f"Length of 'names': {len(names)}")

Length of 'names': 1194


We have extracted 1194 names, but the original table only seems contains 1010 pokemon. What is going on?
<br><br>
Looking closely at the original table, one can see that a Pokémon and its mega evolution still has the same pokédex number. E.g. both **Venusaur** and **Mega Venusaur** have the number 3. The number of rows in the original table is likely 1194, but the unique number of Pokédex numbers is 1010. 

In [67]:
for x in range(10):
    print(names[x], numbers[x])

Bulbasaur 0001
Ivysaur 0002
Venusaur 0003
Venusaur Mega Venusaur 0003
Charmander 0004
Charmeleon 0005
Charizard 0006
Charizard Mega Charizard X 0006
Charizard Mega Charizard Y 0006
Squirtle 0007


# Putting it all together

In [156]:
table_body = soup.find("tbody")
rows = table_body.find_all("tr")

df = pd.DataFrame(columns=columns)
table = []

for row in rows:
    row_tds = row.find_all("td")
    
    row_columns = []
    for col in row_tds:
        row_columns.append(col.text)

    table.append(row_columns)

df = pd.DataFrame(table, columns=columns)
df.head()

Unnamed: 0,#,Name,Type,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,1,Bulbasaur,Grass Poison,318,45,49,49,65,65,45
1,2,Ivysaur,Grass Poison,405,60,62,63,80,80,60
2,3,Venusaur,Grass Poison,525,80,82,83,100,100,80
3,3,Venusaur Mega Venusaur,Grass Poison,625,80,100,123,122,120,80
4,4,Charmander,Fire,309,39,52,43,60,50,65


Let's rename the columns to make them more convenient to use

In [157]:
df.rename(columns={"#": "number", 
                   "Name":"name", 
                   "Type":"type", 
                   "Total":"total", 
                   "HP":"hp", 
                   "Attack":"attack", 
                   "Defense":"defense",
                   "Sp. Atk":"sp_atk",
                   "Sp. Def":"sp_def",
                   "Speed":"speed"}, inplace=True)
df.head()

Unnamed: 0,number,name,type,total,hp,attack,defense,sp_atk,sp_def,speed
0,1,Bulbasaur,Grass Poison,318,45,49,49,65,65,45
1,2,Ivysaur,Grass Poison,405,60,62,63,80,80,60
2,3,Venusaur,Grass Poison,525,80,82,83,100,100,80
3,3,Venusaur Mega Venusaur,Grass Poison,625,80,100,123,122,120,80
4,4,Charmander,Fire,309,39,52,43,60,50,65


In [158]:
df.dtypes

number     object
name       object
type       object
total      object
hp         object
attack     object
defense    object
sp_atk     object
sp_def     object
speed      object
dtype: object

Let's convert the datatypes of the number and stats columns to be integers instead of `objects`.

In [159]:
df[['number', 'total', 'hp', 'attack', 'defense', 'sp_atk', 'sp_def', 'speed']] = df[['number', 'total', 'hp', 'attack', 'defense', 'sp_atk', 'sp_def', 'speed']].apply(pd.to_numeric)
print(df.dtypes)

df.head()

number      int64
name       object
type       object
total       int64
hp          int64
attack      int64
defense     int64
sp_atk      int64
sp_def      int64
speed       int64
dtype: object


Unnamed: 0,number,name,type,total,hp,attack,defense,sp_atk,sp_def,speed
0,1,Bulbasaur,Grass Poison,318,45,49,49,65,65,45
1,2,Ivysaur,Grass Poison,405,60,62,63,80,80,60
2,3,Venusaur,Grass Poison,525,80,82,83,100,100,80
3,3,Venusaur Mega Venusaur,Grass Poison,625,80,100,123,122,120,80
4,4,Charmander,Fire,309,39,52,43,60,50,65


It would nice if we could make the `type` field be two separate columns; `type 1` and `type 2`. Not all Pokémon have two types though, so some will have `None` in the type 2 column. 

In [160]:
# Are there any Pokémon without a type?
df.type.isnull().sum()

0

In [161]:
df[["type_1", "type_2"]] = df.type.str.split(" ", expand=True)
df.head()

Unnamed: 0,number,name,type,total,hp,attack,defense,sp_atk,sp_def,speed,type_1,type_2
0,1,Bulbasaur,Grass Poison,318,45,49,49,65,65,45,Grass,Poison
1,2,Ivysaur,Grass Poison,405,60,62,63,80,80,60,Grass,Poison
2,3,Venusaur,Grass Poison,525,80,82,83,100,100,80,Grass,Poison
3,3,Venusaur Mega Venusaur,Grass Poison,625,80,100,123,122,120,80,Grass,Poison
4,4,Charmander,Fire,309,39,52,43,60,50,65,Fire,


Let's remove the `type` column and move the `type 1` and `type 2` columns to be after the name.

In [162]:
df = df[["number", "name", "type_1", "type_2", "total", "hp", "attack", "defense", "sp_atk", "sp_def", "speed"]]
df.head()

Unnamed: 0,number,name,type_1,type_2,total,hp,attack,defense,sp_atk,sp_def,speed
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80
3,3,Venusaur Mega Venusaur,Grass,Poison,625,80,100,123,122,120,80
4,4,Charmander,Fire,,309,39,52,43,60,50,65


In [163]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1194 entries, 0 to 1193
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   number   1194 non-null   int64 
 1   name     1194 non-null   object
 2   type_1   1194 non-null   object
 3   type_2   1194 non-null   object
 4   total    1194 non-null   int64 
 5   hp       1194 non-null   int64 
 6   attack   1194 non-null   int64 
 7   defense  1194 non-null   int64 
 8   sp_atk   1194 non-null   int64 
 9   sp_def   1194 non-null   int64 
 10  speed    1194 non-null   int64 
dtypes: int64(8), object(3)
memory usage: 102.7+ KB


In [164]:
df.describe()

Unnamed: 0,number,total,hp,attack,defense,sp_atk,sp_def,speed
count,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0
mean,492.746231,441.206868,70.887772,80.948911,74.587102,72.876884,72.128141,69.778057
std,293.719541,121.015326,26.859651,32.12225,30.678626,32.696051,27.628468,30.195593
min,1.0,175.0,1.0,5.0,5.0,10.0,20.0,5.0
25%,235.25,330.0,52.0,56.0,51.25,50.0,50.0,45.0
50%,486.5,460.5,70.0,80.0,70.0,65.0,70.0,67.5
75%,741.75,520.0,85.0,100.0,90.0,95.0,90.0,90.75
max,1010.0,1125.0,255.0,190.0,250.0,194.0,250.0,200.0


In [166]:
df.type_2.unique()

array(['Poison', nan, 'Flying', 'Dragon', 'Normal', 'Psychic', 'Steel',
       'Ground', 'Fairy', 'Grass', 'Rock', 'Fighting', 'Electric', 'Ice',
       'Dark', 'Ghost', 'Fire', 'Water', 'Bug'], dtype=object)

Let's change the empty strings for type_2 into `nan` values instead to make it easier to understand.

In [168]:
df.type_2 = df.type_2.replace("", np.nan)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1194 entries, 0 to 1193
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   number   1194 non-null   int64 
 1   name     1194 non-null   object
 2   type_1   1194 non-null   object
 3   type_2   652 non-null    object
 4   total    1194 non-null   int64 
 5   hp       1194 non-null   int64 
 6   attack   1194 non-null   int64 
 7   defense  1194 non-null   int64 
 8   sp_atk   1194 non-null   int64 
 9   sp_def   1194 non-null   int64 
 10  speed    1194 non-null   int64 
dtypes: int64(8), object(3)
memory usage: 102.7+ KB


Now that we've extracted all the data that we want, we can save it as a .csv file to make it convenient to work with.

In [169]:
df.to_csv("pokemon.csv", index=False)