## Scraping the pokedex

The pokedex table containing basic data about each pokemon is available at https://pokemondb.net/pokedex/all

### Using BeautifulSoup to extract all the table rows as a list

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
poke_url = 'https://pokemondb.net/pokedex/all'
poke_page = requests.get(poke_url)
poke_page_soup = BeautifulSoup(poke_page.text, 'html.parser')

In [3]:
poke_table_body = poke_page_soup.find('tbody')
poke_row_list = poke_page_soup.find_all('tr')
print("Number of rows are: " + str(len(poke_row_list)))

Number of rows are: 927


### Saving the first row of the table (bulbasaur) as a variable.  Printing the contents from that row. 

In [4]:
bulbasaur = poke_row_list[1]

In [5]:
print("Name of the pokemon is: " + bulbasaur.find(name = "a").text)
print("Link to the pokemon is: "+ "https://pokemondb.net" + bulbasaur.find(name = "a")['href'])
print("Type: " + bulbasaur.find(class_="cell-icon").text)
print("The total points: " + bulbasaur.find(class_="cell-total").text)

poke_values = bulbasaur.findAll(class_ = "cell-num")
poke_value_list = []
for i in poke_values:
    poke_value_list.append(i.text)
    
print("ID Number, HP, Attack, Defense, Sp. Atk, Sp. Def, Speed : " + str(poke_value_list))

Name of the pokemon is: Bulbasaur
Link to the pokemon is: https://pokemondb.net/pokedex/bulbasaur
Type: Grass Poison
The total points: 318
ID Number, HP, Attack, Defense, Sp. Atk, Sp. Def, Speed : ['001', '45', '49', '49', '65', '65', '45']


### Creating a single DataFrame by appending these rows

In [6]:
import pandas as pd

In [7]:
def row_to_dataframe(row):
    i = 0
    name = row.find(name = "a").text
    link = "https://pokemondb.net" + row.find(name = "a")['href']
    typeofpoke =  row.find(class_="cell-icon").text
    total_points = row.find(class_="cell-total").text    
    
    poke_values = row.findAll(class_ = "cell-num")
    poke_value_list = []
    
    for i in poke_values:
        poke_value_list.append(i.text)
    
    row_list = [name, link, typeofpoke, total_points, poke_value_list[0], poke_value_list[1], poke_value_list[2],poke_value_list[3], poke_value_list[4], poke_value_list[5], poke_value_list[6]]
    df = pd.DataFrame(row_list).T
    return df

In [8]:
poke_df = pd.DataFrame()
print(poke_df)
for x in poke_row_list[1:]:
    poke_df = poke_df.append(row_to_dataframe(x))

Empty DataFrame
Columns: []
Index: []


In [9]:
poke_df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Bulbasaur,https://pokemondb.net/pokedex/bulbasaur,Grass Poison,318,1,45,49,49,65,65,45
0,Ivysaur,https://pokemondb.net/pokedex/ivysaur,Grass Poison,405,2,60,62,63,80,80,60
0,Venusaur,https://pokemondb.net/pokedex/venusaur,Grass Poison,525,3,80,82,83,100,100,80
0,Venusaur,https://pokemondb.net/pokedex/venusaur,Grass Poison,625,3,80,100,123,122,120,80
0,Charmander,https://pokemondb.net/pokedex/charmander,Fire,309,4,39,52,43,60,50,65
0,Charmeleon,https://pokemondb.net/pokedex/charmeleon,Fire,405,5,58,64,58,80,65,80
0,Charizard,https://pokemondb.net/pokedex/charizard,Fire Flying,534,6,78,84,78,109,85,100
0,Charizard,https://pokemondb.net/pokedex/charizard,Fire Dragon,634,6,78,130,111,130,85,100
0,Charizard,https://pokemondb.net/pokedex/charizard,Fire Flying,634,6,78,104,78,159,115,100
0,Squirtle,https://pokemondb.net/pokedex/squirtle,Water,314,7,44,48,65,50,64,43


##  Cleaning the Pokedex Data

### Adding column names to the dataframe. Converting strings to numeric where appropriate. Makeing the ID number the first column in the DataFrame if it is not already.

In [10]:
#Adding column names
poke_df.columns = ["Name","URL","Type", "Total", "ID", "HP", "Attack", "Defense", "Sp.Atk", "Sp.Def", "Speed"]

#Converting string to numeric values
poke_df['Total'] = pd.to_numeric(poke_df['Total'])
poke_df['ID'] = pd.to_numeric(poke_df['ID'])
poke_df['HP'] = pd.to_numeric(poke_df['Total'])
poke_df['Attack'] = pd.to_numeric(poke_df['Attack'])
poke_df['Defense'] = pd.to_numeric(poke_df['Defense'])
poke_df['Sp.Atk'] = pd.to_numeric(poke_df['Sp.Atk'])
poke_df['Sp.Def'] = pd.to_numeric(poke_df['Sp.Def'])
poke_df['Speed'] = pd.to_numeric(poke_df['Speed'])

#Rearranging to make IDs the first column
poke_df = poke_df[["ID","Name","URL","Type", "Total", "HP", "Attack", "Defense", "Sp.Atk", "Sp.Def", "Speed"]]

### Creating 18 dummy variables for each type of pokemon

In [11]:
a = poke_df['Type'].unique() 

In [12]:
b = []
for i in range(0, len(a)):
        b.extend(a[i].split(' '))

In [13]:
import numpy as np

In [40]:
x = np.array(b)
unique_types = np.unique(x)
unique_types = np.delete(unique_types, 0)
unique_types

array(['Bug', 'Dark', 'Dragon', 'Electric', 'Fairy', 'Fighting', 'Fire',
       'Flying', 'Ghost', 'Grass', 'Ground', 'Ice', 'Normal', 'Poison',
       'Psychic', 'Rock', 'Steel', 'Water'], dtype='<U8')

In [37]:
unique_types

array(['Bug', 'Dark', 'Dragon', 'Electric', 'Fairy', 'Fighting', 'Fire',
       'Flying', 'Ghost', 'Grass', 'Ground', 'Ice', 'Normal', 'Poison',
       'Psychic', 'Rock', 'Steel', 'Water'], dtype='<U8')

In [15]:
for x in unique_types:
    poke_df.loc[:,x] = 0
    poke_df.loc[poke_df['Type'].str.contains(x), x] = 1

### Removing duplicate values

In [16]:
#Dropping duplicate rows based on URL
poke_df = poke_df.drop_duplicates("URL")
print("Number of rows in deduplicated dataset is:" + str(len(poke_df.drop_duplicates("URL"))))

Number of rows in deduplicated dataset is:809


In [17]:
poke_df

Unnamed: 0,ID,Name,URL,Type,Total,HP,Attack,Defense,Sp.Atk,Sp.Def,...,Ghost,Grass,Ground,Ice,Normal,Poison,Psychic,Rock,Steel,Water
0,1,Bulbasaur,https://pokemondb.net/pokedex/bulbasaur,Grass Poison,318,318,49,49,65,65,...,0,1,0,0,0,1,0,0,0,0
0,2,Ivysaur,https://pokemondb.net/pokedex/ivysaur,Grass Poison,405,405,62,63,80,80,...,0,1,0,0,0,1,0,0,0,0
0,3,Venusaur,https://pokemondb.net/pokedex/venusaur,Grass Poison,525,525,82,83,100,100,...,0,1,0,0,0,1,0,0,0,0
0,4,Charmander,https://pokemondb.net/pokedex/charmander,Fire,309,309,52,43,60,50,...,0,0,0,0,0,0,0,0,0,0
0,5,Charmeleon,https://pokemondb.net/pokedex/charmeleon,Fire,405,405,64,58,80,65,...,0,0,0,0,0,0,0,0,0,0
0,6,Charizard,https://pokemondb.net/pokedex/charizard,Fire Flying,534,534,84,78,109,85,...,0,0,0,0,0,0,0,0,0,0
0,7,Squirtle,https://pokemondb.net/pokedex/squirtle,Water,314,314,48,65,50,64,...,0,0,0,0,0,0,0,0,0,1
0,8,Wartortle,https://pokemondb.net/pokedex/wartortle,Water,405,405,63,80,65,80,...,0,0,0,0,0,0,0,0,0,1
0,9,Blastoise,https://pokemondb.net/pokedex/blastoise,Water,530,530,83,100,85,105,...,0,0,0,0,0,0,0,0,0,1
0,10,Caterpie,https://pokemondb.net/pokedex/caterpie,Bug,195,195,30,35,20,20,...,0,0,0,0,0,0,0,0,0,0


### Step 2.4

In [18]:
poke_df.loc[poke_df["ID"]%4 == 0,'Sample'] = 1

## Scraping Individual Pages

Bulbasaur page as an example https://pokemondb.net/pokedex/bulbasaur

### Scraping the main image for Bulbasaur in a general way

In [19]:
%pylab inline
from IPython.display import Image
pd.set_option('colwidth',900)

Populating the interactive namespace from numpy and matplotlib


In [20]:
bulb_url = 'https://pokemondb.net/pokedex/bulbasaur'
bulb_page = requests.get(bulb_url)
bulb_page_soup = BeautifulSoup(bulb_page.text, 'html.parser')



In [21]:
image_url = bulb_page_soup.find(name = "img")['src']

In [22]:
Image(url = image_url)

### Extracting the Location Table

In [72]:
url="https://pokemondb.net/pokedex/bulbasaur"
all_tables = pd.read_html(requests.get(url, headers={'User-agent': 'Mozilla/5.0'}).text)
bulbasaur_location_table = all_tables[-2]
bulbasaur_location_table

Unnamed: 0,0,1
0,RedBlue,Pallet Town
1,Yellow,Cerulean City
2,GoldSilverCrystal,Trade/migrate from another game
3,RubySapphire,Trade/migrate from another game
4,FireRedLeafGreen,Pallet Town
5,Emerald,Trade/migrate from another game
6,DiamondPearlPlatinum,Trade/migrate from another game
7,HeartGoldSoulSilver,Pallet Town
8,BlackWhiteBlack 2White 2,Trade/migrate from another game
9,XY,Lumiose City


### Creating a single DataFrame that contains the name or URL of the pokemon and the XY location

In [73]:
bulbasaur_location_new = bulbasaur_location_table.T
bulbasaur_location_new.columns = bulbasaur_location_new.iloc[0]
bulbasaur_location_new = bulbasaur_location_new.drop(bulbasaur_location_new.index[0])
bulbasaur_location_new

Unnamed: 0,RedBlue,Yellow,GoldSilverCrystal,RubySapphire,FireRedLeafGreen,Emerald,DiamondPearlPlatinum,HeartGoldSoulSilver,BlackWhiteBlack 2White 2,XY,Omega RubyAlpha Sapphire,SunMoon,Ultra SunUltra Moon,Let's Go PikachuLet's Go Eevee
1,Pallet Town,Cerulean City,Trade/migrate from another game,Trade/migrate from another game,Pallet Town,Trade/migrate from another game,Trade/migrate from another game,Pallet Town,Trade/migrate from another game,Lumiose City,Trade/migrate from another game,Trade/migrate from another game,Route 2,"Cerulean City, Viridian Forest"


In [74]:
import numpy as np

poke_df.index = np.arange(1, len(poke_df)+1) 
sample = poke_df[poke_df['Sample'] == 1]
sample = sample.iloc[:-1,]
sample.head(5)

Unnamed: 0,ID,Name,URL,Type,Total,HP,Attack,Defense,Sp.Atk,Sp.Def,...,Grass,Ground,Ice,Normal,Poison,Psychic,Rock,Steel,Water,Sample
4,4,Charmander,https://pokemondb.net/pokedex/charmander,Fire,309,309,52,43,60,50,...,0,0,0,0,0,0,0,0,0,1.0
8,8,Wartortle,https://pokemondb.net/pokedex/wartortle,Water,405,405,63,80,65,80,...,0,0,0,0,0,0,0,0,1,1.0
12,12,Butterfree,https://pokemondb.net/pokedex/butterfree,Bug Flying,395,395,45,50,90,80,...,0,0,0,0,0,0,0,0,0,1.0
16,16,Pidgey,https://pokemondb.net/pokedex/pidgey,Normal Flying,251,251,45,40,35,35,...,0,0,0,1,0,0,0,0,0,1.0
20,20,Raticate,https://pokemondb.net/pokedex/raticate,Normal,413,413,81,60,50,70,...,0,0,0,1,0,0,0,0,0,1.0


In [84]:
locations = pd.DataFrame(columns = ['Name','XY Location'])
import time

for index, row in sample.iterrows():

    url = row['URL']
    all_tables = pd.read_html(requests.get(url, headers={'User-agent': 'Mozilla/5.0'}).text)
    pokemon_location_table = all_tables[-2]
    
    pokemon_location_new = pokemon_location_table.T
    pokemon_location_new.columns = pokemon_location_new.iloc[0]
    pokemon_location_new = pokemon_location_new.drop(pokemon_location_new.index[0])
    
    if 'XY' in pokemon_location_new:
            temp=pd.DataFrame({"Name":row['Name'], "XY Location":pokemon_location_new['XY']})
            locations=locations.append(temp)

In [85]:
print(len(locations))

141


## Analysis

### Finding which type has the highest and lowest average attack? Average defense?

In [48]:
type_stats = pd.DataFrame(columns=['Average Attack', 'Average Defense'], index = unique_types)

for i in unique_types:
    type_stats['Average Attack'][i] = poke_df[poke_df[i]==1]['Attack'].mean() 
    type_stats['Average Defense'][i] = poke_df[poke_df[i]==1]['Defense'].mean()

In [49]:
type_stats.head()

Unnamed: 0,Average Attack,Average Defense
Bug,68.1688,69.8052
Dark,91.4783,67.3261
Dragon,94.6667,82.2444
Electric,71.7083,62.8333
Fairy,60.4043,69.4468


In [58]:
print("Type with lowest average attack is: " + type_stats[type_stats['Average Attack'] == type_stats['Average Attack'].min()].index[0])
print("Type with highest average attack is: " + type_stats[type_stats['Average Attack'] == type_stats['Average Attack'].max()].index[0])
print("Type with lowest average defense is: " + type_stats[type_stats['Average Defense'] == type_stats['Average Defense'].min()].index[0])
print("Type with highest average defense is: " + type_stats[type_stats['Average Defense'] == type_stats['Average Defense'].max()].index[0])

Type with lowest average attack is: Fairy
Type with highest average attack is: Fighting
Type with lowest average defense is: Normal
Type with highest average defense is: Steel


### For the locations in pokemon X/Y, calculating the average total points for each location and finding which location has the highest average total point score

In [86]:
new_poke_df = pd.merge(poke_df,locations, on='Name', how='inner')
new_poke_df

Unnamed: 0,ID,Name,URL,Type,Total,HP,Attack,Defense,Sp.Atk,Sp.Def,...,Ground,Ice,Normal,Poison,Psychic,Rock,Steel,Water,Sample,XY Location
0,4,Charmander,https://pokemondb.net/pokedex/charmander,Fire,309,309,52,43,60,50,...,0,0,0,0,0,0,0,0,1.0,Lumiose City
1,8,Wartortle,https://pokemondb.net/pokedex/wartortle,Water,405,405,63,80,65,80,...,0,0,0,0,0,0,0,1,1.0,Evolve Squirtle
2,12,Butterfree,https://pokemondb.net/pokedex/butterfree,Bug Flying,395,395,45,50,90,80,...,0,0,0,0,0,0,0,0,1.0,Evolve Caterpie/Metapod
3,16,Pidgey,https://pokemondb.net/pokedex/pidgey,Normal Flying,251,251,45,40,35,35,...,0,0,1,0,0,0,0,0,1.0,"Route 2, 3"
4,20,Raticate,https://pokemondb.net/pokedex/raticate,Normal,413,413,81,60,50,70,...,0,0,1,0,0,0,0,0,1.0,Trade/migrate from another game
5,24,Arbok,https://pokemondb.net/pokedex/arbok,Poison,448,448,95,69,65,79,...,0,0,0,1,0,0,0,0,1.0,Route 19
6,28,Sandslash,https://pokemondb.net/pokedex/sandslash,Ground,450,450,100,110,45,55,...,1,0,0,0,0,0,0,0,1.0,"Route 18, Terminus Cave"
7,32,Nidoran♂,https://pokemondb.net/pokedex/nidoran-m,Poison,273,273,57,40,40,40,...,0,0,0,1,0,0,0,0,1.0,Route 11
8,36,Clefable,https://pokemondb.net/pokedex/clefable,Fairy,483,483,70,73,95,90,...,0,0,0,0,0,0,0,0,1.0,Trade/migrate from another game
9,44,Gloom,https://pokemondb.net/pokedex/gloom,Grass Poison,395,395,65,70,85,75,...,0,0,0,1,0,0,0,0,1.0,Evolve Oddish


In [88]:
location_avg = new_poke_df.groupby('XY Location').agg({'Total': ['mean']})

In [90]:
location_avg.head(5)

Unnamed: 0_level_0,Total
Unnamed: 0_level_1,mean
XY Location,Unnamed: 1_level_2
Ambrette Town,355.5
Breed Electrode,330.0
Breed Gurdurr/Conkeldurr,305.0
Breed Hariyama,237.0
Breed Haunter/Gengar,310.0


In [95]:
print("Location with highest average total point score is: " + location_avg[location_avg['Total']['mean'] == location_avg['Total']['mean'].max()].index[0])

Location with highest average total point score is: Sea Spirit's Den, Roaming Kalos
