In [1]:
from bs4 import BeautifulSoup
import requests

import pandas as pd

import time
import random

In [2]:
# Remove exact link duplicates from a list of links
def remove_duplicate_link(list_of_links):
    output_list = []
    for my_link in list_of_links:
        if my_link not in output_list:
            output_list.append(my_link)
    return output_list

In [3]:
# Remove exact text duplicates from a list, ignoring case.
def remove_duplicate_flavor_text(flavor_text_list):
    flavor_text_list = reversed(flavor_text_list)
    output_list = []
    for my_text in flavor_text_list:
        if my_text.lower() not in [the_text.lower() for the_text in output_list]:
            output_list.append(my_text)
    return output_list

In [4]:
def get_pokemon_info(pokemon_url):
    headers = {"User-Agent": "Mozilla/5.0"}

    response = requests.get(pokemon_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    #Extract Pokemon Name:
    poke_name = soup.find_all('h1')[0].text
    #print(soup.find_all('h1')[0].text)

    #Extract Pokemon Types and National Dex Number:
    poke_types = []
    national_dex_num = -1
    for table_row in soup.find_all('tr'):
        for table_header in table_row.find_all('th'):
            if table_header.text == 'Type':
                table_data = table_row.find_all('a')
                for typing in table_data:
                    poke_types.append(typing.text)
                    #print(typing.text)
            elif 'National' in table_header.text:
                table_data = table_row.find('strong')
                national_dex_num = int(table_data.text)

    #Extract Flavor Text:
    poke_flavor_text = []
    flavor_text = soup.find_all('td', class_='cell-med-text')
    for my_text in flavor_text:
        poke_flavor_text.append(my_text.text)
        #print(my_text.text)

    #Remove exact duplicates (ignoring case)
    poke_flavor_text = remove_duplicate_flavor_text(poke_flavor_text)

    return poke_name, national_dex_num, poke_types, poke_flavor_text


In [5]:
def get_pokemon_and_info():
    base_url = 'https://pokemondb.net'
    headers = {"User-Agent": "Mozilla/5.0"}
    pokedex_url = 'https://pokemondb.net/pokedex/all'
    
    response = requests.get(pokedex_url, headers=headers)
        
    soup = BeautifulSoup(response.content, 'html.parser')

    #Prepare a list of links to all the pokemon in the pokedex
    poke_links = []
    for link in soup.find_all('a', class_='ent-name'):
        poke_links.append(link.attrs['href'])

    poke_links = remove_duplicate_link(poke_links)

    pokedex = []

    for link in poke_links:
        pokemon = get_pokemon_info(base_url + link)
        pokedex.append(pokemon)
        
        # Wait some time in order to avoid overloading the network
        sleep_time = random.randint(50,100)
        time.sleep(sleep_time/10.0)

    return pokedex
        

# Retrieve Data And Store As CSV

In [6]:
pokedex = get_pokemon_and_info()

In [31]:
coldexnum = []
colname = []
coltext = []

colnormal = []
colfire = []
colwater = []
colgrass = []
colelectric = []
colpsychic = []
colice = []
coldragon = []
coldark = []
colfairy = []
colfighting = []
colflying = []
colpoison = []
colground = []
colrock = []
colbug = []
colghost = []
colsteel = []


for pokemon_entry in pokedex:
    pokemon_name = pokemon_entry[0]
    national_dex_num = pokemon_entry[1]
    pokemon_types = pokemon_entry[2]
    pokemon_text = pokemon_entry[3]

    if national_dex_num < 1009:

        for flavor_text in pokemon_text:
            coldexnum.append(national_dex_num)
            colname.append(pokemon_name)
            coltext.append(flavor_text)

            if 'Normal' in pokemon_types:
                colnormal.append(1)
            else:
                colnormal.append(0)

            if 'Fire' in pokemon_types:
                colfire.append(1)
            else:
                colfire.append(0)

            if 'Water' in pokemon_types:
                colwater.append(1)
            else:
                colwater.append(0)

            if 'Grass' in pokemon_types:
                colgrass.append(1)
            else:
                colgrass.append(0)

            if 'Electric' in pokemon_types:
                colelectric.append(1)
            else:
                colelectric.append(0)

            if 'Psychic' in pokemon_types:
                colpsychic.append(1)
            else:
                colpsychic.append(0)

            if 'Ice' in pokemon_types:
                colice.append(1)
            else:
                colice.append(0)

            if 'Dragon' in pokemon_types:
                coldragon.append(1)
            else:
                coldragon.append(0)

            if 'Dark' in pokemon_types:
                coldark.append(1)
            else:
                coldark.append(0)

            if 'Fairy' in pokemon_types:
                colfairy.append(1)
            else:
                colfairy.append(0)

            if 'Fighting' in pokemon_types:
                colfighting.append(1)
            else:
                colfighting.append(0)

            if 'Flying' in pokemon_types:
                colflying.append(1)
            else:
                colflying.append(0)

            if 'Poison' in pokemon_types:
                colpoison.append(1)
            else:
                colpoison.append(0)

            if 'Ground' in pokemon_types:
                colground.append(1)
            else:
                colground.append(0)

            if 'Rock' in pokemon_types:
                colrock.append(1)
            else:
                colrock.append(0)

            if 'Bug' in pokemon_types:
                colbug.append(1)
            else:
                colbug.append(0)

            if 'Ghost' in pokemon_types:
                colghost.append(1)
            else:
                colghost.append(0)

            if 'Steel' in pokemon_types:
                colsteel.append(1)
            else:
                colsteel.append(0)
                

data = {'DexNumber': coldexnum, 'Name': colname, 'FlavorText': coltext, 'Normal': colnormal, 'Fire': colfire, 'Water': colwater, 'Grass': colgrass,
       'Electric': colelectric, 'Psychic': colpsychic, 'Ice': colice, 'Dragon': coldragon, 'Dark': coldark, 'Fairy': colfairy,
       'Fighting': colfighting, 'Flying': colflying, 'Poison': colpoison, 'Ground': colground, 'Rock': colrock, 'Bug': colbug,
       'Ghost': colghost, 'Steel': colsteel}

df = pd.DataFrame(data)

In [32]:
df.to_csv('pokemon_text.csv', index=True)
df.to_csv('pokemon_text_no_index.csv', index=False)