# 全国図鑑の全ポケモンのデータ取得

## 準備

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pickle
import re

In [2]:
# 全ポケモンのURL一覧
url_list = pickle.load(open('./all_gen_pokemon_urls.pkl', 'rb'))
# タイプ データフレーム
df_type = pd.read_csv('./type.csv')
# アタックデックス データフレーム
df_attackdex = pd.read_csv('./attackdex.csv')
# たまごグループ名
egg_groups_name = pickle.load(open('./egg_groups_name.pkl', 'rb'))
# 持ち物 データフレーム
df_holditem = pd.read_csv('./holditemdex.csv')
# 特性 データフレーム
df_abilirydex = pd.read_csv('./abilirydex.csv')


In [3]:
# タイプを日本語名にする関数の定義
def type_func(elem):
    type_link = elem['href'].split('/')[-1]
    if not '.shtml' in type_link:
        type_link = f'{type_link}.shtml'
    if type_link == 'psychic.shtml':
        type_link = 'psychict.shtml'
    return df_type.query(f'link_text == "{type_link}"')['japanese'].values[0]

In [4]:
# 努力値データを整形する関数の定義

effort_val_name = {
    'HP': 'HP',
    'Attack': 'こうげき',
    'Defense': 'ぼうぎょ',
    'Sp. Attack': 'とくこう',
    'Sp. Defense': 'とくぼう',
    'Speed': 'すばやさ',
}

def effort_func(effort_text):
    effort_value = effort_text.split()[0]
    if len(effort_text.split()) < 3:
        effort_key = effort_val_name[effort_text.split()[1]]
    else:
        effort_key = effort_val_name[" ".join(effort_text.split()[1:3])]
    
    return effort_key, effort_value

In [5]:
from pymongo import MongoClient
client = MongoClient()
db = client['pokemon']

## 実行

In [17]:
# for index, pokemon in enumerate(url_list[24:25]):
for pokemon in url_list:
    item = {}

    url = f'https://www.serebii.net{pokemon["Gen_URLs"][0]["URL"]}'
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')

    # 全国図鑑No
    item['No'] = pokemon['No'].replace('#', '')

    # 英語名
    item['English_name'] = pokemon['English_name']

    # ドット画像の取得
    item['Dot_image'] = f'https://www.serebii.net{soup.select("table.dextab")[0].select("td")[1].img.attrs["src"]}'

    # 画像の取得
    item['Image'] = f'https://www.serebii.net{soup.select("table.dextable")[0].select("tr")[1].select("td")[0].img.attrs["src"]}'

    # 変数の初期化
    weakness = {}
    holditems = []
    egg_groups = []
    generation = ''

    for dextable_elem in soup.select('table.dextable, #swshbdsp, #legends'):
        # 要素がid属性を含む場合に世代の判定処理を行う
        if 'id' in dextable_elem.attrs.keys():
            if dextable_elem.attrs['id'] == 'swshbdsp':
                # 剣盾・DBSPの場合
                continue
            elif dextable_elem.attrs['id'] == 'legends':
                # レジェンズの場合
                generation = '-legends'
                continue
        # 日本語名とタイプの取得
        if dextable_elem.select_one('td'):
            if dextable_elem.select_one('td').text.strip() == 'Name':
                # 日本語名の取得
                if not 'Japanese_name' in item.keys():
                    item['Japanese_name'] = dextable_elem.select('td.fooinfo')[1].select('td')[1].contents[-1].strip()
                # タイプの取得
                if not 'Type' in item.keys():
                    if dextable_elem.select_one('td.cen').select_one('table'):
                        for type_elem in dextable_elem.select_one('td.cen').select('tr'):
                            if type_elem.td.text.strip() == 'Normal':
                                item['Type'] = [type_func(elem) for elem in type_elem.select('a')]
                            else:
                                item[f'Type_{type_elem.td.text.strip()}'] = [type_func(elem) for elem in type_elem.select('a')]
                    else:
                        item['Type'] = [type_func(elem) for elem in dextable_elem.select_one('td.cen').select('a')]
        
        # 特性の取得
        if dextable_elem.select_one('b'):
            if 'Abilities' in dextable_elem.select_one('b').text.strip():
                ab_key_name = 'Abilities'
                abiliries = {
                    'Base_abilities': [],
                    'Hidden_ability': [],
                }
                ab_detail_key = 'Base_abilities'
                for tmp_elem in dextable_elem.find_all(['tr'])[1].select('td>a, td>b'):
                    if not tmp_elem.select('b'):
                        if 'Abilities' in tmp_elem.text.strip():
                            if abiliries['Base_abilities']:
                                item[ab_key_name] = abiliries
                            ab_key_name = f'Abilities-{tmp_elem.text.strip().replace(" Abilities", "").replace(" Ability", "").replace(" ", "_")}'
                            ab_detail_key = 'Base_abilities'
                            abiliries = {
                                'Base_abilities': [],
                                'Hidden_ability': [],
                            }
                        elif tmp_elem.text.strip() == 'Hidden Ability':
                            ab_detail_key = 'Hidden_ability'
                    else:
                        if len(df_abilirydex[df_abilirydex['English_name'] == tmp_elem.text.strip()]):
                            ab_val = df_abilirydex[df_abilirydex['English_name'] == tmp_elem.text.strip()]['Japanese_name'].values[0]
                            if ab_val and type(ab_val) != float:
                                abiliries[ab_detail_key].append(ab_val)
                            else:
                                abiliries[ab_detail_key].append(tmp_elem.text.strip())
                        else:
                            abiliries[ab_detail_key].append(tmp_elem.text.strip())

                if abiliries['Base_abilities']:
                    item[ab_key_name] = abiliries

        # タイプ相性（弱点）と種族値の取得
        if dextable_elem.select('h2'):
            # タイプ相性（弱点）の取得
            if dextable_elem.h2.text.strip() == 'Weakness':
                tmp_elem = soup.select('table.dextable')[3].select('tr')
                for type_elem, val_elem in zip(tmp_elem[1].select('td'), tmp_elem[2].select('td')):
                    weakness[type_func(type_elem.a)] = val_elem.text.replace('*', '').strip()
                item['Weakness'] = weakness
            # 種族値の取得
            if 'Stats' in dextable_elem.h2.text.strip():
                stats = {}
                stats['HP'] = int(dextable_elem.select('tr')[2].select('td')[1].text.strip())
                stats['Attack'] = int(dextable_elem.select('tr')[2].select('td')[2].text.strip())
                stats['Defense'] = int(dextable_elem.select('tr')[2].select('td')[3].text.strip())
                stats['Sp_Attack'] = int(dextable_elem.select('tr')[2].select('td')[4].text.strip())
                stats['Sp_Defense'] = int(dextable_elem.select('tr')[2].select('td')[5].text.strip())
                stats['Speed'] = int(dextable_elem.select('tr')[2].select('td')[6].text.strip())
                stats['Total'] = stats['HP'] + stats['Attack'] + stats['Defense'] + stats['Sp_Attack'] + stats['Sp_Defense'] + stats['Speed']
                item[dextable_elem.h2.text.strip().replace(' - ', '-').replace(' ', '_')] = stats
        
        # 持ち物とたまごグループの取得
        if dextable_elem.select('td.footwo'):
            if dextable_elem.select_one('td').text.strip() == 'Wild Hold Item':
                # 持ち物の取得
                holditem_elem = dextable_elem.find_all('tr')[1].select_one('td')
                holditem_list = []
                holditem_gen = ''
                for tmp_elem in holditem_elem.find_all(['b', 'a']):
                    if tmp_elem.attrs:
                        if len(df_holditem[df_holditem['English_name'] == tmp_elem.text.strip()]['Japanese_name']):
                            holditem_list.append(df_holditem[df_holditem['English_name'] == tmp_elem.text.strip()]['Japanese_name'].values[0])
                        else:
                            tmp_url = f'https://www.serebii.net{tmp_elem.attrs["href"]}'
                            tmp_res = requests.get(tmp_url)
                            tmp_soup = BeautifulSoup(tmp_res.text, 'html.parser')
                            for tmp_dextable_elem in tmp_soup.select('table.dextable'):
                                if tmp_dextable_elem.select_one('td'):
                                    if tmp_dextable_elem.select_one('td').text.strip() == 'Sprites':
                                        if tmp_dextable_elem.select("td.cen")[2].text:
                                            holditem_list.append(tmp_dextable_elem.select("td.cen")[2].contents[0].strip())
                                        else:
                                            holditem_list.append(tmp_elem.text.strip())
                    else:
                        if holditem_list:
                            holditems.append({'Gen': holditem_gen, 'Hold_item': holditem_list})
                            holditem_list = []
                        holditem_gen = tmp_elem.text.strip()
                if holditem_list:
                    holditems.append({'Gen': holditem_gen, 'Hold_item': holditem_list})
                    item['Hold_items'] = holditems
                
                # たまごグループの取得
                egg_group_dextable = dextable_elem.select_one('td.fooinfo:last-of-type')
                if egg_group_dextable.select('table.dexitem'):
                    egg_group_elem = egg_group_dextable.select_one('table.dexitem').select('td')
                    egg_groups = [egg_groups_name[egg_group_elem[i].text.strip()] for i in range(1, len(egg_group_elem), 2)]
                
                item['Hold_items'] = holditems
                item['Egg_groups'] = egg_groups
        
        # 努力値の取得
        if dextable_elem.select_one('.fooleft'):
            if 'Abilities' in dextable_elem.select_one('.fooleft').contents[0].text.strip():
                if dextable_elem.select('td.fooinfo')[3].text:
                    effort_values = {}
                    effort_key_name = 'Effort_values'
                    for string in dextable_elem.select('td.fooinfo')[3].strings:
                        if re.search(r'\d\s.*', string.strip().replace(' Point(s)', '')):
                            effort_key, effort_value = effort_func(re.search(r'\d\s.*', string.strip().replace(' Point(s)', '')).group())
                            effort_values[effort_key] = effort_value
                        else:
                            if effort_values:
                                item[effort_key_name] = effort_values
                                effort_values = {}
                                effort_key_name = f'Effort_values-{string.replace(" ", "_")}'
                    if effort_values:
                        item[effort_key_name] = effort_values
        
        # 技の取得
        if dextable_elem.select_one('h3'):
            # レベル技の取得
            if 'Level Up' in dextable_elem.h3.text.strip():
                level_up_attacks = []
                for child in dextable_elem.children:
                    if not child.text.strip() or len(child.select('td')) < 3:
                        continue
                    attack = {}
                    if generation == '-legends':
                        attack['Level'] = [level for level in child.select('td')[0].strings][0]
                        if len([level for level in child.select('td')[0].strings]) > 1:
                            attack['Level_mastery'] = [level for level in child.select('td')[0].strings][1]
                    else:
                        attack['Level'] = child.select('td')[0].text.strip()
                    attack['English_name'] = child.select('td')[1].a.text.strip()
                    if len(df_attackdex[df_attackdex['English_name'] == child.select('td')[1].a.text.strip()]['Japanese_name']):
                        attack['Japanese_name'] = df_attackdex[df_attackdex['English_name'] == child.select('td')[1].a.text.strip()]['Japanese_name'].values[0]
                    else:
                        attack['Japanese_name'] = child.select('td')[1].a.text.strip()
                    level_up_attacks.append(attack)
                item[f'{dextable_elem.h3.text.strip().replace(" - ", "-").replace(" ", "_")}{generation}'] = level_up_attacks
            
            # わざマシンの取得
            if 'Technical Machine Attacks' in dextable_elem.h3.text.strip() or 'TM & HM Attacks' in dextable_elem.h3.text.strip():
                tech_machine_attacks = []
                for child in dextable_elem.children:
                    if not child.text.strip() or len(child.select('td')) < 3:
                        continue
                    attack = {}
                    attack['No'] = child.select('td')[0].text.strip().replace('TM', '')
                    attack['English_name'] = child.select('td')[1].a.text.strip()
                    attack['Japanese_name'] = df_attackdex[df_attackdex['English_name'] == child.select('td')[1].a.text.strip()]['Japanese_name'].values[0]
                    tech_machine_attacks.append(attack)
                item[dextable_elem.h3.text.strip().replace(' - ', '-').replace(' ', '_')] = tech_machine_attacks
            
            # たまご技の取得
            if 'Egg Moves' in dextable_elem.h3.text.strip():
                egg_attacks = []
                for child in dextable_elem.children:
                    if not child.text.strip() or len(child.select('td')) < 3:
                        continue
                    attack = {}
                    attack['English_name'] = child.select('td')[0].a.contents[0].strip()
                    attack['Japanese_name'] = df_attackdex[df_attackdex['English_name'] == child.select('td')[0].a.contents[0].strip()]['Japanese_name'].values[0]
                    egg_attacks.append(attack)
                item[dextable_elem.h3.text.strip().replace(' - ', '-').replace(' ', '_')] = egg_attacks

            # 技レコードの取得
            if 'Technical Record Attacks' in dextable_elem.h3.text.strip():
                tech_record_attacks = []
                for child in dextable_elem.children:
                    if not child.text.strip() or len(child.select('td')) < 3:
                        continue
                    attack = {}
                    attack['No'] = child.select('td')[0].text.strip().replace('TR', '')
                    attack['English_name'] = child.select('td')[1].a.text.strip()
                    attack['Japanese_name'] = df_attackdex[df_attackdex['English_name'] == child.select('td')[1].a.text.strip()]['Japanese_name'].values[0]
                    tech_record_attacks.append(attack)
                item[dextable_elem.h3.text.strip().replace(' - ', '-').replace(' ', '_')] = tech_record_attacks

            # 訓練場の教え技の取得
            if 'Move Shop Attacks' in dextable_elem.h3.text.strip():
                tmp_elem = dextable_elem.find('thead')
                move_shop_attacks = []
                for child in tmp_elem.children:
                    if not child.text.strip() or len(child.select('td')) < 3:
                        continue
                    attack = {}
                    attack['English_name'] = child.select('td')[0].a.contents[0].strip()
                    if len(df_attackdex[df_attackdex['English_name'] == child.select('td')[0].a.contents[0].strip()]['Japanese_name']):
                        attack['Japanese_name'] = df_attackdex[df_attackdex['English_name'] == child.select('td')[0].a.contents[0].strip()]['Japanese_name'].values[0]
                    else:
                        attack['Japanese_name'] = child.select('td')[0].a.contents[0].strip()
                    move_shop_attacks.append(attack)
                item[dextable_elem.h3.text.strip().replace(' - ', '-').replace(' ', '_')] = move_shop_attacks

    ### MongoDB へ保存 < DB: pokemon / Collection: all_gen_pokedex > ###
    db.all_gen_pokedex.insert_one(item)
    # print(item)


In [16]:
# コレクションの削除処理
db.all_gen_pokedex.delete_many({})

# コレクション数の取得
print(db.all_gen_pokedex.count_documents({}))

0
