# レース情報のスクレイピング(日付、距離、天候等)

## スクレイピング対象の調査

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
race_id = '201901010101'
url = 'https://db.netkeiba.com/race/' + race_id
html = requests.get(url)
html.encoding = 'EUC-JP'
soup = BeautifulSoup(html.text, 'html.parser')

In [3]:
soup

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html id="html" lang="ja" xml:lang="ja" xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>2歳未勝利｜2019年7月27日 | 競馬データベース - netkeiba.com</title>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="ja" http-equiv="content-language">
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<meta content="text/javascript" http-equiv="content-script-type"/>
<meta content="text/css" http-equiv="content-style-type"/>
<meta content="競馬データベースです。競走馬、騎手、レース、調教師、馬主。" name="description"/>
<meta content="競馬情報,競走馬,騎手,レース,調教師,検索,データベース,JRA,netkeiba.com" name="keywords"/>
<link href="https://cdn.netkeiba.com/img.db/common/css/reset.css?20160421" media="all" rel="stylesheet" type="text/css"/>
<link href="https://cdn.netkeiba.com/img.db/common/css/common.css?2020" media="all" rel="stylesheet" type="text/css"/>
<link href="https://cdn.netkeiba.com

In [7]:
soup.find('div', attrs={'class':'data_intro'}).find_all('p')[0].text

'\n\n芝右1800m\xa0/\xa0天候 : 曇\xa0/\xa0芝 : 良\xa0/\xa0発走 : 09:50\n\n\n'

In [8]:
text = soup.find('div', attrs={'class':'data_intro'}).find_all('p')[0].text

In [11]:
import re

re.findall(r'\w+', text)

['芝右1800m', '天候', '曇', '芝', '良', '発走', '09', '50']

In [12]:
text2 = soup.find('div', attrs={'class':'data_intro'}).find_all('p')[1].text

In [13]:
re.findall(r'\w+', text2)

['2019年7月27日', '1回札幌1日目', '2歳未勝利', '混', '指', '馬齢']

# スクレイピング本番

In [3]:
import pandas as pd
results  = pd.read_pickle('2019_result_raw.pickle')

In [7]:
race_id_list = results.index.unique()[:50]

In [29]:
# 必要ライブラリインポート
import time
import re
from tqdm import tqdm

In [30]:
# 情報取得後、整形したデータを返す
def scrape_race_info(race_id_list):

    race_infos = {}
    for race_id in tqdm(race_id_list):
        try:
            url = 'https://db.netkeiba.com/race/' + race_id
            html = requests.get(url)
            html.encoding = 'EUC-JP'
            soup = BeautifulSoup(html.text, 'html.parser')

            texts = soup.find('div', attrs={'class':'data_intro'}).find_all('p')[0].text + soup.find('div', attrs={'class':'data_intro'}).find_all('p')[1].text
            info = re.findall(r'\w+', texts)
            race_infos[race_id] = info

            info_dict = {}
            for text in info:
                if text in ['芝', 'ダート']:
                    info_dict['race_type'] = text
                if '障' in text:
                    info_dict['race_type'] = '障害'
                if 'm' in text:
                    info_dict['course_len'] = re.findall(r'\d+', text)[0]
                if text in ['良','稍重','重','不良']:
                    info_dict['ground_state'] = text
                if text in ['曇','晴','雨','小雨','小雪','雪']:
                    info_dict['weather'] = text
                if '年' in text:
                    info_dict['date'] = text
                race_infos[race_id] = info_dict
                time.sleep(1)
                
        except IndexError:
            continue
        except Exception as e:
            print(e)
            break
        except:
                break
        
    return race_infos

In [None]:
scrape_race_info(race_id_list)

 30%|████████████▉                              | 15/50 [03:33<08:21, 14.32s/it]