In [1]:
# ! pip install mojimoji
# ! pip install wget
# ! pip install lhafile

In [9]:
# 圧縮ファイルをウェブからダウンロードし解凍 >> テキストファイルを保存
def download_file(obj, date):
    """
    obj (str): 'racelists' or 'results'
    """
    date = str(pd.to_datetime(date).date())
    ymd = date.replace('-', '')
    S, s = ('K', 'k') if obj == 'results' else ('B', 'b')
    if os.path.exists(f'downloads/{obj}/{ymd}.txt'):
        return
    else:
        os.makedirs(f'downloads/{obj}', exist_ok=True)
        try:
            url_t = f'http://www1.mbrace.or.jp/od2/{S}/'
            url_b = f'{ymd[:-2]}/{s}{ymd[2:]}.lzh'
            wget.download(url_t + url_b, f'downloads/{obj}/{ymd}.lzh')
            #archive = LhaFile(f'downloads/{obj}/{ymd}.lzh')
            archive = lhafile.LhaFile(f'downloads/{obj}/{ymd}.lzh')
            d = archive.read(archive.infolist()[0].filename)
            open(f'downloads/{obj}/{ymd}.txt', 'wb').write(d)
            subprocess.run(['rm', f'downloads/{obj}/{ymd}.lzh'])
        except (urllib.request.HTTPError ,FileNotFoundError):#リクエストが通らないor該当するデータがない
            print(f'There are no data for {date}')

In [10]:
# 結果ファイルのフォーマットを解析し、いい感じにテーブルに変形して出力
def get_results(date):
    conv_racetime = lambda x: np.nan if x == '.' else\
        sum([w * float(v) for w, v in zip((60, 1, 1/10), x.split('.'))])
    info_cols = ['title', 'day', 'date', 'place_cd', 'place']
    race_cols = ['race_no', 'race_type', 'distance']
    keys = ['toban', 'name', 'moter_no', 'boat_no',
            'ET', 'SC', 'ST', 'RT', 'position']
    racer_cols = [f'{k}_{i}' for k in keys for i in range(1, 7)]
    res_cols = []
    for k in ('tkt', 'odds', 'poprank'):
        for type_ in ('1t', '1f1', '1f2', '2t', '2f',
                      'w1', 'w2', 'w3', '3t', '3f'):
            if (k == 'poprank') & (type_ in ('1t', '1f1', '1f2')):
                pass
            else:
                res_cols.append(f'{k}_{type_}')
    res_cols.append('win_method')
    cols = info_cols + race_cols + racer_cols + res_cols

    stack = []
    date = str(pd.to_datetime(date).date())
    for place_cd, lines in read_file('results', date).items():
        min_lines = 26
        if len(lines) < min_lines:
            continue
        title = lines[4]
        day = int(re.findall('第(\d)日', lines[6].replace(' ', ''))[0])
        place = place_mapper[place_cd]
        info = {k: v for k, v in zip(
            info_cols, [title, day, date, place_cd, place])}

        head_list = []
        race_no = 1
        for i, l in enumerate(lines[min_lines:]):
            if f'{race_no}R' in l:
                head_list.append(min_lines + i)
                race_no += 1
        for race_no, head in enumerate(head_list, 1):
            try:
                race_type = lines[head].split()[1]
                distance = int(re.findall('H(\d*)m', lines[head])[0])
                win_method = lines[head + 1].split()[-1]
                _, tkt_1t, pb_1t = lines[head + 10].split()
                _, tkt_1f1, pb_1f1, tkt_1f2, pb_1f2 = lines[head + 11].split()
                _, tkt_2t, pb_2t, _, pr_2t = lines[head + 12].split()
                _, tkt_2f, pb_2f, _, pr_2f = lines[head + 13].split()
                _, tkt_w1, pb_w1, _, pr_w1 = lines[head + 14].split()
                tkt_w2, pb_w2, _, pr_w2 = lines[head + 15].split()
                tkt_w3, pb_w3, _, pr_w3 = lines[head + 16].split()
                _, tkt_3t, pb_3t, _, pr_3t = lines[head + 17].split()
                _, tkt_3f, pb_3f, _, pr_3f = lines[head + 18].split()
                race_vals = [race_no, race_type, distance]
                res_vals = [
                    tkt_1t, tkt_1f1, tkt_1f2, tkt_2t, tkt_2f,
                    tkt_w1, tkt_w2, tkt_w3, tkt_3t, tkt_3f,
                    pb_1t, pb_1f1, pb_1f2, pb_2t, pb_2f,
                    pb_w1, pb_w2, pb_w3, pb_3t, pb_3f,
                    pr_2t, pr_2f, pr_w1, pr_w2, pr_w3,
                    pr_3t, pr_3f, win_method
                ]
                dic = info.copy()
                dic.update(dict(zip(race_cols, race_vals)))
                dic.update(dict(zip(res_cols, res_vals)))
                dic = {k: float(v) / 100 if 'odds' in k else v
                       for k, v in dic.items()}
                for i in range(6):
                    bno, *vals = lines[head + 3 + i].split()[1:10]
                    vals.append(i + 1)
                    keys = ['toban', 'name', 'moter_no', 'boat_no',
                            'ET', 'SC', 'ST', 'RT', 'position']
                    dic.update(zip([f'{k}_{bno}' for k in keys], vals))
                stack.append(dic)
            except IndexError:
                continue
            except ValueError:
                continue
    if len(stack) > 0:
        df = pd.DataFrame(stack)[cols].dropna(how='all')
        repl_mapper = {'K': np.nan, '.': np.nan}
        for i in range(1, 7):
            df[f'ET_{i}'] = df[f'ET_{i}'].replace(repl_mapper)
            df[f'ST_{i}'] = df[f'ST_{i}'].replace(repl_mapper)\
                .str.replace('F', '-').str.replace('L', '1')
            df[f'RT_{i}'] = df[f'RT_{i}'].map(conv_racetime)
        waku = np.array([('{}'*6).format(*v) for v in df[
            [f'SC_{i}' for i in range(1, 7)]].values])
        df['wakunari'] = np.where(waku == '123456', 1, 0)
        df = df.replace({'K': np.nan})
        # return df.astype(get_dtype('results'))
        return df
    else:
        return None

In [11]:
# テキストファイルを読み込み、会場ごとのデータにテキストを区切って出力
def read_file(obj, date):
    """
    obj (str): 'racelists' or 'results'
    """
    date = str(pd.to_datetime(date).date())
    ymd = date.replace('-', '')
    f = open(f'downloads/{obj}/{ymd}.txt', 'r', encoding='shift-jis')
    Lines = [l.strip().replace('\u3000', '') for l in f]
    Lines = [mojimoji.zen_to_han(l, kana=False) for l in Lines][1:-1]
    lines_by_plc = {}
    for l in Lines:
        if 'BGN' in l:
            place_cd = int(l[:-4])
            lines = []
        elif 'END' in l:
            lines_by_plc[place_cd] = lines
        else:
            lines.append(l)
    return lines_by_plc

In [12]:
# 出走表ファイルのフォーマットを解析し、いい感じにテーブルに変形して出力
place_mapper = {
    1: '桐生', 2: '戸田', 3: '江戸川', 4: '平和島', 5: '多摩川',
    6: '浜名湖', 7: '蒲郡', 8: '常滑', 9: '津', 10: '三国',
    11: '琵琶湖', 12: '住之江', 13: '尼崎', 14: '鳴門', 15: '丸亀',
    16: '児島', 17: '宮島', 18: '徳山', 19: '下関', 20: '若松',
    21: '芦屋', 22: '福岡', 23: '唐津', 24: '大村'
}

def get_racelists(date):
    info_cols = ['title', 'day', 'date', 'place_cd', 'place']
    race_cols = ['race_no', 'race_type', 'distance', 'deadline']
    keys = ['toban', 'name', 'area', 'class', 'age', 'weight',
            'glob_win', 'glob_in2', 'loc_win', 'loc_in2',
            'moter_no', 'moter_in2', 'boat_no', 'boat_in2']
    racer_cols = [f'{k}_{i}' for k in keys for i in range(1, 7)]
    cols = info_cols + race_cols + racer_cols

    stack = []
    date = str(pd.to_datetime(date).date())
    for place_cd, lines in read_file('racelists', date).items():
        min_lines = 11
        if len(lines) < min_lines:
            continue
        title = lines[4]
        day = int(re.findall('第(\d)日', lines[6].replace(' ', ''))[0])
        place = place_mapper[place_cd]
        info = {k: v for k, v in zip(
            info_cols, [title, day, date, place_cd, place])}

        head_list = []
        race_no = 1
        for i, l in enumerate(lines[min_lines:]):
            if f'{race_no}R' in l:
                head_list.append(min_lines + i)
                race_no += 1
        for race_no, head in enumerate(head_list, 1):
            try:
                race_type = lines[head].split()[1]
                distance = int(re.findall('H(\d*)m', lines[head])[0])
                deadline = re.findall('電話投票締切予定(\d*:\d*)', lines[head])[0]
                arr = []
                for l in lines[head + 5: head + 11]:
                    split = re.findall('\d \d{4}.*\d\d\.\\d\d', l)[0].split()
                    bno = [0]
                    name, area, cls1 = [e for e in re.findall(
                        '[^\d]*', split[1]) if e != '']
                    toban, age, wght, cls2 = [e for e in re.findall(
                        '[\d]*', split[1]) if e != '']
                    tmp = [toban, name, area, cls1 + cls2, age, wght] + split[2:10]
                    if len(tmp) == 14:
                        arr.append(tmp)
                    else:
                        continue
                if len(arr) == 6:
                    dic = info.copy()
                    dic.update(zip(race_cols, [race_no, race_type, distance, deadline]))
                    dic.update(dict(zip(racer_cols, np.array(arr).T.reshape(-1))))
                    stack.append(dic)
            except IndexError:
                continue
            except ValueError:
                continue
    if len(stack) > 0:
        df = pd.DataFrame(stack)[cols].dropna()
        # return df.astype(get_dtype('racelists'))
        return df
    else:
        return None

In [13]:
import pandas as pd
import os
import urllib
import re
import wget
import lhafile# as LhaFile
import subprocess
import numpy as np
import mojimoji

In [26]:
# 2021/03/28の結果ファイルをcsvで取得

month = "02"

for day in range(1,29):#日にち連続取得
    print(str(day).zfill(2))

    date = '2021-{}-{}'.format( month,str(day).zfill(2) )
    download_file('results', date)
    df = get_results(date)
    # df.head()


    if(len(df)!=0):         #データがない場合はマージしない
        tmp = df
        try:                #2回目以降はそれ以前のデータとマージ
            RSdata = pd.concat([RSdata,tmp])
        except NameError:   #初回はスクレイピングしたデータをLMdataに格納
            RSdata = tmp


01
There are no data for 2021-01-01
02
There are no data for 2021-01-02
03
There are no data for 2021-01-03
04
There are no data for 2021-01-04
05
There are no data for 2021-01-05
06
There are no data for 2021-01-06
07
There are no data for 2021-01-07
08
There are no data for 2021-01-08
09
There are no data for 2021-01-09
10
There are no data for 2021-01-10
11
There are no data for 2021-01-11
12
There are no data for 2021-01-12
13
There are no data for 2021-01-13
14
There are no data for 2021-01-14
15
There are no data for 2021-01-15
16
There are no data for 2021-01-16
17
There are no data for 2021-01-17
18
There are no data for 2021-01-18
19
There are no data for 2021-01-19
20
There are no data for 2021-01-20
21
There are no data for 2021-01-21
22
There are no data for 2021-01-22
23
There are no data for 2021-01-23
24
There are no data for 2021-01-24
25
There are no data for 2021-01-25
26
There are no data for 2021-01-26
27
There are no data for 2021-01-27
28
There are no data for 202

In [28]:
RSdata

Unnamed: 0,title,day,date,place_cd,place,race_no,race_type,distance,toban_1,toban_2,...,odds_3f,poprank_2t,poprank_2f,poprank_w1,poprank_w2,poprank_w3,poprank_3t,poprank_3f,win_method,wakunari
0,年末年始感謝競走,4,2021-01-01,24,大村,1,一般,1800,4708,5053,...,3.2,1,1,2,1,5,1,1,逃げ,0
1,年末年始感謝競走,4,2021-01-01,24,大村,2,一般,1800,4657,4966,...,5.0,1,1,1,3,7,2,2,逃げ,0
2,年末年始感謝競走,4,2021-01-01,24,大村,3,一般,1800,4164,3722,...,14.9,4,4,4,7,10,13,6,逃げ,1
3,年末年始感謝競走,4,2021-01-01,24,大村,4,一般,1800,4394,4393,...,3.0,2,2,3,2,5,3,1,逃げ,1
4,年末年始感謝競走,4,2021-01-01,24,大村,5,一般,1800,3604,4764,...,33.0,15,4,4,15,5,57,11,まくり差し,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,第15回埼玉新聞社杯,3,2021-01-31,1,桐生,8,予選,1200,3107,4228,...,29.6,11,6,6,9,10,43,10,まくり,1
139,第15回埼玉新聞社杯,3,2021-01-31,1,桐生,9,予選特賞,1200,4650,4068,...,5.1,1,1,1,8,9,2,3,逃げ,1
140,第15回埼玉新聞社杯,3,2021-01-31,1,桐生,10,予選特賞,1200,4565,3790,...,3.3,2,1,1,4,3,7,1,まくり差し,1
141,第15回埼玉新聞社杯,3,2021-01-31,1,桐生,11,一般特選,1800,4579,4850,...,2.6,1,1,1,2,4,1,1,逃げ,0


In [30]:
RSdata.to_csv("./data/RSdata{}.csv".format(month),index=False)

# 直前情報

In [16]:
# 任意の日程のレースについて直前情報やオッズの情報が記載されたURLを取得する
def get_url(date, place_cd, race_no, content):
    """
    content (str): ['odds3t', 'odds3f', 'odds2tf', 'beforeinfo']
    """
    url_t = 'https://www.boatrace.jp/owpc/pc/race/'
    ymd = str(pd.to_datetime(date)).split()[0].replace('-', '')
    jcd = f'0{place_cd}' if place_cd < 10 else str(place_cd)
    url = f'{url_t}{content}?rno={race_no}&jcd={jcd}&hd={ymd}'
    return url

In [94]:
# 直前情報のサイトからHTMLを取得し解析する
def get_beforeinfo(date, place_cd, race_no):
    url = get_url(date, place_cd, race_no, 'beforeinfo')
    soup = BeautifulSoup(requests.get(url).text, 'lxml')

    try:#レースが無い時にエラーでフリーズしないようにする。
        arr1 = arr1 = [[tag('td')[4].text, tag('td')[5].text]
                    for tag in soup(class_='is-fs12')]
        arr1 = [[v if v != '\xa0' else '' for v in row] for row in arr1]
        arr2 = [[tag.find(class_=f'table1_boatImage1{k}').text
                for k in ('Number', 'Time')]
                for tag in soup(class_='table1_boatImage1')]
        arr2 = [[v.replace('F', '-') for v in row] for row in arr2]
        arr2 = [row + [i] for i, row in enumerate(arr2, 1)]
        arr2 = pd.DataFrame(arr2).sort_values(by=[0]).values[:, 1:]

        air_t, wind_v, water_t, wave_h = [
            tag.text for tag in soup(class_='weather1_bodyUnitLabelData')]
        wether = soup(class_='weather1_bodyUnitLabelTitle')[1].text
        wind_d = int(soup.select_one(
            'p[class*="is-wind"]').attrs['class'][1][7:])

        df = pd.DataFrame(np.concatenate([arr1, arr2], 1),
                        columns=['ET', 'tilt', 'EST', 'ESC'])\
            .replace('L', '1').astype('float')

    except KeyError:
        return pd.DataFrame([])  

    if len(df) < 6:
        return pd.DataFrame([])
    try:
        data = pd.concat([
            pd.Series(
                {'date': date, 'place_cd': place_cd, 'race_no': race_no}),
            pd.Series(df.values.T.reshape(-1),
                      index=[f'{col}_{i}' for col in df.columns
                             for i in range(1, 7)]),
            pd.Series({
                'wether': wether, 'air_t': float(air_t[:-1]),
                'wind_d': wind_d, 'wind_v': float(wind_v[:-1]),
                'water_t': float(water_t[:-1]),
                'wave_h': float(wave_h[:-2])})])
        for i in range(1, 7):
            data[f'ESC_{i}'] = int(data[f'ESC_{i}'])
        return data
    except ValueError:
        return pd.DataFrame([])

In [104]:
from bs4 import BeautifulSoup
import requests

date = '2021-03-28'
place_cd = 3
race_no = 1
bi = get_beforeinfo(date, place_cd, race_no)


if(len(bi)!=0):#データがない場合はマージしない
    tmp = pd.DataFrame(bi).T
    try:#2回目以降はそれ以前のデータとマージ
        LMdata = pd.concat([LMdata,tmp])
    except NameError:#初回はスクレイピングしたデータをLMdataに格納
        LMdata = tmp



In [105]:
tmp

Unnamed: 0,date,place_cd,race_no,ET_1,ET_2,ET_3,ET_4,ET_5,ET_6,tilt_1,...,ESC_3,ESC_4,ESC_5,ESC_6,wether,air_t,wind_d,wind_v,water_t,wave_h
0,2021-03-28,3,1,7.06,7.0,6.95,6.87,6.82,6.79,0.0,...,3,4,5,6,曇り,18.0,4,6.0,14.0,10.0


In [106]:
LMdata

Unnamed: 0,date,place_cd,race_no,ET_1,ET_2,ET_3,ET_4,ET_5,ET_6,tilt_1,...,ESC_3,ESC_4,ESC_5,ESC_6,wether,air_t,wind_d,wind_v,water_t,wave_h
0,2021-03-28,1,1,6.79,6.79,6.77,6.83,6.83,6.87,-0.5,...,3,4,5,6,雨,12.0,6,1.0,16.0,1.0
0,2021-03-28,3,1,7.06,7.0,6.95,6.87,6.82,6.79,0.0,...,3,4,5,6,曇り,18.0,4,6.0,14.0,10.0
