# モジュールインポート

In [17]:
import pandas as pd
import glob
import os
import time
import re
import datetime
from tqdm.notebook import tqdm
import dataclasses
from urllib.request import urlopen
from bs4 import BeautifulSoup
from selenium import webdriver
from abc import ABCMeta, abstractmethod
from types import MappingProxyType
from sklearn.preprocessing import LabelEncoder
import optuna.integration.lightgbm as lgb_o
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from selenium.webdriver.common.by import By

In [18]:
@dataclasses.dataclass(frozen=True)
class UrlPaths:
    DOMAIN: str = 'https://db.netkeiba.com/'
    # レース結果表、レース情報表、払い戻し表が含まれるページ
    RACE_URL: str = DOMAIN + 'race/'
    # 馬の過去成績表のページ
    HORSE_URL: str = DOMAIN + 'horse/'
    # 血統表のページ
    PED_URL: str = HORSE_URL + 'ped/'
    
    TOP: str = 'https://race.netkeiba.com/top/'
    # 開催日程ページ
    CALENDAR_URL: str = TOP + 'calendar.html'
    # レース一覧ページ
    RACE_LIST_URL: str = TOP + 'race_list.html'
    
    # 出馬表ページ
    SHUTUBA_TABLE: str = 'https://race.netkeiba.com/race/shutuba.html'

In [19]:
@dataclasses.dataclass(frozen=True)
class LocalPaths:
   
    BASE_PATH: str = os.path.abspath('./')

    DATA_PATH: str = os.path.join(os.path.abspath('./'),'data')
  
    HTML_PATH: str = os.path.join(DATA_PATH, 'html')
    HTML_RACE_PATH: str = os.path.join(HTML_PATH, 'race')
    HTML_HORSE_PATH: str = os.path.join(HTML_PATH, 'horse')
    HTML_PED_PATH: str = os.path.join(HTML_PATH, 'ped')
    
    RAW_PATH: str = os.path.join(DATA_PATH, 'raw')
    RAW_RESULTS_PATH: str = os.path.join(RAW_PATH, 'results')
    RAW_RACE_INFO_PATH: str = os.path.join(RAW_PATH, 'race_info')
    RAW_RETURN_PATH: str = os.path.join(RAW_PATH, 'return_tables')
    RAW_HORSE_RESULTS_PATH: str = os.path.join(RAW_PATH, 'horse_results')
    RAW_PEDS_PATH: str = os.path.join(RAW_PATH, 'peds')

@dataclasses.dataclass(frozen=True)
class Master:
    PLACE_DICT: dict = MappingProxyType({
        '札幌':'01',
        '函館':'02',
        '福島':'03',
        '新潟':'04',
        '東京':'05',
        '中山':'06',
        '中京':'07',
        '京都':'08',
        '阪神':'09',
        '小倉':'10',
        '門別':'30',
        '旭川':'34',
        '盛岡':'35',
        '水沢':'36',
        '浦和':'42',
        '船橋':'43',
        '大井':'44',
        '川崎':'45',
        '金沢':'46',
        '笠松':'47',
        '名古屋':'48',
        '園田':'50',
        '姫路':'51',
        '福山':'53',
        '高知':'54',
        '佐賀':'55',
        '荒尾':'56',
        '札幌(地)':'58',
        '香港':'60',
        'フランス':'61',
        'オースト':'62',
        'イギリス':'63',
        'シャティ':'64',
        'アラブ首':'65',
        'メイダン':'66',
        'ドイツ':'67',
        'アイルラ':'68',
        'アメリカ':'69',
        'ロンシャ':'70',
        'イタリア':'71',
        'シンガポ':'72',
        'カナダ':'73',
        'シャンテ':'74',
        '韓国':'75',
        'フレミン':'76',
        'ニュージ':'77',
        'アスコッ':'78',
        'デルマー':'79',
        'サンタア':'80',
        'コーフィ':'81',
        'ベルモン':'82',
        'ドーヴィ':'83',
        'ランドウ':'84',
        'ヨーク':'85',
        'レパーズ':'86',
        'チャーチ':'87',
        'サンダウ':'88'
        })

    RACE_TYPE_DICT: dict = MappingProxyType({
        '芝': '芝',
        'ダ': 'ダート',
        '障': '障害',
        })
    
    WEATHER_LIST: tuple = ('晴', '曇', '小雨', '雨', '小雪', '雪')
    
    GROUND_STATE_LIST: tuple = ('良', '稍重', '重', '不良')
    
    SEX_LIST: tuple = ('牡', '牝', 'セ')
    
    AROUND_LIST: tuple = ('右', '左', '直線', '障害')

    RACE_CLASS_LIST: tuple = ('新馬', '未勝利', '1勝クラス', '2勝クラス', '3勝クラス', 'オープン', 'G3', 'G2', 'G1', '障害')


In [None]:
class FeatureEngineering:
    def __init__(self, data_merger: DataMerger):
        self.__data = data_merger.merged_data.copy()
        
    @property
    def featured_data(self):
        return self.__data
    
    def add_interval(self):
        self.__data['interval'] = (self.__data['date'] - self.__data['latest']).dt.days
        self.__data.drop('latest', axis=1, inplace=True)
        return self

    def add_agedays(self):
        self.__data['age_days'] = (self.__data['date'] - self.__data['birthday']).dt.days
        self.__data.drop('birthday', axis=1, inplace=True)
        return self
    
    def dumminize_weather(self):
        self.__data['weather'] = pd.Categorical(self.__data['weather'], Master.WEATHER_LIST)
        self.__data = pd.get_dummies(self.__data, columns=['weather'])
        return self
    
    def dumminize_race_type(self):
        self.__data['race_type'] = pd.Categorical(
            self.__data['race_type'], list(Master.RACE_TYPE_DICT.values())
            )
        self.__data = pd.get_dummies(self.__data, columns=['race_type'])
        return self
    
    def dumminize_ground_state(self):
        self.__data['ground_state'] = pd.Categorical(
            self.__data['ground_state'], Master.GROUND_STATE_LIST
            )
        self.__data = pd.get_dummies(self.__data, columns=['ground_state'])
        return self
    
    def dumminize_sex(self):
        self.__data['性'] = pd.Categorical(self.__data['性'], Master.SEX_LIST)
        self.__data = pd.get_dummies(self.__data, columns=['性'])
        return self
    
    def __label_encode(self, target_col: str):
        csv_path = os.path.join(LocalPaths.MASTER_DIR, target_col + '.csv')
        # ファイルが存在しない場合、空のDataFrameを作成
        if not os.path.isfile(csv_path):
            target_master = pd.DataFrame(columns=[target_col, 'encoded_id'])
        else:
            target_master = pd.read_csv(csv_path, dtype=object)

        target_master['encoded_id'] = target_master['encoded_id'].astype(int)

        new_target = self.__data[[target_col]][
            ~self.__data[target_col].isin(target_master[target_col])
            ].drop_duplicates(subset=[target_col])
        # 新しい情報を登録
        if len(target_master) > 0:
            new_target['encoded_id'] = [
                i+max(target_master['encoded_id']) for i in range(1, len(new_target)+1)
                ]
            # 整数に変換
            new_target['encoded_id'] = new_target['encoded_id'].astype(int)
        else: # まだ1行も登録されていない場合の処理
            new_target['encoded_id'] = [i for i in range(len(new_target))]
        # 元のマスタと繋げる
        new_target_master = pd.concat([target_master, new_target]).set_index(target_col)['encoded_id']
        # マスタファイルを更新
        new_target_master.to_csv(csv_path)
        # ラベルエンコーディング実行
        self.__data[target_col] = pd.Categorical(self.__data[target_col].map(new_target_master))
        return self
    
    def encode_horse_id(self):
        self.__label_encode('horse_id')
        return self
    
    def encode_jockey_id(self):
        self.__label_encode('jockey_id')
        return self
    
    def encode_trainer_id(self):
        self.__label_encode('trainer_id')
        return self

    def encode_owner_id(self):
        self.__label_encode('owner_id')
        return self

    def encode_breeder_id(self):
        self.__label_encode('breeder_id')
        return self

    def dumminize_kaisai(self):
        self.__data[HorseResultsCols.PLACE] = pd.Categorical(
            self.__data[HorseResultsCols.PLACE], list(Master.PLACE_DICT.values())
            )
        self.__data = pd.get_dummies(self.__data, columns=[HorseResultsCols.PLACE])
        return self

    def dumminize_around(self):
        self.__data['around'] = pd.Categorical(self.__data['around'], Master.AROUND_LIST)
        self.__data = pd.get_dummies(self.__data, columns=['around'])
        return self

    def dumminize_race_class(self):
        self.__data['race_class'] = pd.Categorical(self.__data['race_class'], Master.RACE_CLASS_LIST)
        self.__data = pd.get_dummies(self.__data, columns=['race_class'])
        return self

# 関数置き場
## 各セルをそのまま実行してもらえれば大丈夫です

In [20]:
def get_kaisai_date(from_: str, to_: str):
    date_range = pd.date_range(start=from_, end=to_, freq="M")
    kaisai_date_list = []
    for year, month in tqdm(zip(date_range.year, date_range.month), total=len(date_range)):
        query = ['year=' + str(year),'month=' + str(month),]
        url = UrlPaths.CALENDAR_URL + '?' + '&'.join(query)
        html = urlopen(url).read()
        time.sleep(1)
        soup = BeautifulSoup(html, "html.parser")
        a_list = soup.find('table', class_='Calendar_Table').find_all('a')
        for a in a_list:
            kaisai_date_list.append(re.findall('(?<=kaisai_date=)\d+', a['href'])[0])
    return kaisai_date_list

In [21]:
def get_race_id_list(kaisai_date_list: list):
    race_id_list = []
    driver = webdriver.Chrome()
    for kaisai_date in tqdm(kaisai_date_list):
        try:
            query = [
                'kaisai_date=' + str(kaisai_date)
            ]
            url = UrlPaths.RACE_LIST_URL + '?' + '&'.join(query)
            print('scraping: {}'.format(url))
            driver.get(url)
            try:
                time.sleep(1) #取得の猶予として1秒設けた
                a_list = driver.find_element(By.CLASS_NAME, 'RaceList_Box').find_elements(By.TAG_NAME, 'a')
            except: #それでも取得できなかった場合
                print('waiting more 10 seconds')
                time.sleep(10)
                a_list = driver.find_element(By.CLASS_NAME, 'RaceList_Box').find_elements(By.TAG_NAME, 'a')
            for a in a_list:
                race_id = re.findall('(?<=result.html\?race_id=)\d+', a.get_attribute('href'))
                if len(race_id) > 0:
                    race_id_list.append(race_id[0])
        except Exception as e:
            print(e)
            break
    driver.close()
    return race_id_list

In [22]:
def get_html_horse(horse_id_list: list, skip: bool = True):
    html_path_list = []
    for horse_id in tqdm(horse_id_list):
        url = 'https://db.netkeiba.com/horse/' + horse_id #horse_idのurl作成
        html = urlopen(url).read() #スクレイピング実行
        filename = 'data/html/horse/' + horse_id + '.bin'
        html_path_list.append(filename)
        if skip and os.path.isfile(filename): 
            print('horse_id {} skipped'.format(horse_id))
            continue
        with open(filename, 'wb') as f: #保存するファイルパスを指定
            f.write(html) #保存
        time.sleep(1) #サーバーが落ちないようにするため必須
    return html_path_list

def get_html_race(race_id_list: list, skip: bool = True):
    html_path_list = []
    for race_id in tqdm(race_id_list):
        url = 'https://db.netkeiba.com/race/' + race_id #race_idからurlを作る
        html = urlopen(url).read() #スクレイピング実行
        filename = 'data/html/race/' + race_id + '.bin'
        html_path_list.append(filename)
        if skip and os.path.isfile(filename): 
            print('race_id {} skipped'.format(race_id))
            continue
        with open(filename, 'wb') as f: #保存するファイルパスを指定
            f.write(html) #保存
        time.sleep(1) #サーバーが落ちないようにするため必須
    return html_path_list

def get_html_ped(horse_id_list: list, skip: bool = True):
    html_path_list = []
    for horse_id in tqdm(horse_id_list):
        url = 'https://db.netkeiba.com/horse/ped/' + horse_id #horse_idからurlを作る
        html = urlopen(url).read() #スクレイピング実行
        filename = 'data/html/ped/' + horse_id + '.bin'
        html_path_list.append(filename)
        if skip and os.path.isfile(filename): 
            print('horse_id {} skipped'.format(horse_id))
            continue
        with open(filename, 'wb') as f: #保存するファイルパスを指定
            f.write(html) #保存
        time.sleep(1) #サーバーが落ちないようにするため必須
    return html_path_list

In [23]:
def get_rawdata_results(html_path_list: list):
    race_results = {}
    for html_path in tqdm(html_path_list):
        with open(html_path, 'rb') as f:
            try:
                html = f.read() 
                df = pd.read_html(html)[0] #メインとなるレース結果表データを取得
                
                soup = BeautifulSoup(html, "html.parser") #htmlをsoupオブジェクトに変換

                horse_id_list = []
                horse_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/horse")}
                )
                for a in horse_a_list:
                    horse_id = re.findall(r"\d+", a["href"])
                    horse_id_list.append(horse_id[0])
                jockey_id_list = []
                jockey_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/jockey")}
                )
                for a in jockey_a_list:
                    jockey_id = re.findall(r"\d+", a["href"])
                    jockey_id_list.append(jockey_id[0])
                df["horse_id"] = horse_id_list
                df["jockey_id"] = jockey_id_list

                #インデックスをrace_idにする
                race_id = re.findall('(?<=race/)\d+', html_path)[0]
                df.index = [race_id] * len(df)

                race_results[race_id] = df
            except Exception as e:
                print('error at {}'.format(html_path))
                print(e)

    race_results_df = pd.concat([race_results[key] for key in race_results])

    return race_results_df

def get_rawdata_info(html_path_list: list):
    race_infos = {}
    for html_path in tqdm(html_path_list):
        with open(html_path, 'rb') as f:
            try:
                html = f.read() #保存してあるバイナリファイルを読み込む
                
                soup = BeautifulSoup(html, "html.parser") #htmlをsoupオブジェクトに変換

                texts = (
                    soup.find("div", attrs={"class": "data_intro"}).find_all("p")[0].text
                    + soup.find("div", attrs={"class": "data_intro"}).find_all("p")[1].text
                )
                info = re.findall(r'\w+', texts)
                df = pd.DataFrame()
                for text in info:
                    if text in ["芝", "ダート"]:
                        df["race_type"] = [text]
                    if "障" in text:
                        df["race_type"] = ["障害"]
                    if "m" in text:
                        df["course_len"] = [int(re.findall(r"\d+", text)[-1])] 
                    if text in ["良", "稍重", "重", "不良"]:
                        df["ground_state"] = [text]
                    if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                        df["weather"] = [text]
                    if "年" in text:
                        df["date"] = [text]
                
                #インデックスをrace_idにする
                race_id = re.findall('(?<=race/)\d+', html_path)[0]
                df.index = [race_id] * len(df)

                race_infos[race_id] = df
            except Exception as e:
                print('error at {}'.format(html_path))
                print(e)

    race_infos_df = pd.concat([race_infos[key] for key in race_infos])

    return race_infos_df

def get_rawdata_return(html_path_list: list):
    horse_results = {}
    for html_path in tqdm(html_path_list):
        with open(html_path, 'rb') as f:
            try: 
                html = f.read() #保存してあるバイナリファイルを読み込む
                
                html = html.replace(b'<br />', b'br')
                dfs = pd.read_html(html)

                df = pd.concat([dfs[1], dfs[2]])
                
                race_id = re.findall('(?<=race/)\d+', html_path)[0]
                df.index = [race_id] * len(df)
                horse_results[race_id] = df
            except Exception as e:
                print('error at {}'.format(html_path))
                print(e)

    horse_results_df = pd.concat([horse_results[key] for key in horse_results])
    return horse_results_df

def get_rawdata_horse_results(html_path_list: list):
    horse_results = {}
    for html_path in tqdm(html_path_list):
        with open(html_path, 'rb') as f:
            html = f.read() #保存してあるバイナリファイルを読み込む
            
            df = pd.read_html(html)[3]
            if df.columns[0]=='受賞歴':
                df = pd.read_html(html)[4]
                
            horse_id = re.findall('(?<=horse/)\d+', html_path)[0]
            
            df.index = [horse_id] * len(df)
            horse_results[horse_id] = df
            
    horse_results_df = pd.concat([horse_results[key] for key in horse_results])
    return horse_results_df

def get_rawdata_peds(html_path_list: list):
    peds = {}
    for html_path in tqdm(html_path_list):
        with open(html_path, 'rb') as f:
            html = f.read() #保存してあるバイナリファイルを読み込む
            
            df = pd.read_html(html)[0]

            generations = {}
            horse_id = re.findall('(?<=ped/)\d+', html_path)[0]
            for i in reversed(range(5)):
                generations[i] = df[i]
                df.drop([i], axis=1, inplace=True)
                df = df.drop_duplicates()
            ped = pd.concat([generations[i] for i in range(5)]).rename(horse_id)
            peds[horse_id] = ped.reset_index(drop=True)

    peds_df = pd.concat([peds[key] for key in peds], axis=1).T.add_prefix('peds_')
    return peds_df

# A.データ取得

## レースID取得

## ※ここから自分で書き換える箇所あり
#### （書き換え方）from_とto_の所で取得したい日時の範囲を指定する

In [26]:
#開催日取得
kaisai_date_2023 = get_kaisai_date(from_="2021-01-01", to_="2022-01-01")

  0%|          | 0/12 [00:00<?, ?it/s]

#### （書き換え方）(kaisai_date_年数)として年数の部分を書き換える

In [27]:
# 開催日からレースIDを取得する
race_id_list = get_race_id_list(kaisai_date_2023)

  0%|          | 0/106 [00:00<?, ?it/s]

scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20210105
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20210109
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20210110
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20210111
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20210116
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20210117
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20210123
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20210124
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20210130
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20210131
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20210206
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20210207
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20210213
scraping: ht

## レースデータ取得

In [28]:
#https://db.netkeiba.com/race/のhtml(バイナリファイル)をスクレイピングして保存
html_files_race = get_html_race(race_id_list)
html_files_race[:5]

  0%|          | 0/3456 [00:00<?, ?it/s]

['data/html/race/202106010101.bin',
 'data/html/race/202106010102.bin',
 'data/html/race/202106010103.bin',
 'data/html/race/202106010104.bin',
 'data/html/race/202106010105.bin']

In [29]:
results = get_rawdata_results(html_files_race) #レース結果表の作成
race_info = get_rawdata_info(html_files_race) #レース情報表の作成
return_table = get_rawdata_return(html_files_race) #払戻表の作成

  0%|          | 0/3456 [00:00<?, ?it/s]

error at data/html/race/202110010405.bin
'NoneType' object has no attribute 'find_all'
error at data/html/race/202104010403.bin
'NoneType' object has no attribute 'find_all'
error at data/html/race/202105030103.bin
'NoneType' object has no attribute 'find_all'
error at data/html/race/202105030603.bin
'NoneType' object has no attribute 'find_all'
error at data/html/race/202104040305.bin
'NoneType' object has no attribute 'find_all'
error at data/html/race/202104040502.bin
'NoneType' object has no attribute 'find_all'
error at data/html/race/202104050202.bin
'NoneType' object has no attribute 'find_all'


  0%|          | 0/3456 [00:00<?, ?it/s]

  0%|          | 0/3456 [00:00<?, ?it/s]

#### （書き換え方）_年数の部分を書き換えると後で使いやすくなる

In [30]:
#保存（後で分かりやすいように年数を書き換える）
results.to_pickle('data/raw/results/results_2023.pickle')
race_info.to_pickle('data/raw/race_info/race_info_2023.pickle')
return_table.to_pickle('data/raw/return_tables/return_tables_2023.pickle')

## 馬データ取得

In [None]:
horse_id_list = results['horse_id'].unique()
html_files_horse = get_html_horse(horse_id_list) #htmlをスクレイピング

  0%|          | 0/11561 [00:00<?, ?it/s]

horse_id 2018106541 skipped
horse_id 2018103805 skipped
horse_id 2018101171 skipped
horse_id 2018103832 skipped
horse_id 2018102256 skipped
horse_id 2018102289 skipped
horse_id 2018104708 skipped
horse_id 2018110132 skipped
horse_id 2018102097 skipped
horse_id 2018102382 skipped
horse_id 2018100439 skipped
horse_id 2018103330 skipped
horse_id 2018104096 skipped
horse_id 2018102938 skipped
horse_id 2018104611 skipped
horse_id 2018106428 skipped
horse_id 2017104921 skipped
horse_id 2017100035 skipped
horse_id 2017101057 skipped
horse_id 2017101910 skipped
horse_id 2017103142 skipped
horse_id 2017102848 skipped
horse_id 2014106348 skipped
horse_id 2017104947 skipped
horse_id 2017102518 skipped
horse_id 2018101071 skipped
horse_id 2018102158 skipped
horse_id 2018101771 skipped
horse_id 2018106273 skipped
horse_id 2018104843 skipped
horse_id 2018101566 skipped
horse_id 2018105831 skipped
horse_id 2018104893 skipped
horse_id 2018105269 skipped
horse_id 2018100681 skipped
horse_id 2018105162 

horse_id 2018100818 skipped
horse_id 2018102986 skipped
horse_id 2018106571 skipped
horse_id 2018105009 skipped
horse_id 2018101012 skipped
horse_id 2018105738 skipped
horse_id 2018100858 skipped
horse_id 2018101087 skipped
horse_id 2018104965 skipped
horse_id 2018104630 skipped
horse_id 2018100994 skipped
horse_id 2018109002 skipped
horse_id 2018104829 skipped
horse_id 2018103368 skipped
horse_id 2018104406 skipped
horse_id 2018100420 skipped
horse_id 2018104713 skipped
horse_id 2018105418 skipped
horse_id 2018102899 skipped
horse_id 2018104136 skipped
horse_id 2018104578 skipped
horse_id 2018103377 skipped
horse_id 2018105273 skipped
horse_id 2018105337 skipped
horse_id 2018100493 skipped
horse_id 2018104595 skipped
horse_id 2018101573 skipped
horse_id 2018102605 skipped
horse_id 2018100410 skipped
horse_id 2018110013 skipped
horse_id 2018106146 skipped
horse_id 2018104327 skipped
horse_id 2018102060 skipped
horse_id 2018106473 skipped
horse_id 2018100897 skipped
horse_id 2018102117 

horse_id 2018101646 skipped
horse_id 2018103916 skipped
horse_id 2018103534 skipped
horse_id 2018110112 skipped
horse_id 2018110010 skipped
horse_id 2018102528 skipped
horse_id 2017102141 skipped
horse_id 2017104315 skipped
horse_id 2017105287 skipped
horse_id 2017103619 skipped
horse_id 2017101531 skipped
horse_id 2017101877 skipped
horse_id 2015102743 skipped
horse_id 2016103101 skipped
horse_id 2017105978 skipped
horse_id 2016104027 skipped
horse_id 2016106181 skipped
horse_id 2015103845 skipped
horse_id 2016105718 skipped
horse_id 2013105140 skipped
horse_id 2017103299 skipped
horse_id 2014101462 skipped
horse_id 2014100050 skipped
horse_id 2017103912 skipped
horse_id 2017102612 skipped
horse_id 2016101917 skipped
horse_id 2016106547 skipped
horse_id 2016104519 skipped
horse_id 2017109028 skipped
horse_id 2017103788 skipped
horse_id 2014106286 skipped
horse_id 2017105460 skipped
horse_id 2017101419 skipped
horse_id 2017104637 skipped
horse_id 2017105438 skipped
horse_id 2017105703 

horse_id 2016105198 skipped
horse_id 2017101792 skipped
horse_id 2016105101 skipped
horse_id 2016104611 skipped
horse_id 2016102775 skipped
horse_id 2016104470 skipped
horse_id 2016104807 skipped
horse_id 2017101415 skipped
horse_id 2015100524 skipped
horse_id 2017106018 skipped
horse_id 2017106279 skipped
horse_id 2015102824 skipped
horse_id 2017101303 skipped
horse_id 2017103473 skipped
horse_id 2018100531 skipped
horse_id 2018104310 skipped
horse_id 2018103116 skipped
horse_id 2018105647 skipped
horse_id 2018105954 skipped
horse_id 2018103870 skipped
horse_id 2018100679 skipped
horse_id 2018102238 skipped
horse_id 2018105259 skipped
horse_id 2018104340 skipped
horse_id 2018104780 skipped
horse_id 2018106305 skipped
horse_id 2018105617 skipped
horse_id 2018103314 skipped
horse_id 2018100710 skipped
horse_id 2018106443 skipped
horse_id 2018104455 skipped
horse_id 2018102544 skipped
horse_id 2018101679 skipped
horse_id 2017110146 skipped
horse_id 2013103569 skipped
horse_id 2014100645 

horse_id 2018106508 skipped
horse_id 2018101994 skipped
horse_id 2018105365 skipped
horse_id 2018105846 skipped
horse_id 2018104591 skipped
horse_id 2017103992 skipped
horse_id 2017105493 skipped
horse_id 2017104781 skipped
horse_id 2016104400 skipped
horse_id 2017103890 skipped
horse_id 2017101448 skipped
horse_id 2016101362 skipped
horse_id 2015103213 skipped
horse_id 2016105515 skipped
horse_id 2016103459 skipped
horse_id 2016100319 skipped
horse_id 2017105192 skipped
horse_id 2015106163 skipped
horse_id 2017100367 skipped
horse_id 2016100927 skipped
horse_id 2015105401 skipped
horse_id 2017102163 skipped
horse_id 2017101756 skipped
horse_id 2017104665 skipped
horse_id 2017101919 skipped
horse_id 2016102216 skipped
horse_id 2017100464 skipped
horse_id 2017101915 skipped
horse_id 2017104917 skipped
horse_id 2017101436 skipped
horse_id 2017101451 skipped
horse_id 2017100368 skipped
horse_id 2017102952 skipped
horse_id 2017104681 skipped
horse_id 2017101523 skipped
horse_id 2017104961 

horse_id 2018102226 skipped
horse_id 2018104165 skipped
horse_id 2018103232 skipped
horse_id 2018102963 skipped
horse_id 2017110174 skipped
horse_id 2017101560 skipped
horse_id 2016104253 skipped
horse_id 2017105293 skipped
horse_id 2017103688 skipped
horse_id 2017106203 skipped
horse_id 2017105111 skipped
horse_id 2017104963 skipped
horse_id 2017104989 skipped
horse_id 2016100634 skipped
horse_id 2017104103 skipped
horse_id 2016104193 skipped
horse_id 2016104340 skipped
horse_id 2015103374 skipped
horse_id 2017104619 skipped
horse_id 2014110107 skipped
horse_id 2016101455 skipped
horse_id 2014104259 skipped
horse_id 2017100378 skipped
horse_id 2015104571 skipped
horse_id 2016104624 skipped
horse_id 2016100349 skipped
horse_id 2015106156 skipped
horse_id 2014104052 skipped
horse_id 2016100285 skipped
horse_id 2017101707 skipped
horse_id 2016104717 skipped
horse_id 2017106141 skipped
horse_id 2018104986 skipped
horse_id 2018106193 skipped
horse_id 2018103159 skipped
horse_id 2018102186 

horse_id 2017102257 skipped
horse_id 2017102899 skipped
horse_id 2016104596 skipped
horse_id 2016105433 skipped
horse_id 2017100047 skipped
horse_id 2018102593 skipped
horse_id 2018106152 skipped
horse_id 2018105749 skipped
horse_id 2018105247 skipped
horse_id 2018103551 skipped
horse_id 2018100611 skipped
horse_id 2018106547 skipped
horse_id 2018102663 skipped
horse_id 2018102566 skipped
horse_id 2016103861 skipped
horse_id 2018104332 skipped
horse_id 2018104534 skipped
horse_id 2018100099 skipped
horse_id 2018100430 skipped
horse_id 2017104689 skipped
horse_id 2017105368 skipped
horse_id 2016105487 skipped
horse_id 2017103927 skipped
horse_id 2017105356 skipped
horse_id 2017102270 skipped
horse_id 2017102102 skipped
horse_id 2017102880 skipped
horse_id 2017105531 skipped
horse_id 2015105026 skipped
horse_id 2013105594 skipped
horse_id 2017102516 skipped
horse_id 2017106414 skipped
horse_id 2014105642 skipped
horse_id 2016104743 skipped
horse_id 2016101938 skipped
horse_id 2015103138 

horse_id 2016104373 skipped
horse_id 2016103595 skipped
horse_id 2018105322 skipped
horse_id 2018104790 skipped
horse_id 2018103883 skipped
horse_id 2018101046 skipped
horse_id 2018105030 skipped
horse_id 2018104011 skipped
horse_id 2018105301 skipped
horse_id 2018103506 skipped
horse_id 2018105079 skipped
horse_id 2018104643 skipped
horse_id 2018105226 skipped
horse_id 2018100753 skipped
horse_id 2018103485 skipped
horse_id 2018103296 skipped
horse_id 2018100592 skipped
horse_id 2017101566 skipped
horse_id 2017104606 skipped
horse_id 2017100275 skipped
horse_id 2017100746 skipped
horse_id 2016101560 skipped
horse_id 2017102466 skipped
horse_id 2017100141 skipped
horse_id 2016104887 skipped
horse_id 2017105477 skipped
horse_id 2017105198 skipped
horse_id 2017104060 skipped
horse_id 2016102550 skipped
horse_id 2016105673 skipped
horse_id 2015104699 skipped
horse_id 2014106117 skipped
horse_id 2015105082 skipped
horse_id 2018104903 skipped
horse_id 2018104922 skipped
horse_id 2018105554 

horse_id 2018102033 skipped
horse_id 2018101945 skipped
horse_id 2018105187 skipped
horse_id 2017105282 skipped
horse_id 2016102749 skipped
horse_id 2015100317 skipped
horse_id 2017110145 skipped
horse_id 2016104613 skipped
horse_id 2017105409 skipped
horse_id 2016104839 skipped
horse_id 2017105541 skipped
horse_id 2016110152 skipped
horse_id 2017106185 skipped
horse_id 2017105916 skipped
horse_id 2016103509 skipped
horse_id 2016104000 skipped
horse_id 2016106072 skipped
horse_id 2016103741 skipped
horse_id 2018100898 skipped
horse_id 2018102541 skipped
horse_id 2017102082 skipped
horse_id 2018104960 skipped
horse_id 2018104403 skipped
horse_id 2018102149 skipped
horse_id 2018110042 skipped
horse_id 2018100377 skipped
horse_id 2017101787 skipped
horse_id 2016106457 skipped
horse_id 2016105892 skipped
horse_id 2015106353 skipped
horse_id 2016100322 skipped
horse_id 2017106570 skipped
horse_id 2017106232 skipped
horse_id 2017103382 skipped
horse_id 2017106161 skipped
horse_id 2016101679 

horse_id 2018101608 skipped
horse_id 2018104386 skipped
horse_id 2018109100 skipped
horse_id 2018100576 skipped
horse_id 2018104786 skipped
horse_id 2018102127 skipped
horse_id 2018106153 skipped
horse_id 2018105328 skipped
horse_id 2018106087 skipped
horse_id 2018103429 skipped
horse_id 2017104267 skipped
horse_id 2016110159 skipped
horse_id 2015101855 skipped
horse_id 2017104744 skipped
horse_id 2017105689 skipped
horse_id 2017104850 skipped
horse_id 2017101537 skipped
horse_id 2017102920 skipped
horse_id 2016104894 skipped
horse_id 2016104234 skipped
horse_id 2017101230 skipped
horse_id 2016101745 skipped
horse_id 2015102764 skipped
horse_id 2016104817 skipped
horse_id 2016104420 skipped
horse_id 2016104989 skipped
horse_id 2017101832 skipped
horse_id 2013106055 skipped
horse_id 2017106572 skipped
horse_id 2016105472 skipped
horse_id 2016100279 skipped
horse_id 2016103584 skipped
horse_id 2014105439 skipped
horse_id 2015105143 skipped
horse_id 2016102372 skipped
horse_id 2018105853 

horse_id 2018106082 skipped
horse_id 2017100428 skipped
horse_id 2016106535 skipped
horse_id 2017106409 skipped
horse_id 2017106087 skipped
horse_id 2017104870 skipped
horse_id 2017105410 skipped
horse_id 2015105078 skipped
horse_id 2017103844 skipped
horse_id 2016104393 skipped
horse_id 2018104934 skipped
horse_id 2018105296 skipped
horse_id 2018110045 skipped
horse_id 2016105172 skipped
horse_id 2016105310 skipped
horse_id 2017106376 skipped
horse_id 2017101730 skipped
horse_id 2018104420 skipped
horse_id 2018100652 skipped
horse_id 2018105493 skipped
horse_id 2017103843 skipped
horse_id 2017102665 skipped
horse_id 2017105619 skipped
horse_id 2016105576 skipped
horse_id 2017105028 skipped
horse_id 2017105509 skipped
horse_id 2015101568 skipped
horse_id 2016110106 skipped
horse_id 2015100416 skipped
horse_id 2018104646 skipped
horse_id 2014103170 skipped
horse_id 2018105304 skipped
horse_id 2018103362 skipped
horse_id 2018101657 skipped
horse_id 2015104625 skipped
horse_id 2016101514 

horse_id 2017104105 skipped
horse_id 2014106120 skipped
horse_id 2013106474 skipped
horse_id 2012100517 skipped
horse_id 2011101125 skipped
horse_id 2016104492 skipped
horse_id 2017105558 skipped
horse_id 2018106464 skipped
horse_id 2018100239 skipped
horse_id 2018102035 skipped
horse_id 2018104558 skipped
horse_id 2018100537 skipped
horse_id 2018101966 skipped
horse_id 2017100543 skipped
horse_id 2016101901 skipped
horse_id 2016103962 skipped
horse_id 2018105192 skipped
horse_id 2017104823 skipped
horse_id 2017100808 skipped
horse_id 2017105091 skipped
horse_id 2017100265 skipped
horse_id 2018110113 skipped
horse_id 2018100849 skipped
horse_id 2017102031 skipped
horse_id 2017100859 skipped
horse_id 2017105874 skipped
horse_id 2016102333 skipped
horse_id 2017103666 skipped
horse_id 2018104955 skipped
horse_id 2018102345 skipped
horse_id 2016105913 skipped
horse_id 2017101528 skipped
horse_id 2017109123 skipped
horse_id 2014103367 skipped
horse_id 2018105366 skipped
horse_id 2018101007 

horse_id 2017100861 skipped
horse_id 2018100813 skipped
horse_id 2018102264 skipped
horse_id 2018103109 skipped
horse_id 2019101846 skipped
horse_id 2019105514 skipped
horse_id 2019103831 skipped
horse_id 2019100875 skipped
horse_id 2019101179 skipped
horse_id 2019105084 skipped
horse_id 2019103370 skipped
horse_id 2019106369 skipped
horse_id 2019101834 skipped
horse_id 2019104772 skipped
horse_id 2019104903 skipped
horse_id 2015106197 skipped
horse_id 2017105321 skipped
horse_id 2016100844 skipped
horse_id 2019105548 skipped
horse_id 2019105511 skipped
horse_id 2019100765 skipped
horse_id 2019106807 skipped
horse_id 2019101158 skipped
horse_id 2019104057 skipped
horse_id 2019104930 skipped
horse_id 2019104803 skipped
horse_id 2019106416 skipped
horse_id 2019104874 skipped
horse_id 2019109140 skipped
horse_id 2019104451 skipped
horse_id 2019104639 skipped
horse_id 2019104555 skipped
horse_id 2019101408 skipped
horse_id 2019105529 skipped
horse_id 2019103555 skipped
horse_id 2019100905 

horse_id 2019104841 skipped
horse_id 2019105075 skipped
horse_id 2019101910 skipped
horse_id 2019102628 skipped
horse_id 2019104532 skipped
horse_id 2019103858 skipped
horse_id 2017106420 skipped
horse_id 2018100047 skipped
horse_id 2019105347 skipped
horse_id 2019105204 skipped
horse_id 2019100935 skipped
horse_id 2019101983 skipped
horse_id 2019100571 skipped
horse_id 2019100641 skipped
horse_id 2019105344 skipped
horse_id 2019100938 skipped
horse_id 2019101630 skipped
horse_id 2019103937 skipped
horse_id 2019106901 skipped
horse_id 2019101042 skipped
horse_id 2019106300 skipped
horse_id 2019102152 skipped
horse_id 2019105167 skipped
horse_id 2019101311 skipped
horse_id 2019101889 skipped
horse_id 2019106322 skipped
horse_id 2019106214 skipped
horse_id 2019100803 skipped
horse_id 2019105800 skipped
horse_id 2019101313 skipped
horse_id 2019102164 skipped
horse_id 2019102469 skipped
horse_id 2019106777 skipped
horse_id 2019102937 skipped
horse_id 2019102514 skipped
horse_id 2019106340 

horse_id 2019104905 skipped
horse_id 2019100214 skipped
horse_id 2019100806 skipped
horse_id 2019109146 skipped
horse_id 2019106259 skipped
horse_id 2019100495 skipped
horse_id 2019100082 skipped
horse_id 2019103410 skipped
horse_id 2019100351 skipped
horse_id 2019105019 skipped
horse_id 2019103440 skipped
horse_id 2019100158 skipped
horse_id 2019106355 skipped
horse_id 2017109098 skipped
horse_id 2018100490 skipped
horse_id 2019105430 skipped
horse_id 2019103327 skipped
horse_id 2019105339 skipped
horse_id 2019105573 skipped
horse_id 2019100529 skipped
horse_id 2019103369 skipped
horse_id 2019104770 skipped
horse_id 2019105237 skipped
horse_id 2019103750 skipped
horse_id 2019101878 skipped
horse_id 2019101844 skipped
horse_id 2019104964 skipped
horse_id 2019105179 skipped
horse_id 2019101854 skipped
horse_id 2019106923 skipped
horse_id 2019106858 skipped
horse_id 2019105978 skipped
horse_id 2019101901 skipped
horse_id 2019102353 skipped
horse_id 2019102481 skipped
horse_id 2019105982 

horse_id 2019103427 skipped
horse_id 2019104984 skipped
horse_id 2019103488 skipped
horse_id 2019101930 skipped
horse_id 2019104624 skipped
horse_id 2019104521 skipped
horse_id 2019102515 skipped
horse_id 2019100747 skipped
horse_id 2019102122 skipped
horse_id 2013104055 skipped
horse_id 2019104976 skipped
horse_id 2019105365 skipped
horse_id 2019106411 skipped
horse_id 2019100795 skipped
horse_id 2019103099 skipped
horse_id 2019105643 skipped
horse_id 2019105124 skipped
horse_id 2019106808 skipped
horse_id 2019105389 skipped
horse_id 2019104529 skipped
horse_id 2019100611 skipped
horse_id 2019106876 skipped
horse_id 2019103665 skipped
horse_id 2019105891 skipped
horse_id 2019101906 skipped
horse_id 2019106190 skipped
horse_id 2019100783 skipped
horse_id 2019104207 skipped
horse_id 2019103880 skipped
horse_id 2019104469 skipped
horse_id 2019103145 skipped
horse_id 2019104150 skipped
horse_id 2019101504 skipped
horse_id 2019104995 skipped
horse_id 2019104774 skipped
horse_id 2019109011 

horse_id 2019100612 skipped
horse_id 2019102320 skipped
horse_id 2019103780 skipped
horse_id 2019104931 skipped
horse_id 2019104808 skipped
horse_id 2019103289 skipped
horse_id 2019110101 skipped
horse_id 2019105794 skipped
horse_id 2019103562 skipped
horse_id 2019101038 skipped
horse_id 2019101649 skipped
horse_id 2019102238 skipped
horse_id 2019104816 skipped
horse_id 2019105844 skipped
horse_id 2019105540 skipped
horse_id 2019101117 skipped
horse_id 2019103081 skipped
horse_id 2019105600 skipped
horse_id 2019104543 skipped
horse_id 2019100780 skipped
horse_id 2019100502 skipped
horse_id 2019100462 skipped
horse_id 2015103057 skipped
horse_id 2016101165 skipped
horse_id 2017105212 skipped
horse_id 2019105552 skipped
horse_id 2019104329 skipped
horse_id 2019101896 skipped
horse_id 2019104354 skipped
horse_id 2019105076 skipped
horse_id 2019110084 skipped
horse_id 2019110023 skipped
horse_id 2019100461 skipped
horse_id 2019103512 skipped
horse_id 2018105183 skipped
horse_id 2019106186 

horse_id 2019101803 skipped
horse_id 2019105370 skipped
horse_id 2019103586 skipped
horse_id 2019103211 skipped
horse_id 2019103561 skipped
horse_id 2019105354 skipped
horse_id 2019104862 skipped
horse_id 2019105598 skipped
horse_id 2019106966 skipped
horse_id 2019106546 skipped
horse_id 2019103250 skipped
horse_id 2016104078 skipped
horse_id 2016104364 skipped
horse_id 2017105588 skipped
horse_id 2018105280 skipped
horse_id 2019102350 skipped
horse_id 2019106888 skipped
horse_id 2019103506 skipped
horse_id 2019106087 skipped
horse_id 2019105388 skipped
horse_id 2019104129 skipped
horse_id 2019102554 skipped
horse_id 2019105428 skipped
horse_id 2019103418 skipped
horse_id 2019106120 skipped
horse_id 2019104630 skipped
horse_id 2019103920 skipped
horse_id 2019100586 skipped
horse_id 2019103036 skipped
horse_id 2019100874 skipped
horse_id 2019101414 skipped
horse_id 2019105308 skipped
horse_id 2019105542 skipped
horse_id 2019106856 skipped
horse_id 2019105480 skipped
horse_id 2019104477 

In [None]:
html_files_horse = []
for horse_id in tqdm(horse_id_list):
    file = glob.glob(os.path.join(LocalPaths.HTML_HORSE_PATH, horse_id+'*.bin'))[0]
    html_files_horse.append(file)
html_files_horse[:5]

#### （書き換え方）同様に_年数の部分を書き換えると後で扱いやすい

In [None]:
#後で分かりやすいように年数を書き換える
horse_results_2023 = get_rawdata_horse_results(html_files_horse) #馬の過去成績表の作成
horse_results_2023.to_pickle('data/raw/horse_results/horse_results_2023.pickle')

## 血統表データ取得

In [None]:
html_files_peds = get_html_ped(horse_id_list) #htmlをスクレイピング

#### （書き換え方）同様に_年数の部分を書き換えると後で扱いやすい

In [None]:
#後で分かりやすいように年数を書き換える
peds_2023 = get_rawdata_peds(html_files_peds) #血統表の作成
peds_2023.to_pickle('data/raw/peds/peds_2023.pickle')

# B. 学習

## 関数置き場

In [None]:
class AbstractBetPolicy(metaclass=ABCMeta):
    @abstractstaticmethod
    def judge(score_table, **params):
        pass

class AbstractScorePolicy(metaclass=ABCMeta):
    @staticmethod
    @abstractmethod
    def calc(model, X: pd.DataFrame) -> pd.DataFrame:
        raise NotImplementedError

class BasicScorePolicy(AbstractScorePolicy):
    """
    LightGBMの出力をそのままscoreとして計算。
    """
    @staticmethod
    def calc(model, X: pd.DataFrame) -> pd.DataFrame:
        return _calc(model, X)

class StdScorePolicy(AbstractScorePolicy):
    """
    レース内で標準化して、相対評価する。「レース内偏差値」のようなもの。
    """
    @staticmethod
    def calc(model, X: pd.DataFrame) -> pd.DataFrame:
        score_table = _calc(model, X)
        score_table[_SCORE] = _apply_scaler(score_table[_SCORE], _scaler_standard)
        return score_table

class MinMaxScorePolicy(AbstractScorePolicy):
    """
    レース内で標準化して、相対評価した後、全体を0~1にスケーリング。
    """
    @staticmethod
    def calc(model, X: pd.DataFrame) -> pd.DataFrame:
        score_table = _calc(model, X)
        score = _apply_scaler(score_table[_SCORE], _scaler_standard)
        min_ = score.min()
        score_table[_SCORE] = (score - min_) / (score.max() - min_)
        return score_table

In [None]:
class ResultsCols:
    RANK: str = '着順'
    WAKUBAN: str = '枠番'
    UMABAN: str = '馬番'
    HORSE_NAME: str = '馬名'
    SEX_AGE: str = '性齢'
    KINRYO: str = '斤量'
    JOCKEY: str = '騎手'
    TIME: str = 'タイム'
    RANK_DIFF: str = '着差'
    TANSHO_ODDS: str = '単勝'
    POPULARITY: str = '人気'
    WEIGHT_AND_DIFF: str = '馬体重'
    TRAINER: str = '調教師'

class DataSplitter:
    def __init__(self, featured_data, test_size, valid_size) -> None:
        self.__featured_data = featured_data
        self.train_valid_test_split(test_size, valid_size)

    def train_valid_test_split(self, test_size, valid_size):
        """
        訓練データとテストデータに分ける。さらに訓練データをoptuna用の訓練データと検証データに分ける。
        """
        self.__train_data, self.__test_data = self.__split_by_date(self.__featured_data, test_size=test_size)
        self.__train_data_optuna, self.__valid_data_optuna = self.__split_by_date(
            self.__train_data, test_size=valid_size
            )
        self.__lgb_train_optuna = lgb_o.Dataset(
            self.__train_data_optuna.drop(['rank', 'date', ResultsCols.TANSHO_ODDS], axis=1).values,
            self.__train_data_optuna['rank']
        )
        self.__lgb_valid_optuna = lgb_o.Dataset(
            self.__valid_data_optuna.drop(['rank', 'date', ResultsCols.TANSHO_ODDS], axis=1).values,
            self.__valid_data_optuna['rank']
        )
        # 説明変数と目的変数に分ける。開催はエラーなるので一度drop。
        self.__X_train = self.__train_data.drop(['rank', 'date', ResultsCols.TANSHO_ODDS], axis=1)
        self.__y_train = self.__train_data['rank']
        self.__X_test = self.__test_data.drop(['rank', 'date'], axis=1)
        self.__y_test = self.__test_data['rank']

    def __split_by_date(self, df, test_size):
        """
        時系列に沿って訓練データとテストデータに分ける関数。test_sizeは0~1。
        """
        sorted_id_list = df.sort_values("date").index.unique()
        train_id_list = sorted_id_list[: round(len(sorted_id_list) * (1 - test_size))]
        test_id_list = sorted_id_list[round(len(sorted_id_list) * (1 - test_size)) :]
        train = df.loc[train_id_list]
        test = df.loc[test_id_list]
        return train, test

    @property
    def featured_data(self):
        return self.__featured_data

    @property
    def train_data(self):
        return self.__train_data

    @property
    def test_data(self):
        return self.__test_data

    @property
    def train_data_optuna(self):
        return self.__train_data_optuna

    @property
    def valid_data_optuna(self):
        return self.__valid_data_optuna

    @property
    def lgb_train_optuna(self):
        return self.__lgb_train_optuna

    @property
    def lgb_valid_optuna(self):
        return self.__lgb_valid_optuna

    @property
    def X_train(self):
        return self.__X_train

    @property
    def y_train(self):
        return self.__y_train

    @property
    def X_test(self):
        return self.__X_test

    @property
    def y_test(self):
        return self.__y_test

class ModelWrapper:
    """
    モデルのハイパーパラメータチューニング・学習の処理が記述されたクラス。
    """
    def __init__(self):
        self.__lgb_model = lgb.LGBMClassifier(objective='binary')
        self.__feature_importance = None

    def tune_hyper_params(self, datasets: DataSplitter):
        """
        optunaによるチューニングを実行。
        """

        params = {'objective': 'binary'}

        # チューニング実行
        lgb_clf_o = lgb_o.train(
            params,
            datasets.lgb_train_optuna,
            valid_sets=(datasets.lgb_train_optuna, datasets.lgb_valid_optuna),
            verbose_eval=100,
            early_stopping_rounds=10,
            optuna_seed=100 # optunaのseed固定
            )

        # num_iterationsとearly_stopping_roundは今は使わないので削除
        tunedParams = {
            k: v for k, v in lgb_clf_o.params.items() if k not in ['num_iterations', 'early_stopping_round']
            }

        self.__lgb_model.set_params(**tunedParams)

    @property
    def params(self):
        return self.__lgb_model.get_params()

    def set_params(self, ex_params):
        """
        外部からハイパーパラメータを設定する場合。
        """
        self.__lgb_model.set_params(**ex_params)

    def train(self, datasets: DataSplitter):
        # 学習
        self.__lgb_model.fit(datasets.X_train.values, datasets.y_train.values)
        # AUCを計算して出力
        auc_train = roc_auc_score(
            datasets.y_train, self.__lgb_model.predict_proba(datasets.X_train)[:, 1]
            )
        auc_test = roc_auc_score(
            datasets.y_test,
            self.__lgb_model.predict_proba(datasets.X_test.drop([ResultsCols.TANSHO_ODDS], axis=1))[:, 1]
            )
        # 特徴量の重要度を記憶しておく
        self.__feature_importance = pd.DataFrame({
            "features": datasets.X_train.columns,
            "importance": self.__lgb_model.feature_importances_
            }).sort_values("importance", ascending=False)
        print('AUC: {:.3f}(train), {:.3f}(test)'.format(auc_train, auc_test))

    @property
    def feature_importance(self):
        return self.__feature_importance

    @property
    def lgb_model(self):
        return self.__lgb_model

    @lgb_model.setter
    def lgb_model(self, loaded):
        self.__lgb_model = loaded


In [None]:

from ._model_wrapper import ModelWrapper
from ._data_splitter import DataSplitter
from modules.policies import AbstractScorePolicy

class KeibaAI:
    """
    モデルの訓練や読み込み、実際に賭けるなどの処理を実行するクラス。
    """
    def __init__(self, datasets: DataSplitter):
        self.__datasets = datasets
        self.__model_wrapper = ModelWrapper()

    @property
    def datasets(self):
        return self.__datasets

    def train_with_tuning(self):
        self.__model_wrapper.tune_hyper_params(self.__datasets)
        self.__model_wrapper.train(self.__datasets)

    def get_params(self):
        """
        ハイパーパラメータを取得
        """
        return self.__model_wrapper.params

    def set_params(self, params):
        self.__model_wrapper.set_params(params)

    def calc_score(self, X: pd.DataFrame, score_policy: AbstractScorePolicy):
        return score_policy.calc(self.__model_wrapper.lgb_model, X)

    def decide_action(self, score_table: pd.DataFrame,
        bet_policy: AbstractBetPolicy, **params) -> dict:
        actions = bet_policy.judge(score_table, **params)

        return actions


In [None]:
class KeibaAIFactory:

    @staticmethod
    def create(featured_data, test_size = 0.3, valid_size = 0.3) -> KeibaAI:
        datasets = DataSplitter(featured_data, test_size, valid_size)
        return KeibaAI(datasets)

    @staticmethod
    def save(keibaAI: KeibaAI, version_name: str) -> None:
        yyyymmdd = datetime.date.today().strftime('%Y%m%d')
        # ディレクトリ作成
        os.makedirs(os.path.join('models', yyyymmdd), exist_ok=True)
        filepath_pickle = os.path.join('models', yyyymmdd, '{}.pickle'.format(version_name))
        with open(filepath_pickle, mode='wb') as f:
            dill.dump(keibaAI, f)
    
    @staticmethod
    def load(filepath: str) -> KeibaAI:
        with open(filepath, mode='rb') as f:
            return dill.load(f)

### 以下で学習させる

In [None]:
keiba_ai = training.KeibaAIFactory.create(feature_enginnering.featured_data) #モデル作成
keiba_ai.train_with_tuning() #パラメータチューニングをして学習

In [None]:
#モデル保存。pickle形式でモデルとデータセットが保存される。
training.KeibaAIFactory.save(keiba_ai, version_name='basemodel_2022_2023')

# C.予測

## 関数置き場

In [None]:
class ResultsProcessor(AbstractDataProcessor):
    def __init__(self, filepath):
        super().__init__(filepath)
    
    def _preprocess(self):
        """
        前処理
        """
        df = self.raw_data.copy()
        
        # 着順の前処理
        df = self._preprocess_rank(df)
        
        # 性齢を性と年齢に分ける
        # サイト上のテーブルに存在する列名は、ResultsColsクラスで定数化している。
        df["性"] = df[ResultsCols.SEX_AGE].map(lambda x: str(x)[0])
        df["年齢"] = df[ResultsCols.SEX_AGE].map(lambda x: str(x)[1:]).astype(int)

        # 馬体重を体重と体重変化に分ける
        df["体重"] = df[ResultsCols.WEIGHT_AND_DIFF].str.split("(", expand=True)[0]
        df["体重変化"] = df[ResultsCols.WEIGHT_AND_DIFF].str.split("(", expand=True)[1].str[:-1]
        
        # errors='coerce'で、"計不"など変換できない時に欠損値にする
        df['体重'] = pd.to_numeric(df['体重'], errors='coerce')
        df['体重変化'] = pd.to_numeric(df['体重変化'], errors='coerce')

        # 各列を数値型に変換
        df[ResultsCols.TANSHO_ODDS] = df[ResultsCols.TANSHO_ODDS].astype(float)
        df[ResultsCols.KINRYO] = df[ResultsCols.KINRYO].astype(float)
        df[ResultsCols.WAKUBAN] = df[ResultsCols.WAKUBAN].astype(int)
        df[ResultsCols.UMABAN] = df[ResultsCols.UMABAN].astype(int)
        
        # 6/6出走数追加
        df['n_horses'] = df.index.map(df.index.value_counts())
        
        # カラム抽出
        df = self._select_columns(df)
        
        return df
        
        
    def _preprocess_rank(self, raw):
        """
        着順の前処理
        """
        df = raw.copy()
        # 着順に数字以外の文字列が含まれているものを取り除く
        df[ResultsCols.RANK] = pd.to_numeric(df[ResultsCols.RANK], errors='coerce')
        df.dropna(subset=[ResultsCols.RANK], inplace=True)
        df[ResultsCols.RANK] = df[ResultsCols.RANK].astype(int)
        df['rank'] = df[ResultsCols.RANK].map(lambda x:1 if x<4 else 0)
        return df
    
    def _select_columns(self, raw):
        """
        カラム抽出
        """
        df = raw.copy()[[
            ResultsCols.WAKUBAN, # 枠番
            ResultsCols.UMABAN, # 馬番
            ResultsCols.KINRYO, # 斤量
            ResultsCols.TANSHO_ODDS, # 単勝
            'horse_id',
            'jockey_id',
            'trainer_id',
            'owner_id',
            '性',
            '年齢',
            '体重',
            '体重変化',
            'n_horses',
            'rank'
            ]]
        return df

class ShutubaTableProcessor(ResultsProcessor):
    def __init__(self, filepath: str):
        super().__init__(filepath)

    def _preprocess(self):
        df = super()._preprocess()
        
        # 距離は10の位を切り捨てる
        df["course_len"] = df["course_len"].astype(float) // 100
        
        # 開催場所
        df['開催'] = df.index.map(lambda x:str(x)[4:6])
        
        # 日付型に変更
        df["date"] = pd.to_datetime(df["date"])
        return df
    
    def _preprocess_rank(self, raw):
        return raw
    
    def _select_columns(self, raw):
        df = raw.copy()[[\
            ResultsCols.WAKUBAN, # 枠番
            ResultsCols.UMABAN, # 馬番
            ResultsCols.KINRYO, # 斤量
            ResultsCols.TANSHO_ODDS, # 単勝
            'horse_id',
            'jockey_id',
            'trainer_id',
            '性',
            '年齢',
            '体重',
            '体重変化',
            'n_horses',
            'course_len',
            'weather',
            'race_type',
            'ground_state',
            'date',
            'around',
            'race_class'
            ]]
        return df

In [None]:
def create_active_race_id_list(minus_time=-50):
    # 現在時刻を取得
    now_date = datetime.datetime.now().date().strftime('%Y%m%d')
    hhmm = datetime.datetime.now().strftime("%H:%M")
    print(now_date, hhmm)

    # レースidとレース時刻の一覧を取得
    race_id_list, race_time_list = scrape_race_id_race_time_list(now_date)

    # 現在時刻マイナス馬体重時刻を取得
    t_delta30 = datetime.timedelta(hours = 9, minutes = minus_time)
    JST30 = datetime.timezone(t_delta30, 'JST')
    now30 = datetime.datetime.now(JST30)
    hhmm_minus_time = now30.strftime("%H:%M")

    target_race_id_list = []
    target_race_time_list = []
    from_time = '09:15'

    for (race_id, race_time) in zip(race_id_list, race_time_list):

        # レース時刻より馬体重発表時刻を算出
        dt1 = datetime.datetime(int(now_date[:4]), int(now_date[4:6]),
            int(now_date[6:8]), int(race_time[0:2]), int(race_time[3:5]))
        dt2 = dt1 + datetime.timedelta(minutes = minus_time)
        announce_weight_time = dt2.strftime("%H:%M")

        # 1Rの場合は、前回のレース時刻を馬体重発表時刻に設定
        if '01' == race_id_list[10:12]:
            from_time = announce_weight_time

        # 前回のレース時刻 ＜ 現在時刻 ＜ レース時刻
        if (from_time < hhmm < race_time):
            target_race_id_list.append(race_id)
            target_race_time_list.append(race_time)
        # 現在時刻マイナス馬体重時刻 ＜ 馬体重発表時刻 ＜＝ 現在時刻
        elif (hhmm_minus_time < announce_weight_time <= hhmm):
            target_race_id_list.append(race_id)
            target_race_time_list.append(race_time)
        # 前回のレース時刻を退避
        from_time = race_time

    return target_race_id_list, target_race_time_list

In [None]:
class ShutubaDataMerger(DataMerger):
        
    def merge(self):
        self._merge_horse_results()
        self._merge_horse_info()
        self._merge_peds()

class ShutubaTableProcessor(ResultsProcessor):
    def __init__(self, filepath: str):
        super().__init__(filepath)

    def _preprocess(self):
        df = super()._preprocess()
        
        # 距離は10の位を切り捨てる
        df["course_len"] = df["course_len"].astype(float) // 100
        
        # 開催場所
        df['開催'] = df.index.map(lambda x:str(x)[4:6])
        
        # 日付型に変更
        df["date"] = pd.to_datetime(df["date"])
        return df
    
    def _preprocess_rank(self, raw):
        return raw
    
    def _select_columns(self, raw):
        df = raw.copy()[[\
            Cols.WAKUBAN, # 枠番
            Cols.UMABAN, # 馬番
            Cols.KINRYO, # 斤量
            Cols.TANSHO_ODDS, # 単勝
            'horse_id',
            'jockey_id',
            'trainer_id',
            '性',
            '年齢',
            '体重',
            '体重変化',
            'n_horses',
            'course_len',
            'weather',
            'race_type',
            'ground_state',
            'date',
            'around',
            'race_class'
            ]]
        return df

In [None]:
#モデルの準備
keiba_ai = KeibaAIFactory.load('models/20231102/basemodel_2022_2023.pickle')

In [None]:
# 前日全レース予想用のレースidとレース発走時刻を取得
target_race_id_list, target_race_time_list = scrape_race_id_race_time_list('20231104')
print(len(target_race_id_list))
print(len(target_race_time_list))
yesterday = True

In [None]:
# 馬体重の発表されたレースID、レース時刻を取得（レース当日用）
target_race_id_list, target_race_time_list = create_active_race_id_list()
print((target_race_id_list))
print((target_race_time_list))
yesterday = False

In [None]:
# 一時的に出馬表を保存するパスを指定
filepath = 'data/tmp/shutuba.pickle'
today = '2023/11/04'

for race_id, race_time in zip(target_race_id_list, target_race_time_list):
    # 出馬表の取得
    scrape_shutuba_table(race_id, today, filepath)

    # 前日予想の場合
    if yesterday:
        # 前日予想の場合、馬体重を0（0）に補正
        pd2 = pd.read_pickle(filepath)
        pd2[ResultsCols.WEIGHT_AND_DIFF] = '0(0)'
        # 前日予想の場合、天候と馬場状態が公開されていない場合はこちらを有効にする
        pd2['weather'] = '晴'
        pd2['ground_state'] = '良'
        pd2.to_pickle(filepath)

    # 出馬表の加工
    shutuba_table_processor = ShutubaTableProcessor(filepath)

    # テーブルのマージ
    shutuba_data_merger = ShutubaDataMerger(
        shutuba_table_processor,
        horse_results_processor,
        horse_info_processor,
        peds_processor,
        target_cols=TARGET_COLS,
        group_cols=GROUP_COLS
    )
    shutuba_data_merger.merge()

    # 特徴量エンジニアリング
    feature_enginnering_shutuba = FeatureEngineering(shutuba_data_merger) \
        .add_interval()\
        .add_agedays()\
        .dumminize_ground_state()\
        .dumminize_race_type()\
        .dumminize_sex()\
        .dumminize_weather()\
        .encode_horse_id()\
        .encode_jockey_id()\
        .encode_trainer_id()\
        .encode_owner_id()\
        .encode_breeder_id()\
        .dumminize_kaisai()\
        .dumminize_around()\
        .dumminize_race_class()

    # 予測
    X = feature_enginnering_shutuba.featured_data.drop(['date'], axis=1)

    # 当日の出走情報テーブル（前処理前）
    df_tmp = shutuba_table_processor.raw_data[:1]

    i = 0
    for num in list(PLACE_DICT.values()):
        if num == race_id[4:6]:
            print(list(PLACE_DICT)[i] + race_id[10:12] + 'R ' + race_time + '発走 ' + str(df_tmp.iat[0, 12])
                + str(df_tmp.iat[0, 10]) + 'm ' + str(df_tmp.iat[0, 13]) + ' ' + str(df_tmp.iat[0, 15]))
            break
        i += 1

    print(keiba_ai.calc_score(X, StdScorePolicy).sort_values('score', ascending=False))