In [19]:
import traceback
import pandas as pd
import numpy as np
import urllib.parse
import urllib.request
from bs4 import BeautifulSoup
import re
import time
import warnings
warnings.simplefilter('ignore')
from concurrent import futures
import lightgbm as lgb
import pickle
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.keys import Keys
options = ChromeOptions()
options.add_argument('--headless')   

def id2kaisai(race_id):
    num = int(race_id[4:6])
    if num == 1:
        return race_id[0:4]+race_id[7]+"札幌"+race_id[9]
    elif num == 2:
        return race_id[0:4]+race_id[7]+"函館"+race_id[9]
    elif num == 3:
        return race_id[0:4]+race_id[7]+"福島"+race_id[9]
    elif num == 4:
        return race_id[0:4]+race_id[7]+"新潟"+race_id[9]
    elif num == 5:
        return race_id[0:4]+race_id[7]+"東京"+race_id[9]
    elif num == 6:
        return race_id[0:4]+race_id[7]+"中山"+race_id[9]
    elif num == 7:
        return race_id[0:4]+race_id[7]+"中京"+race_id[9]
    elif num == 8:
        return race_id[0:4]+race_id[7]+"京都"+race_id[9]
    elif num == 9:
        return race_id[0:4]+race_id[7]+"阪神"+race_id[9]
    else:
        return race_id[0:4]+race_id[7]+"小倉"+race_id[9]
    
def get_table(target_url):
    with urllib.request.urlopen(target_url) as response:
        html = response.read()
        return pd.read_html(html)
    
def make_data(race_id):
    target_url = 'https://race.netkeiba.com/race/shutuba.html?race_id='+race_id+'&rf=race_submenu' 
    driver = Chrome(options=options)
    driver.get(target_url)
    #time.sleep(3)
    html = driver.page_source.encode('euc-jp',"ignore")  
    driver.quit()
    df_ = pd.read_html(html)[0]
    df = pd.DataFrame()
    with urllib.request.urlopen(target_url) as response:
        html = response.read()
        # 馬場距離天気
        soup = BeautifulSoup(html, "html.parser")
   
    #レース情報作成
    # "race_id;siba_da;length;weather;baba;prize"
    data_1 = soup.select_one("#page > div.RaceColumn01 > div > div.RaceMainColumn > div.RaceList_NameBox > div.RaceList_Item02 > div.RaceData01").text
    result_1 = re.findall(r'/\s([^\s]+)',data_1)
    siba_da = result_1[0][0]
    length = result_1[0][1:-1]
    weather = result_1[1][-1]
    baba = result_1[2][-1]
    
    data_2 = soup.select_one("#page > div.RaceColumn01 > div > div.RaceMainColumn > div.RaceList_NameBox > div.RaceList_Item02 > div.RaceData02 > span:nth-child(10)").text
    result_2 = re.findall(r'本賞金:([^,]+),',data_2)
    prize = result_2[0]
    # 着順・馬名・性齢・騎手名
    
    #new_columns = ["レース番号","芝/ダ","距離","天気","馬場","賞金","odds_all"]
    list_race = [race_id,siba_da,length,weather,baba,prize]
    race_info = ";".join(list_race)
    
    #各馬情報作成
    # "着順;馬番;馬名;馬齢;騎手;単勝オッズ;調教師;馬主;生産者;父馬;母馬;それまでの獲得賞金
    df["馬齢"] = [x[1] for x in df_["性齢"]["性齢"]]
    df["馬名"] = [x for x in df_["馬名"]["馬名"]]
    df["馬番"] = df.index + 1
    df["騎手"] = [x for x in df_["騎手"]["騎手"]]
    try:
        df["単勝オッズ"] = [x for x in df_["オッズ 更新"]["オッズ 更新"]]
    except:
        df["単勝オッズ"] = [x for x in df_["オッズ"]["オッズ"]]
    df["馬体重"] = [str(x)[0:3] for x in df_["馬体重(増減)"]["馬体重(増減)"]]
    df["増減"] = [str(x)[4:-1] for x in df_["馬体重(増減)"]["馬体重(増減)"]]
    df = df[["馬番","馬名","馬齢","騎手","単勝オッズ","馬体重","増減"]]
    df = df.sort_values('馬番')
    horse_urls = re.findall(r'<a href="([^"]*/horse/[^"]+)"', str(html))
    with futures.ThreadPoolExecutor(max_workers=4) as executor:
        results = list(executor.map(get_table,horse_urls))
        
    horses_list = []
    for i in range(len(df)):
        horse_list = list(map(str,df.iloc[i,:].values))
        table_0 = results[i][1]
    
        table_0.index = table_0.iloc[:,0]
        horse_list.append(table_0.loc["調教師",1])
        horse_list.append(table_0.loc["馬主",1])
        horse_list.append(table_0.loc["生産者",1])
        table_1 = results[i][2]
        horse_list.append(table_1.loc[0,0])
    
        horse_list.append(table_1.loc[2,0])
        
        try:
            table_2 = results[i][3]

            if table_2.columns[0] == "受賞歴":
                table_2 = results[i][4]

            table_2.index = [table_2.iloc[i,0][0:4]+table_2.iloc[i,1] for i in range(len(table_2))]
            try:
                a = table_2.index.get_loc(id2kaisai(race_id))
                table_2 = table_2.iloc[a+1:,:]
            except:
                pass
            #table_2["賞金"].fillna('0',inplace=True)
            #table_2["賞金"] = list(map(lambda x: x.replace(",",""),table_2["賞金"]))
            horse_list.append(str(np.sum(table_2["賞金"])))
        except:
            horse_list.append("0")
        horse_info = ";".join(horse_list)
        horses_list.append(horse_info)
    horses_info = ",".join(horses_list)
    
    info = ",".join([race_info,horses_info])+"\n"
    return info

def predict(race_id,odds=False):
    X = []
    line = make_data(race_id)
    model = pickle.load(open('trained_model.pkl', 'rb'))
    
    line_list = line.split(",")
    race_info,horses_info = line_list[0],line_list[1:]
    race_list = race_info.split(";")
    
    for horse_info in horses_info:
        horse_list = horse_info.split(";")
        data = race_list + horse_list + [race_list[0][4:6],race_list[0][-2:]]
        
        if data[10] == "--":
            continue
            
            
        for i in [2,5,8,10,11,12,18]:
            try:
                data[i] = float(data[i])
            except:
                data[i] = 0
        X.append(data)
    X = np.array(X)
    index = X[:,6]
    category = [0,1,3,4,6,7,9,13,14,15,16,17,19,20]
    number =[2,5,8,10,11,12,18]
    X = pd.DataFrame(X)

    for i in category:
        X[i] = X[i].astype('category')
    for i in number:
        X[i] = X[i].astype('float')
    if odds != True:
        X.drop(columns=10,inplace=True)
    
    pred = model.predict(X,num_iteration=model.best_iteration)
    pred = pd.DataFrame(pred,columns=[race_id])
    pred.index = index
    pred = pred.sort_values(race_id,ascending=False)   
    
    baken = pred.index[0:3]
    return baken


In [21]:
for i in [str(100+j)[1:] for j in range(1,13)]:
    print(predict("2020060303"+i),predict("2020090203"+i))
   
  

Index(['2', '14', '15'], dtype='object') Index(['1', '9', '14'], dtype='object')
Index(['8', '12', '3'], dtype='object') Index(['4', '7', '5'], dtype='object')
Index(['10', '12', '7'], dtype='object') Index(['7', '2', '4'], dtype='object')
Index(['10', '5', '12'], dtype='object') Index(['6', '3', '13'], dtype='object')
Index(['1', '4', '7'], dtype='object') Index(['3', '11', '4'], dtype='object')
Index(['11', '4', '2'], dtype='object') Index(['3', '1', '5'], dtype='object')
Index(['2', '8', '7'], dtype='object') Index(['11', '1', '2'], dtype='object')
Index(['10', '7', '4'], dtype='object') Index(['8', '11', '14'], dtype='object')
Index(['8', '9', '5'], dtype='object') Index(['1', '6', '4'], dtype='object')
Index(['5', '14', '10'], dtype='object') Index(['6', '5', '11'], dtype='object')
Index(['14', '10', '15'], dtype='object') Index(['15', '5', '11'], dtype='object')
Index(['15', '1', '7'], dtype='object') Index(['3', '2', '4'], dtype='object')


In [22]:
predict("202009020411")

Index(['3', '1', '4'], dtype='object')

In [13]:
race_id = "202006020612"
target_url = 'https://race.netkeiba.com/race/shutuba.html?race_id='+race_id+'&rf=race_submenu' 
driver = Chrome(options=options)
driver.get(target_url)
html = driver.page_source.encode('euc-jp',"ignore")  
driver.quit()
df_ = pd.read_html(html)[0]

df = pd.DataFrame()
with urllib.request.urlopen(target_url) as response:
    html = response.read()
    # 馬場距離天気
    soup = BeautifulSoup(html, "html.parser")

#レース情報作成
# "race_id;siba_da;length;weather;baba;prize"
data_1 = soup.select_one("#page > div.RaceColumn01 > div > div.RaceMainColumn > div.RaceList_NameBox > div.RaceList_Item02 > div.RaceData01").text
result_1 = re.findall(r'/\s([^\s]+)',data_1)
siba_da = result_1[0][0]
length = result_1[0][1:-1]
weather = result_1[1][-1]
baba = result_1[2][-1]

data_2 = soup.select_one("#page > div.RaceColumn01 > div > div.RaceMainColumn > div.RaceList_NameBox > div.RaceList_Item02 > div.RaceData02 > span:nth-child(10)").text
result_2 = re.findall(r'本賞金:([^,]+),',data_2)
prize = result_2[0]
# 着順・馬名・性齢・騎手名

#new_columns = ["レース番号","芝/ダ","距離","天気","馬場","賞金","odds_all"]
list_race = [race_id,siba_da,length,weather,baba,prize]
race_info = ";".join(list_race)
display(df_)

df["馬齢"] = [x[1] for x in df_["性齢"]["性齢"]]
df["馬名"] = [x for x in df_["馬名"]["馬名"]]
df["馬番"] = df.index + 1
df["騎手"] = [x for x in df_["騎手"]["騎手"]]
df["単勝オッズ"] = [x for x in df_["オッズ"]["オッズ"]]
df["馬体重"] = [x[0:3] for x in df_["馬体重(増減)"]["馬体重(増減)"]]
df["増減"] = [x[4:-1] for x in df_["馬体重(増減)"]["馬体重(増減)"]]
df = df[["馬番","馬名","馬齢","騎手","単勝オッズ","馬体重","増減"]]
df = df.sort_values('馬番')



Unnamed: 0_level_0,枠,馬番,印,馬名,性齢,斤量,騎手,厩舎,馬体重(増減),オッズ 更新,人気,お気に入り馬,お気に入り馬
Unnamed: 0_level_1,枠,馬番,印,馬名,性齢,斤量,騎手,厩舎,馬体重(増減),オッズ 更新,人気,登録,メモ
0,1,1,--,イナズママンボ,牝6,55.0,津村,栗東高柳大,,10.8,7,,
1,1,2,--,トウカイパシオン,牝5,55.0,江田照,美浦菊川,,3.5,1,,
2,2,3,--,アオイサンシャイン,牝7,55.0,石橋脩,美浦古賀慎,,15.1,8,,
3,2,4,--,シルバーストーン,牡6,57.0,丸田,美浦石栗,,19.4,10,,
4,3,5,--,アメリカンツイスト,セ5,57.0,北村宏,美浦中舘,,8.9,4,,
5,3,6,--,ポッドグレイシー,牡5,57.0,菅原隆,美浦小野,,135.7,16,,
6,4,7,--,トッカータ,牡5,57.0,武藤,美浦和田雄,,20.3,11,,
7,4,8,--,ウィンターリリー,牝4,54.0,川又,美浦大江原,,30.4,12,,
8,5,9,--,ローレルジャック,牡7,57.0,武士沢,美浦高橋裕,,41.9,13,,
9,5,10,--,アイアムピッカピカ,牝4,55.0,柴田善,美浦奥平,,8.0,3,,


KeyError: 'オッズ'