# ランダムフォレストによる予測

In [41]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

### 読み込むデータファイルの年を入力

In [81]:
target_year = '2019'

In [82]:
read_file_name = target_year + '_result.pickle'
data = pd.read_pickle(read_file_name)

In [83]:
data.head(2)

Unnamed: 0,着順,枠番,馬番,馬名,斤量,騎手,単勝,人気,course_len,weather,race_type,ground_state,date,性,年齢,体重,増減
201901010101,1,1,1,ゴルコンダ,54.0,ルメール,1.4,1.0,1800,曇,芝,良,2019-07-27,牡,2,518,-16
201901010101,2,3,3,プントファイヤー,54.0,岩田康誠,3.5,2.0,1800,曇,芝,良,2019-07-27,牡,2,496,-8


### 不要データと型の調整

In [84]:
data.drop('馬名', axis=1, inplace=True)

In [99]:
data['course_len'] = data['course_len'].astype(int)
data['年齢'] = data['年齢'].astype(int)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47118 entries, 201901010101 to 201910021212
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   着順            47118 non-null  int64         
 1   枠番            47118 non-null  int64         
 2   馬番            47118 non-null  int64         
 3   斤量            47118 non-null  float64       
 4   騎手            47118 non-null  object        
 5   単勝            47118 non-null  float64       
 6   人気            47118 non-null  float64       
 7   course_len    47118 non-null  int64         
 8   weather       47118 non-null  object        
 9   race_type     47118 non-null  object        
 10  ground_state  47118 non-null  object        
 11  date          47118 non-null  datetime64[ns]
 12  性             47118 non-null  object        
 13  年齢            47118 non-null  int64         
 14  体重            47118 non-null  int64         
 15  増減            47118 non

In [100]:
data_d = pd.get_dummies(data)

In [101]:
data_d['rank'] = data_d['着順'].map(lambda x: x if x<4 else 4)

### 訓練とテストデータを分ける関数定義

In [None]:
def split_data(df, test_size):
    sorted_id_list = data.sort_values('date').index.unique()
    train_id_list = sorted_id_list[:round(len(sorted_id_list) * (1- test_size))]
    test_id_list = sorted_id_list[round(len(sorted_id_list) * (1 - test_size)):]
    train = df.loc[train_id_list]
    test = df.loc[test_id_list]
    return train, test

In [116]:
train, test = split_data(data_d, 0.3)

In [111]:
len(train) / len(data)

0.7030646462073942

In [112]:
len(test) / len(data)

0.29693535379260577

### アンダーサンプリングして、ランダムフォレストで解析

In [134]:
from imblearn.under_sampling import RandomUnderSampler

rank_1 = train['rank'].value_counts()[1]
rank_2 = train['rank'].value_counts()[2]
rank_3 = train['rank'].value_counts()[3]

rus = RandomUnderSampler(sampling_strategy={1:rank_1, 2:rank_2, 3:rank_3, 4:rank_1}, random_state=71)

X_train = train.drop(['着順','rank','date'], axis=1)
y_train = train['rank']
X_test = test.drop(['着順','rank','date'], axis=1)
y_test = test['rank']

In [135]:
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

In [138]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0)
clf.fit(X_train_rus, y_train_rus)

print(clf.score(X_train, y_train), clf.score(X_test, y_test))

0.7229148428774111 0.5393467228932886
