2. データ整形

In [1]:
import pandas as pd

In [2]:
results0 = pd.read_pickle("../etc/data/results.pickle")

In [3]:
def preprocessing(results):
    df = results.copy()
    
    #着順に数字以外の文字が含まれているものを削除
    df = df[~(results["着順"].astype(str).str.contains("\D"))]
    df["着順"] = df["着順"].astype(int)
    
    #性齢を性と年齢に分割
    df["性"] = df["性齢"].map(lambda x: str(x)[0])
    df["年齢"] = df["性齢"].map(lambda x: str(x)[1]).astype(int)
    
    #馬体重を体重と体重変化に分割
    df["体重"] = df["馬体重"].str.split("(", expand = True)[0].astype(int)
    df["体重変化"] = df["馬体重"].str.split("(", expand = True)[1].str[:-1].astype(int)
    
    #データ型をintやfloatに変換
    df["単勝"] = df["単勝"].astype(float)
    
    #不要な列を削除
    df.drop(["タイム", "着差", "調教師", "性齢", "馬体重"], axis=1, inplace=True)
    
    return df

In [4]:
results0

Unnamed: 0,着順,枠番,馬番,馬名,性齢,斤量,騎手,タイム,着差,単勝,人気,馬体重,調教師
202101010101,1,5,9,ディーバサンライズ,牝3,52.0,秋山稔樹,1:08.7,,13.5,7.0,474(-22),[西] 小林真也
202101010101,2,3,6,バンベルク,牡3,56.0,横山和生,1:08.8,3/4,3.9,2.0,470(-2),[西] 安田隆行
202101010101,3,7,14,リツィタル,牝3,54.0,大野拓弥,1:08.9,1/2,124.7,14.0,394(+2),[西] 牧田和弥
202101010101,4,5,10,クレマチステソーロ,牝3,54.0,鮫島克駿,1:09.0,1/2,7.6,5.0,468(+2),[東] 高木登
202101010101,5,3,5,ペイシャケイティー,牝3,54.0,古川吉洋,1:09.0,ハナ,7.5,4.0,456(+8),[東] 土田稔
...,...,...,...,...,...,...,...,...,...,...,...,...,...
202110040812,7,2,2,ジオルティ,牡3,51.0,角田大和,2:42.0,1.1/4,49.6,10.0,502(+8),[西] 杉山晴紀
202110040812,8,7,9,ロックグラス,牡4,57.0,幸英明,2:42.2,1.1/4,43.7,8.0,464(0),[西] 谷潔
202110040812,9,7,8,サマーカナロア,牡3,53.0,富田暁,2:42.2,ハナ,46.5,9.0,502(-10),[西] 昆貢
202110040812,10,8,10,レッドレイル,セ5,57.0,浜中俊,2:42.4,1.1/4,14.0,6.0,436(+2),[西] 辻野泰之


In [5]:
results = preprocessing(results0)

In [6]:
results

Unnamed: 0,着順,枠番,馬番,馬名,斤量,騎手,単勝,人気,性,年齢,体重,体重変化
202101010101,1,5,9,ディーバサンライズ,52.0,秋山稔樹,13.5,7.0,牝,3,474,-22
202101010101,2,3,6,バンベルク,56.0,横山和生,3.9,2.0,牡,3,470,-2
202101010101,3,7,14,リツィタル,54.0,大野拓弥,124.7,14.0,牝,3,394,2
202101010101,4,5,10,クレマチステソーロ,54.0,鮫島克駿,7.6,5.0,牝,3,468,2
202101010101,5,3,5,ペイシャケイティー,54.0,古川吉洋,7.5,4.0,牝,3,456,8
...,...,...,...,...,...,...,...,...,...,...,...,...
202110040812,7,2,2,ジオルティ,51.0,角田大和,49.6,10.0,牡,3,502,8
202110040812,8,7,9,ロックグラス,57.0,幸英明,43.7,8.0,牡,4,464,0
202110040812,9,7,8,サマーカナロア,53.0,富田暁,46.5,9.0,牡,3,502,-10
202110040812,10,8,10,レッドレイル,57.0,浜中俊,14.0,6.0,セ,5,436,2


In [7]:
results.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41278 entries, 202101010101 to 202110040812
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   着順      41278 non-null  int64  
 1   枠番      41278 non-null  int64  
 2   馬番      41278 non-null  int64  
 3   馬名      41278 non-null  object 
 4   斤量      41278 non-null  float64
 5   騎手      41278 non-null  object 
 6   単勝      41278 non-null  float64
 7   人気      41278 non-null  float64
 8   性       41278 non-null  object 
 9   年齢      41278 non-null  int64  
 10  体重      41278 non-null  int64  
 11  体重変化    41278 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 4.1+ MB


3. ロジスティック回帰予測

In [8]:
clip_rank = lambda x: x if x<4 else 4

In [10]:
results["rank"] = results["着順"].map(clip_rank)
results["rank"].value_counts()

4    32272
1     3008
3     3002
2     2996
Name: rank, dtype: int64

In [11]:
results.drop(["着順", "馬名"], axis=1, inplace=True)
results

Unnamed: 0,枠番,馬番,斤量,騎手,単勝,人気,性,年齢,体重,体重変化,rank
202101010101,5,9,52.0,秋山稔樹,13.5,7.0,牝,3,474,-22,1
202101010101,3,6,56.0,横山和生,3.9,2.0,牡,3,470,-2,2
202101010101,7,14,54.0,大野拓弥,124.7,14.0,牝,3,394,2,3
202101010101,5,10,54.0,鮫島克駿,7.6,5.0,牝,3,468,2,4
202101010101,3,5,54.0,古川吉洋,7.5,4.0,牝,3,456,8,4
...,...,...,...,...,...,...,...,...,...,...,...
202110040812,2,2,51.0,角田大和,49.6,10.0,牡,3,502,8,4
202110040812,7,9,57.0,幸英明,43.7,8.0,牡,4,464,0,4
202110040812,7,8,53.0,富田暁,46.5,9.0,牡,3,502,-10,4
202110040812,8,10,57.0,浜中俊,14.0,6.0,セ,5,436,2,4


In [12]:
results_d = pd.get_dummies(results)
results_d

Unnamed: 0,枠番,馬番,斤量,単勝,人気,年齢,体重,体重変化,rank,騎手_ムーア,...,騎手_鮫島克駿,騎手_鮫島良太,騎手_鴨宮祥行,騎手_黒岩悠,騎手_黛弘人,騎手_Ｃ．デム,騎手_Ｍ．デム,性_セ,性_牝,性_牡
202101010101,5,9,52.0,13.5,7.0,3,474,-22,1,0,...,0,0,0,0,0,0,0,0,1,0
202101010101,3,6,56.0,3.9,2.0,3,470,-2,2,0,...,0,0,0,0,0,0,0,0,0,1
202101010101,7,14,54.0,124.7,14.0,3,394,2,3,0,...,0,0,0,0,0,0,0,0,1,0
202101010101,5,10,54.0,7.6,5.0,3,468,2,4,0,...,1,0,0,0,0,0,0,0,1,0
202101010101,3,5,54.0,7.5,4.0,3,456,8,4,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202110040812,2,2,51.0,49.6,10.0,3,502,8,4,0,...,0,0,0,0,0,0,0,0,0,1
202110040812,7,9,57.0,43.7,8.0,4,464,0,4,0,...,0,0,0,0,0,0,0,0,0,1
202110040812,7,8,53.0,46.5,9.0,3,502,-10,4,0,...,0,0,0,0,0,0,0,0,0,1
202110040812,8,10,57.0,14.0,6.0,5,436,2,4,0,...,0,0,0,0,0,0,0,1,0,0


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [15]:
#データの"rank"以外を入力X，"rank"を正解ラベルとする
X = results_d.drop(["rank"], axis=1)
y = results_d["rank"]

#入力データと正解ラベルを、訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

In [16]:
#ロジスティック回帰モデルを学習させる
moodel = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [17]:
print(model.score(X_train, y_train), model.score(X_test, y_test))

0.7818232158925729 0.7818152454780362


In [24]:
y_pred = model.predict(X_test)
pd.Series(y_pred).value_counts()

4    12384
dtype: int64

In [44]:
from imblearn.under_sampling import RandomUnderSampler

rank_1 = y_train.value_counts()[1]
rank_2 = y_train.value_counts()[2]
rank_3 = y_train.value_counts()[3]

rus = RandomUnderSampler(random_state=71)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

In [45]:
pd.Series(y_train_rus).value_counts()

1    2097
2    2097
3    2097
4    2097
Name: rank, dtype: int64

In [48]:
moodel = LogisticRegression()
model.fit(X_train_rus, y_train_rus)

print(model.score(X_train, y_train), model.score(X_test, y_test))

0.5634733854779539 0.5510335917312662


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [50]:
y_pred = model.predict(X_test)

In [52]:
pred_df = pd.DataFrame({"pred":y_pred, "actual":y_test})

In [55]:
pred_df

Unnamed: 0,pred,actual
202110030303,3,3
202110040603,4,4
202107050211,1,4
202107030405,4,4
202107020504,4,4
...,...,...
202106030801,4,4
202105010502,4,4
202109050204,3,2
202107010801,4,4


In [59]:
pred_df[pred_df["pred"]==1]["actual"].value_counts()

4    1554
1     594
2     502
3     408
Name: actual, dtype: int64

In [61]:
len(pred_df[pred_df["pred"]==1])

3058

In [62]:
pred_df[pred_df["pred"]==4]["actual"].value_counts()

4    5950
3     200
2     146
1     110
Name: actual, dtype: int64

In [64]:
pd.Series(model.coef_[0], index=X.columns).sort_values()

人気        -0.122805
年齢        -0.016136
単勝        -0.007703
性_牡       -0.006078
枠番        -0.004947
             ...   
馬番         0.003031
騎手_川田将雅    0.003827
体重変化       0.004657
騎手_ルメール    0.004705
性_牝        0.006832
Length: 159, dtype: float64

In [65]:
results[results["騎手"]=="ルメール"]["rank"].value_counts()

4    328
1    173
2    109
3     71
Name: rank, dtype: int64

In [67]:
results["馬番"].value_counts()

2     2986
7     2978
4     2975
3     2974
5     2974
6     2973
1     2963
8     2915
9     2823
10    2695
11    2533
12    2336
13    2087
14    1873
15    1561
16    1203
17     238
18     191
Name: 馬番, dtype: int64