# 3 ロジスティック回帰で予想

In [1]:
import pandas as pd

In [2]:
results  = pd.read_pickle('2019_result.pickle')

### 4着以下は4着としてまとめる

In [3]:
clip_rank = lambda x: x if x<4 else 4

In [4]:
results['rank'] = results['着順'].map(clip_rank)

In [5]:
results['rank'].value_counts()

4    36759
1     3457
3     3453
2     3449
Name: rank, dtype: int64

### とりあえず着順、馬名、調教師、着差データは使用しないので削除

In [6]:
results.drop(['着順','馬名','調教師','着差'], axis=1, inplace=True)

### 騎手や年齢をダミー変数化

In [7]:
results_d = pd.get_dummies(results)

In [8]:
results_d

Unnamed: 0,枠番,馬番,斤量,単勝,人気,体重,増減,rank,騎手_アヴドゥ,騎手_オドノヒ,...,年齢_11,年齢_12,年齢_2,年齢_3,年齢_4,年齢_5,年齢_6,年齢_7,年齢_8,年齢_9
201901010101,1,1,54.0,1.4,1.0,518,-16,1,0,0,...,0,0,1,0,0,0,0,0,0,0
201901010101,3,3,54.0,3.5,2.0,496,-8,2,0,0,...,0,0,1,0,0,0,0,0,0,0
201901010101,4,4,51.0,46.6,6.0,546,6,3,0,0,...,0,0,1,0,0,0,0,0,0,0
201901010101,8,9,51.0,56.8,7.0,458,-8,4,0,0,...,0,0,1,0,0,0,0,0,0,0
201901010101,5,5,54.0,140.3,9.0,436,0,4,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201910021212,6,11,56.0,120.3,15.0,458,8,4,0,0,...,0,0,0,0,0,0,1,0,0,0
201910021212,1,1,54.0,7.5,4.0,460,2,4,0,0,...,0,0,0,0,0,1,0,0,0,0
201910021212,2,3,54.0,99.2,12.0,478,14,4,0,0,...,0,0,0,0,1,0,0,0,0,0
201910021212,6,12,52.0,17.5,8.0,468,2,4,0,0,...,0,0,0,1,0,0,0,0,0,0


### 回帰モデルのライブラリインポート

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### 説明変数から目的変数であるrankを削除、また目的変数としてrankを抽出
### そして、データを訓練用と検証用で7:3の割合で分割

In [10]:
X = results_d.drop(['rank'], axis=1)
y = results_d['rank']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3,random_state=0)

### モデルで学習

In [11]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [12]:
print(model.score(X_train, y_train), model.score(X_test, y_test))

0.7801528106239767 0.7801358234295416


### 4着以下の数が多く、ほとんどが4着と予測されてしまうので、各データのサンプル数を
### UnderSamplerで合わせる

In [13]:
from imblearn.under_sampling import RandomUnderSampler

In [14]:
rank_1 = y_train.value_counts()[1]
rank_2 = y_train.value_counts()[2]
rank_3 = y_train.value_counts()[3]

rus = RandomUnderSampler(sampling_strategy={1:rank_1, 2:rank_2, 3:rank_3, 4:rank_1}, random_state=71)

In [15]:
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

### 再度学習

In [16]:
model = LogisticRegression()
model.fit(X_train_rus, y_train_rus)

print(model.score(X_train, y_train), model.score(X_test, y_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5661269783518282 0.5658602150537635


### 予測

In [17]:
y_pred = model.predict(X_test)
pred_df = pd.DataFrame({'pred':y_pred, 'actual':y_test})

In [18]:
pred_df

Unnamed: 0,pred,actual
201907030606,2,2
201904020801,1,1
201904021108,4,4
201901020302,2,4
201902020103,3,4
...,...,...
201905040708,4,4
201909050601,4,4
201904020404,4,4
201906010301,4,4


In [19]:
pd.Series(model.coef_[0], index=X.columns).sort_values()

人気        -0.113718
単勝        -0.008636
年齢_6      -0.005326
枠番        -0.004220
騎手_岩田康誠   -0.002420
             ...   
騎手_レーン     0.001742
騎手_丸山元気    0.002377
騎手_ルメール    0.003049
騎手_川田将雅    0.003985
年齢_3       0.006068
Length: 1834, dtype: float64