## 機械学習フロー 

>Kaggleの Home Credit Default Risk コンペティションを題材に、機械学習の実践的な流れを学びます。  
>特に適切な 検証 を行い、高い 汎化性能 のあるモデルを完成させることを目指します。



## 【問題1】クロスバリデーション  
>事前学習期間では検証データをはじめに分割しておき、それに対して指標値を計算することで検証を行っていました。（ホールドアウト法）  
>しかし、分割の仕方により精度は変化します。実践的には クロスバリデーション（交差検証） を行います。  
>分割を複数回行い、それぞれに対して学習と検証を行う方法です。複数回の分割のためにscikit-learnにはKFoldクラスが用意されています。
>
>事前学習期間の課題で作成したベースラインモデルに対してKFoldクラスによるクロスバリデーションを行うコードを作成し実行してください。

In [128]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [129]:
# データセットcsvをpandasに読み込む

# 学習データの読み込み
csv_path = "./Kaggle_data/HomeCredit/application_train.csv" 
df = pd.read_csv(csv_path)

# テストデータの読み込み
csv_path = "./Kaggle_data/HomeCredit/application_test.csv" 
df_test = pd.read_csv(csv_path)

display(df_test)

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,456221,Cash loans,F,N,Y,0,121500.0,412560.0,17473.5,270000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
48740,456222,Cash loans,F,N,N,2,157500.0,622413.0,31909.5,495000.0,...,0,0,0,0,,,,,,
48741,456223,Cash loans,F,Y,Y,1,202500.0,315000.0,33205.5,315000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,3.0,1.0
48742,456224,Cash loans,M,N,N,0,225000.0,450000.0,25128.0,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0


### ベースラインモデルの作成

week4で作成したベースラインモデルのコードを使用して、再作成。

In [130]:
#　データ欠損の確認
df[['DAYS_BIRTH','EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].isnull().sum()

DAYS_BIRTH           0
EXT_SOURCE_1    173378
EXT_SOURCE_2       660
EXT_SOURCE_3     60965
dtype: int64

In [131]:
# 必要なデータを抜き出す
df_X = df[['DAYS_BIRTH','EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']]
df_y = df[['TARGET']]

df2 = pd.concat([df_X,df_y], axis=1)

# テストデータを抜き出す
df2_t = df_test[['SK_ID_CURR','DAYS_BIRTH','EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']]

# テストデータは欠損値に対して行削除で対応できないため、NaN値を０に変換
df2_t = df2_t.fillna(0)

In [132]:
# 学習データの欠損値がある行を削除
display(df2.isnull().sum())
df_drop = df2.dropna(how="any")
display(df_drop.isnull().sum())

DAYS_BIRTH           0
EXT_SOURCE_1    173378
EXT_SOURCE_2       660
EXT_SOURCE_3     60965
TARGET               0
dtype: int64

DAYS_BIRTH      0
EXT_SOURCE_1    0
EXT_SOURCE_2    0
EXT_SOURCE_3    0
TARGET          0
dtype: int64

In [133]:
# sickit-learnに入れるために特徴量をX、目的変数をyというndarrayに格納
X = np.array(df_drop[['DAYS_BIRTH','EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']])
y = np.array(df_drop['TARGET'])

# 回答作成用のテストデータを用意
X_te = np.array(df2_t[['DAYS_BIRTH','EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']])
print(X_te)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [134]:
from sklearn.model_selection import train_test_split

# 学習データとテストデータを７５：２５で分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(82191, 4)
(82191,)
(27398, 4)
(27398,)


In [135]:
from sklearn.preprocessing import StandardScaler

# 標準化クラスをインスタンス化
scaler = StandardScaler()

# 配列Xの平均と分散を計算して記憶する
scaler.fit(X_train)

# 標準化
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_te_scaled = scaler.transform(X_te)

#表示行数を１０行に設定
np.set_printoptions(threshold=10)

print(X_train_scaled)
print(X_test_scaled)
print(X_te_scaled)

[[ 0.9414237   0.12089212  0.17805643 -0.54442099]
 [ 0.81246831 -0.11377564  0.76685882 -0.36505365]
 [-2.07121013  1.51261204  0.30751971 -1.3924383 ]
 ...
 [ 0.92578498 -0.43315546 -0.01019387 -1.63744548]
 [-0.94035081  1.28792554 -0.45127403 -0.58845805]
 [ 0.59557713  0.34436941  0.5187824  -0.71824731]]
[[-0.25378512 -1.03959966 -2.64195922  1.02367955]
 [-1.6082013   0.31612723 -0.3116659   0.0220315 ]
 [ 0.44406096 -1.7118136  -0.39401592 -1.1640454 ]
 ...
 [ 0.06129674  0.3719305  -1.45409286  0.3542867 ]
 [-0.48734052  0.37369729 -1.65738643 -0.03366449]
 [ 1.43340263 -1.26291866  1.03717696 -0.71824731]]
[[-1.13134836  1.16780826  1.42958234 -1.73650975]
 [-0.82959787  0.27452283 -1.30490442 -0.32865922]
 [-1.33567728 -2.41541416  0.93612425  0.58794846]
 ...
 [-0.28044787  1.0768202   0.5681357  -1.09709016]
 [ 0.22050409 -0.63911997 -0.45904851  0.50796395]
 [ 0.22204233 -2.41541416 -0.39952834 -1.15670194]]


In [136]:
#ロジスティック回帰で学習および推定

from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression()#random_state=0
logistic.fit(X_train_scaled, y_train)
y_pred = logistic.predict(X_test_scaled)

In [137]:
base_line = ['ベースライン']

#Accuracy（正解率）
from sklearn.metrics import accuracy_score
print("Accuracy（正解率）： {}".format(accuracy_score(y_test, y_pred)))
base_line.append(accuracy_score(y_test, y_pred))


#Precision（適合率）
from sklearn.metrics import precision_score
print("recision（適合率）： {}".format(precision_score(y_test, y_pred)))
base_line.append(precision_score(y_test, y_pred))

#Recall（再現率）
from sklearn.metrics import recall_score
print("ecall（再現率）： {}".format(recall_score(y_test, y_pred)))
base_line.append(recall_score(y_test, y_pred))

#F値
from sklearn.metrics import f1_score
print("F値： {}".format(f1_score(y_test, y_pred)))
base_line.append(f1_score(y_test, y_pred))


from sklearn.metrics import confusion_matrix
print("混同行列：\n{}".format(confusion_matrix(y_test, y_pred)))
base_line_matrix = confusion_matrix(y_test, y_pred)

Accuracy（正解率）： 0.9267829768596247
recision（適合率）： 0.5909090909090909
ecall（再現率）： 0.006467661691542288
F値： 0.01279527559055118
混同行列：
[[25379     9]
 [ 1997    13]]


In [138]:
# ロジスティック回帰でベースラインを作成

log_reg_pred = logistic.predict_proba(X_te_scaled)[:, 1]

# Submission dataframe
submit_ID = df2_t.loc[:]['SK_ID_CURR']
submit_TARGET = pd.DataFrame(log_reg_pred)

submit = pd.concat([submit_ID, submit_TARGET], axis=1)
submit.columns = ['SK_ID_CURR','TARGET']

display(submit)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.053942
1,100005,0.098597
2,100013,0.138032
3,100028,0.035478
4,100038,0.368593
...,...,...
48739,456221,0.138662
48740,456222,0.354003
48741,456223,0.044679
48742,456224,0.062867


### クロスバリデーションの実施

In [139]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

In [140]:
import copy
from sklearn.model_selection import KFold

#np.set_printoptions(threshold=100)
np.set_printoptions(threshold=np.inf)

kf = KFold(n_splits=3)

# データ分割を実施
kf.get_n_splits(X)


print(kf)

verification=[]
for train_index, test_index in kf.split(X):
    # train_index : シャッフルしたトレーニングデータのインデックス番号
    # test_index : シャッフルしたテストデータのインデックス番号
    
    #print("TRAIN:", train_index, "TEST:", test_index)
    # インデックス番号を使って、テストデータと検証用データを作成する
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # モデルに直接fitさせないよう別のインスタンスにコピーして学習
    logistic_test = copy.deepcopy(logistic)
    logistic_test.fit(X_train, y_train)
    y_pred = logistic_test.predict(X_test)
    
    verification_result=[]
    
    #log_reg_pred = logistic_test.predict_proba(X_te_scaled)[:, 1]
    #print(roc_auc_score(y_test, log_reg_pred))
    #verification_result.append(roc_auc_score(y_test, log_reg_pred))
    
    #Accuracy（正解率）
    #print("Accuracy（正解率）： {}".format(accuracy_score(y_test, y_pred)))
    verification_result.append(accuracy_score(y_test, y_pred))
    #Precision（適合率）
    #print("recision（適合率）： {}".format(precision_score(y_test, y_pred)))
    verification_result.append(precision_score(y_test, y_pred))
    #Recall（再現率）
    #print("ecall（再現率）： {}".format(recall_score(y_test, y_pred)))
    verification_result.append(recall_score(y_test, y_pred))
    #F値
    #print("F値： {}".format(f1_score(y_test, y_pred)))
    verification_result.append(f1_score(y_test, y_pred))
    #混同行列
    #print("混同行列：\n{}".format(confusion_matrix(y_test, y_pred)))
    #verification_result.append(confusion_matrix(y_test, y_pred))
    #print()
    verification.append(verification_result)

    # 行と列のインデックスようのリストを用意  'ROC_AUC', のぞく
    data_columns=['Accuracy（正解率）', 'Precision（適合率）', 'Recall（再現率）', 'F値' ]

# pandas のデータフレームにする
df_verification = pd.DataFrame(data=verification, columns=data_columns)

display(df_verification)

KFold(n_splits=3, random_state=None, shuffle=False)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Accuracy（正解率）,Precision（適合率）,Recall（再現率）,F値
0,0.927156,0.0,0.0,0.0
1,0.926335,0.0,0.0,0.0
2,0.927564,0.0,0.0,0.0


## 【問題2】グリッドサーチ  

>これまで分類器のパラメータには触れず、デフォルトの設定を使用していました。パラメータの詳細は今後のSprintで学んでいくことになります。機械学習の前提として、パラメータは状況に応じて最適なものを選ぶ必要があります。最適なパラメータを探していくことを パラメータチューニング と呼びます。パラメータチューニングをある程度自動化する単純な方法としては グリッドサーチ があります。
>
>scikit-learnのGridSearchCVを使い、グリッドサーチを行うコードを作成してください。そして、ベースラインモデルに対して何らかしらのパラメータチューニングを行なってください。どのパラメータをチューニングするかは、使用した手法の公式ドキュメントを参考にしてください。
>
>sklearn.model_selection.GridSearchCV — scikit-learn 0.21.3 documentation
>
>GridSearchCVクラスには引数としてモデル、探索範囲、さらにクロスバリデーションを何分割で行うかを与えます。クロスバリデーションの機能も含まれているため、これを使用する場合はKFoldクラスを利用する必要はありません。

In [141]:
# sickit-learnに入れるために特徴量をX、目的変数をyというndarrayに格納
X = np.array(df_drop[['DAYS_BIRTH','EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']])
y = np.array(df_drop['TARGET'])

In [142]:
# 学習データとテストデータを７５：２５で分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(82191, 4)
(82191,)
(27398, 4)
(27398,)


In [143]:
from sklearn.preprocessing import StandardScaler

# 標準化クラスをインスタンス化
scaler = StandardScaler()

# 配列Xの平均と分散を計算して記憶する
scaler.fit(X_train)

# 標準化
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_te_scaled = scaler.transform(X_te)

#表示行数を１０行に設定
np.set_printoptions(threshold=10)

print(X_train_scaled)
print(X_test_scaled)
print(X_te_scaled)

[[ 0.9414237   0.12089212  0.17805643 -0.54442099]
 [ 0.81246831 -0.11377564  0.76685882 -0.36505365]
 [-2.07121013  1.51261204  0.30751971 -1.3924383 ]
 ...
 [ 0.92578498 -0.43315546 -0.01019387 -1.63744548]
 [-0.94035081  1.28792554 -0.45127403 -0.58845805]
 [ 0.59557713  0.34436941  0.5187824  -0.71824731]]
[[-0.25378512 -1.03959966 -2.64195922  1.02367955]
 [-1.6082013   0.31612723 -0.3116659   0.0220315 ]
 [ 0.44406096 -1.7118136  -0.39401592 -1.1640454 ]
 ...
 [ 0.06129674  0.3719305  -1.45409286  0.3542867 ]
 [-0.48734052  0.37369729 -1.65738643 -0.03366449]
 [ 1.43340263 -1.26291866  1.03717696 -0.71824731]]
[[-1.13134836  1.16780826  1.42958234 -1.73650975]
 [-0.82959787  0.27452283 -1.30490442 -0.32865922]
 [-1.33567728 -2.41541416  0.93612425  0.58794846]
 ...
 [-0.28044787  1.0768202   0.5681357  -1.09709016]
 [ 0.22050409 -0.63911997 -0.45904851  0.50796395]
 [ 0.22204233 -2.41541416 -0.39952834 -1.15670194]]


In [144]:
#ロジスティック回帰で学習および推定

from sklearn.linear_model import LogisticRegression

# L1正規化を実行するために　solver='saga'　に変更。ワーニング防止のため、max_iter=500
logistic = LogisticRegression(solver='saga', max_iter=500)
logistic.fit(X_train_scaled, y_train)
y_pred = logistic.predict(X_test_scaled)

LogisticRegressionのパラメーターの一覧を表示

In [145]:
logistic.get_params().keys()

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'])

現在のデフォルトパラメーターの一覧を表示

In [146]:
logistic.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 500,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'saga',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

'penalty'と'C'を用いてグリッドサーチを実施。

In [147]:
import pprint


from sklearn.model_selection import GridSearchCV

# 作成したベースラインモデルをコピー
logistic_grid = copy.deepcopy(logistic)

parameters = {'penalty':("l1", "l2"), 'C':[0.01, 10, 50, 100]}
# logistic_grid.get_params().keys()
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

clf = GridSearchCV(logistic_grid, parameters, scoring='roc_auc')

clf.fit(X_train_scaled, y_train)

clf_df = pd.DataFrame(clf.cv_results_)
display(clf_df.sort_values('rank_test_score'))


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,0.128268,0.010157,0.004896,0.000298,0.01,l2,"{'C': 0.01, 'penalty': 'l2'}",0.755339,0.739959,0.741482,0.752452,0.746901,0.747226,0.005984,1
4,0.189871,0.048436,0.005981,0.000776,50.0,l1,"{'C': 50, 'penalty': 'l1'}",0.755168,0.739947,0.741394,0.752553,0.747007,0.747214,0.005975,2
7,0.239178,0.052316,0.006363,0.000747,100.0,l2,"{'C': 100, 'penalty': 'l2'}",0.755166,0.739947,0.741395,0.752552,0.747006,0.747213,0.005974,3
3,0.161028,0.010298,0.00546,0.000476,10.0,l2,"{'C': 10, 'penalty': 'l2'}",0.755166,0.739947,0.741395,0.752552,0.747006,0.747213,0.005974,4
2,0.215254,0.034656,0.005409,0.000589,10.0,l1,"{'C': 10, 'penalty': 'l1'}",0.755167,0.739947,0.741395,0.752552,0.747006,0.747213,0.005974,5
5,0.196486,0.037279,0.006174,0.000358,50.0,l2,"{'C': 50, 'penalty': 'l2'}",0.755166,0.739947,0.741394,0.752552,0.747006,0.747213,0.005974,6
6,0.23228,0.0627,0.005724,0.000595,100.0,l1,"{'C': 100, 'penalty': 'l1'}",0.755166,0.739947,0.741393,0.752552,0.747006,0.747213,0.005974,7
0,0.144809,0.020234,0.004972,0.000226,0.01,l1,"{'C': 0.01, 'penalty': 'l1'}",0.755573,0.73978,0.741544,0.752025,0.746357,0.747056,0.006017,8


roc_aucにて比較したところ、今回のベースラインモデルでは、全体としては大きな差はないものの、  
C は０.０１〜１０程度が好ましく、大きすぎるとよくないことがわかった。  
正則化についてはCとの組み合わせによって優劣が変わっているため、明確な傾向はみれらない。

## 【問題3】Kaggle Notebooksからの調査  
>KaggleのNotebooksから様々なアイデアを見つけ出して、列挙してください。

- 過学習防止のため早期打ち切りを実施（ロジスティック回帰に有効か不明のため検証）
- ランダムサーチを使用（精度に差が出るかは不明）
- クロスバリデーションをStratifiedKFoldに変更する
- 特徴量にDomain Knowledge Features を追加する（先週の課題では１つだけしか増やしていないので）

https://www.kaggle.com/willkoehrsen/intro-to-model-tuning-grid-and-random-search

早期打ち切りについて

Early Stopping¶  
One of the most important hyperparameters in a Gradient Boosting Machine is the number of estimators (the number of decision trees trained sequentially). We could set this as another hyperparameter in our search, but there's a better method: early stopping. Early stopping means training until the validation error does not decrease for a specified number of iterations. In the case of the GBM, this means training more decision trees, and in this example, we will use early stopping with 100 rounds, meaning that the training will continue until validation error has not decreased for 100 rounds. Then, the number of estimators that yielded the best score on the validation data will be chosen as the number of estimators to use in the final model.

早期打ち切り¶  
勾配ブースティングマシンで最も重要なハイパーパラメータの1つは、推定量の数（順次トレーニングされる決定木の数）です。 これを検索の別のハイパーパラメータとして設定することもできますが、より良い方法があります。早期停止です。 早期停止とは、指定された反復回数で検証エラーが減少しなくなるまでトレーニングすることを意味します。 GBMの場合、これはより多くの決定木をトレーニングすることを意味します。この例では、100ラウンドで早期停止を使用します。つまり、検証エラーが100ラウンド減少しなくなるまでトレーニングが続行されます。 次に、検証データで最高のスコアが得られた推定量の数が、最終モデルで使用する推定量の数として選択されます。

LightGBMとは
LightGBMとは決定木アルゴリズムに基づいた勾配ブースティング（Gradient Boosting）の機械学習フレームワークです。
LightGBMは米マイクロソフト社がスポンサーをしています。

グリッドサーチの他にランダムサーチあり

Random Search  
Random search is surprisingly efficient compared to grid search. Although grid search will find the optimal value of hyperparameters (assuming they are in your grid) eventually, random search will usually find a "close-enough" value in far fewer iterations. This great paper explains why this is so: grid search spends too much time evaluating unpromising regions of the hyperparameter search space because it has to evaluate every single combination in the grid. Random search in contrast, does a better job of exploring the search space and therefore can usually find a good combination of hyperparameters in far fewer iterations.


ランダム検索  
ランダム検索は、グリッド検索と比較して驚くほど効率的です。 グリッド検索は最終的にハイパーパラメータの最適値を見つけますが（グリッド内にあると仮定）、ランダム検索は通常、はるかに少ない反復で「十分に近い」値を見つけます。 この優れた論文では、その理由を説明しています。グリッド検索では、グリッド内のすべての組み合わせを評価する必要があるため、ハイパーパラメータ検索スペースの見込みのない領域の評価に時間がかかりすぎます。 対照的に、ランダム検索は、検索空間を探索する上で優れた仕事をするため、通常、はるかに少ない反復でハイパーパラメーターの適切な組み合わせを見つけることができます。

https://www.kaggle.com/willkoehrsen/start-here-a-gentle-introduction  


Domain Knowledge Features
専門分野の知識を利用した新しい特徴量の作成

app_train_domain['CREDIT_INCOME_PERCENT'] = app_train_domain['AMT_CREDIT'] / app_train_domain['AMT_INCOME_TOTAL']
app_train_domain['ANNUITY_INCOME_PERCENT'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_INCOME_TOTAL']
app_train_domain['CREDIT_TERM'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_CREDIT']
app_train_domain['DAYS_EMPLOYED_PERCENT'] = app_train_domain['DAYS_EMPLOYED'] / app_train_domain['DAYS_BIRTH']


app_test_domain['CREDIT_INCOME_PERCENT'] = app_test_domain['AMT_CREDIT'] / app_test_domain['AMT_INCOME_TOTAL']
app_test_domain['ANNUITY_INCOME_PERCENT'] = app_test_domain['AMT_ANNUITY'] / app_test_domain['AMT_INCOME_TOTAL']
app_test_domain['CREDIT_TERM'] = app_test_domain['AMT_ANNUITY'] / app_test_domain['AMT_CREDIT']
app_test_domain['DAYS_EMPLOYED_PERCENT'] = app_test_domain['DAYS_EMPLOYED'] / app_test_domain['DAYS_BIRTH']

https://www.kaggle.com/willkoehrsen/intro-to-model-tuning-grid-and-random-search

## 【問題4】高い汎化性能のモデル作成  
>問題3で見つけたアイデアと、独自のアイデアを組み合わせ高い汎化性能のモデル作りを進めてください。  
>その過程として、何を行うことで、クロスバリデーションの結果がどの程度変化したかを表にまとめてください。

1. ベースラインと同じデータに対してパラメータを増やしてグリッドサーチを実施。  
→最も精度が高いパラメータで学習させ、KFoldとStratifiedKFoldを使用してベースラインと比較しながら汎化性能の高いモデルを構築。
2. ベースラインと同じデータに対して学習回数のパラメーターをグリッドサーチにかけて何回が最適な学習回数なのか検証する。  
（max_iter int、default = 100  ソルバーが収束するために必要な最大反復回数。）

### グリッドサーチのパラメータについて  
Notebookではロジスティック回帰を用いている人が少なく、良い情報が得られなかった。  
インターネットで検索したところ、"C"や "random_state"を使用しているものを見かけた。  
テキストのp.13では"penalty"と"C"が推奨されていた。  
ロジスティック回帰のリファレンスから検討を行う。 

https://qiita.com/FujiedaTaro/items/5784eda386146f1fd6e7

In [25]:
from sklearn.preprocessing import LabelEncoder

In [56]:
from sklearn.model_selection import GridSearchCV

# 作成したベースラインモデルをコピー
logistic_grid = copy.deepcopy(logistic)

parameters = {'penalty':("l1", "l2"), 
              'C':[0.01, 5, 10],
              'fit_intercept':(True, False),
              'class_weight':("balanced", None),
              'random_state':[0, 50],
              'multi_class':("ovr", "multinomial"),
              'n_jobs':[1, 2, 3]
             }

# logistic_grid.get_params().keys()
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

clf = GridSearchCV(logistic_grid, parameters, cv=5, scoring='roc_auc')
clf.fit(X_train_scaled, y_train)

pd.set_option('display.max_rows', 300)

clf_df = pd.DataFrame(clf.cv_results_)
display(clf_df.sort_values('rank_test_score'))






Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_fit_intercept,param_multi_class,param_n_jobs,param_penalty,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
10,0.407797,0.021655,0.007201,0.000921,0.01,balanced,True,ovr,3,l2,0,"{'C': 0.01, 'class_weight': 'balanced', 'fit_i...",0.755157,0.740038,0.741363,0.752628,0.747014,0.74724,0.00597,1
2,0.446661,0.108773,0.008542,0.002689,0.01,balanced,True,ovr,1,l2,0,"{'C': 0.01, 'class_weight': 'balanced', 'fit_i...",0.755157,0.740038,0.741363,0.752628,0.747014,0.74724,0.00597,1
6,0.395901,0.015093,0.007224,0.000544,0.01,balanced,True,ovr,2,l2,0,"{'C': 0.01, 'class_weight': 'balanced', 'fit_i...",0.755157,0.740038,0.741363,0.752628,0.747014,0.74724,0.00597,1
11,0.477764,0.195026,0.007967,0.001912,0.01,balanced,True,ovr,3,l2,50,"{'C': 0.01, 'class_weight': 'balanced', 'fit_i...",0.755156,0.740037,0.741363,0.752627,0.747013,0.747239,0.005969,4
3,0.374645,0.040786,0.007879,0.001408,0.01,balanced,True,ovr,1,l2,50,"{'C': 0.01, 'class_weight': 'balanced', 'fit_i...",0.755156,0.740037,0.741363,0.752627,0.747013,0.747239,0.005969,4
7,0.358918,0.014825,0.006709,0.000421,0.01,balanced,True,ovr,2,l2,50,"{'C': 0.01, 'class_weight': 'balanced', 'fit_i...",0.755156,0.740037,0.741363,0.752627,0.747013,0.747239,0.005969,4
8,0.863062,0.420376,0.008252,0.001471,0.01,balanced,True,ovr,3,l1,0,"{'C': 0.01, 'class_weight': 'balanced', 'fit_i...",0.755274,0.740044,0.741407,0.752558,0.746907,0.747238,0.005979,7
0,0.645971,0.130614,0.010224,0.000956,0.01,balanced,True,ovr,1,l1,0,"{'C': 0.01, 'class_weight': 'balanced', 'fit_i...",0.755274,0.740044,0.741407,0.752558,0.746907,0.747238,0.005979,7
4,1.087697,0.585163,0.009759,0.002056,0.01,balanced,True,ovr,2,l1,0,"{'C': 0.01, 'class_weight': 'balanced', 'fit_i...",0.755274,0.740044,0.741407,0.752558,0.746907,0.747238,0.005979,7
14,1.163186,0.036169,0.007582,0.001118,0.01,balanced,True,multinomial,1,l2,0,"{'C': 0.01, 'class_weight': 'balanced', 'fit_i...",0.75513,0.740028,0.741349,0.752653,0.747031,0.747238,0.005972,10


最もroc_aucが高かったパラメーターの組み合わせは以下の通り。  
param_C	0.01	  
param_class_weight	balanced	 
param_fit_intercept	True  
param_multi_class  	ovr  
param_n_jobs	３ 
param_penalty	l2  
param_random_state  ０
						

### 最適パラメータで交差検証用のモデルを作成

In [148]:
# ロジスティック回帰

logistic_grid1 = LogisticRegression(C=0.01, class_weight='balanced', fit_intercept=True, multi_class='ovr', n_jobs=3, penalty='l2', random_state=0)

#### KFoldによる検証

In [149]:
import copy
from sklearn.model_selection import KFold

#np.set_printoptions(threshold=100)
np.set_printoptions(threshold=np.inf)

kf = KFold(n_splits=5)

# データ分割を実施
kf.get_n_splits(X)


print(kf)


verification = []
for train_index, test_index in kf.split(X):
    # train_index : シャッフルしたトレーニングデータのインデックス番号
    # test_index : シャッフルしたテストデータのインデックス番号
    
    #print("TRAIN:", train_index, "TEST:", test_index)
    # インデックス番号を使って、テストデータと検証用データを作成する
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # モデルに直接fitさせないよう別のインスタンスにコピーして学習
    logistic_test = copy.deepcopy(logistic_grid1)
    logistic_test.fit(X_train, y_train)
    y_pred = logistic_test.predict(X_test)
    
    verification_result=[]
    
    #log_reg_pred = logistic_test.predict_proba(X_te_scaled)[:, 1]
    #print(roc_auc_score(y_test, log_reg_pred))
    #verification_result.append(roc_auc_score(y_test, log_reg_pred))
    
    #Accuracy（正解率）
    #print("Accuracy（正解率）： {}".format(accuracy_score(y_test, y_pred)))
    verification_result.append(accuracy_score(y_test, y_pred))
    #Precision（適合率）
    #print("recision（適合率）： {}".format(precision_score(y_test, y_pred)))
    verification_result.append(precision_score(y_test, y_pred))
    #Recall（再現率）
    #print("ecall（再現率）： {}".format(recall_score(y_test, y_pred)))
    verification_result.append(recall_score(y_test, y_pred))
    #F値
    #print("F値： {}".format(f1_score(y_test, y_pred)))
    verification_result.append(f1_score(y_test, y_pred))
    #混同行列
    #print("混同行列：\n{}".format(confusion_matrix(y_test, y_pred)))
    #verification_result.append(confusion_matrix(y_test, y_pred))
    #print()
    verification.append(verification_result)
    
    
    # 行と列のインデックスようのリストを用意 'ROC_AUC', 
    data_columns=['Accuracy（正解率）', 'Precision（適合率）', 'Recall（再現率）', 'F値' ]

    # pandas のデータフレームにする
df_verification = pd.DataFrame(data=verification, columns=data_columns)

display(df_verification)

KFold(n_splits=5, random_state=None, shuffle=False)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Accuracy（正解率）,Precision（適合率）,Recall（再現率）,F値
0,0.927639,0.0,0.0,0.0
1,0.927229,0.0,0.0,0.0
2,0.925586,0.0,0.0,0.0
3,0.926773,0.0,0.0,0.0
4,0.927864,0.0,0.0,0.0


#### StratifiedKFoldによる検証

In [163]:
import copy
from sklearn.model_selection import StratifiedKFold

#np.set_printoptions(threshold=100)
np.set_printoptions(threshold=np.inf)

skf = StratifiedKFold(n_splits=5)

# データ分割を実施
skf.get_n_splits(X, y)


print(skf)


verification = []
for train_index, test_index in skf.split(X, y):
    # train_index : シャッフルしたトレーニングデータのインデックス番号
    # test_index : シャッフルしたテストデータのインデックス番号
    
    #print("TRAIN:", train_index, "TEST:", test_index)
    # インデックス番号を使って、テストデータと検証用データを作成する
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # モデルに直接fitさせないよう別のインスタンスにコピーして学習
    logistic_test = copy.deepcopy(logistic_grid1)
    logistic_test.fit(X_train, y_train)
    y_pred = logistic_test.predict(X_test)
    
    verification_result=[]

    #log_reg_pred = logistic_test.predict_proba(X_te_scaled_bin)[:, 1]
    #print(roc_auc_score(y_test, log_reg_pred))
    #verification_result.append(roc_auc_score(y_test, log_reg_pred))
    
    #Accuracy（正解率）
    #print("Accuracy（正解率）： {}".format(accuracy_score(y_test, y_pred)))
    verification_result.append(accuracy_score(y_test, y_pred))
    #Precision（適合率）
    #print("recision（適合率）： {}".format(precision_score(y_test, y_pred)))
    verification_result.append(precision_score(y_test, y_pred))
    #Recall（再現率）
    #print("ecall（再現率）： {}".format(recall_score(y_test, y_pred)))
    verification_result.append(recall_score(y_test, y_pred))
    #F値
    #print("F値： {}".format(f1_score(y_test, y_pred)))
    verification_result.append(f1_score(y_test, y_pred))
    #混同行列
    #print("混同行列：\n{}".format(confusion_matrix(y_test, y_pred)))
    #verification_result.append(confusion_matrix(y_test, y_pred))
    #print()
    
    verification.append(verification_result)
    

    # 行と列のインデックスようのリストを用意  'ROC_AUC', 
    data_columns=['Accuracy（正解率）', 'Precision（適合率）', 'Recall（再現率）', 'F値' ]

    # pandas のデータフレームにする
df_verification = pd.DataFrame(data=verification, columns=data_columns)

display(df_verification)

KFold(n_splits=5, random_state=None, shuffle=False)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Accuracy（正解率）,Precision（適合率）,Recall（再現率）,F値
0,0.927001,0.0,0.0,0.0
1,0.927001,0.0,0.0,0.0
2,0.927001,0.0,0.0,0.0
3,0.927046,0.0,0.0,0.0
4,0.927043,0.0,0.0,0.0


#### 早期打ち切りの検証 

In [155]:
# sickit-learnに入れるために特徴量をX、目的変数をyというndarrayに格納
X = np.array(df_drop[['DAYS_BIRTH','EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']])
y = np.array(df_drop['TARGET'])

# 回答作成用のテストデータを用意
X_te = np.array(df2_t[['DAYS_BIRTH','EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']])
print(X_te)

# 学習データとテストデータを７５：２５で分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# 標準化クラスをインスタンス化
scaler = StandardScaler()

# 配列Xの平均と分散を計算して記憶する
scaler.fit(X_train)

# 標準化
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_te_scaled = scaler.transform(X_te)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [156]:
from sklearn.model_selection import GridSearchCV

# 作成したモデルをコピー
logistic_grid = copy.deepcopy(logistic_grid1)

parameters = {'max_iter':[25, 50, 75,100]}

# logistic_grid.get_params().keys()
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

clf = GridSearchCV(logistic_grid, parameters, cv=5, scoring='roc_auc')
clf.fit(X_train_scaled, y_train)

pd.set_option('display.max_rows', 300)

clf_df = pd.DataFrame(clf.cv_results_)
display(clf_df.sort_values('rank_test_score'))


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_iter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.081866,0.006101,0.005364,0.000497,25,{'max_iter': 25},0.755157,0.740037,0.741363,0.752627,0.747014,0.74724,0.005969,1
1,0.093411,0.01121,0.006667,0.000494,50,{'max_iter': 50},0.755157,0.740037,0.741363,0.752627,0.747014,0.74724,0.005969,1
2,0.108677,0.007304,0.007369,0.001083,75,{'max_iter': 75},0.755157,0.740037,0.741363,0.752627,0.747014,0.74724,0.005969,1
3,0.114677,0.004517,0.007937,0.000689,100,{'max_iter': 100},0.755157,0.740037,0.741363,0.752627,0.747014,0.74724,0.005969,1


### 最適パラメータで交差検証用のモデルを作成

In [160]:
# ロジスティック回帰

logistic_grid1 = LogisticRegression(max_iter=25, C=0.01, class_weight='balanced', fit_intercept=True, multi_class='ovr', n_jobs=3, penalty='l2', random_state=0)

#### KFoldによる検証

In [161]:
import copy
from sklearn.model_selection import KFold

#np.set_printoptions(threshold=100)
np.set_printoptions(threshold=np.inf)

kf = KFold(n_splits=5)

# データ分割を実施
kf.get_n_splits(X)


print(kf)


verification = []
for train_index, test_index in kf.split(X):
    # train_index : シャッフルしたトレーニングデータのインデックス番号
    # test_index : シャッフルしたテストデータのインデックス番号
    
    #print("TRAIN:", train_index, "TEST:", test_index)
    # インデックス番号を使って、テストデータと検証用データを作成する
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # モデルに直接fitさせないよう別のインスタンスにコピーして学習
    logistic_test = copy.deepcopy(logistic_grid1)
    logistic_test.fit(X_train, y_train)
    y_pred = logistic_test.predict(X_test)
    
    verification_result=[]
    
    #log_reg_pred = logistic_test.predict_proba(X_te_scaled_bin)[:, 1]
    #print(roc_auc_score(y_test, log_reg_pred))
    #verification_result.append(roc_auc_score(y_test, log_reg_pred))
    
    #Accuracy（正解率）
    #print("Accuracy（正解率）： {}".format(accuracy_score(y_test, y_pred)))
    verification_result.append(accuracy_score(y_test, y_pred))
    #Precision（適合率）
    #print("recision（適合率）： {}".format(precision_score(y_test, y_pred)))
    verification_result.append(precision_score(y_test, y_pred))
    #Recall（再現率）
    #print("ecall（再現率）： {}".format(recall_score(y_test, y_pred)))
    verification_result.append(recall_score(y_test, y_pred))
    #F値
    #print("F値： {}".format(f1_score(y_test, y_pred)))
    verification_result.append(f1_score(y_test, y_pred))
    #混同行列
    #print("混同行列：\n{}".format(confusion_matrix(y_test, y_pred)))
    #verification_result.append(confusion_matrix(y_test, y_pred))
    #print()
    verification.append(verification_result)
    
    
    # 行と列のインデックスようのリストを用意 'ROC_AUC', 
    data_columns=['Accuracy（正解率）', 'Precision（適合率）', 'Recall（再現率）', 'F値' ]

    # pandas のデータフレームにする
df_verification = pd.DataFrame(data=verification, columns=data_columns)

display(df_verification)

KFold(n_splits=5, random_state=None, shuffle=False)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Accuracy（正解率）,Precision（適合率）,Recall（再現率）,F値
0,0.927639,0.0,0.0,0.0
1,0.927229,0.0,0.0,0.0
2,0.925586,0.0,0.0,0.0
3,0.926773,0.0,0.0,0.0
4,0.927864,0.0,0.0,0.0


#### StratifiedKFoldによる検証

In [164]:
import copy
from sklearn.model_selection import StratifiedKFold

#np.set_printoptions(threshold=100)
np.set_printoptions(threshold=np.inf)

skf = StratifiedKFold(n_splits=5)

# データ分割を実施
skf.get_n_splits(X, y)


print(skf)


verification = []
for train_index, test_index in skf.split(X, y):
    # train_index : シャッフルしたトレーニングデータのインデックス番号
    # test_index : シャッフルしたテストデータのインデックス番号
    
    #print("TRAIN:", train_index, "TEST:", test_index)
    # インデックス番号を使って、テストデータと検証用データを作成する
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # モデルに直接fitさせないよう別のインスタンスにコピーして学習
    logistic_test = copy.deepcopy(logistic_grid1)
    logistic_test.fit(X_train, y_train)
    y_pred = logistic_test.predict(X_test)
    
    verification_result=[]

    #log_reg_pred = logistic_test.predict_proba(X_te_scaled_bin)[:, 1]
    #print(roc_auc_score(y_test, log_reg_pred))
    #verification_result.append(roc_auc_score(y_test, log_reg_pred))
    
    #Accuracy（正解率）
    #print("Accuracy（正解率）： {}".format(accuracy_score(y_test, y_pred)))
    verification_result.append(accuracy_score(y_test, y_pred))
    #Precision（適合率）
    #print("recision（適合率）： {}".format(precision_score(y_test, y_pred)))
    verification_result.append(precision_score(y_test, y_pred))
    #Recall（再現率）
    #print("ecall（再現率）： {}".format(recall_score(y_test, y_pred)))
    verification_result.append(recall_score(y_test, y_pred))
    #F値
    #print("F値： {}".format(f1_score(y_test, y_pred)))
    verification_result.append(f1_score(y_test, y_pred))
    #混同行列
    #print("混同行列：\n{}".format(confusion_matrix(y_test, y_pred)))
    #verification_result.append(confusion_matrix(y_test, y_pred))
    #print()
    
    verification.append(verification_result)
    

    # 行と列のインデックスようのリストを用意  'ROC_AUC', 
    data_columns=['Accuracy（正解率）', 'Precision（適合率）', 'Recall（再現率）', 'F値' ]

    # pandas のデータフレームにする
df_verification = pd.DataFrame(data=verification, columns=data_columns)

display(df_verification)

StratifiedKFold(n_splits=5, random_state=None, shuffle=False)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Accuracy（正解率）,Precision（適合率）,Recall（再現率）,F値
0,0.927001,0.0,0.0,0.0
1,0.927001,0.0,0.0,0.0
2,0.927001,0.0,0.0,0.0
3,0.927046,0.0,0.0,0.0
4,0.927043,0.0,0.0,0.0
