# Sprint課題 機械学習フロー

In [99]:
import pandas as pd
import numpy as np
import statistics

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, KFold, ShuffleSplit

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, roc_curve, auc

#### データ前処理をしてから課題に移る

In [2]:
test_data_raw = pd.read_csv("application_test.csv")
train_data_raw = pd.read_csv("application_train.csv")

In [3]:
train_data = train_data_raw.copy(deep=True)
test_data = test_data_raw.copy(deep=True)

In [4]:
#欠損値がある列を削除
droped_train = train_data.dropna(axis=1)
droped_test = test_data.dropna(axis=1)

In [5]:
#train, testのデータで共通する列を抜きだす
drop_column_intersection = droped_train.columns & droped_test.columns

In [6]:
droped_train_inter = train_data[drop_column_intersection]
droped_train_inter.insert(1, "TARGET", train_data.loc[:, "TARGET"])
droped_test_inter = test_data[drop_column_intersection]

In [7]:
#機械学習させる上で文字列を使えないので、objectタイプを削除する
on_train = droped_train_inter.select_dtypes(include="number")
on_test = droped_test_inter.select_dtypes(include="number")

In [8]:
#機械学習用のtrain_test_splitのために説明変数と目的変数を分ける
X = on_train.drop(columns="TARGET")
y = on_train["TARGET"]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [10]:
RFC = RandomForestClassifier(n_estimators=10)
RFC.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [13]:
y_pred_1 = RFC.predict(X_test)
fpr_1, tpr_1, thresholds_1 = roc_curve(y_test, y_pred_1)
auc_1 = auc(fpr_1, tpr_1)
print("accuracy：", accuracy_score(y_test, y_pred_1))
print("recall:", recall_score(y_test, y_pred_1))
print("precision:", precision_score(y_test, y_pred_1))
print("f1:",f1_score(y_test, y_pred_1))
print("matrix:\n", confusion_matrix(y_test, y_pred_1))
print("-"*20)
print("auc:",auc_1)

accuracy： 0.9200811675641927
recall: 0.0016417665407978984
precision: 0.136986301369863
f1: 0.0032446463335496427
matrix:
 [[70724    63]
 [ 6081    10]]
--------------------
auc: 0.5003758863076798


In [14]:
features = X_train.columns
importances = RFC.feature_importances_

print("重要度が高い特徴量順:")
print(sorted(zip(map(lambda x: round(x, 2), RFC.feature_importances_), features), 
             reverse=True))

重要度が高い特徴量順:
[(0.12, 'SK_ID_CURR'), (0.12, 'DAYS_BIRTH'), (0.11, 'DAYS_REGISTRATION'), (0.11, 'DAYS_ID_PUBLISH'), (0.1, 'DAYS_EMPLOYED'), (0.1, 'AMT_CREDIT'), (0.08, 'REGION_POPULATION_RELATIVE'), (0.08, 'AMT_INCOME_TOTAL'), (0.07, 'HOUR_APPR_PROCESS_START'), (0.02, 'CNT_CHILDREN'), (0.01, 'REG_CITY_NOT_WORK_CITY'), (0.01, 'REGION_RATING_CLIENT_W_CITY'), (0.01, 'REGION_RATING_CLIENT'), (0.01, 'FLAG_WORK_PHONE'), (0.01, 'FLAG_PHONE'), (0.01, 'FLAG_EMAIL'), (0.01, 'FLAG_DOCUMENT_3'), (0.0, 'REG_REGION_NOT_WORK_REGION'), (0.0, 'REG_REGION_NOT_LIVE_REGION'), (0.0, 'REG_CITY_NOT_LIVE_CITY'), (0.0, 'LIVE_REGION_NOT_WORK_REGION'), (0.0, 'LIVE_CITY_NOT_WORK_CITY'), (0.0, 'FLAG_MOBIL'), (0.0, 'FLAG_EMP_PHONE'), (0.0, 'FLAG_DOCUMENT_9'), (0.0, 'FLAG_DOCUMENT_8'), (0.0, 'FLAG_DOCUMENT_7'), (0.0, 'FLAG_DOCUMENT_6'), (0.0, 'FLAG_DOCUMENT_5'), (0.0, 'FLAG_DOCUMENT_4'), (0.0, 'FLAG_DOCUMENT_21'), (0.0, 'FLAG_DOCUMENT_20'), (0.0, 'FLAG_DOCUMENT_2'), (0.0, 'FLAG_DOCUMENT_19'), (0.0, 'FLAG_DOCUMENT_18'),

<pre>
使う特徴量を5つにする
'DAYS_REGISTRATION'
'DAYS_BIRTH'
'DAYS_ID_PUBLISH'
'DAYS_EMPLOYED'
'AMT_CREDIT'

In [15]:
importance_feature = ["DAYS_REGISTRATION", "DAYS_BIRTH", "DAYS_ID_PUBLISH", "DAYS_EMPLOYED", "AMT_CREDIT"]
X_5 = on_train[importance_feature]
y = on_train["TARGET"]
test_data_5 = on_test[importance_feature]

## 【問題1】クロスバリデーション
事前学習期間では検証用データをはじめに分割しておき、それに対して指標値を計算することで検証を行っていました。（ホールドアウト法）しかし、分割の仕方により精度は変化します。実践的には クロスバリデーション（交差検証） を行います。分割を複数回行い、それぞれに対して学習と検証を行う方法です。複数回の分割のためにscikit-learnにはKFoldクラスが用意されています。

事前学習期間の課題で作成したベースラインモデルに対してKFoldクラスによるクロスバリデーションを行うコードを作成し実行してください。

In [16]:
SKF = StratifiedKFold(n_splits=3)
for train_index, test_index in SKF.split(X_5, y):
    print("TRAIN:", train_index, "TEST:", train_index)

TRAIN: [101986 101997 102002 ... 307508 307509 307510] TEST: [101986 101997 102002 ... 307508 307509 307510]
TRAIN: [     0      1      2 ... 307508 307509 307510] TEST: [     0      1      2 ... 307508 307509 307510]
TRAIN: [     0      1      2 ... 205109 205110 205111] TEST: [     0      1      2 ... 205109 205110 205111]


In [17]:
result_1 = cross_val_score(RFC ,X_5 ,y , cv = SKF , scoring = "roc_auc")
result_1.mean()

0.5474147875075216

## 【問題2】グリッドサーチ
<pre>
これまで分類器のパラメータには触れず、デフォルトの設定を使用していました。パラメータの詳細は今後のSprintで学んでいくことになります。機械学習の前提として、パラメータは状況に応じて最適なものを選ぶ必要があります。最適なパラメータを探していくことを パラメータチューニング と呼びます。パラメータチューニングをある程度自動化する単純な方法としては グリッドサーチ があります。

scikit-learnのGridSearchCVを使い、グリッドサーチを行うコードを作成してください。そして、ベースラインモデルに対して何らかしらのパラメータチューニングを行なってください。どのパラメータをチューニングするかは、使用した手法の公式ドキュメントを参考にしてください。

sklearn.model_selection.GridSearchCV — scikit-learn 0.21.3 documentation

GridSearchCVクラスには引数としてモデル、探索範囲、さらにクロスバリデーションを何分割で行うかを与えます。クロスバリデーションの機能も含まれているため、これを使用する場合はKFoldクラスを利用する必要はありません。

In [18]:
grid_param = {"n_estimators": [10, 20, 30],
                          "criterion": ["gini", "entropy"]}
gscv_1 = GridSearchCV(RFC , param_grid=grid_param , cv=3 , scoring="roc_auc", return_train_score=True)

In [19]:
gscv_1.fit(X_5, y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [10, 20, 30], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [20]:
#結果をデータフレームとして保存
gscv_result_1 = pd.DataFrame.from_dict(gscv_1.cv_results_)
#最も良いパラメータを保存
best_params_1 = gscv_1.best_params_
#ベストスコアを保存
gscv_1.best_score_
#ベストスコアを出したモデルを保存
best_RFC = gscv_1.best_estimator_

In [27]:
y_pred = best_RFC.predict_proba(test_data_5)

submit_data_1 = test_data_5.copy(deep=True)

submit_data_1.insert(0, "SK_ID_CURR", on_test.loc[:, "SK_ID_CURR"])

submit_data_1.loc[:, "TARGET"] = y_pred[:,1]

submit_data_1 = submit_data_1.loc[:, ["SK_ID_CURR", "TARGET"]]

submit_1.to_csv("submit_1.csv", index=False)

In [30]:
print("ローカルのAUC:", gscv_1.best_score_)
print("kaggleに提出した結果の１AUC:", 0.56855)

ローカルのAUC: 0.5688849571345059
kaggleに提出した結果の１AUC: 0.56855


### 【問題3】Kernelからの調査
KaggleのKernelから様々なアイデアを見つけ出して、列挙してください。

#### Improved Model: Random Forest¶
<pre>
To try and beat the poor performance of our baseline, we can update the algorithm.
Let's try using a Random Forest on the same training data to see how that affects performance. 
The Random Forest is a much more powerful model especially when we use hundreds of trees. 
We will use 100 trees in the random forest.

ランダムフォレストのn-estimators＝100を試しているので、n-estimatorsを変えてみる

## 【問題4】高い汎化性能のモデル作成
問題3で見つけたアイデアと、独自のアイデアを組み合わせ高い汎化性能のモデル作りを進めてください。

その過程として、何を行うことで、クロスバリデーションの結果がどの程度変化したかを表にまとめてください。

In [32]:
grid_param_2 = {"n_estimators": [10, 100, 300]}
gscv_2 = GridSearchCV(RFC , param_grid=grid_param_2 , cv=3 , scoring="roc_auc", return_train_score=True)

In [33]:
gscv_2.fit(X_5, y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [10, 100, 300]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [34]:
gscv_result_2 = pd.DataFrame.from_dict(gscv_2.cv_results_)

best_params_2 = gscv_2.best_params_

gscv_2.best_score_

best_RFC_2 = gscv_2.best_estimator_

In [49]:
y_pred_2 = best_RFC_2.predict_proba(test_data_5)

submit_data_2 = test_data_5.copy(deep=True)

submit_data_2.insert(0, "SK_ID_CURR", on_test.loc[:, "SK_ID_CURR"])

submit_data_2.loc[:, "TARGET"] = y_pred_2[:,1]

submit_data_2 = submit_data_2.loc[:, ["SK_ID_CURR", "TARGET"]]

submit_data_2.to_csv("submit_2.csv", index=False)

In [42]:
print("ローカルのAUC:", gscv_2.best_score_)
print("kaggleに提出した結果の１AUC:", 0.59282)

ローカルのAUC: 0.5868421136799019
kaggleに提出した結果の１AUC: 0.59282


In [38]:
grid_param_3 ={"max_depth":[5, 8, 15],
             "max_leaf_nodes":[2, None]}
gscv_3 = GridSearchCV(RFC , param_grid=grid_param_3 , cv=3 , scoring="roc_auc", return_train_score=True)

In [39]:
gscv_3.fit(X_5, y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [5, 8, 15], 'max_leaf_nodes': [2, None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [40]:
gscv_result_3 = pd.DataFrame.from_dict(gscv_3.cv_results_)

best_params_3 = gscv_3.best_params_

gscv_3.best_score_

best_RFC_3 = gscv_3.best_estimator_

In [50]:
y_pred_3 = best_RFC_3.predict_proba(test_data_5)

submit_data_3 = test_data_5.copy(deep=True)

submit_data_3.insert(0, "SK_ID_CURR", on_test.loc[:, "SK_ID_CURR"])

submit_data_3.loc[:, "TARGET"] = y_pred_3[:,1]

submit_data_3 = submit_data_3.loc[:, ["SK_ID_CURR", "TARGET"]]

submit_data_3.to_csv("submit_3.csv", index=False)

In [45]:
print("ローカルのAUC:", gscv_3.best_score_)
print("kaggleに提出した結果のAUC:", 0.62332)

ローカルのAUC: 0.620469089921196
kaggleに提出した結果のAUC: 0.62332


## 【問題5】最終的なモデルの選定
最終的にこれは良いというモデルを選び、推定した結果をKaggleに提出してスコアを確認してください。どういったアイデアを取り入れ、どの程度のスコアになったかを記載してください。

In [46]:
best_RF = RandomForestClassifier(n_estimators=300, 
                                criterion="gini",
                                max_depth=8,
                                 max_leaf_nodes=None)

In [47]:
best_RF.fit(X_5, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [51]:
y_pred_best = best_RF.predict_proba(test_data_5)

submit_data_best = test_data_5.copy(deep=True)

submit_data_best.insert(0, "SK_ID_CURR", on_test.loc[:, "SK_ID_CURR"])

submit_data_best.loc[:, "TARGET"] = y_pred_best[:,1]

submit_data_best = submit_data_best.loc[:, ["SK_ID_CURR", "TARGET"]]

submit_data_best.to_csv("submit_4.csv", index=False)

In [53]:
result_cvs_best_RF = cross_val_score(best_RF ,X_5 ,y , cv = 3 , scoring = "roc_auc")

In [55]:
print("ローカルのAUC:", result_cvs_best_RF.mean())
print("kaggleに提出した結果のAUC:", 0.62931)

ローカルのAUC: 0.625736438551944
kaggleに提出した結果のAUC: 0.62931


## まとめ

In [61]:
cross_val_result = pd.DataFrame({"number of features": [44, 5, 5, 5, 5, 5],
                                "parameters": [str(RFC.get_params()), str(RFC.get_params()),str(best_RFC.get_params()), 
                                               str(best_RFC_2.get_params()), str(best_RFC_3.get_params()),str(best_RF.get_params())],
                                "AUC": [auc_1, result_1.mean(), 0.56855, 0.59282,0.62332, 0.62931]},
                                index=["Normal RF", "Normal RF 5 features", "1st GSCV RF", "2nd GSCV RF", "3rd GSCV RF", "Final GSCV RF"])

In [63]:
cross_val_result

Unnamed: 0,number of features,parameters,AUC
Normal RF,44,"{'bootstrap': True, 'class_weight': None, 'cri...",0.500376
Normal RF 5 features,5,"{'bootstrap': True, 'class_weight': None, 'cri...",0.547415
1st GSCV RF,5,"{'bootstrap': True, 'class_weight': None, 'cri...",0.56855
2nd GSCV RF,5,"{'bootstrap': True, 'class_weight': None, 'cri...",0.59282
3rd GSCV RF,5,"{'bootstrap': True, 'class_weight': None, 'cri...",0.62332
Final GSCV RF,5,"{'bootstrap': True, 'class_weight': None, 'cri...",0.62931


In [66]:
print("最後のモデルが最も高いAUCを返すモデルだった\n","パラメータ\n", str(best_RF.get_params()), "\n")
print("-"*30)
print("AUC:", 0.62931)

最後のモデルが最も高いAUCを返すモデルだった
 パラメータ
 {'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': 8, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 300, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False} 

------------------------------
AUC: 0.62931


### おまけ

In [73]:
rfc_sub = RandomForestClassifier(n_estimators=10)

In [95]:
k=1
splits=10
avg_score_1= []
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=200)
print('\nStarting KFold iterations...')
for train_index,test_index in skf.split(X_5,y):
    df_X=X_5.iloc[train_index,:]
    df_y=y.iloc[train_index]
    val_X=X_5.iloc[test_index,:]
    val_y=y.iloc[test_index]

    rfc_sub.fit(df_X,df_y)

    pred_x = rfc_sub.predict(val_X)
    fpr_sub, tpr_sub, thresholds_1 = roc_curve(val_y, pred_x)
    auc_sub = auc(fpr_sub, tpr_sub)
    
    avg_score_1.append(auc_sub)
    
    print('Iteration:',k,'  roc_auc_score:',auc_sub)
    k+= 1
print("平均値(AUC):", statistics.mean(avg_score_1))
print("中央値:", statistics.median(avg_score_1))
print("分散:", statistics.variance(avg_score_1))
print("標準偏差:", statistics.stdev(avg_score_1))


Starting KFold iterations...
Iteration: 1   roc_auc_score: 0.500323854907132
Iteration: 2   roc_auc_score: 0.5004014136839413
Iteration: 3   roc_auc_score: 0.5004122910032089
Iteration: 4   roc_auc_score: 0.50204093271296
Iteration: 5   roc_auc_score: 0.5000340494997381
Iteration: 6   roc_auc_score: 0.5002005311638683
Iteration: 7   roc_auc_score: 0.5004304374829749
Iteration: 8   roc_auc_score: 0.5009286617430699
Iteration: 9   roc_auc_score: 0.5010347888125478
Iteration: 10   roc_auc_score: 0.500727211299879
平均値(AUC): 0.500653417230932
中央値: 0.5004213642430919
分散: 3.356166352277129e-07
標準偏差: 0.0005793242919364878


In [97]:
k=1
splits=10
avg_score_2= []
k_fold = KFold(n_splits = splits, shuffle = True, random_state = 200)
print('\nStarting KFold iterations...')
for train_index,test_index in k_fold.split(X_5,y):
    df_X=X_5.iloc[train_index,:]
    df_y=y.iloc[train_index]
    val_X=X_5.iloc[test_index,:]
    val_y=y.iloc[test_index]

    rfc_sub.fit(df_X,df_y)

    pred_x = rfc_sub.predict(val_X)
    fpr_sub, tpr_sub, thresholds_1 = roc_curve(val_y, pred_x)
    auc_sub = auc(fpr_sub, tpr_sub)
    
    avg_score_2.append(auc_sub)
    
    print('Iteration:',k,'  roc_auc_score:',auc_sub)
    k+= 1
print("-"*50)
print("平均値(AUC):", statistics.mean(avg_score_2))
print("中央値:", statistics.median(avg_score_2))
print("分散:", statistics.variance(avg_score_2))
print("標準偏差:", statistics.stdev(avg_score_2))


Starting KFold iterations...
Iteration: 1   roc_auc_score: 0.49998799582016157
Iteration: 2   roc_auc_score: 0.5011869140283971
Iteration: 3   roc_auc_score: 0.500487530352102
Iteration: 4   roc_auc_score: 0.5003300060145056
Iteration: 5   roc_auc_score: 0.5006521932081408
Iteration: 6   roc_auc_score: 0.4995519971563201
Iteration: 7   roc_auc_score: 0.5012948809188171
Iteration: 8   roc_auc_score: 0.5003321599318328
Iteration: 9   roc_auc_score: 0.5004330424898812
Iteration: 10   roc_auc_score: 0.5011547042162539
--------------------------------------------------
平均値(AUC): 0.5005411424136412
中央値: 0.5004602864209916
分散: 3.0679163050607624e-07
標準偏差: 0.0005538877417907678


In [100]:
k=1
splits=10
avg_score_3= []
ss = ShuffleSplit(n_splits=splits, random_state=200)
print('\nStarting ShuffleSplit iterations...')
for train_index,test_index in ss.split(X_5,y):
    df_X=X_5.iloc[train_index,:]
    df_y=y.iloc[train_index]
    val_X=X_5.iloc[test_index,:]
    val_y=y.iloc[test_index]

    rfc_sub.fit(df_X,df_y)

    pred_x = rfc_sub.predict(val_X)
    fpr_sub, tpr_sub, thresholds_1 = roc_curve(val_y, pred_x)
    auc_sub = auc(fpr_sub, tpr_sub)
    
    avg_score_3.append(auc_sub)
    
    print('Iteration:',k,'  roc_auc_score:',auc_sub)
    k+= 1

print("-"*50)
print("平均値(AUC):", statistics.mean(avg_score_3))
print("中央値:", statistics.median(avg_score_3))
print("分散:", statistics.variance(avg_score_3))
print("標準偏差:", statistics.stdev(avg_score_3))


Starting ShuffleSplit iterations...
Iteration: 1   roc_auc_score: 0.499822902874039
Iteration: 2   roc_auc_score: 0.49996065197572365
Iteration: 3   roc_auc_score: 0.5015710126549928
Iteration: 4   roc_auc_score: 0.5014367525856497
Iteration: 5   roc_auc_score: 0.5003037010399796
Iteration: 6   roc_auc_score: 0.501236691654067
Iteration: 7   roc_auc_score: 0.5004905720625781
Iteration: 8   roc_auc_score: 0.5009261590809692
Iteration: 9   roc_auc_score: 0.5028851450986765
Iteration: 10   roc_auc_score: 0.5018463733015907
--------------------------------------------------
平均値(AUC): 0.5010479962328266
中央値: 0.5010814253675181
分散: 8.92835349264487e-07
標準偏差: 0.0009448996503674276


In [107]:
k=1
splits=10
avg_score_4= []
ss = ShuffleSplit(n_splits=splits, random_state=200, test_size=0.25)
print('\nStarting ShuffleSplit_testsize0.25 iterations...')
for train_index,test_index in ss.split(X_5,y):
    df_X=X_5.iloc[train_index,:]
    df_y=y.iloc[train_index]
    val_X=X_5.iloc[test_index,:]
    val_y=y.iloc[test_index]

    rfc_sub.fit(df_X,df_y)

    pred_x = rfc_sub.predict(val_X)
    fpr_sub, tpr_sub, thresholds_1 = roc_curve(val_y, pred_x)
    auc_sub = auc(fpr_sub, tpr_sub)
    
    avg_score_4.append(auc_sub)
    
    print('Iteration:',k,'  roc_auc_score:',auc_sub)
    k+= 1

print("-"*50)
print("平均値(AUC):", statistics.mean(avg_score_4))
print("中央値:", statistics.median(avg_score_4))
print("分散:", statistics.variance(avg_score_4))
print("標準偏差:", statistics.stdev(avg_score_4))


Starting ShuffleSplit_testsize0.25 iterations...
Iteration: 1   roc_auc_score: 0.5009026306493146
Iteration: 2   roc_auc_score: 0.5013500848740691
Iteration: 3   roc_auc_score: 0.5010147941528093
Iteration: 4   roc_auc_score: 0.5008445152230082
Iteration: 5   roc_auc_score: 0.5009329437558859
Iteration: 6   roc_auc_score: 0.5004018973041269
Iteration: 7   roc_auc_score: 0.5002058658076307
Iteration: 8   roc_auc_score: 0.5006600314982652
Iteration: 9   roc_auc_score: 0.5000792709285631
Iteration: 10   roc_auc_score: 0.5011710514398945
--------------------------------------------------
平均値(AUC): 0.5007563085633567
中央値: 0.5008735729361614
分散: 1.7200345775912387e-07
標準偏差: 0.0004147329957443992


In [108]:
cross_val_method= pd.DataFrame({"平均値": [statistics.mean(avg_score_1), statistics.mean(avg_score_2), statistics.mean(avg_score_3),
                                      statistics.mean(avg_score_4)],
                                "中央値": [statistics.median(avg_score_1), statistics.median(avg_score_1), statistics.median(avg_score_3),
                                       statistics.median(avg_score_4)],
                                "分散": [statistics.variance(avg_score_1), statistics.variance(avg_score_2), statistics.variance(avg_score_3),
                                      statistics.variance(avg_score_4)],
                               "標準偏差":[statistics.stdev(avg_score_1), statistics.stdev(avg_score_2), statistics.stdev(avg_score_3),
                                      statistics.stdev(avg_score_4)]},
                                index=["StratifiedKFold", "KFold", "ShuffleSplit", "ShuffleSplit test_size0.25"])

In [109]:
cross_val_method

Unnamed: 0,平均値,中央値,分散,標準偏差
StratifiedKFold,0.500653,0.500421,3.356166e-07,0.000579
KFold,0.500541,0.500421,3.067916e-07,0.000554
ShuffleSplit,0.501048,0.501081,8.928353e-07,0.000945
ShuffleSplit test_size0.25,0.500756,0.500874,1.720035e-07,0.000415
