In [37]:
#ライブラリ
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

#検証方法：クロスバリデーション
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

#モデル
from sklearn.ensemble import GradientBoostingClassifier

In [14]:
#データ準備
train =pd.read_table('data/train.tsv', index_col=0)
test = pd.read_table('data/test.tsv',index_col=0)
sample = pd.read_table('data/sample_submit.tsv' , index_col=0, header=None)

In [15]:
#データの確認と前処理　テストデータ
print(train.shape)
train.head()

(445, 8)


Unnamed: 0_level_0,survived,pclass,sex,age,sibsp,parch,fare,embarked
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S
7,0,3,male,2.0,3,1,21.075,S
9,1,2,female,14.0,1,0,30.0708,C
11,1,1,female,58.0,0,0,26.55,S


In [16]:
#説明変数と目的変数に分ける
train_x = train.drop('survived',axis = 1)
y = train.iloc[:,0]
print(train_x.shape)
print(y.shape)

(445, 7)
(445,)


In [17]:
#trainデータ：one-hot-encoding
train_ohe = pd.get_dummies(train_x,dummy_na=True)
train_ohe.head()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,sex_female,sex_male,sex_nan,embarked_C,embarked_Q,embarked_S,embarked_nan
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3,1,35.0,1,0,53.1,1,0,0,0,0,1,0
4,3,35.0,0,0,8.05,0,1,0,0,0,1,0
7,3,2.0,3,1,21.075,0,1,0,0,0,1,0
9,2,14.0,1,0,30.0708,1,0,0,1,0,0,0
11,1,58.0,0,0,26.55,1,0,0,0,0,1,0


In [18]:
#trainデータ：欠損値の補完
#インピュータークラスのインスタンス化と（列平均）の学習
imp = SimpleImputer()
imp.fit(train_ohe)

#学習済みimputerの適用：各列欠損値の置換
train_fin = pd.DataFrame(imp.transform(train_ohe) , columns = train_ohe.columns.values)

#結果表示
display(train_fin.iloc[10:20,1])

10    38.000000
11    29.211583
12    29.211583
13    29.211583
14    29.211583
15    66.000000
16    29.211583
17    27.000000
18    29.211583
19     3.000000
Name: age, dtype: float64

In [19]:
#testデータ：one_hot_encoding
test_ohe = pd.get_dummies(test,dummy_na=True)
test_ohe.head()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,sex_female,sex_male,sex_nan,embarked_C,embarked_Q,embarked_S,embarked_nan
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,3,22.0,1,0,7.25,0,1,0,0,0,1,0
1,1,38.0,1,0,71.2833,1,0,0,1,0,0,0
2,3,26.0,0,0,7.925,1,0,0,0,0,1,0
5,3,,0,0,8.4583,0,1,0,0,1,0,0
6,1,54.0,0,0,51.8625,0,1,0,0,0,1,0


In [20]:
#testデータ：欠損値の補完
imp2 = SimpleImputer()
imp2.fit(test_ohe)

#学習ずみimputerの適用
test_fin = pd.DataFrame(imp2.transform(test_ohe) , columns = test_ohe.columns.values)

#結果表示
display(test_fin.iloc[10:20,1])

10    30.194915
11    31.000000
12    35.000000
13    34.000000
14    15.000000
15    19.000000
16    40.000000
17    30.194915
18    28.000000
19    42.000000
Name: age, dtype: float64

In [21]:
#columnsのデータ項目を確認
cols_train = set(train_ohe.columns.values)
cols_test = set(test_ohe.columns.values)

#trainにあってtestにないデータ項目
diff1 = cols_train -cols_test
print('trainのみ：%s' % diff1)

#testにあってtrainにないデータ項目
diff2 = cols_test - cols_train
print('testのみ：%s' % diff2)

trainのみ：set()
testのみ：set()


#set pipelines for two different algorithms
#ハイパーパラメータの調整：①n_estimaters(決定木の数) , ②learning_rate（学習率） , ③max_depth（決定木ノードの深さ制限）
pipelines = {

    #デフォルト
    'gb':GradientBoostingClassifier(n_estimators=100 , learning_rate=0.01 , max_depth= 3 ,random_state=1),
    
    #①n_estimators変更(Up:150,200) = 100~500
    'gb2':GradientBoostingClassifier(n_estimators=50 ,learning_rate=0.01 , max_depth= 3 , random_state=1),

    #②learning_rate変更(up:0.05,down：０.1,0.005) = 0.01~0.1
    'gb3':GradientBoostingClassifier(n_estimators=100 , learning_rate=0.005 , max_depth= 3 ,random_state=1),

    #③max_depth変更(Up:2~6 , down:10) = 2~8
    'gb4':GradientBoostingClassifier(n_estimators=100 , learning_rate=0.03 , max_depth= 2 ,random_state=1),
    
}

%precision 3

#クロスバリデーションの実装
for pipe_name , est in pipelines.items():
    cv_results = cross_val_score(est,
                                train_fin,
                                y,
                                cv=5,
                                scoring='accuracy')
    print('Algorithm:',pipe_name)
    print('Scores:',cv_results)
    print('accuracy:%.3f +- %.3f' %(cv_results.mean(),cv_results.std()))
    print()

In [48]:
#set pipelines for two different algorithms グリッドサーチクロスバリデーションに利用
gb = GradientBoostingClassifier(random_state=1)

In [49]:
# パラメータグリッドの設定
param_grid_gb= {'n_estimators':[100,200,300,400,500],
                'learning_rate':[0.01,0.03,0.06,0.09,0.12],
               'max_depth':[2,4,6,8]}

In [50]:
#グリッドサーチクロスバリデーションの実装
print('探索空間:%s' %param_grid_gb)

gs = GridSearchCV(gb,
                 param_grid=param_grid_gb,
                 scoring='accuracy',
                 cv=10,
                 return_train_score=False)
gs.fit(train_fin,y)

探索空間:{'n_estimators': [100, 200, 300, 400, 500], 'learning_rate': [0.01, 0.03, 0.06, 0.09, 0.12], 'max_depth': [2, 4, 6, 8]}


GridSearchCV(cv=10, estimator=GradientBoostingClassifier(random_state=1),
             param_grid={'learning_rate': [0.01, 0.03, 0.06, 0.09, 0.12],
                         'max_depth': [2, 4, 6, 8],
                         'n_estimators': [100, 200, 300, 400, 500]},
             scoring='accuracy')

In [51]:
gs.predict_proba(test_fin)

array([[0.868, 0.132],
       [0.083, 0.917],
       [0.406, 0.594],
       [0.823, 0.177],
       [0.657, 0.343],
       [0.488, 0.512],
       [0.159, 0.841],
       [0.856, 0.144],
       [0.422, 0.578],
       [0.267, 0.733],
       [0.839, 0.161],
       [0.35 , 0.65 ],
       [0.839, 0.161],
       [0.851, 0.149],
       [0.318, 0.682],
       [0.468, 0.532],
       [0.589, 0.411],
       [0.083, 0.917],
       [0.51 , 0.49 ],
       [0.588, 0.412],
       [0.856, 0.144],
       [0.35 , 0.65 ],
       [0.273, 0.727],
       [0.427, 0.573],
       [0.856, 0.144],
       [0.735, 0.265],
       [0.763, 0.237],
       [0.35 , 0.65 ],
       [0.83 , 0.17 ],
       [0.856, 0.144],
       [0.086, 0.914],
       [0.59 , 0.41 ],
       [0.588, 0.412],
       [0.849, 0.151],
       [0.59 , 0.41 ],
       [0.589, 0.411],
       [0.735, 0.265],
       [0.209, 0.791],
       [0.915, 0.085],
       [0.912, 0.088],
       [0.79 , 0.21 ],
       [0.868, 0.132],
       [0.856, 0.144],
       [0.8

In [54]:
pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.079217,0.016822,0.002246,0.000251,0.01,2,100,"{'learning_rate': 0.01, 'max_depth': 2, 'n_est...",0.911111,0.822222,...,0.866667,0.711111,0.750000,0.750000,0.772727,0.795455,0.863636,0.804293,0.059004,6
1,0.144454,0.007589,0.002470,0.000370,0.01,2,200,"{'learning_rate': 0.01, 'max_depth': 2, 'n_est...",0.866667,0.844444,...,0.866667,0.733333,0.750000,0.750000,0.772727,0.795455,0.863636,0.802071,0.050570,9
2,0.212436,0.008204,0.002599,0.000464,0.01,2,300,"{'learning_rate': 0.01, 'max_depth': 2, 'n_est...",0.866667,0.822222,...,0.911111,0.711111,0.750000,0.750000,0.772727,0.818182,0.909091,0.808889,0.065471,2
3,0.296522,0.016439,0.002805,0.000464,0.01,2,400,"{'learning_rate': 0.01, 'max_depth': 2, 'n_est...",0.844444,0.800000,...,0.911111,0.711111,0.750000,0.750000,0.772727,0.840909,0.909091,0.806717,0.064485,3
4,0.378960,0.049381,0.002806,0.000318,0.01,2,500,"{'learning_rate': 0.01, 'max_depth': 2, 'n_est...",0.844444,0.800000,...,0.911111,0.711111,0.750000,0.750000,0.795455,0.886364,0.909091,0.813535,0.067279,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.398663,0.014524,0.002646,0.000250,0.12,8,100,"{'learning_rate': 0.12, 'max_depth': 8, 'n_est...",0.711111,0.800000,...,0.844444,0.711111,0.681818,0.818182,0.704545,0.886364,0.795455,0.773081,0.064741,65
96,0.834056,0.050524,0.003094,0.000303,0.12,8,200,"{'learning_rate': 0.12, 'max_depth': 8, 'n_est...",0.711111,0.822222,...,0.866667,0.688889,0.681818,0.840909,0.659091,0.840909,0.795455,0.768485,0.072758,75
97,1.299872,0.059803,0.003721,0.000351,0.12,8,300,"{'learning_rate': 0.12, 'max_depth': 8, 'n_est...",0.733333,0.800000,...,0.866667,0.688889,0.681818,0.795455,0.659091,0.795455,0.795455,0.757172,0.062251,96
98,1.512178,0.082437,0.003985,0.000297,0.12,8,400,"{'learning_rate': 0.12, 'max_depth': 8, 'n_est...",0.733333,0.777778,...,0.866667,0.688889,0.681818,0.840909,0.636364,0.772727,0.795455,0.754949,0.068257,98


# fit & evaluation
scores = {}
for pipe_name, pipeline in pipelines.items():
    pipeline.fit(train_x, train_y)
    scores[(pipe_name,'train')] = accuracy_score(train_y, pipeline.predict(train_x))
    scores[(pipe_name,'test')] = accuracy_score(test_y, pipeline.predict(test_x))

pd.Series(scores).unstack()

In [58]:
#提出用関数
pred = gs.predict_proba(test_fin)[:,1]

In [59]:
sample[1] = pred
sample.to_csv(f'submit/gb_cross.tsv',sep='\t', header=None)