In [115]:
#ライブラリ
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt

#検証方法：クロスバリデーション
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

#モデル
from sklearn.ensemble import GradientBoostingClassifier

In [86]:
#データ準備
train =pd.read_table('data/train.tsv', index_col=0)
test = pd.read_table('data/test.tsv',index_col=0)
sample = pd.read_table('data/sample_submit.tsv' , index_col=0, header=None)

In [87]:
#データの確認と前処理　テストデータ
print(train.shape)
train.head()

(445, 8)


Unnamed: 0_level_0,survived,pclass,sex,age,sibsp,parch,fare,embarked
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S
7,0,3,male,2.0,3,1,21.075,S
9,1,2,female,14.0,1,0,30.0708,C
11,1,1,female,58.0,0,0,26.55,S


In [88]:
#説明変数と目的変数に分ける
train_x = train.drop('survived',axis = 1)
y = train.iloc[:,0]
print(train_x.shape)
print(y.shape)

(445, 7)
(445,)


In [89]:
#trainデータ：one-hot-encoding
train_ohe = pd.get_dummies(train_x,dummy_na=True)
train_ohe.head()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,sex_female,sex_male,sex_nan,embarked_C,embarked_Q,embarked_S,embarked_nan
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3,1,35.0,1,0,53.1,1,0,0,0,0,1,0
4,3,35.0,0,0,8.05,0,1,0,0,0,1,0
7,3,2.0,3,1,21.075,0,1,0,0,0,1,0
9,2,14.0,1,0,30.0708,1,0,0,1,0,0,0
11,1,58.0,0,0,26.55,1,0,0,0,0,1,0


In [90]:
#trainデータ：欠損値の補完
#インピュータークラスのインスタンス化と（列平均）の学習
imp = SimpleImputer()
imp.fit(train_ohe)

#学習済みimputerの適用：各列欠損値の置換
train_fin = pd.DataFrame(imp.transform(train_ohe) , columns = train_ohe.columns.values)

#結果表示
display(train_fin.iloc[10:20,1])

10    38.000000
11    29.211583
12    29.211583
13    29.211583
14    29.211583
15    66.000000
16    29.211583
17    27.000000
18    29.211583
19     3.000000
Name: age, dtype: float64

## テストデータ前処理

In [91]:
#testデータ：one_hot_encoding
test_ohe = pd.get_dummies(test,dummy_na=True)
test_ohe.head()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,sex_female,sex_male,sex_nan,embarked_C,embarked_Q,embarked_S,embarked_nan
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,3,22.0,1,0,7.25,0,1,0,0,0,1,0
1,1,38.0,1,0,71.2833,1,0,0,1,0,0,0
2,3,26.0,0,0,7.925,1,0,0,0,0,1,0
5,3,,0,0,8.4583,0,1,0,0,1,0,0
6,1,54.0,0,0,51.8625,0,1,0,0,0,1,0


In [92]:
#testデータ：欠損値の補完
imp2 = SimpleImputer()
imp2.fit(test_ohe)

#学習ずみimputerの適用
test_fin = pd.DataFrame(imp2.transform(test_ohe) , columns = test_ohe.columns.values)

#結果表示
display(test_fin.iloc[10:20,1])

10    30.194915
11    31.000000
12    35.000000
13    34.000000
14    15.000000
15    19.000000
16    40.000000
17    30.194915
18    28.000000
19    42.000000
Name: age, dtype: float64

In [93]:
#columnsのデータ項目を確認
cols_train = set(train_ohe.columns.values)
cols_test = set(test_ohe.columns.values)

#trainにあってtestにないデータ項目
diff1 = cols_train -cols_test
print('trainのみ：%s' % diff1)

#testにあってtrainにないデータ項目
diff2 = cols_test - cols_train
print('testのみ：%s' % diff2)

trainのみ：set()
testのみ：set()


In [94]:
#特徴量エンジニアリング
train_fin.describe()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,sex_nan,embarked_C,embarked_Q,embarked_S,embarked_nan
count,445.0,445.0,445.0,445.0,445.0,445.0,445.0,445.0,445.0,445.0,445.0,445.0
mean,2.296629,29.211583,0.546067,0.431461,33.959971,0.350562,0.649438,0.0,0.177528,0.08764,0.730337,0.004494
std,0.834024,12.72753,1.195247,0.850489,52.079492,0.477683,0.477683,0.0,0.382545,0.28309,0.444284,0.066965
min,1.0,0.67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,22.0,0.0,0.0,7.925,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,29.211583,0.0,0.0,15.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
75%,3.0,35.0,1.0,1.0,31.3875,1.0,1.0,0.0,0.0,0.0,1.0,0.0
max,3.0,80.0,8.0,5.0,512.3292,1.0,1.0,0.0,1.0,1.0,1.0,1.0


In [95]:
test_fin.describe()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,sex_nan,embarked_C,embarked_Q,embarked_S,embarked_nan
count,446.0,446.0,446.0,446.0,446.0,446.0,446.0,446.0,446.0,446.0,446.0,446.0
mean,2.320628,30.194915,0.5,0.331839,30.452381,0.35426,0.64574,0.0,0.199552,0.085202,0.715247,0.0
std,0.838873,13.269927,1.002805,0.756823,47.186192,0.478826,0.478826,0.0,0.400112,0.279495,0.451804,0.0
min,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,22.0,0.0,0.0,7.8958,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,30.194915,0.0,0.0,13.5,0.0,1.0,0.0,0.0,0.0,1.0,0.0
75%,3.0,35.0,1.0,0.0,30.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
max,3.0,74.0,8.0,6.0,512.3292,1.0,1.0,0.0,1.0,1.0,1.0,0.0


In [96]:
#family sizeのみ追加
train_fin['FamilySize'] = train_fin['sibsp'] + train_fin['parch'] + 1
test_fin['FamilySize'] = test_fin['sibsp'] + test_fin['parch'] + 1

In [97]:
#①家族サイズ・配偶者フラグの2つ追加
train_fin['IsAlone'] = 0
test_fin['IsAlone'] = 0
train_fin.loc[train_fin['FamilySize'] == 1 , 'IsAlone']  = 1
test_fin.loc[test_fin['FamilySize'] == 1 , 'IsAlone']  = 1

In [98]:
train_fin.iloc[30:40,-1]

30    0
31    1
32    1
33    0
34    1
35    1
36    1
37    1
38    0
39    1
Name: IsAlone, dtype: int64

In [102]:
#家族サイズのみ
train_fam = train_fin.drop('IsAlone',axis = 1)
test_fam = test_fin.drop('IsAlone',axis = 1)
test_fam.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,sex_nan,embarked_C,embarked_Q,embarked_S,embarked_nan,FamilySize
0,3.0,22.0,1.0,0.0,7.25,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0
1,1.0,38.0,1.0,0.0,71.2833,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0
2,3.0,26.0,0.0,0.0,7.925,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,3.0,30.194915,0.0,0.0,8.4583,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,1.0,54.0,0.0,0.0,51.8625,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [104]:
#独身フラグのみ
train_alone = train_fin.drop('FamilySize',axis=1)
test_alone = test_fin.drop('FamilySize',axis=1)
test_alone.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,sex_nan,embarked_C,embarked_Q,embarked_S,embarked_nan,IsAlone
0,3.0,22.0,1.0,0.0,7.25,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
1,1.0,38.0,1.0,0.0,71.2833,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0
2,3.0,26.0,0.0,0.0,7.925,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1
3,3.0,30.194915,0.0,0.0,8.4583,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1
4,1.0,54.0,0.0,0.0,51.8625,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1


In [121]:
#set pipelines for two different algorithms グリッドサーチクロスバリデーションに利用
gb = GradientBoostingClassifier(random_state=1)

In [122]:
# パラメータグリッドの設定
param_grid_gb= {'n_estimators':[100,200,300,400,500],
                'learning_rate':[0.003,0.009,0.01,0.03,0.1],
               'max_depth':[2,4,6,8]}

In [123]:
train_fin.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,sex_nan,embarked_C,embarked_Q,embarked_S,embarked_nan,FamilySize,IsAlone
0,1.0,35.0,1.0,0.0,53.1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0
1,3.0,35.0,0.0,0.0,8.05,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1
2,3.0,2.0,3.0,1.0,21.075,0.0,1.0,0.0,0.0,0.0,1.0,0.0,5.0,0
3,2.0,14.0,1.0,0.0,30.0708,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0
4,1.0,58.0,0.0,0.0,26.55,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1


In [137]:
#グリッドサーチクロスバリデーションの実装
print('探索空間:%s' %param_grid_gb)

gs = GridSearchCV(gb,
                 param_grid=param_grid_gb,
                 scoring='neg_log_loss',
                 cv=10,
                 return_train_score=False)
gs.fit(train_alone,y)

探索空間:{'n_estimators': [100, 200, 300, 400, 500], 'learning_rate': [0.003, 0.009, 0.01, 0.03, 0.1], 'max_depth': [2, 4, 6, 8]}


GridSearchCV(cv=10, estimator=GradientBoostingClassifier(random_state=1),
             param_grid={'learning_rate': [0.003, 0.009, 0.01, 0.03, 0.1],
                         'max_depth': [2, 4, 6, 8],
                         'n_estimators': [100, 200, 300, 400, 500]},
             scoring='neg_log_loss')

In [138]:
result = pd.DataFrame(gs.cv_results_)
result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.091966,0.010248,0.003861,0.000935,0.003,2,100,"{'learning_rate': 0.003, 'max_depth': 2, 'n_es...",-0.578305,-0.589109,...,-0.590294,-0.611728,-0.611277,-0.603635,-0.609269,-0.587025,-0.570865,-0.594319,0.013457,73
1,0.162798,0.004746,0.003741,0.000710,0.003,2,200,"{'learning_rate': 0.003, 'max_depth': 2, 'n_es...",-0.512072,-0.535909,...,-0.530307,-0.580479,-0.576802,-0.557923,-0.566679,-0.529476,-0.508459,-0.544309,0.024210,64
2,0.242683,0.007617,0.003844,0.000620,0.003,2,300,"{'learning_rate': 0.003, 'max_depth': 2, 'n_es...",-0.469516,-0.504545,...,-0.492855,-0.565261,-0.558364,-0.530149,-0.540194,-0.492655,-0.467954,-0.513875,0.032626,50
3,0.327678,0.009789,0.004089,0.000595,0.003,2,400,"{'learning_rate': 0.003, 'max_depth': 2, 'n_es...",-0.445615,-0.487217,...,-0.468158,-0.558359,-0.549558,-0.513943,-0.524196,-0.473281,-0.440961,-0.496324,0.038529,39
4,0.411194,0.007818,0.004033,0.000668,0.003,2,500,"{'learning_rate': 0.003, 'max_depth': 2, 'n_es...",-0.436579,-0.477631,...,-0.452133,-0.555250,-0.544018,-0.501002,-0.513163,-0.460498,-0.422594,-0.485456,0.041814,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.458675,0.017358,0.004323,0.000448,0.1,8,100,"{'learning_rate': 0.1, 'max_depth': 8, 'n_esti...",-1.288764,-1.022356,...,-0.453408,-1.080452,-1.301387,-0.831075,-1.169032,-0.545152,-0.891778,-0.984220,0.286382,91
96,0.960638,0.047028,0.004759,0.000464,0.1,8,200,"{'learning_rate': 0.1, 'max_depth': 8, 'n_esti...",-1.854024,-1.540470,...,-0.846411,-1.632917,-2.262694,-1.093641,-1.930482,-0.742511,-1.466978,-1.532906,0.477349,95
97,1.421198,0.046954,0.005758,0.000485,0.1,8,300,"{'learning_rate': 0.1, 'max_depth': 8, 'n_esti...",-2.633094,-2.084922,...,-1.274646,-2.141370,-3.559810,-1.482208,-2.771434,-0.854600,-2.052866,-2.160222,0.765951,98
98,1.743928,0.062326,0.006113,0.000395,0.1,8,400,"{'learning_rate': 0.1, 'max_depth': 8, 'n_esti...",-3.337281,-2.572962,...,-1.571347,-2.525525,-4.445421,-1.868475,-3.590114,-1.122757,-2.415698,-2.664823,0.951924,99


In [139]:
pd.DataFrame(gs.cv_results_).loc[1,'rank_test_score']

64

In [141]:
#提出用関数
best = gs.best_estimator_
pred = best.predict_proba(test_alone)[:,1]
sample[1] = pred
sample.to_csv(f'submit/gb_cross_alone.tsv',sep='\t', header=None)