In [2]:
#ライブラリ
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt

#検証方法：クロスバリデーション
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

#モデル
from sklearn.ensemble import GradientBoostingClassifier

In [3]:
#データ準備
train =pd.read_table('data/train.tsv', index_col=0)
test = pd.read_table('data/test.tsv',index_col=0)
sample = pd.read_table('data/sample_submit.tsv' , index_col=0, header=None)

In [4]:
#データの確認と前処理　テストデータ
print(train.shape)
train.head()

(445, 8)


Unnamed: 0_level_0,survived,pclass,sex,age,sibsp,parch,fare,embarked
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S
7,0,3,male,2.0,3,1,21.075,S
9,1,2,female,14.0,1,0,30.0708,C
11,1,1,female,58.0,0,0,26.55,S


In [5]:
#説明変数と目的変数に分ける
train_x = train.drop('survived',axis = 1)
y = train.iloc[:,0]
print(train_x.shape)
print(y.shape)

(445, 7)
(445,)


In [6]:
#trainデータ：one-hot-encoding
train_ohe = pd.get_dummies(train_x,dummy_na=True)
train_ohe.head()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,sex_female,sex_male,sex_nan,embarked_C,embarked_Q,embarked_S,embarked_nan
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3,1,35.0,1,0,53.1,1,0,0,0,0,1,0
4,3,35.0,0,0,8.05,0,1,0,0,0,1,0
7,3,2.0,3,1,21.075,0,1,0,0,0,1,0
9,2,14.0,1,0,30.0708,1,0,0,1,0,0,0
11,1,58.0,0,0,26.55,1,0,0,0,0,1,0


In [7]:
#trainデータ：欠損値の補完
#インピュータークラスのインスタンス化と（列平均）の学習
imp = SimpleImputer()
imp.fit(train_ohe)

#学習済みimputerの適用：各列欠損値の置換
train_fin = pd.DataFrame(imp.transform(train_ohe) , columns = train_ohe.columns.values)

#結果表示
display(train_fin.iloc[10:20,1])

10    38.000000
11    29.211583
12    29.211583
13    29.211583
14    29.211583
15    66.000000
16    29.211583
17    27.000000
18    29.211583
19     3.000000
Name: age, dtype: float64

## テストデータ前処理

In [8]:
#testデータ：one_hot_encoding
test_ohe = pd.get_dummies(test,dummy_na=True)
test_ohe.head()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,sex_female,sex_male,sex_nan,embarked_C,embarked_Q,embarked_S,embarked_nan
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,3,22.0,1,0,7.25,0,1,0,0,0,1,0
1,1,38.0,1,0,71.2833,1,0,0,1,0,0,0
2,3,26.0,0,0,7.925,1,0,0,0,0,1,0
5,3,,0,0,8.4583,0,1,0,0,1,0,0
6,1,54.0,0,0,51.8625,0,1,0,0,0,1,0


In [9]:
#testデータ：欠損値の補完
imp2 = SimpleImputer()
imp2.fit(test_ohe)

#学習ずみimputerの適用
test_fin = pd.DataFrame(imp2.transform(test_ohe) , columns = test_ohe.columns.values)

#結果表示
display(test_fin.iloc[10:20,1])

10    30.194915
11    31.000000
12    35.000000
13    34.000000
14    15.000000
15    19.000000
16    40.000000
17    30.194915
18    28.000000
19    42.000000
Name: age, dtype: float64

In [10]:
#columnsのデータ項目を確認
cols_train = set(train_ohe.columns.values)
cols_test = set(test_ohe.columns.values)

#trainにあってtestにないデータ項目
diff1 = cols_train -cols_test
print('trainのみ：%s' % diff1)

#testにあってtrainにないデータ項目
diff2 = cols_test - cols_train
print('testのみ：%s' % diff2)

trainのみ：set()
testのみ：set()


In [11]:
#特徴量エンジニアリング
train_fin.describe()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,sex_nan,embarked_C,embarked_Q,embarked_S,embarked_nan
count,445.0,445.0,445.0,445.0,445.0,445.0,445.0,445.0,445.0,445.0,445.0,445.0
mean,2.296629,29.211583,0.546067,0.431461,33.959971,0.350562,0.649438,0.0,0.177528,0.08764,0.730337,0.004494
std,0.834024,12.72753,1.195247,0.850489,52.079492,0.477683,0.477683,0.0,0.382545,0.28309,0.444284,0.066965
min,1.0,0.67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,22.0,0.0,0.0,7.925,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,29.211583,0.0,0.0,15.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
75%,3.0,35.0,1.0,1.0,31.3875,1.0,1.0,0.0,0.0,0.0,1.0,0.0
max,3.0,80.0,8.0,5.0,512.3292,1.0,1.0,0.0,1.0,1.0,1.0,1.0


In [12]:
test_fin.describe()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,sex_nan,embarked_C,embarked_Q,embarked_S,embarked_nan
count,446.0,446.0,446.0,446.0,446.0,446.0,446.0,446.0,446.0,446.0,446.0,446.0
mean,2.320628,30.194915,0.5,0.331839,30.452381,0.35426,0.64574,0.0,0.199552,0.085202,0.715247,0.0
std,0.838873,13.269927,1.002805,0.756823,47.186192,0.478826,0.478826,0.0,0.400112,0.279495,0.451804,0.0
min,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,22.0,0.0,0.0,7.8958,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,30.194915,0.0,0.0,13.5,0.0,1.0,0.0,0.0,0.0,1.0,0.0
75%,3.0,35.0,1.0,0.0,30.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
max,3.0,74.0,8.0,6.0,512.3292,1.0,1.0,0.0,1.0,1.0,1.0,0.0


In [13]:
#family sizeのみ追加
train_fin['FamilySize'] = train_fin['sibsp'] + train_fin['parch'] + 1
test_fin['FamilySize'] = test_fin['sibsp'] + test_fin['parch'] + 1

In [14]:
#①家族サイズ・配偶者フラグの2つ追加
train_fin['IsAlone'] = 0
test_fin['IsAlone'] = 0
train_fin.loc[train_fin['FamilySize'] == 1 , 'IsAlone']  = 1
test_fin.loc[test_fin['FamilySize'] == 1 , 'IsAlone']  = 1

In [15]:
train_fin.iloc[30:40,-1]

30    0
31    1
32    1
33    0
34    1
35    1
36    1
37    1
38    0
39    1
Name: IsAlone, dtype: int64

In [16]:
#家族サイズのみ
train_fam = train_fin.drop('IsAlone',axis = 1)
test_fam = test_fin.drop('IsAlone',axis = 1)
test_fam.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,sex_nan,embarked_C,embarked_Q,embarked_S,embarked_nan,FamilySize
0,3.0,22.0,1.0,0.0,7.25,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0
1,1.0,38.0,1.0,0.0,71.2833,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0
2,3.0,26.0,0.0,0.0,7.925,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,3.0,30.194915,0.0,0.0,8.4583,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,1.0,54.0,0.0,0.0,51.8625,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [17]:
#独身フラグのみ
train_alone = train_fin.drop('FamilySize',axis=1)
test_alone = test_fin.drop('FamilySize',axis=1)
test_alone.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,sex_nan,embarked_C,embarked_Q,embarked_S,embarked_nan,IsAlone
0,3.0,22.0,1.0,0.0,7.25,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
1,1.0,38.0,1.0,0.0,71.2833,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0
2,3.0,26.0,0.0,0.0,7.925,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1
3,3.0,30.194915,0.0,0.0,8.4583,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1
4,1.0,54.0,0.0,0.0,51.8625,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1


In [18]:
#set pipelines for two different algorithms グリッドサーチクロスバリデーションに利用
gb = GradientBoostingClassifier(random_state=1)

In [20]:
# パラメータグリッドの設定
param_grid_gb= {'n_estimators':[100,200,300,400,500],
                'learning_rate':[0.003,0.005,0.009,0.01,0.03,0.1],
               'max_depth':[2,4,6,8]}

In [21]:
train_fin.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,sex_nan,embarked_C,embarked_Q,embarked_S,embarked_nan,FamilySize,IsAlone
0,1.0,35.0,1.0,0.0,53.1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0
1,3.0,35.0,0.0,0.0,8.05,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1
2,3.0,2.0,3.0,1.0,21.075,0.0,1.0,0.0,0.0,0.0,1.0,0.0,5.0,0
3,2.0,14.0,1.0,0.0,30.0708,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0
4,1.0,58.0,0.0,0.0,26.55,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1


In [22]:
#グリッドサーチクロスバリデーションの実装
print('探索空間:%s' %param_grid_gb)

gs = GridSearchCV(gb,
                 param_grid=param_grid_gb,
                 scoring='neg_log_loss',
                 cv=10,
                 return_train_score=False)
gs.fit(train_fin,y)

探索空間:{'n_estimators': [100, 200, 300, 400, 500], 'learning_rate': [0.003, 0.005, 0.009, 0.01, 0.03, 0.1], 'max_depth': [2, 4, 6, 8]}


GridSearchCV(cv=10, estimator=GradientBoostingClassifier(random_state=1),
             param_grid={'learning_rate': [0.003, 0.005, 0.009, 0.01, 0.03,
                                           0.1],
                         'max_depth': [2, 4, 6, 8],
                         'n_estimators': [100, 200, 300, 400, 500]},
             scoring='neg_log_loss')

In [23]:
result = pd.DataFrame(gs.cv_results_)
result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.088770,0.010799,0.003373,0.000273,0.003,2,100,"{'learning_rate': 0.003, 'max_depth': 2, 'n_es...",-0.578933,-0.589109,...,-0.590294,-0.615300,-0.611277,-0.603635,-0.609269,-0.587025,-0.570865,-0.594739,0.013880,93
1,0.159838,0.005392,0.003799,0.000663,0.003,2,200,"{'learning_rate': 0.003, 'max_depth': 2, 'n_es...",-0.517587,-0.535909,...,-0.530287,-0.585926,-0.577174,-0.557923,-0.566615,-0.529477,-0.508508,-0.545458,0.024424,80
2,0.241566,0.012133,0.003979,0.000667,0.003,2,300,"{'learning_rate': 0.003, 'max_depth': 2, 'n_es...",-0.479282,-0.504545,...,-0.491741,-0.570906,-0.559267,-0.530149,-0.539682,-0.492219,-0.468177,-0.515378,0.032508,63
3,0.313023,0.016294,0.003854,0.000572,0.003,2,400,"{'learning_rate': 0.003, 'max_depth': 2, 'n_es...",-0.459669,-0.487217,...,-0.466255,-0.563088,-0.550386,-0.514030,-0.524024,-0.471813,-0.441269,-0.498054,0.037994,52
4,0.380025,0.008791,0.003816,0.000557,0.003,2,500,"{'learning_rate': 0.003, 'max_depth': 2, 'n_es...",-0.449764,-0.477314,...,-0.449910,-0.559781,-0.544536,-0.499870,-0.512359,-0.458745,-0.422288,-0.486663,0.041572,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,0.400703,0.015035,0.003633,0.000300,0.1,8,100,"{'learning_rate': 0.1, 'max_depth': 8, 'n_esti...",-1.223437,-0.979610,...,-0.466629,-1.091633,-1.408830,-0.822516,-1.165415,-0.563970,-0.848663,-0.975127,0.284371,111
116,0.855352,0.029654,0.004302,0.000388,0.1,8,200,"{'learning_rate': 0.1, 'max_depth': 8, 'n_esti...",-1.828934,-1.481761,...,-0.802790,-1.721928,-2.514029,-1.209662,-1.969072,-0.876363,-1.316270,-1.564723,0.503893,115
117,1.283400,0.027760,0.005218,0.000778,0.1,8,300,"{'learning_rate': 0.1, 'max_depth': 8, 'n_esti...",-2.629071,-2.080341,...,-1.129664,-2.436346,-3.526953,-1.718595,-2.805687,-1.207045,-2.017051,-2.228663,0.711174,118
118,1.585571,0.039458,0.005426,0.000531,0.1,8,400,"{'learning_rate': 0.1, 'max_depth': 8, 'n_esti...",-3.355833,-2.622313,...,-1.419950,-2.892786,-4.313529,-2.160817,-3.483910,-1.504818,-2.497201,-2.744911,0.855449,119


In [24]:
pd.DataFrame(gs.cv_results_).loc[1,'rank_test_score']

80

In [25]:
#提出用関数
best = gs.best_estimator_
pred = best.predict_proba(test_fin)[:,1]
sample[1] = pred
sample.to_csv(f'submit/gb_cross_fam_alone3.tsv',sep='\t', header=None)