In [103]:
#ライブラリ
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC,LinearSVC


In [72]:
#データ準備
train =pd.read_table('data/train.tsv', index_col=0)
test = pd.read_table('data/test.tsv',index_col=0)
sample = pd.read_table('data/sample_submit.tsv' , index_col=0, header=None)

# trainデータの前処理

In [73]:
#データの確認と前処理　テストデータ
#"sex"と"embarked"はone-hot-encoding が必要
print(train.shape)
train.head()

(445, 8)


Unnamed: 0_level_0,survived,pclass,sex,age,sibsp,parch,fare,embarked
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S
7,0,3,male,2.0,3,1,21.075,S
9,1,2,female,14.0,1,0,30.0708,C
11,1,1,female,58.0,0,0,26.55,S


In [74]:
train.isnull().sum()

survived     0
pclass       0
sex          0
age         85
sibsp        0
parch        0
fare         0
embarked     2
dtype: int64

In [75]:
#説明変数と目的変数に分ける
train_x = train.drop('survived',axis = 1)
y = train.iloc[:,0]
print(train_x.shape)
print(y.shape)

(445, 7)
(445,)


In [76]:
#one-hot-encoding
train_ohe = pd.get_dummies(train_x,dummy_na=True)
train_ohe.head()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,sex_female,sex_male,sex_nan,embarked_C,embarked_Q,embarked_S,embarked_nan
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3,1,35.0,1,0,53.1,1,0,0,0,0,1,0
4,3,35.0,0,0,8.05,0,1,0,0,0,1,0
7,3,2.0,3,1,21.075,0,1,0,0,0,1,0
9,2,14.0,1,0,30.0708,1,0,0,1,0,0,0
11,1,58.0,0,0,26.55,1,0,0,0,0,1,0


In [77]:
#欠損値の確認　"age","embarked"
print(train_ohe.isnull().sum())

pclass           0
age             85
sibsp            0
parch            0
fare             0
sex_female       0
sex_male         0
sex_nan          0
embarked_C       0
embarked_Q       0
embarked_S       0
embarked_nan     0
dtype: int64


In [78]:
train_ohe.describe()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,sex_nan,embarked_C,embarked_Q,embarked_S,embarked_nan
count,445.0,360.0,445.0,445.0,445.0,445.0,445.0,445.0,445.0,445.0,445.0,445.0
mean,2.296629,29.211583,0.546067,0.431461,33.959971,0.350562,0.649438,0.0,0.177528,0.08764,0.730337,0.004494
std,0.834024,14.1543,1.195247,0.850489,52.079492,0.477683,0.477683,0.0,0.382545,0.28309,0.444284,0.066965
min,1.0,0.67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,20.0,0.0,0.0,7.925,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,28.0,0.0,0.0,15.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
75%,3.0,37.25,1.0,1.0,31.3875,1.0,1.0,0.0,0.0,0.0,1.0,0.0
max,3.0,80.0,8.0,5.0,512.3292,1.0,1.0,0.0,1.0,1.0,1.0,1.0


In [79]:
#欠損値の補完
#インピュータークラスのインスタンス化と（列平均）の学習
imp = SimpleImputer()
imp.fit(train_ohe)

#学習済みimputerの適用：各列欠損値の置換
train_ohe = pd.DataFrame(imp.transform(train_ohe) , columns = train_ohe.columns.values)

#結果表示
display(train_ohe.iloc[10:20,1])

10    38.000000
11    29.211583
12    29.211583
13    29.211583
14    29.211583
15    66.000000
16    29.211583
17    27.000000
18    29.211583
19     3.000000
Name: age, dtype: float64

# テストデータの前処理

In [80]:
#テストデータの前処理
print(test.shape)
test.head()

(446, 7)


Unnamed: 0_level_0,pclass,sex,age,sibsp,parch,fare,embarked
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
5,3,male,,0,0,8.4583,Q
6,1,male,54.0,0,0,51.8625,S


In [81]:
test.isnull().sum()

pclass       0
sex          0
age         92
sibsp        0
parch        0
fare         0
embarked     0
dtype: int64

In [82]:
#one_hot_encoding
test_ohe = pd.get_dummies(test,dummy_na=True)
test_ohe.head()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,sex_female,sex_male,sex_nan,embarked_C,embarked_Q,embarked_S,embarked_nan
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,3,22.0,1,0,7.25,0,1,0,0,0,1,0
1,1,38.0,1,0,71.2833,1,0,0,1,0,0,0
2,3,26.0,0,0,7.925,1,0,0,0,0,1,0
5,3,,0,0,8.4583,0,1,0,0,1,0,0
6,1,54.0,0,0,51.8625,0,1,0,0,0,1,0


In [83]:
#欠損値の補完
imp2 = SimpleImputer()
imp2.fit(test_ohe)

#学習ずみimputerの適用
test_ohe = pd.DataFrame(imp2.transform(test_ohe) , columns = test_ohe.columns.values)

#結果表示
display(test_ohe.iloc[10:20,1])

10    30.194915
11    31.000000
12    35.000000
13    34.000000
14    15.000000
15    19.000000
16    40.000000
17    30.194915
18    28.000000
19    42.000000
Name: age, dtype: float64

In [84]:
test_ohe.isnull().sum()

pclass          0
age             0
sibsp           0
parch           0
fare            0
sex_female      0
sex_male        0
sex_nan         0
embarked_C      0
embarked_Q      0
embarked_S      0
embarked_nan    0
dtype: int64

In [85]:
#columnsのデータ項目を確認
cols_train = set(train_ohe.columns.values)
cols_test = set(test_ohe.columns.values)

#trainにあってtestにないデータ項目
diff1 = cols_train -cols_test
print('trainのみ：%s' % diff1)

#testにあってtrainにないデータ項目
diff2 = cols_test - cols_train
print('testのみ：%s' % diff2)

trainのみ：set()
testのみ：set()


# モデル作成

In [110]:
#Holdout
train_x,test_x,train_y,test_y =train_test_split(train_ohe, y ,test_size=0.3,random_state=1)


#set pipelines for two different algorithms
pipelines = {
    'logistic':Pipeline([('scl',StandardScaler()),
                        ('est',LogisticRegression(random_state=1))]),
    
    'knn':Pipeline([('scl',StandardScaler()),
                   ('est',KNeighborsClassifier())]),
    
    'rsvc':Pipeline([('scl',StandardScaler()),
                    ('est',SVC(C=1.0,
                              kernel='rbf',
                              class_weight='balanced',
                              random_state=1))]),
}

# fit & evaluation
scores = {}
for pipe_name, pipeline in pipelines.items():
    pipeline.fit(train_x, train_y)
    scores[(pipe_name,'train')] = accuracy_score(train_y, pipeline.predict(train_x))
    scores[(pipe_name,'test')] = accuracy_score(test_y, pipeline.predict(test_x))

pd.Series(scores).unstack()

Unnamed: 0,test,train
knn,0.776119,0.871383
logistic,0.776119,0.810289
rsvc,0.80597,0.839228


In [None]:
#提出用
#pred = pipelines['logistic'].predict_proba(test_ohe)[:,1]
#pred

In [None]:
#sample[1] = pred
#sample.head()

In [None]:
#sample.to_csv('submit/submit1_log.tsv',sep='\t', header=None)