In [41]:
import sys
import os
# データ加工・処理・分析ライブラリ
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import sklearn
import numpy as np
import numpy.random as random
import scipy as sp
from pandas import Series, DataFrame
import pandas as pd

# 可視化ライブラリ
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

# 小数第3位まで表示
%precision 3


'%.3f'

In [42]:
train_csv = "./df/EDA_02_train.csv"
test_csv = "./df/EDA_02_test.csv"
df_train = pd.read_csv(train_csv)
df_test = pd.read_csv(test_csv)

X = df_train.iloc[:,2:].values
y = df_train.iloc[:,1].values

X_test = df_test.iloc[:,1:].values

In [43]:
df_train.head()

Unnamed: 0,PassengerId,Perished,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,1,1,3,0,22.0,1,0,7.25,0,0,1
1,2,0,1,1,38.0,1,0,71.2833,1,0,0
2,3,0,3,1,26.0,0,0,7.925,0,0,1
3,4,0,1,1,35.0,1,0,53.1,0,0,1
4,5,1,3,0,35.0,0,0,8.05,0,0,1


In [44]:
#ロジステック回帰モデル

# 訓練データとテストデータに分ける
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.5, random_state=41)



# 標準化処理
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_valid_std = sc.transform(X_valid)
X_test_std = sc.transform(X_test)

# ロジスティック回帰クラスの初期化と学習
lr = LogisticRegression(random_state=42)
lr.fit(X_train_std, y_train)


# # 正解率の表示
print('正解率(Normal)(train):{:.3f}'.format(lr.score(X_train_std, y_train)))
print('正解率(Nomal)(test):{:.3f}'.format(lr.score(X_valid_std, y_valid)))

正解率(Normal)(train):0.813
正解率(Nomal)(test):0.798


In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth': [3, 5, 7,9],
              'min_samples_leaf': [1, 2, 4,6],
              'n_estimators':[50,100,200,300]}
rfc_gs = GridSearchCV(RandomForestClassifier(n_estimators=200,n_jobs=1,random_state=42),param_grid,cv=5)
rfc_gs.fit(X_train_std,y_train)

print('Best Parameters: {}'.format(rfc_gs.best_params_))
print('CV Score: {}'.format(round(rfc_gs.best_score_, 3)))


In [46]:

rfc_fe = RandomForestClassifier(
    max_depth=9, min_samples_leaf=1, n_estimators=200, n_jobs=-1, random_state=42)
rfc_fe.fit(X_train_std, y_train)

print('Train Score: {}'.format(round(rfc_fe.score(X_train_std, y_train), 3)))
print(' Test Score: {}'.format(round(rfc_fe.score(X_valid_std, y_valid), 3)))


Train Score: 0.953
 Test Score: 0.825


In [47]:
from sklearn.neural_network import MLPClassifier
mlpc = MLPClassifier(hidden_layer_sizes=(100, 100, 10), random_state=0)
mlpc.fit(X_train_std, y_train)

print('Multilayer Perceptron \n')
print('Train Score: {}'.format(round(mlpc.score(X_train_std, y_train), 3)))
print(' Test Score: {}'.format(round(mlpc.score(X_valid_std, y_valid), 3)))


Multilayer Perceptron 

Train Score: 0.912
 Test Score: 0.778




In [48]:
#predict
lr_predict = lr.predict_proba(X_test_std)
rfc_predict = rfc_fe.predict_proba(X_test)
mlpc_predict = mlpc.predict_proba(X_test_std)
pred_proba = (lr_predict+rfc_predict+mlpc_predict)/3
predict = pred_proba.argmax(axis=1)


In [49]:
predict

array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,

In [50]:

submission = pd.read_csv('gender_submission.csv')
submission['Perished'] = predict
submission.to_csv(os.path.join("sub", '02_submission.csv'), index=False)
submission.head()


Unnamed: 0,PassengerId,Perished
0,892,1
1,893,1
2,894,1
3,895,1
4,896,1
