In [24]:
#ライブラリの読み込み
import pandas as pd
import numpy as np
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings('ignore')

In [25]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [26]:
#Age, Fareの欠損値を平均値で保管
age = pd.concat([df['Age'], df_test['Age']])
fare = pd.concat([df['Fare'], df_test['Fare']])

df['Age'].fillna(age.mean(), inplace=True)
df_test['Age'].fillna(age.mean(), inplace=True)

df['Fare'].fillna(fare.mean(), inplace=True)
df_test['Fare'].fillna(fare.mean(), inplace=True)

print(df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [27]:
#欠損値の多いCabinはdrop
df.drop('Cabin', axis=1, inplace=True)
df_test.drop('Cabin', axis=1, inplace=True)

print(df.isnull().sum())

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
dtype: int64


In [28]:
#Embarkedについては割合の大きいSouthamptonで補完
df['Embarked'].fillna('S', inplace=True)
df_test['Embarked'].fillna('S', inplace=True)

print(df.isnull().sum())

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


In [29]:
#今回はNameとTicketはdrop
df.drop('Name', axis=1, inplace=True)
df_test.drop('Name', axis=1, inplace=True)

df.drop('Ticket', axis=1, inplace=True)
df_test.drop('Ticket', axis=1, inplace=True)

print(df.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare', 'Embarked'],
      dtype='object')


In [30]:
#Sexは01に数値化
df.replace({'Sex': {'male': 0, 'female': 1}}, inplace=True)
df_test.replace({'Sex': {'male': 0, 'female': 1}}, inplace=True)

print(df.head())

   PassengerId  Survived  Pclass  Sex   Age  SibSp  Parch     Fare Embarked
0            1         0       3    0  22.0      1      0   7.2500        S
1            2         1       1    1  38.0      1      0  71.2833        C
2            3         1       3    1  26.0      0      0   7.9250        S
3            4         1       1    1  35.0      1      0  53.1000        S
4            5         0       3    0  35.0      0      0   8.0500        S


In [31]:
#EmberkedはOne-Hot Encodingで変換。
embarked = pd.concat([df['Embarked'], df_test['Embarked']])

embarked_ohe = pd.get_dummies(embarked)

embarked_ohe_train = embarked_ohe[:891]
embarked_ohe_test = embarked_ohe[891:]

df = pd.concat([df, embarked_ohe_train], axis=1)
df_test = pd.concat([df_test, embarked_ohe_test], axis=1)

df.drop('Embarked', axis=1, inplace=True)
df_test.drop('Embarked', axis=1, inplace=True)

print(df.head())

   PassengerId  Survived  Pclass  Sex   Age  SibSp  Parch     Fare      C  \
0            1         0       3    0  22.0      1      0   7.2500  False   
1            2         1       1    1  38.0      1      0  71.2833   True   
2            3         1       3    1  26.0      0      0   7.9250  False   
3            4         1       1    1  35.0      1      0  53.1000  False   
4            5         0       3    0  35.0      0      0   8.0500  False   

       Q      S  
0  False   True  
1  False  False  
2  False   True  
3  False   True  
4  False   True  


In [32]:
#説明変数及び目的変数を抽出
X = df.iloc[:, 2:].values
y = df.iloc[:, 1].values

X_test = df_test.iloc[:, 1:].values

In [33]:
#説明変数Xと目的変数yを7:3で学習用と検証用とに分割
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

In [34]:
#ランダムフォレスト分類気を作成
rfc = RandomForestClassifier(max_depth=10, min_samples_leaf=1, n_estimators=100, n_jobs=-1, random_state=42)
rfc.fit(X_train, y_train)
print(rfc)

RandomForestClassifier(max_depth=10, n_jobs=-1, random_state=42)


In [35]:
#学習データに対するモデルの正答率(過学習の有無を確認)
print('Train Score: {}'.format(round(rfc.score(X_train, y_train), 3)))
#検証データに対するモデルの正答率(汎化性能を確認)
print(' Test Score: {}'.format(round(rfc.score(X_valid, y_valid), 3)))

Train Score: 0.953
 Test Score: 0.791


In [36]:
#ハイパーパラメータのグリッドサーチ
param_grid = {'max_depth': [3, 5, 7],
              'min_samples_leaf': [1, 2, 4]}

#3*3の組み合わせでモデルを学習・評価
for max_depth in param_grid['max_depth']:
    for min_samples_leaf in param_grid['min_samples_leaf']:
        rfc_grid = RandomForestClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf,
                                            n_estimators=100, n_jobs=-1, random_state=42)
        rfc_grid.fit(X_train, y_train)
        print('max_depth: {}, min_samples_leaf: {}'.format(max_depth, min_samples_leaf))
        print('    Train Score: {}, Test Score: {}'.format(round(rfc_grid.score(X_train, y_train), 3),
                                                           round(rfc_grid.score(X_valid, y_valid), 3)))

max_depth: 3, min_samples_leaf: 1
    Train Score: 0.844, Test Score: 0.821
max_depth: 3, min_samples_leaf: 2
    Train Score: 0.844, Test Score: 0.821
max_depth: 3, min_samples_leaf: 4
    Train Score: 0.844, Test Score: 0.817
max_depth: 5, min_samples_leaf: 1
    Train Score: 0.88, Test Score: 0.821
max_depth: 5, min_samples_leaf: 2
    Train Score: 0.867, Test Score: 0.806
max_depth: 5, min_samples_leaf: 4
    Train Score: 0.864, Test Score: 0.802
max_depth: 7, min_samples_leaf: 1
    Train Score: 0.899, Test Score: 0.81
max_depth: 7, min_samples_leaf: 2
    Train Score: 0.889, Test Score: 0.787
max_depth: 7, min_samples_leaf: 4
    Train Score: 0.876, Test Score: 0.791


In [37]:
# ランダムフォレストのハイパーパラメータをグリッドサーチで最適化（5分割クロスバリデーション）
rfc_gs = GridSearchCV(RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42), param_grid, cv=5)
rfc_gs.fit(X, y)

print('Best Parameters: {}'.format(rfc_gs.best_params_))
print('CV Score: {}'.format(round(rfc_gs.best_score_, 3)))

Best Parameters: {'max_depth': 7, 'min_samples_leaf': 2}
CV Score: 0.824


In [38]:
# 家族の合計人数（SibSp + Parch）を表す新しい特徴量 'Family' を作成
df_fe = df.copy()
df_fe_test = df_test.copy()

df_fe['Family'] = df['SibSp'] + df['Parch']
df_fe_test['Family'] = df_test['SibSp'] + df_test['Parch']

df_fe.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S,Family
0,1,0,3,0,22.0,1,0,7.25,False,False,True,1
1,2,1,1,1,38.0,1,0,71.2833,True,False,False,1
2,3,1,3,1,26.0,0,0,7.925,False,False,True,0
3,4,1,1,1,35.0,1,0,53.1,False,False,True,1
4,5,0,3,0,35.0,0,0,8.05,False,False,True,0
5,6,0,3,0,29.881138,0,0,8.4583,False,True,False,0
6,7,0,1,0,54.0,0,0,51.8625,False,False,True,0
7,8,0,3,0,2.0,3,1,21.075,False,False,True,4
8,9,1,3,1,27.0,0,2,11.1333,False,False,True,2
9,10,1,2,1,14.0,1,0,30.0708,True,False,False,1


In [39]:
#新たな特徴量'Family'をもとに再度ランダムフォレストを作成
X_fe = df_fe.iloc[:, 2:].values
y_fe = df_fe.iloc[:, 1].values

X_fe_test = df_fe_test.iloc[:, 1:].values

X_fe_train, X_fe_valid, y_fe_train, y_fe_valid = train_test_split(X_fe, y_fe, test_size=0.3, random_state=42)

rfc_fe = RandomForestClassifier(max_depth=7, min_samples_leaf=1, n_estimators=100, n_jobs=-1, random_state=42)
rfc_fe.fit(X_fe_train, y_fe_train)

print('Train Score: {}'.format(round(rfc_fe.score(X_fe_train, y_fe_train), 3)))
print(' Test Score: {}'.format(round(rfc_fe.score(X_fe_valid, y_fe_valid), 3)))

Train Score: 0.905
 Test Score: 0.802


In [40]:
# ロジスティック回帰でデータを分類し、精度を検証
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)

print('Logistic Regression \n')
print('Train Score: {}'.format(round(lr.score(X_train, y_train), 3)))
print(' Test Score: {}'.format(round(lr.score(X_valid, y_valid), 3)))

Logistic Regression 

Train Score: 0.803
 Test Score: 0.81


In [41]:
#多層パーセプトロンでdataを分類し、精度を検証
mlpc = MLPClassifier(hidden_layer_sizes=(100, 100, 10), random_state=0)
mlpc.fit(X_train, y_train)

print('Multilayer Perceptron \n')
print('Train Score: {}'.format(round(mlpc.score(X_train, y_train), 3)))
print(' Test Score: {}'.format(round(mlpc.score(X_valid, y_valid), 3)))

Multilayer Perceptron 

Train Score: 0.828
 Test Score: 0.802


In [42]:
#3モデルの予測確率を平均し、アンサンブルで最終クラスを決定
rfc_pred = rfc.predict_proba(X_test)
lr_pred = lr.predict_proba(X_test)
mlpc_pred = mlpc.predict_proba(X_test)

pred_proba = (rfc_pred + lr_pred + mlpc_pred) / 3
pred = pred_proba.argmax(axis=1)

In [43]:
 # 読み込むデータが格納されたディレクトリのパス，必要に応じて変更の必要あり
submission = pd.read_csv('gender_submission.csv')

submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [44]:
pred.shape

(418,)

In [46]:
# 予測を格納
submission = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],  # test.csvのPassengerIdをそのまま使う
    'Survived': pred                        # 予測した値（0または1）
})

# CSVとして保存
submission.to_csv('submission.csv', index=False)
