In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#ライブラリのimport
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings('ignore')

In [3]:
#df作成
path = "/content/drive/MyDrive/GCI/titanic/"

df = pd.read_csv(path + 'data/train.csv')
df_test = pd.read_csv(path + 'data/test.csv')

In [4]:
#dfのsize確認
print(df.shape)

(891, 12)


In [5]:
#データ数、変数の種類を確認
print('訓練データのデータ数は{}、変数は{}種類です。'.format(df.shape[0], df.shape[1]))
print('テストデータのデータ数は{}、変数は{}種類です'.format(df_test.shape[0], df_test.shape[1]))
df.head(10)
print(df.columns)

訓練データのデータ数は891、変数は12種類です。
テストデータのデータ数は418、変数は11種類です
Index(['PassengerId', 'Perished', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')



Perished	死亡したかどうか	0 = No, 1 = Yes
Pclass	チケットのクラス	1 = 1st, 2 = 2nd, 3 = 3rd
Name	名前
Sex	性別
Age	年齢
SibSp	乗船していた兄弟姉妹・配偶者の数
Parch	乗船していた親・子供の数
Ticket	チケット番号
Fare	チケット料金
Cabin	キャビン番号
embarked	乗船した港	C = Cherbourg, Q = Queenstown, S = Southampton

以下からは、欠損値補完


In [6]:
df.isnull().sum()

PassengerId      0
Perished         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
df_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [8]:
#all_dfを作っておく
all_df = pd.concat([df, df_test],sort=False).reset_index(drop=True)

In [9]:
#fareの欠損値をPclassごとの平均で埋める
Fare_mean = all_df[["Pclass","Fare"]].groupby("Pclass").mean().reset_index()
Fare_mean.columns = ["Pclass","Fare_mean"]
Fare_mean

Unnamed: 0,Pclass,Fare_mean
0,1,87.508992
1,2,21.179196
2,3,13.302889


In [10]:
#dfとdf_testの両方に適応
df = pd.merge(df, Fare_mean, on="Pclass",how="left")
df_test = pd.merge(df_test, Fare_mean, on="Pclass",how="left")

df.loc[(df["Fare"].isnull()), "Fare"] = df["Fare_mean"]
df_test.loc[(df_test["Fare"].isnull()), "Fare"] = df_test["Fare_mean"]

df = df.drop("Fare_mean",axis=1)
df_test = df_test.drop("Fare_mean",axis=1)
df.isnull().sum()
df_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

In [11]:
#敬称で年齢の欠損値を埋める
name_df = all_df["Name"].str.split("[,.]",2,expand=True)
name_df.columns = ["family_name","honorific","name"]
name_df.head(10)

Unnamed: 0,family_name,honorific,name
0,Braund,Mr,Owen Harris
1,Cumings,Mrs,John Bradley (Florence Briggs Thayer)
2,Heikkinen,Miss,Laina
3,Futrelle,Mrs,Jacques Heath (Lily May Peel)
4,Allen,Mr,William Henry
5,Moran,Mr,James
6,McCarthy,Mr,Timothy J
7,Palsson,Master,Gosta Leonard
8,Johnson,Mrs,Oscar W (Elisabeth Vilhelmina Berg)
9,Nasser,Mrs,Nicholas (Adele Achem)


In [12]:
name_df["family_name"] =name_df["family_name"].str.strip()
name_df["honorific"] =name_df["honorific"].str.strip()
name_df["name"] =name_df["name"].str.strip()
name_df["honorific"].value_counts()

Mr              757
Miss            260
Mrs             197
Master           61
Rev               8
Dr                8
Col               4
Mlle              2
Major             2
Ms                2
Lady              1
Sir               1
Mme               1
Don               1
Capt              1
the Countess      1
Jonkheer          1
Dona              1
Name: honorific, dtype: int64

In [13]:
all_df = pd.concat([all_df, name_df], axis=1)
column_names = all_df.columns.tolist()
print(column_names)
all_df[["Age","honorific"]].groupby("honorific").mean()

['PassengerId', 'Perished', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'family_name', 'honorific', 'name']


Unnamed: 0_level_0,Age
honorific,Unnamed: 1_level_1
Capt,70.0
Col,54.0
Don,40.0
Dona,39.0
Dr,43.571429
Jonkheer,38.0
Lady,48.0
Major,48.5
Master,5.482642
Miss,21.774238


In [14]:
df = pd.concat([df,name_df[0:len(df)].reset_index(drop=True)],axis=1)
df_test = pd.concat([df_test,name_df[0:len(df_test)].reset_index(drop=True)],axis=1)
print(len(df_test))
honorific_age_mean_train = df[["honorific","Age"]].groupby("honorific").mean().reset_index()
honorific_age_mean_test = df_test[["honorific","Age"]].groupby("honorific").mean().reset_index()
honorific_age_mean_train.columns = ["honorific","honorific_Age"]
honorific_age_mean_test.columns = ["honorific","honorific_Age"]

418


In [15]:
df = pd.merge(df, honorific_age_mean_train, on="honorific", how="left")
df_test = pd.merge(df_test, honorific_age_mean_test, on="honorific", how="left")

df.loc[(df["Age"].isnull()), "Age"] = df["honorific_Age"]
df_test.loc[(df_test["Age"].isnull()), "Age"] = df_test["honorific_Age"]

df = df.drop(["honorific_Age"],axis=1)
df_test = df_test.drop(["honorific_Age"],axis=1)

In [16]:
#変数aloneを追加
df["family_num"] = df["Parch"] + df["SibSp"]
df_test["family_num"] = df_test["Parch"] + df_test["SibSp"]

df["alone"] = df["family_num"].apply(lambda x: 0 if x > 0 else 1)
df_test["alone"] = df_test["family_num"].apply(lambda x: 0 if x > 0 else 1)
df["alone"].fillna(0, inplace=True)
df_test["alone"].fillna(0, inplace=True)

In [17]:
#Cabinの欠損値を補完
df.drop('Cabin', axis=1, inplace=True)
df_test.drop('Cabin', axis=1, inplace=True)

df.isnull().sum()

PassengerId    0
Perished       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
family_name    0
honorific      0
name           0
family_num     0
alone          0
dtype: int64

In [18]:
#Embarkedの欠損値を補完
df['Embarked'].fillna('S', inplace=True)
df_test['Embarked'].fillna('S', inplace=True)

df.isnull().sum()

PassengerId    0
Perished       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
family_name    0
honorific      0
name           0
family_num     0
alone          0
dtype: int64

PerishedをSurvivedに変更

In [19]:
df = df.rename(columns={'Perished': 'Survived'})
df_test = df_test.rename(columns={'Perished': 'Survived'})

カテゴリカルデータを数値化

In [20]:
#必要がなさそうなデータを削除
df.drop('Name', axis=1, inplace=True)
df_test.drop('Name', axis=1, inplace=True)

df.drop('Ticket', axis=1, inplace=True)
df_test.drop('Ticket', axis=1, inplace=True)

df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare', 'Embarked', 'family_name', 'honorific', 'name', 'family_num',
       'alone'],
      dtype='object')

In [21]:
#sexを数値化
df.replace({'Sex': {'male': 0, 'female': 1}}, inplace=True)
df_test.replace({'Sex': {'male': 0, 'female': 1}}, inplace=True)

In [22]:
#embarkedをone-hot-encodingで数値化
#0,1,2で振り分けると、大小関係が影響しちゃうから
embarked = pd.concat([df['Embarked'], df_test['Embarked']])

embarked_ohe = pd.get_dummies(embarked)

embarked_ohe_train = embarked_ohe[:891]
embarked_ohe_test = embarked_ohe[891:]

df = pd.concat([df, embarked_ohe_train], axis=1)
df_test = pd.concat([df_test, embarked_ohe_test], axis=1)

df.drop('Embarked', axis=1, inplace=True)
df_test.drop('Embarked', axis=1, inplace=True)

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,family_name,honorific,name,family_num,alone,C,Q,S
0,1,1,3,0,22.0,1,0,7.25,Braund,Mr,Owen Harris,1,0,0,0,1
1,2,0,1,1,38.0,1,0,71.2833,Cumings,Mrs,John Bradley (Florence Briggs Thayer),1,0,1,0,0
2,3,0,3,1,26.0,0,0,7.925,Heikkinen,Miss,Laina,0,1,0,0,1
3,4,0,1,1,35.0,1,0,53.1,Futrelle,Mrs,Jacques Heath (Lily May Peel),1,0,0,0,1
4,5,1,3,0,35.0,0,0,8.05,Allen,Mr,William Henry,0,1,0,0,1


不要な変数削除

In [23]:
df = df.drop(["family_name","name", "honorific"],axis=1)
df_test = df_test.drop(["family_name","name", "honorific"],axis=1)
df_test.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
family_num     0
alone          0
C              0
Q              0
S              0
dtype: int64

モデルの構築


In [25]:
X = df.iloc[:, 2:].values
y = df.iloc[:, 1].values

X_test = df_test.iloc[:, 1:].values

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

In [31]:
param_grid = {'max_depth': [3, 5, 7],
              'min_samples_leaf': [1, 2, 4]}

rfc_gs = GridSearchCV(RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42), param_grid, cv=5)
rfc_gs.fit(X, y)

print('Best Parameters: {}'.format(rfc_gs.best_params_))
print('CV Score: {}'.format(round(rfc_gs.best_score_, 3)))

Best Parameters: {'max_depth': 7, 'min_samples_leaf': 2}
CV Score: 0.822


In [34]:
rfc_pred = rfc_gs.predict_proba(X_test)
pred = rfc_pred.argmax(axis=1)
print(pred)

[1 1 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 1 1 0 1 0 0 0 0 1 0 1 1 1 1 1 0 1 1 1 1
 1 1 0 1 1 1 0 0 1 1 1 0 0 1 1 0 0 1 1 1 1 1 0 1 1 1 0 0 0 0 1 1 0 0 1 1 1
 0 1 1 0 1 0 0 1 1 1 1 1 0 1 0 0 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 1 1 1
 0 0 0 0 1 1 0 1 0 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 1 0 1 1 0 1 1 0 1 1 0 0 0 0 1 1 1 1 1 0 1 1 1 1 1 1 0 0 0 0 0 1 1 0 1 0
 1 0 1 1 1 1 1 0 1 0 1 0 1 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 0 1 1 0 1 0 1 0 1
 0 1 0 0 1 0 1 1 1 0 1 1 0 1 1 1 0 0 0 0 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 0
 1 1 1 0 0 1 1 1 1 1 1 1 1 0 0 1 0 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 1 0 1 1 1 0 1 1
 0 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 0 1 1 1 0 1 0 1 1 0 1 0 0 1 0 1 1 0 0 1
 1 0 1 1 0 0 1 1 1 1 1 1 0 0 1 0 1 1 1 1 1 0 1 1 1 0 1 0 1 1 0 1 0 1 1 1 1
 1 0 0 0 0 1 1 0 1 1 1]


In [37]:
path = "/content/drive/MyDrive/GCI/titanic/"

submission = pd.read_csv(path + 'submit_titanic.csv')
submission

Unnamed: 0,PassengerId,Survived
0,892,1
1,893,1
2,894,1
3,895,1
4,896,0
...,...,...
413,1305,1
414,1306,0
415,1307,1
416,1308,1


In [38]:
submission.to_csv('/content/drive/MyDrive/GCI/titanic/submit_titanic.csv', index=False)

submission.to_csv('submission.csv',index=False)

from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>