このnotebookではpandasとscikit-learnを用いたシンプルな解法のtutorialを行います．


基本的な流れはこんな感じです
- データの読み込み
- データの前処理
- 学習モデルの作成
- モデルの学習
- テストデータの予測

In [1]:
# 必要なライブラリのインポート
import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [2]:
# データの読み込み
inputs_dir = "../titanic"
train = pd.read_csv(os.path.join(inputs_dir, "train.csv"))
test = pd.read_csv(os.path.join(inputs_dir, "test.csv"))
sample_submit = pd.read_csv(os.path.join(inputs_dir, "gender_submission.csv"))

In [3]:
# データの確認
print("train")
display(train)
print("test")
display(test)
print("gender submission")
display(sample_submit)

train


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


test


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


gender submission


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


## データの前処理

In [4]:
# trainの最初の3行を表示
display(train.head(3))
# trainの属性の確認
print(train.columns.values)
print(f"trainの属性数 : {len(train.columns)}")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']
trainの属性数 : 12


Survivedは目的変数であるため，Survicedを除く11の属性で学習を行いtestデータの予測をする必要があり，Fareなど数値属性はそのまま学習に使用できることが多いですが，Name属性など文字列の場合は何らかの前処理を行なって数値に変換する必要があります．

ですが今回はまず，数値属性のみを使用してRandomForest分類器を学習させてみます．

In [5]:
# 学習データの目的変数を別に持っておいて，trainから目的変数を取り除く
y_train = train[["Survived"]]
train = train.drop(["Survived"], axis=1)

trainとtest両方に同じ処理をするため一度trainとtestを結合します

In [6]:
# 結合する前にtrainとtestを識別するID（trainなら1，testなら0）を先にそれぞれ格納しておきます
train["train_or_test"] = 1
test["train_or_test"] = 0

In [7]:
# trainとtestを結合してdfに格納
df = pd.concat([train, test], axis=0)

In [8]:
# dfの確認
df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,train_or_test
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,1
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,1
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,0
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,0
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,0


In [9]:
# PassengerIdと数値属性以外を取り除く
drop_columns = ["PassengerId", "Name", "Sex", "Ticket", "Cabin", "Embarked"]
df = df.drop(drop_columns, axis=1)

In [10]:
# 処理後のdfの確認
df.head(3)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,train_or_test
0,3,22.0,1,0,7.25,1
1,1,38.0,1,0,71.2833,1
2,3,26.0,0,0,7.925,1


In [11]:
# 欠損値の確認
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Pclass         1309 non-null   int64  
 1   Age            1046 non-null   float64
 2   SibSp          1309 non-null   int64  
 3   Parch          1309 non-null   int64  
 4   Fare           1308 non-null   float64
 5   train_or_test  1309 non-null   int64  
dtypes: float64(2), int64(4)
memory usage: 71.6 KB


Non-Null Countは欠損値でない数を示していて，今回はAgeとFareに欠損値があることが確認できます

このtutorialでは学習モデルにscikit-learnのRandomForestClassifierを使用しますが，欠損値がある場合は欠損値を何らかの数値に置き換えるか，欠損値を除去しないとエラーがでてしまいます

そこで今回は欠損値を平均値で置き換える処理を行ってみます

In [12]:
# AgeとFareの欠損値をそれぞれの平均値で埋める
df["Age"] = df["Age"].fillna(df["Age"].mean())
df["Fare"] = df["Fare"].fillna(df["Fare"].mean())

In [13]:
# 欠損値がなくなっているかの確認
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Pclass         1309 non-null   int64  
 1   Age            1309 non-null   float64
 2   SibSp          1309 non-null   int64  
 3   Parch          1309 non-null   int64  
 4   Fare           1309 non-null   float64
 5   train_or_test  1309 non-null   int64  
dtypes: float64(2), int64(4)
memory usage: 71.6 KB


欠損値がなくなったことが確認でき，学習に使用できるようになったため，dfをtrainとtestに再度分けて，trainで学習を行なっていきます

In [14]:
# trainとtestへ分割して，学習に必要ないtrain_or_testを取り除く
X_train = df[df["train_or_test"]==1].drop(["train_or_test"], axis=1)
X_test = df[df["train_or_test"]==0].drop(["train_or_test"], axis=1)

In [15]:
# 分割後のtrainとtestの確認
display(X_train)
display(y_train)
display(X_test)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3,22.000000,1,0,7.2500
1,1,38.000000,1,0,71.2833
2,3,26.000000,0,0,7.9250
3,1,35.000000,1,0,53.1000
4,3,35.000000,0,0,8.0500
...,...,...,...,...,...
886,2,27.000000,0,0,13.0000
887,1,19.000000,0,0,30.0000
888,3,29.881138,1,2,23.4500
889,1,26.000000,0,0,30.0000


Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3,34.500000,0,0,7.8292
1,3,47.000000,1,0,7.0000
2,2,62.000000,0,0,9.6875
3,3,27.000000,0,0,8.6625
4,3,22.000000,1,1,12.2875
...,...,...,...,...,...
413,3,29.881138,0,0,8.0500
414,1,39.000000,0,0,108.9000
415,3,38.500000,0,0,7.2500
416,3,29.881138,0,0,8.0500


## 学習モデルの作成

これで無事前処理が終了したので，X_trainとy_trainで学習した後，X_testで目的変数の予測を行なっていきます．

先述の通り，今回はscikit-learnのRandomForestClassifierを使用して学習を行います

In [16]:
# modelの作成
model = RandomForestClassifier()

## モデルの学習

In [17]:
# 学習
model.fit(X_train, y_train)

RandomForestClassifier()

## テストデータの予測

学習ができたのでX_testを使用して予測を行い，提出用のcsvファイルを作っていきます

In [18]:
# testの予測
predict = model.predict(X_test)

In [19]:
predict

array([0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [20]:
sample_submit

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [21]:
# 予測変数を提出するDataFrameに格納
sample_submit["Survived"] = predict

In [22]:
sample_submit

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,1
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [23]:
# 提出用csvの作成
output_dir = "../outputs"
os.makedirs(output_dir, exist_ok=True)
sample_submit.to_csv(os.path.join(output_dir, "first_submit.csv"),index=False)