In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

In [2]:
train = pd.read_csv('input/train.csv')
test  = pd.read_csv('input/test.csv')
gender_submission = pd.read_csv('input/gender_submission.csv')

print(train.shape, test.shape, gender_submission.shape)
train.head()

(891, 12) (418, 11) (418, 2)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 特徴量生成

In [3]:
dataset = [train, test]
for data in dataset:
    # 同乗している親族の数
    data['Family'] = data.SibSp + data.Parch
    
    # 未就学児
    # data['So_Young'] = data['Age'].apply(lambda x: x<7)
    # data['So_Young'] = data['So_Young'].map({True:1, False:0})
    
    # 同じチケットで乗船した人の数(自分以外に同じチケット番号の人が何人いるか)
    # data['Same_ticket_no'] = data.Ticket.apply(lambda x: data.Ticket.value_counts()[x] - 1)
    
    # 親族数 または 同じチケットの数の多い方を知り合いの人数として登録
    # data['Relatives'] = data.apply(lambda r: max([r['Family'], r['Relatives_tmp']]), axis=1)
    
    # 乗船料金がタダの人は乗組員ぽいので分離
    # data['Is_free'] = data['Fare'].apply(lambda x: x==0)
    # data['Is_free'] = data['Is_free'].map({True:1, False:0})
    
    # CabinをA〜Gとその他(Unknown)に分ける
    data['Cabin'] = data['Cabin'].fillna('Unknown')
    data['Cabin'] = data['Cabin'].apply(lambda x: x[0])
    data['Cabin'] = data['Cabin'].map({'U':0, 'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'T':0})
    
    # 不要となったカラムの削除
    data.drop(['SibSp', 'Parch'], inplace=True, axis=1)
    
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,Family
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,A/5 21171,7.25,0,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,PC 17599,71.2833,3,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,STON/O2. 3101282,7.925,0,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,113803,53.1,3,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,373450,8.05,0,S,0


## 不要カラムの削除
- Embarkedは一旦残す

In [4]:
del_columns = ['PassengerId', 'Name', 'Ticket', 'Fare']
train = train.drop(del_columns, axis=1)
test  = test.drop(del_columns, axis=1)
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Cabin,Embarked,Family
0,0,3,male,22.0,0,S,1
1,1,1,female,38.0,3,C,1
2,1,3,female,26.0,0,S,0
3,1,1,female,35.0,3,S,1
4,0,3,male,35.0,0,S,0


## カテゴリ変数を数値に置き換える

In [5]:
dataset = [train, test]
for data in dataset:
    # 性別
    data['Sex'] = data['Sex'].map({'male': 0, 'female':1})

# 乗船港
train = pd.get_dummies(train, drop_first=True)
test  = pd.get_dummies(test,  drop_first=True)
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Cabin,Family,Embarked_Q,Embarked_S
0,0,3,0,22.0,0,1,0,1
1,1,1,1,38.0,3,1,0,0
2,1,3,1,26.0,0,0,0,1
3,1,1,1,35.0,3,1,0,1
4,0,3,0,35.0,0,0,0,1


In [6]:
if len(test.columns) != len(train.columns) - 1:
    print('テストデータのカラムがおかしい')
    print('Test:', test.columns)
    print('Train:', train.columns)

## 欠損値を埋める

In [7]:
train.isna().sum(), test.isna().sum()

(Survived        0
 Pclass          0
 Sex             0
 Age           177
 Cabin           0
 Family          0
 Embarked_Q      0
 Embarked_S      0
 dtype: int64,
 Pclass         0
 Sex            0
 Age           86
 Cabin          0
 Family         0
 Embarked_Q     0
 Embarked_S     0
 dtype: int64)

In [8]:
# 中央値で埋める
age_median = train['Age'].median()
print('Age Median:', age_median)
# fare_median = train['Fare'].median()
# print('Fare Median:', fare_median)

dataset = [train, test]
for data in dataset:
    data['Age'] = data['Age'].fillna(age_median)
    # data['Fare'] = data['Fare'].fillna(fare_median)
    
train.isna().sum(), test.isna().sum()

Age Median: 28.0


(Survived      0
 Pclass        0
 Sex           0
 Age           0
 Cabin         0
 Family        0
 Embarked_Q    0
 Embarked_S    0
 dtype: int64,
 Pclass        0
 Sex           0
 Age           0
 Cabin         0
 Family        0
 Embarked_Q    0
 Embarked_S    0
 dtype: int64)

## 訓練データとテストデータに分割してCSV保存

In [9]:
# 訓練用データ
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']
X_train.head()

Unnamed: 0,Pclass,Sex,Age,Cabin,Family,Embarked_Q,Embarked_S
0,3,0,22.0,0,1,0,1
1,1,1,38.0,3,1,0,0
2,3,1,26.0,0,0,0,1
3,1,1,35.0,3,1,0,1
4,3,0,35.0,0,0,0,1


In [11]:
# テスト用データ
X_test = test

if X_train.shape[1] != X_test.shape[1]:
    print('訓練データとテストデータのカラム数が合っていない')
    

In [28]:
X_train.to_csv('data/X_train.csv', index=False)
y_train.to_csv('data/y_train.csv', index=False)
X_test.to_csv('data/X_test.csv', index=False)