# Kaggle-Titanic-02
参考：https://www.kaggle.com/kojitakahashi6/titanic-koji

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


##  不必要と判断した列をdropする

In [4]:
train = train.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
test = test.drop(['Name','Ticket','Cabin'], axis=1)

train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


## train-Embarkedの欠損値を埋める

In [5]:
# train データの欠損値の数
train.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [6]:
# Embarkedのmode(最頻値)を求める。
pd.value_counts(train['Embarked'])

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [7]:
# Embarkedの２つの欠損値をmode=Sで埋める。
train.fillna({'Embarked': 'S'}, inplace = True)
train.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64

## test-Fareの欠損値を埋める

In [8]:
# test データの欠損値の数
test.isnull().sum()

PassengerId     0
Pclass          0
Sex             0
Age            86
SibSp           0
Parch           0
Fare            1
Embarked        0
dtype: int64

In [9]:
# Fareのmean(平均値)を求める。
test['Fare'].mean()

35.6271884892086

In [10]:
# Fareの欠損値をmean=35.627で埋める。
test.fillna({'Fare': 35.627}, inplace = True)
test.isnull().sum()

PassengerId     0
Pclass          0
Sex             0
Age            86
SibSp           0
Parch           0
Fare            0
Embarked        0
dtype: int64

## LabelEncoderを使ってEmbarkedとsexを数列に置き換える。

In [11]:
from sklearn.preprocessing import LabelEncoder

LE=LabelEncoder()

labels = ['Embarked','Sex']
for label in labels:
    train[label]=LE.fit_transform(train[label])
    test[label]=LE.fit_transform(test[label])
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


## RandomForestの予測結果でAgeの欠損値を埋めていく

In [12]:
from sklearn.ensemble import RandomForestRegressor

def fill_missing_age(df):
    
    # 使う特徴
    age_df = df[['Age','Embarked','Fare', 'Parch', 'SibSp', 'Pclass']]
    
    # Ageが欠損値かどうかで2グループに分ける
    train  = age_df.loc[df.Age.notnull()]
    test = age_df.loc[df.Age.isnull()]
    
    # Ageの部分は正解データとして学習に使う
    y = train.values[:, 0]
    
    # Age以外の部分は訓練データとして学習に使う
    X = train.values[:, 1::]
    
    # 学習させる    
    rtr = RandomForestRegressor(n_estimators=2000, n_jobs=-1)
    rtr.fit(X, y)
    
    # 予測させる
    predictedAges = rtr.predict(test.values[:, 1::])
    
    # 元のデータフレームに予測されたAgeの値を入れていく
    df.loc[df.Age.isnull(), 'Age'] = predictedAges 
    
    return df

In [13]:
train = fill_missing_age(train)
test = fill_missing_age(test)

In [16]:
train.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [17]:
test.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

## StandardScalerでAgeとFareを標準化する

In [14]:
from sklearn.preprocessing import StandardScaler

std_scale = StandardScaler().fit(train[['Age', 'Fare']])
train[['Age', 'Fare']] = std_scale.transform(train[['Age', 'Fare']])

std_scale = StandardScaler().fit(test[['Age', 'Fare']])
test[['Age', 'Fare']] = std_scale.transform(test[['Age', 'Fare']])