# Titanic 생존자 예측

### 데이터 불러오기, 전처리

In [1]:
import pandas as pd
import numpy as np
titanic = pd.read_csv('../00. data/titanic/train.csv')[['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.2500,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.9250,S
3,4,1,1,female,35.0,1,0,53.1000,S
4,5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,13.0000,S
887,888,1,1,female,19.0,0,0,30.0000,S
888,889,0,3,female,,1,2,23.4500,S
889,890,1,1,male,26.0,0,0,30.0000,C


In [2]:
titanic['sex'] = np.nan
for i in titanic.index:
    titanic['sex'][i] = 1 if titanic['Sex'][i] == 'male' else 0
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,sex
0,1,0,3,male,22.0,1,0,7.25,S,1.0
1,2,1,1,female,38.0,1,0,71.2833,C,0.0
2,3,1,3,female,26.0,0,0,7.925,S,0.0
3,4,1,1,female,35.0,1,0,53.1,S,0.0
4,5,0,3,male,35.0,0,0,8.05,S,1.0


In [3]:
for i in titanic.index:
    titanic['Age'][i] = titanic['Age'][i] * 100 if titanic['Age'][i] < 1 else titanic['Age'][i]
titanic['Age'] = titanic['Age'].fillna(round(titanic['Age'].mean(), 0))
titanic.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,sex
886,887,0,2,male,27.0,0,0,13.0,S,1.0
887,888,1,1,female,19.0,0,0,30.0,S,0.0
888,889,0,3,female,30.0,1,2,23.45,S,0.0
889,890,1,1,male,26.0,0,0,30.0,C,1.0
890,891,0,3,male,32.0,0,0,7.75,Q,1.0


In [4]:
titanic['Embarked'] = titanic['Embarked'].fillna('C')
titanic['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

In [5]:
for i in titanic.index:
    if titanic.Embarked[i] == 'S':
        titanic.Embarked[i] = 0
    elif titanic.Embarked[i] == 'C':
        titanic.Embarked[i] = 1
    else:
        titanic.Embarked[i] = 2

In [6]:
del titanic['PassengerId']
del titanic['Sex']
titanic.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,sex
0,0,3,22.0,1,0,7.25,0,1.0
1,1,1,38.0,1,0,71.2833,1,0.0
2,1,3,26.0,0,0,7.925,0,0.0
3,1,1,35.0,1,0,53.1,0,0.0
4,0,3,35.0,0,0,8.05,0,1.0


### One-Hot Encoding

In [7]:
t_df = pd.get_dummies(titanic)
t_df_label = titanic['Survived']
t_df_data = titanic.drop('Survived', axis=1)

### 학습/테스트 데이터 분리

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
X_train, X_test, y_train, y_test = train_test_split(t_df_data, t_df_label, test_size=0.2, random_state=121)
dtree = DecisionTreeClassifier()
parameters = {
    'max_depth': [2, 3, 4],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3]
}

### 교차 검증과 최적 파라미터 탐색

In [9]:
grid_dtree = GridSearchCV(dtree, param_grid=parameters, cv=5, scoring='accuracy', refit=True)
grid_dtree.fit(t_df_data, t_df_label)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [2, 3, 4], 'min_samples_leaf': [1, 2, 3],
                         'min_samples_split': [2, 3, 4]},
             scoring='accuracy')

In [10]:
print('GridSearchCV 최적 파라미터:', grid_dtree.best_params_)
print('GridSearchCV 최고 정확도:, {0:.4f}'.format(grid_dtree.best_score_))

GridSearchCV 최적 파라미터: {'max_depth': 3, 'min_samples_leaf': 3, 'min_samples_split': 2}
GridSearchCV 최고 정확도:, 0.8114


### 테스트 데이터 세트 정확도

In [11]:
estimator = grid_dtree.best_estimator_
pred = estimator.predict(X_test)

In [12]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.7932960893854749