# 타이타닉 생존자 예측


In [45]:
import numpy as np
import pandas as pd
import seaborn as sns

In [46]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


1. 데이터 전처리

- Feature selection (필요한 데이터 고르기)

In [47]:
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [48]:
df = df[['survived', 'pclass', 'sex','age', 'sibsp', 'parch', 'fare', 'embarked', 'deck']]
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [49]:
df.isna().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
deck        688
dtype: int64

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   survived  891 non-null    int64   
 1   pclass    891 non-null    int64   
 2   sex       891 non-null    object  
 3   age       714 non-null    float64 
 4   sibsp     891 non-null    int64   
 5   parch     891 non-null    int64   
 6   fare      891 non-null    float64 
 7   embarked  889 non-null    object  
 8   deck      203 non-null    category
dtypes: category(1), float64(2), int64(4), object(2)
memory usage: 57.0+ KB


In [51]:
df.age = df.age.fillna(df.age.mean())
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   survived  891 non-null    int64   
 1   pclass    891 non-null    int64   
 2   sex       891 non-null    object  
 3   age       891 non-null    float64 
 4   sibsp     891 non-null    int64   
 5   parch     891 non-null    int64   
 6   fare      891 non-null    float64 
 7   embarked  889 non-null    object  
 8   deck      203 non-null    category
dtypes: category(1), float64(2), int64(4), object(2)
memory usage: 57.0+ KB


In [52]:
df.embarked.value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [53]:
df.embarked = df.embarked.fillna('S')
df.embarked.isna().sum()

0

In [54]:
# deck - 열 삭제
df.drop(columns=['deck'], inplace=True)
df.isna().sum().sum()

0

- 카데고리 값(sex, embarked)을 숫자로 변환

In [55]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()


In [56]:
df.sex = le.fit_transform(df.sex)

In [57]:
df.embarked = le.fit_transform(df.embarked)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


### 2. Train/Test dataset으로 분리


In [58]:
# X와 y를 넘파이 배열로
X = df.iloc[:, 1:].values
y = df.survived.values
X.shape, y.shape

((891, 7), (891,))

In [59]:
# y값의 분포
# df.survived.value_counts()
np.unique(y, return_counts=True)

(array([0, 1]), array([549, 342]))

In [60]:
# train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=2022
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 7), (179, 7), (712,), (179,))

In [61]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([439, 273]))

### 3.RandomForest로 학습

In [62]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2022)
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2022,
 'verbose': 0,
 'warm_start': False}

In [63]:
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

0.8324022346368715

### 5. 3,4 대신에 GridSearchCV 수행

In [64]:
params = {
    'max_depth' : [2, 4, 6, 8],
    'min_samples_split' : [2, 4, 6]
    # ,'n_estimators' : [10, 100, 1000, 10000] 다음에는 빼고하기 너무 오래걸림
}

In [66]:
from sklearn.model_selection import GridSearchCV
grid_rf = GridSearchCV(
    rfc, params, scoring='accuracy', cv=5
)
%time grid_rf.fit(X_train, y_train)
print(grid_rf.best_params_, grid_rf.best_score_)
best_rf = grid_rf.best_estimator_
best_rf.score(X_test, y_test)

{'max_depth': 4, 'min_samples_split': 2, 'n_estimators': 10000} 0.8216093765389539


0.8324022346368715

In [67]:
# %time 앞에 붙이면 얼마나 걸리는지 보여줌

### 6. 테스트 데이터에 적용

In [68]:
X_test[25], y_test[25]

(array([ 3.  ,  1.  , 45.  ,  0.  ,  0.  ,  8.05,  2.  ]), 1)

In [None]:
best_rf.predict(X_test[25].reshape(1,-1))

### 7.엉터리 분류기

In [73]:
df.dtypes

survived      int64
pclass        int64
sex           int64
age         float64
sibsp         int64
parch         int64
fare        float64
embarked      int64
dtype: object

In [77]:
df.groupby(['sex', 'pclass'])[['survived']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,survived
sex,pclass,Unnamed: 2_level_1
0,1,0.968085
0,2,0.921053
0,3,0.5
1,1,0.368852
1,2,0.157407
1,3,0.135447


In [87]:
from sklearn.base import BaseEstimator

class Myclassifier(BaseEstimator):
    # fit(), predict() method만 재정의(Overiding)
    def fit(self, X, y): 
        pass
    # 무조건 self 붙여줘야 함

    def predict(self, X):
        pred = np.zeros(X.shape[0], int)      # X의 행의 개수
        for i in range(X.shape[0]) :
            if X[i, 1] == 0:             # 여성이면
                pred[i] = 1              # 여성이면 생존
        return pred
        

In [88]:
my_clf = Myclassifier()
my_clf.fit(X_train, y_train) 
pred_my = my_clf.predict(X_test)

In [82]:
X_test[8, 1], pred_my[6]

(0.0, 0.0)

In [84]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred_my)

0.7877094972067039

- 모델의 성능을 평가할 때 무조건 적으로 정확도를 사용하는 것은 지양해야 함

In [89]:
pred_rf = best_rf.predict(X_test)
sdf = pd.DataFrame({'y_test' : y_test, 'RF' : pred_rf, 'MY' : pred_my})
sdf.head()

Unnamed: 0,y_test,RF,MY
0,1,0,0
1,0,0,0
2,1,0,1
3,0,0,0
4,0,0,0


In [91]:
from sklearn.metrics import confusion_matrix
# confusion_matrix 들어가는 순서 실제값 , 예측값 중요!
confusion_matrix(y_test, pred_rf)

array([[104,   6],
       [ 24,  45]])

In [92]:
confusion_matrix(y_test, pred_my)

array([[96, 14],
       [24, 45]])

In [93]:
from sklearn.metrics import precision_score, recall_score


In [99]:
#정밀도(TP / (FP + TP))
precision_score(y_test, pred_rf), precision_score(y_test, pred_my)

(0.8823529411764706, 0.7627118644067796)

In [101]:
# 재현율(TP / FN + TP)
recall_score(y_test, pred_rf), recall_score(y_test, pred_my)

(0.6521739130434783, 0.6521739130434783)

In [102]:
# F1 score(정밀도와 재현율의 조화 평균)
from sklearn.metrics import f1_score
f1_score(y_test, pred_rf), f1_score(y_test, pred_my)

(0.75, 0.703125)

In [103]:
# AUC Score
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, pred_rf), roc_auc_score(y_test, pred_my)

(0.7988142292490119, 0.7624505928853755)