In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold # K: 배수, Fold: 유효성 검증을 위해 하나를 접었다 의 의미
%matplotlib inline

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC # support vector machine
from sklearn.metrics import accuracy_score

In [2]:
kfold = KFold(n_splits=5)

In [3]:
titanic_df = pd.read_csv('./titanic_train.csv')
titanic_df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,body,home.dest
0,2,1,"Mellinger, Miss. Madeleine Violet",female,13.0,0,1,250644,19.5000,,S,,"England / Bennington, VT"
1,2,1,"Wells, Miss. Joan",female,4.0,1,1,29103,23.0000,,S,,"Cornwall / Akron, OH"
2,2,1,"Duran y More, Miss. Florentina",female,30.0,1,0,SC/PARIS 2148,13.8583,,C,,"Barcelona, Spain / Havana, Cuba"
3,3,0,"Scanlan, Mr. James",male,,0,0,36209,7.7250,,Q,,
4,3,1,"Bradley, Miss. Bridget Delia",female,22.0,0,0,334914,7.7250,,Q,,"Kingwilliamstown, Co Cork, Ireland Glens Falls..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
893,3,0,"Davies, Mr. Joseph",male,17.0,2,0,A/4 48873,8.0500,,S,,"West Bromwich, England Pontiac, MI"
894,3,0,"Lockyer, Mr. Edward",male,,0,0,1222,7.8792,,S,153.0,
895,1,0,"Fortune, Mr. Mark",male,64.0,1,4,19950,263.0000,C23 C25 C27,S,,"Winnipeg, MB"
896,3,1,"Dorking, Mr. Edward Arthur",male,19.0,0,0,A/5. 10482,8.0500,,S,,"England Oglesby, IL"


In [4]:
type(titanic_df)

pandas.core.frame.DataFrame

In [5]:
titanic_df.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'body', 'home.dest'],
      dtype='object')

In [6]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 898 entries, 0 to 897
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     898 non-null    int64  
 1   survived   898 non-null    int64  
 2   name       898 non-null    object 
 3   sex        898 non-null    object 
 4   age        728 non-null    float64
 5   sibsp      898 non-null    int64  
 6   parch      898 non-null    int64  
 7   ticket     898 non-null    object 
 8   fare       898 non-null    float64
 9   cabin      211 non-null    object 
 10  embarked   896 non-null    object 
 11  body       84 non-null     float64
 12  home.dest  516 non-null    object 
dtypes: float64(3), int64(4), object(6)
memory usage: 91.3+ KB


In [7]:
titanic_df['name'][:5]

0    Mellinger, Miss. Madeleine Violet
1                    Wells, Miss. Joan
2       Duran y More, Miss. Florentina
3                   Scanlan, Mr. James
4         Bradley, Miss. Bridget Delia
Name: name, dtype: object

In [8]:
# 함수(drop처리)
# name, ticket, body, home.dest
def drop_features(df):
    df.drop(columns=['name', 'ticket', 'body', 'home.dest'], inplace=True)
    return df

In [9]:
titanic_df = drop_features(titanic_df)
titanic_df

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked
0,2,1,female,13.0,0,1,19.5000,,S
1,2,1,female,4.0,1,1,23.0000,,S
2,2,1,female,30.0,1,0,13.8583,,C
3,3,0,male,,0,0,7.7250,,Q
4,3,1,female,22.0,0,0,7.7250,,Q
...,...,...,...,...,...,...,...,...,...
893,3,0,male,17.0,2,0,8.0500,,S
894,3,0,male,,0,0,7.8792,,S
895,1,0,male,64.0,1,4,263.0000,C23 C25 C27,S
896,3,1,male,19.0,0,0,8.0500,,S


In [10]:
# na처리
# age는 평균으로 채워주세요.
# cabin은 N으로 채워주세요
# embarked는 N으로 채워주세요

In [11]:
# 전체 na 개수
titanic_df.isna().sum()

pclass        0
survived      0
sex           0
age         170
sibsp         0
parch         0
fare          0
cabin       687
embarked      2
dtype: int64

In [12]:
def fillna(df):
    df['age'].fillna(titanic_df['age'].mean(), inplace=True)
    df['cabin'].fillna('N', inplace=True)
    df['embarked'].fillna('N', inplace=True)
    return df

In [13]:
titanic_df = fillna(titanic_df)
titanic_df

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked
0,2,1,female,13.000000,0,1,19.5000,N,S
1,2,1,female,4.000000,1,1,23.0000,N,S
2,2,1,female,30.000000,1,0,13.8583,N,C
3,3,0,male,30.196882,0,0,7.7250,N,Q
4,3,1,female,22.000000,0,0,7.7250,N,Q
...,...,...,...,...,...,...,...,...,...
893,3,0,male,17.000000,2,0,8.0500,N,S
894,3,0,male,30.196882,0,0,7.8792,N,S
895,1,0,male,64.000000,1,4,263.0000,C23 C25 C27,S
896,3,1,male,19.000000,0,0,8.0500,N,S


In [14]:
titanic_df['cabin'] = titanic_df['cabin'].str[:1]
titanic_df['cabin'][:20]

0     N
1     N
2     N
3     N
4     N
5     N
6     E
7     N
8     C
9     E
10    N
11    N
12    C
13    N
14    N
15    N
16    N
17    N
18    N
19    N
Name: cabin, dtype: object

In [15]:
def label_encoding(df):
    df['cabin'] = df['cabin'].str[:1]
    print(list(titanic_df['cabin'])[:10])
    le = LabelEncoder()
    features = ['sex', 'cabin', 'embarked']
    for f in features:
        le.fit(df[f])
        print('라벨링할 목록 => ', le.classes_)
        df[f] = le.transform(df[f])
    return df

In [16]:
titanic_df = label_encoding(titanic_df)
titanic_df

['N', 'N', 'N', 'N', 'N', 'N', 'E', 'N', 'C', 'E']
라벨링할 목록 =>  ['female' 'male']
라벨링할 목록 =>  ['A' 'B' 'C' 'D' 'E' 'F' 'G' 'N' 'T']
라벨링할 목록 =>  ['C' 'N' 'Q' 'S']


Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked
0,2,1,0,13.000000,0,1,19.5000,7,3
1,2,1,0,4.000000,1,1,23.0000,7,3
2,2,1,0,30.000000,1,0,13.8583,7,0
3,3,0,1,30.196882,0,0,7.7250,7,2
4,3,1,0,22.000000,0,0,7.7250,7,2
...,...,...,...,...,...,...,...,...,...
893,3,0,1,17.000000,2,0,8.0500,7,3
894,3,0,1,30.196882,0,0,7.8792,7,3
895,1,0,1,64.000000,1,4,263.0000,2,3
896,3,1,1,19.000000,0,0,8.0500,7,3


In [17]:
titanic_df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked
0,2,1,0,13.0,0,1,19.5,7,3
1,2,1,0,4.0,1,1,23.0,7,3
2,2,1,0,30.0,1,0,13.8583,7,0
3,3,0,1,30.196882,0,0,7.725,7,2
4,3,1,0,22.0,0,0,7.725,7,2


In [18]:
# 전처리 담당 함수 만들기
def preprocessing(df):
    df2 = fillna(df)
    df3 = drop_features(df2)
    df4 = label_encoding(df3)
    return df4

In [19]:
titanic_df2 = pd.read_csv('./titanic_train.csv')
titanic_df2

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,body,home.dest
0,2,1,"Mellinger, Miss. Madeleine Violet",female,13.0,0,1,250644,19.5000,,S,,"England / Bennington, VT"
1,2,1,"Wells, Miss. Joan",female,4.0,1,1,29103,23.0000,,S,,"Cornwall / Akron, OH"
2,2,1,"Duran y More, Miss. Florentina",female,30.0,1,0,SC/PARIS 2148,13.8583,,C,,"Barcelona, Spain / Havana, Cuba"
3,3,0,"Scanlan, Mr. James",male,,0,0,36209,7.7250,,Q,,
4,3,1,"Bradley, Miss. Bridget Delia",female,22.0,0,0,334914,7.7250,,Q,,"Kingwilliamstown, Co Cork, Ireland Glens Falls..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
893,3,0,"Davies, Mr. Joseph",male,17.0,2,0,A/4 48873,8.0500,,S,,"West Bromwich, England Pontiac, MI"
894,3,0,"Lockyer, Mr. Edward",male,,0,0,1222,7.8792,,S,153.0,
895,1,0,"Fortune, Mr. Mark",male,64.0,1,4,19950,263.0000,C23 C25 C27,S,,"Winnipeg, MB"
896,3,1,"Dorking, Mr. Edward Arthur",male,19.0,0,0,A/5. 10482,8.0500,,S,,"England Oglesby, IL"


In [20]:
# 데이터 전처리 호출
titanic_df3 = preprocessing(titanic_df2)
titanic_df3

[7, 7, 7, 7, 7, 7, 4, 7, 2, 4]
라벨링할 목록 =>  ['female' 'male']
라벨링할 목록 =>  ['A' 'B' 'C' 'D' 'E' 'F' 'G' 'N' 'T']
라벨링할 목록 =>  ['C' 'N' 'Q' 'S']


Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked
0,2,1,0,13.000000,0,1,19.5000,7,3
1,2,1,0,4.000000,1,1,23.0000,7,3
2,2,1,0,30.000000,1,0,13.8583,7,0
3,3,0,1,30.196882,0,0,7.7250,7,2
4,3,1,0,22.000000,0,0,7.7250,7,2
...,...,...,...,...,...,...,...,...,...
893,3,0,1,17.000000,2,0,8.0500,7,3
894,3,0,1,30.196882,0,0,7.8792,7,3
895,1,0,1,64.000000,1,4,263.0000,2,3
896,3,1,1,19.000000,0,0,8.0500,7,3


In [21]:
# train_test data split
data = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'cabin', 'embarked']
X_train, X_test, Y_train, Y_test = train_test_split(
            titanic_df3[data], # problem
            titanic_df3['survived'], # answer
            test_size = 0.2, # test-size rate
            random_state = 50 # seed value
)

In [22]:
titanic_df3.columns

Index(['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'fare', 'cabin',
       'embarked'],
      dtype='object')

In [23]:
X_train.shape

(718, 8)

In [24]:
X_test.shape

(180, 8)

In [25]:
Y_train.shape

(718,)

In [26]:
Y_test.shape

(180,)

In [27]:
# 8개의 변수로 1개의 target을 분류하는 문제
# y = ax1 + bx2 + cx3 + dx4 + ex5 + fx6 + gx7 + hx8
# ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'cabin', 'embarked']
# survived = (a * pclass) + (b * sex) + (c * age) + (d * sibsp) + (e * parch) + (f * fare) + (g * cabin) + (h * embarked)

In [28]:
df_clf = DecisionTreeClassifier(random_state=11)
rf_clf = RandomForestClassifier(random_state=11)
svc_clf = SVC()
clf_list = [df_clf, rf_clf, svc_clf]

In [29]:
# train data 3가지 방법으로 훈련데이터 훈련시켜서 정확도 비교
for clf in clf_list:
    clf.fit(X_train, Y_train)
    pred = clf.predict(X_test)
    print("accuracy >> {0:4f}".format(accuracy_score(Y_test, pred)))

accuracy >> 0.805556
accuracy >> 0.827778
accuracy >> 0.683333


In [30]:
kfold = KFold(n_splits=5)

In [31]:
for i, (train_index, test_index) in enumerate(kfold.split(titanic_df3[data])):
    print(i, '-----------------')
    print('test_index >> ', test_index)
    print()
    print('train_index >> ', train_index)

0 -----------------
test_index >>  [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179]

train_index >>  [180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 23

In [32]:
X_train, X_test, Y_train, Y_test

(     pclass  sex        age  sibsp  parch      fare  cabin  embarked
 717       3    1  21.000000      1      0    6.4958      7         3
 2         2    0  30.000000      1      0   13.8583      7         0
 293       3    0  30.196882      0      0    7.7500      7         2
 484       3    0  19.000000      1      0   16.1000      7         3
 795       1    0  18.000000      1      0  227.5250      2         0
 ..      ...  ...        ...    ...    ...       ...    ...       ...
 132       3    1  30.196882      0      0    7.7500      7         2
 289       3    0  16.000000      5      2   46.9000      7         3
 109       3    1  30.196882      0      0    9.5000      7         3
 480       3    1  28.000000      0      0   22.5250      7         3
 688       1    1  65.000000      0      1   61.9792      1         0
 
 [718 rows x 8 columns],
      pclass  sex        age  sibsp  parch     fare  cabin  embarked
 812       3    1  30.196882      0      0   7.7375      7      

In [33]:
titanic_df3[data].index

RangeIndex(start=0, stop=898, step=1)

In [34]:
titanic_df3[data].values[0]

array([ 2. ,  0. , 13. ,  0. ,  1. , 19.5,  7. ,  3. ])

In [35]:
titanic_df3[data].values[[0, 3]]

array([[ 2.        ,  0.        , 13.        ,  0.        ,  1.        ,
        19.5       ,  7.        ,  3.        ],
       [ 3.        ,  1.        , 30.19688187,  0.        ,  0.        ,
         7.725     ,  7.        ,  2.        ]])

In [36]:
titanic_df3[data].values[train_index]

array([[ 2.        ,  0.        , 13.        , ..., 19.5       ,
         7.        ,  3.        ],
       [ 2.        ,  0.        ,  4.        , ..., 23.        ,
         7.        ,  3.        ],
       [ 2.        ,  0.        , 30.        , ..., 13.8583    ,
         7.        ,  0.        ],
       ...,
       [ 2.        ,  1.        , 44.        , ..., 13.        ,
         7.        ,  3.        ],
       [ 3.        ,  1.        , 21.        , ...,  6.4958    ,
         7.        ,  3.        ],
       [ 3.        ,  1.        , 30.19688187, ...,  7.8958    ,
         7.        ,  3.        ]])

In [37]:
titanic_df3[data].values[test_index]

array([[  3.        ,   1.        ,  27.        , ...,   7.8958    ,
          7.        ,   3.        ],
       [  3.        ,   1.        ,  18.5       , ...,   7.2292    ,
          7.        ,   0.        ],
       [  1.        ,   0.        ,  30.19688187, ...,  79.2       ,
          7.        ,   0.        ],
       ...,
       [  1.        ,   1.        ,  64.        , ..., 263.        ,
          2.        ,   3.        ],
       [  3.        ,   1.        ,  19.        , ...,   8.05      ,
          7.        ,   3.        ],
       [  1.        ,   0.        ,  26.        , ..., 136.7792    ,
          2.        ,   0.        ]])

In [38]:
for x in [1, 2, 3]: ## for-each
    print(x + 1)

2
3
4


In [39]:
for x, y in [[1,2], [3,4], [5,6]]:
    print(x, ',' , y)

1 , 2
3 , 4
5 , 6


In [40]:
for i, (x, y) in enumerate([[1,2], [3,4], [5,6]]):
    print(i, ': ' , x, ',' , y)

0 :  1 , 2
1 :  3 , 4
2 :  5 , 6


In [41]:
for i, one in enumerate([1,2,3]): ## for-each + index
    print(one + 1)

2
3
4


In [42]:
for i, (x, y) in enumerate([[train_index, test_index], [train_index, test_index]]):
    print(i, ': ' , x, ',' , y)

0 :  [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 234 235 236 237 238 239 240 241 242 243 244 2

In [43]:
def exec_kfold(clf, k):
    acc_list = []
    kfold = KFold(n_splits=k)
    for i, (train_index, test_index) in enumerate(kfold.split(titanic_df3[data])):
        X_train2, X_test2 = titanic_df3[data].values[train_index], titanic_df3[data].values[test_index]
        Y_train2, Y_test2 = titanic_df3['survived'].values[train_index], titanic_df3['survived'].values[test_index]
    
        clf.fit(X_train2, Y_train2)
        pred2 = clf.predict(X_test2)
        acc = accuracy_score(Y_test2, pred2)
        acc_list.append(acc)
        print(i, ':', "accuracy >> {0: .2f}".format(acc))
    print("average accuracy >> {0: .2f}".format(np.mean(acc_list)))

In [44]:
# decision tree model을 가지고 cross-validation(교차검증, k-fold)
exec_kfold(df_clf, 10)

0 : accuracy >>  0.82
1 : accuracy >>  0.73
2 : accuracy >>  0.70
3 : accuracy >>  0.78
4 : accuracy >>  0.81
5 : accuracy >>  0.74
6 : accuracy >>  0.74
7 : accuracy >>  0.82
8 : accuracy >>  0.82
9 : accuracy >>  0.76
average accuracy >>  0.77


In [45]:
# svm, rf 비교 (5, 10) => 평균!

In [46]:
# svm
exec_kfold(svc_clf, 5)

0 : accuracy >>  0.64
1 : accuracy >>  0.69
2 : accuracy >>  0.68
3 : accuracy >>  0.64
4 : accuracy >>  0.67
average accuracy >>  0.66


In [47]:
# rf
exec_kfold(rf_clf, 10)

0 : accuracy >>  0.81
1 : accuracy >>  0.81
2 : accuracy >>  0.80
3 : accuracy >>  0.74
4 : accuracy >>  0.82
5 : accuracy >>  0.74
6 : accuracy >>  0.77
7 : accuracy >>  0.81
8 : accuracy >>  0.83
9 : accuracy >>  0.79
average accuracy >>  0.79


In [48]:
# model_list로 교차검증이 한꺼번에 되게 코드를 변경해보고
# model_list 중 평균값의 교차검증 결과 중 최대인 모델의 평균값을 구해보자
# 평균값이 제일 높은 모델을 선정해보자
# -----------------------------------------------------------------------
# 교차검증 결과
# 제일 정확도가 높은 모델은 랜덤포레스트이고 정확도는 88입니다

In [49]:
def crossvar_kfold(k):
    clf_list = [df_clf, rf_clf, svc_clf]
    acc_list = []
    avg_acc = []
    for clf in clf_list:
        acc_list = []
        kfold = KFold(n_splits=k)
        for i, (train_index, test_index) in enumerate(kfold.split(titanic_df3[data])):
            X_train2, X_test2 = titanic_df3[data].values[train_index], titanic_df3[data].values[test_index]
            Y_train2, Y_test2 = titanic_df3['survived'].values[train_index], titanic_df3['survived'].values[test_index]

            clf.fit(X_train2, Y_train2)
            pred2 = clf.predict(X_test2)
            acc = accuracy_score(Y_test2, pred2)
            acc_list.append(acc)
        print("average accuracy >> {0: .2f}".format(np.mean(acc_list)))
        avg_acc.append(np.mean(acc_list))
    clf_list = ['의사결정나무', '랜덤포레스트', '서포트벡터머신']
    if avg_acc[0] > avg_acc[1] and avg_acc[0] > avg_acc[2]:
        print('제일 정확도가 높은 모델은', clf_list[0],'이고 정확도는', round(avg_acc[0] * 100), '입니다')
    elif avg_acc[1] > avg_acc[2]:
        print('제일 정확도가 높은 모델은', clf_list[1],'이고 정확도는', round(avg_acc[1] * 100), '입니다')
    else:
        print('제일 정확도가 높은 모델은', clf_list[2],'이고 정확도는', round(avg_acc[2] * 100), '입니다')

In [50]:
crossvar_kfold(10)

average accuracy >>  0.77
average accuracy >>  0.79
average accuracy >>  0.67
제일 정확도가 높은 모델은 랜덤포레스트 이고 정확도는 79 입니다
