!pip install xgboost
!pip install hyperopt

## 0. 필요한 라이브러리 불러오기

In [1]:
import pandas as pd
import numpy as np
import time

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn import metrics

from hyperopt import hp, STATUS_OK, fmin, Trials


## 1. 데이터 불러오기

In [2]:
titanic = pd.read_csv("titanic/train.csv")

In [3]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
# 필요 없는 데이터 (PassengerId, Ticket) 지우기
ttn = titanic.drop( ["PassengerId", "Ticket"] , axis = 1 )
ttn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Fare      891 non-null    float64
 8   Cabin     204 non-null    object 
 9   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 69.7+ KB


## 2. 데이터 전처리 (결측치 제거/ 인코딩 / 표준화)

### 2-1 결측치

In [5]:
# age 결측치 채우기
age_mean = ttn['Age'].mean()
ttn["Age"] = ttn["Age"].fillna(age_mean)
ttn.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       891 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Fare      891 non-null    float64
 8   Cabin     204 non-null    object 
 9   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 69.7+ KB


In [6]:
# embarked 결측치 채우기
emb_mode = ttn["Embarked"].mode()
ttn["Embarked"]= ttn["Embarked"].fillna(emb_mode[0])
ttn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       891 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Fare      891 non-null    float64
 8   Cabin     204 non-null    object 
 9   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 69.7+ KB


In [7]:
# cabin 결측치는 너무 많으니 z(모름)로 입력
ttn["Cabin"]= ttn["Cabin"].fillna("Z")
ttn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       891 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Fare      891 non-null    float64
 8   Cabin     891 non-null    object 
 9   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 69.7+ KB


### 2-2 인코딩

In [8]:
ttn["Cabin"] = ttn["Cabin"].apply(lambda x : x[0])
ttn["Cabin"].value_counts()

Cabin
Z    687
C     59
B     47
D     33
E     32
A     15
F     13
G      4
T      1
Name: count, dtype: int64

In [9]:
# name의 직함 꺼내기 name의 직함 꺼내기

ttn['title'] = ttn['Name'].str.extract("( [A-z]+)\.")

ttn_cnt = ttn['title'].value_counts()
ttn_if = ttn_cnt[ttn_cnt < 10]
ttn['title'] = ttn['title'].replace(ttn_if.index , 'other')

In [10]:
# title, sex, cavin, embarked 인코딩
def encoding(data, col):
    enco = LabelEncoder()
    enco.fit(data[col])
    data[col] = enco.transform(data[col])
    return data


In [11]:
encoding(ttn, "Sex")
ttn["Sex"].value_counts()

Sex
1    577
0    314
Name: count, dtype: int64

In [12]:
encoding(ttn, "Cabin")
ttn["Cabin"].value_counts()

Cabin
8    687
2     59
1     47
3     33
4     32
0     15
5     13
6      4
7      1
Name: count, dtype: int64

In [13]:
encoding(ttn, "Embarked")
ttn["Embarked"].value_counts()

Embarked
2    646
0    168
1     77
Name: count, dtype: int64

In [14]:
encoding(ttn, "title")
ttn["title"].value_counts()

title
2    517
1    182
3    125
0     40
4     27
Name: count, dtype: int64

In [15]:
# Name지우기
ttn = ttn.drop( ["Name"] , axis = 1 )
ttn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int32  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Cabin     891 non-null    int32  
 8   Embarked  891 non-null    int32  
 9   title     891 non-null    int32  
dtypes: float64(2), int32(4), int64(4)
memory usage: 55.8 KB


In [16]:
ttn['family'] = ttn['SibSp'] + ttn['Parch']

ttn.loc[ ttn['family'] >=6 ,'family'] = 6 

In [17]:
bin_Age = [0,10,20,30,40,50,60,70,80]
ttn["AgeGroup"] = pd.cut(ttn["Age"], bins=bin_Age)

encoding(ttn, "AgeGroup")
ttn["AgeGroup"].value_counts()

AgeGroup
2    407
3    155
1    115
4     86
0     64
5     42
6     17
7      5
Name: count, dtype: int64

In [18]:
bin_Fare = [0,10,20,30,40,600]
ttn["FareGroup"] = pd.cut(ttn["Fare"], bins=bin_Fare)

encoding(ttn, "FareGroup")
ttn["FareGroup"].value_counts()

FareGroup
0    321
1    179
4    176
2    142
3     58
5     15
Name: count, dtype: int64

In [19]:
# age, fare, sibsp, farch 지우기
ttn = ttn.drop( ["Age", "Fare", "SibSp", "Parch"] , axis = 1 )
ttn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   Survived   891 non-null    int64
 1   Pclass     891 non-null    int64
 2   Sex        891 non-null    int32
 3   Cabin      891 non-null    int32
 4   Embarked   891 non-null    int32
 5   title      891 non-null    int32
 6   family     891 non-null    int64
 7   AgeGroup   891 non-null    int32
 8   FareGroup  891 non-null    int32
dtypes: int32(6), int64(3)
memory usage: 41.9 KB


### 2-3 데이터 나누기 (train, test)

In [20]:
x = ttn.loc[ : , "Pclass" :]
y = ttn["Survived"]

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x , y, test_size=0.3, random_state=10)

In [22]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 623 entries, 7 to 265
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   Pclass     623 non-null    int64
 1   Sex        623 non-null    int32
 2   Cabin      623 non-null    int32
 3   Embarked   623 non-null    int32
 4   title      623 non-null    int32
 5   family     623 non-null    int64
 6   AgeGroup   623 non-null    int32
 7   FareGroup  623 non-null    int32
dtypes: int32(6), int64(2)
memory usage: 29.2 KB


In [23]:
y.value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

## 3. 모델링
1. 모델 생성
2. 모델 학습
3. 모델 평가

In [24]:
# 학습된 모델 점수 확인을 위한 함수

def model_score(model):
    pre_test = model.predict(x_test)
    pre_train = model.predict(x_train)
    score_test = metrics.f1_score(pre_test, y_test)
    score_train = metrics.f1_score(pre_train, y_train)
    print("test : ", score_test)
    print("train : ", score_train)
    print("test 평가지표")
    print(metrics.classification_report(pre_test, y_test))
    print("train 평가지표")
    print(metrics.classification_report(pre_train, y_train))
    # print(metrics.f1_score(pre_test, y_test))

### 3-1 knn

In [25]:
knn_model = KNeighborsClassifier()
knn_param = { "n_neighbors" : range(1,50)}
knn_grid = GridSearchCV(knn_model, param_grid=knn_param,cv = 5 )
knn_grid.fit(x_train, y_train)

In [26]:
knn_best = knn_grid.best_estimator_
print(knn_grid.best_params_)
model_score(knn_best)

{'n_neighbors': 3}
test :  0.6984126984126984
train :  0.7932489451476793
test 평가지표
              precision    recall  f1-score   support

           0       0.83      0.84      0.84       173
           1       0.70      0.69      0.70        95

    accuracy                           0.79       268
   macro avg       0.77      0.77      0.77       268
weighted avg       0.79      0.79      0.79       268

train 평가지표
              precision    recall  f1-score   support

           0       0.90      0.85      0.87       397
           1       0.76      0.83      0.79       226

    accuracy                           0.84       623
   macro avg       0.83      0.84      0.83       623
weighted avg       0.85      0.84      0.84       623



### 3-2 Decsion Tree

In [27]:
dt_model = DecisionTreeClassifier(random_state=10)
dt_params = {"max_depth" : range(3,10,1), "min_samples_split" : range(10,101,10), "min_samples_leaf" : range(5,51,5)}
dt_grid = GridSearchCV(dt_model, param_grid=dt_params, cv = 5)
dt_grid.fit(x_train, y_train)

In [28]:
dt_best = dt_grid.best_estimator_
print(dt_grid.best_params_)
model_score(dt_best)

{'max_depth': 4, 'min_samples_leaf': 5, 'min_samples_split': 10}
test :  0.761904761904762
train :  0.7866108786610879
test 평가지표
              precision    recall  f1-score   support

           0       0.87      0.87      0.87       173
           1       0.77      0.76      0.76        95

    accuracy                           0.83       268
   macro avg       0.82      0.82      0.82       268
weighted avg       0.83      0.83      0.83       268

train 평가지표
              precision    recall  f1-score   support

           0       0.89      0.85      0.87       393
           1       0.76      0.82      0.79       230

    accuracy                           0.84       623
   macro avg       0.82      0.83      0.83       623
weighted avg       0.84      0.84      0.84       623



### 3-3 voting

In [29]:
vt_model = VotingClassifier(estimators=[ ("KNN", knn_best), ("DT", dt_best) ], voting="soft")
vt_model.fit(x_train, y_train)
model_score(vt_model)

test :  0.7513227513227513
train :  0.8118393234672305
test 평가지표
              precision    recall  f1-score   support

           0       0.86      0.87      0.86       173
           1       0.76      0.75      0.75        95

    accuracy                           0.82       268
   macro avg       0.81      0.81      0.81       268
weighted avg       0.82      0.82      0.82       268

train 평가지표
              precision    recall  f1-score   support

           0       0.91      0.86      0.88       398
           1       0.77      0.85      0.81       225

    accuracy                           0.86       623
   macro avg       0.84      0.86      0.85       623
weighted avg       0.86      0.86      0.86       623



### 3-4 Random Forest

In [30]:
rf_search = {"max_depth" : hp.quniform("max_depth", 3,10,1),
             "min_samples_split" : hp.quniform("min_samples_split", 10,100,1),
             "min_samples_leaf" : hp.quniform("min_samples_leaf", 5,50,1),
             "n_estimators" : hp.quniform("n_estimators",100,500,1)
            }

def rf_obj(ss):
    rf_model = RandomForestClassifier(max_depth = int(ss["max_depth"]),
                                     min_samples_split = int(ss["min_samples_split"]),
                                      min_samples_leaf = int(ss["min_samples_leaf"]),
                                     n_estimators = int(ss["n_estimators"]),
                                     random_state = 10
                                     )
    score = cross_val_score(rf_model, x_train, y_train, scoring="f1", cv = 5)
    return {"loss" : -1 * np.mean(score), "status" : STATUS_OK}

In [31]:
import warnings
warnings.filterwarnings("ignore")

trial = Trials()

rf_hp = fmin(fn = rf_obj, space = rf_search, max_evals= 500, trials=trial, rstate=np.random.default_rng(seed = 10), verbose = 5)
print(rf_hp)

TPE is being used as the default algorithm.


100%|██████████| 500/500 [14:12<00:00,  1.71s/trial, best loss: -0.7629984501859236]
{'max_depth': 6.0, 'min_samples_leaf': 6.0, 'min_samples_split': 12.0, 'n_estimators': 100.0}


In [32]:
rf_best = RandomForestClassifier(max_depth=6, min_samples_leaf = 6, min_samples_split=12, n_estimators = 100, random_state=10)
rf_best.fit(x_train, y_train)
model_score(rf_best)


test :  0.7634408602150538
train :  0.7939914163090128
test 평가지표
              precision    recall  f1-score   support

           0       0.88      0.87      0.87       176
           1       0.76      0.77      0.76        92

    accuracy                           0.84       268
   macro avg       0.82      0.82      0.82       268
weighted avg       0.84      0.84      0.84       268

train 평가지표
              precision    recall  f1-score   support

           0       0.91      0.84      0.88       405
           1       0.75      0.85      0.79       218

    accuracy                           0.85       623
   macro avg       0.83      0.85      0.84       623
weighted avg       0.85      0.85      0.85       623



### 3-5 XGB

In [33]:
setattr(pd, "Int64Index", pd.Index)

In [34]:
xgb_search = {"max_depth" : hp.quniform("max_depth", 3,10,1),
             "learning_rate" : hp.uniform("learning_rate", 0.01, 0.2),
              "min_child_weight" : hp.quniform("min_child_weight", 5,50,1),
              "n_estimators" : hp.quniform("n_estimators",100,500,1)
            }

def xgb_obj(ss):
    xgb_model = XGBClassifier(max_depth = int(ss["max_depth"]),
                              n_estimators = int(ss["n_estimators"]),
                               min_child_weight = int(ss["min_child_weight"]),
                              learning_rate = ss["learning_rate"],
                              subsample = 0.6,
                              eval_metric = "logloss")
    
    score = cross_val_score(xgb_model, x_train, y_train, scoring="f1", cv = 5)
    return {"loss" : -1 * np.mean(score), "status" : STATUS_OK}

In [35]:

trial = Trials()

xgb_hp = fmin(fn = xgb_obj, space=xgb_search, max_evals=500, trials=trial, rstate=np.random.default_rng(seed = 5), verbose = 5)
print(xgb_hp)

TPE is being used as the default algorithm.


100%|██████████| 500/500 [03:28<00:00,  2.39trial/s, best loss: -0.7599728383831529]
{'learning_rate': 0.1077297488550448, 'max_depth': 9.0, 'min_child_weight': 5.0, 'n_estimators': 384.0}


In [36]:
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=5)
evals = [ (x_tr, y_tr), (x_val, y_val) ]

In [39]:
xgb_best = XGBClassifier(n_estimators = 110, learning_rate = 0.195, max_depth = 5,min_child_weight = 5, subsample = 0.6, eval_metric='logloss')
xgb_best.fit(x_tr, y_tr)
model_score(xgb_best)

test :  0.7624309392265194
train :  0.8060344827586207
test 평가지표
              precision    recall  f1-score   support

           0       0.90      0.86      0.88       181
           1       0.73      0.79      0.76        87

    accuracy                           0.84       268
   macro avg       0.82      0.83      0.82       268
weighted avg       0.84      0.84      0.84       268

train 평가지표
              precision    recall  f1-score   support

           0       0.92      0.85      0.88       407
           1       0.75      0.87      0.81       216

    accuracy                           0.86       623
   macro avg       0.84      0.86      0.85       623
weighted avg       0.86      0.86      0.86       623



## 4. test
- test 데이터 불어와서 train과 같은 전처리 과정을 수행한수 학습된 모델에 적용하여 점수 확인

In [41]:
test = pd.read_csv('titanic/test.csv')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB


In [42]:
t_test = test.drop( ["PassengerId", "Ticket"], axis = 1)

In [43]:
t_test["Age"] = t_test["Age"].fillna(age_mean)

In [44]:
t_test["Cabin"] = t_test["Cabin"].fillna("Z")

In [45]:
t_test["Cabin"] = t_test["Cabin"].apply(lambda x : x[0])

In [46]:
t_test['title'] = t_test['Name'].str.extract("( [A-z]+)\.")

test_cnt = t_test['title'].value_counts()
test_if = test_cnt[test_cnt < 10]
t_test['title'] = t_test['title'].replace(test_if.index , 'other')

In [47]:
bin_Age = [0,10,20,30,40,50,60,70,80]
t_test["AgeGroup"] = pd.cut(t_test["Age"], bins=bin_Age)
bin_Fare = [0,10,20,30,40,600]
t_test["FareGroup"] = pd.cut(t_test["Fare"], bins=bin_Fare)


In [48]:
t_test['family'] = t_test['SibSp'] + t_test['Parch']

t_test.loc[ t_test['family'] >=6 ,'family'] = 6 

In [49]:
t_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   Pclass     418 non-null    int64   
 1   Name       418 non-null    object  
 2   Sex        418 non-null    object  
 3   Age        418 non-null    float64 
 4   SibSp      418 non-null    int64   
 5   Parch      418 non-null    int64   
 6   Fare       417 non-null    float64 
 7   Cabin      418 non-null    object  
 8   Embarked   418 non-null    object  
 9   title      418 non-null    object  
 10  AgeGroup   418 non-null    category
 11  FareGroup  415 non-null    category
 12  family     418 non-null    int64   
dtypes: category(2), float64(2), int64(4), object(5)
memory usage: 37.5+ KB


In [50]:
t_test = t_test.drop([ "Name", "Age", "Fare", "SibSp", "Parch"], axis = 1)

In [51]:
for col in ["Sex", "Cabin", "Embarked", "title", "AgeGroup", "FareGroup"]:
    encoding(t_test, col)

In [52]:
t_test.columns

Index(['Pclass', 'Sex', 'Cabin', 'Embarked', 'title', 'AgeGroup', 'FareGroup',
       'family'],
      dtype='object')

In [53]:
t_test = t_test[['Pclass', 'Sex', 'Cabin', 'Embarked', 'title', 'family', 'AgeGroup', 'FareGroup']]

In [54]:
pre = rf_best.predict(t_test)

In [55]:
real = pd.read_csv('titanic/gender_submission.csv')

In [56]:
real = real.drop( ["PassengerId"] , axis = 1)

In [57]:
print(metrics.classification_report(pre, real))

              precision    recall  f1-score   support

           0       0.95      0.98      0.96       257
           1       0.97      0.91      0.94       161

    accuracy                           0.95       418
   macro avg       0.96      0.95      0.95       418
weighted avg       0.95      0.95      0.95       418

