# Titanic dataset
다음 모델들의 테스트 성능을 비교해봅시다.
1. Logistic regression
2. k-nearest neighbor classifier
3. naive Bayes classifier
4. Decision tree
5. Random forest

#### 작성: 고우주 | kubwa 쿱와

# 0. Data preprocessing

In [1]:
from pprint import pprint
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

In [2]:
# Load Titanic dataset
url = 'data/titanic.csv'
titanic = pd.read_csv(url, index_col='PassengerId')

In [3]:
titanic.head(10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
titanic.dtypes

Survived      int64
Pclass        int64
Name         object
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object

In [5]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [6]:
titanic.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
titanic.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [8]:
titanic['Age'].isnull()

PassengerId
1      False
2      False
3      False
4      False
5      False
       ...  
887    False
888    False
889     True
890    False
891    False
Name: Age, Length: 891, dtype: bool

In [9]:
list = []
for i in range (0, len(titanic)):
    age = titanic.iloc[i].Age
    age = 'child' if age < 20 else 'adult' if age >= 20 else 'unknown'
    list.append(age)

In [11]:
titanic['Age_modified'] = list

In [12]:
titanic.head(3)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_modified
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,adult
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,adult
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,adult


In [13]:
Age_dummies = pd.get_dummies(titanic.Age_modified, prefix = 'Age')
Age_dummies.sample(n = 10)

Unnamed: 0_level_0,Age_adult,Age_child,Age_unknown
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
868,1,0,0
737,1,0,0
333,1,0,0
364,1,0,0
819,1,0,0
502,1,0,0
106,1,0,0
585,0,0,1
313,1,0,0
11,0,1,0


In [14]:
Embarked_dummies = pd.get_dummies(titanic.Embarked, prefix = 'Embarked')
Embarked_dummies.sample(n = 10)

Unnamed: 0_level_0,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
243,0,0,1
489,0,0,1
662,1,0,0
401,0,0,1
70,0,0,1
9,0,0,1
716,0,0,1
735,0,0,1
430,0,0,1
483,0,0,1


In [15]:
Sex_dummies = pd.get_dummies(titanic.Sex, prefix = 'Sex')
Sex_dummies.sample (n = 10)

Unnamed: 0_level_0,Sex_female,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
451,0,1
560,1,0
623,0,1
17,0,1
597,1,0
310,1,0
416,1,0
665,0,1
190,0,1
653,0,1


In [16]:
data = pd.concat([titanic, Age_dummies, Embarked_dummies, Sex_dummies], axis = 1)

In [17]:
data.head(5)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_modified,Age_adult,Age_child,Age_unknown,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,adult,1,0,0,0,0,1,0,1
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,adult,1,0,0,1,0,0,1,0
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,adult,1,0,0,0,0,1,1,0
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,adult,1,0,0,0,0,1,1,0
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,adult,1,0,0,0,0,1,0,1


In [18]:
data = data.drop(['Name', 'Sex', 'Age', 'Age_modified', 'Ticket', 'Fare', 'Cabin', 'Embarked'], axis = 1)

In [19]:
data.head(5)

Unnamed: 0_level_0,Survived,Pclass,SibSp,Parch,Age_adult,Age_child,Age_unknown,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3,1,0,1,0,0,0,0,1,0,1
2,1,1,1,0,1,0,0,1,0,0,1,0
3,1,3,0,0,1,0,0,0,0,1,1,0
4,1,1,1,0,1,0,0,0,0,1,1,0
5,0,3,0,0,1,0,0,0,0,1,0,1


In [20]:
data.columns.values

array(['Survived', 'Pclass', 'SibSp', 'Parch', 'Age_adult', 'Age_child',
       'Age_unknown', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'Sex_female', 'Sex_male'], dtype=object)

In [21]:
# 변수명 가져오기
col_names = data.columns.values

In [22]:
X = data[col_names[1:]]
Y = data[col_names[0]]

In [23]:
X.head(5)

Unnamed: 0_level_0,Pclass,SibSp,Parch,Age_adult,Age_child,Age_unknown,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,3,1,0,1,0,0,0,0,1,0,1
2,1,1,0,1,0,0,1,0,0,1,0
3,3,0,0,1,0,0,0,0,1,1,0
4,1,1,0,1,0,0,0,0,1,1,0
5,3,0,0,1,0,0,0,0,1,0,1


In [24]:
Y.head(5)

PassengerId
1    0
2    1
3    1
4    1
5    0
Name: Survived, dtype: int64

# 1. Split data into 3 sets
1. Training set (50%)
2. Validation set (30%)
3. Test set (20%)

In [25]:
def train_val_test_split(X, Y, val_size=0.3, test_size=0.2, random_state=123):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, 
                                                        test_size=test_size, 
                                                        random_state=random_state)
    val_size_rev = val_size / (1 - test_size)
    X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train,
                                                      test_size=val_size_rev,
                                                      random_state=random_state)
    return X_train, X_val, X_test, Y_train, Y_val, Y_test

In [26]:
X_train, X_val, X_test, Y_train, Y_val, Y_test = train_val_test_split(X, Y,
                                                                      val_size=0.3,
                                                                      test_size=0.2,
                                                                      random_state=123)

In [27]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(445, 11)
(267, 11)
(179, 11)


# 2. Fit the model and compare validation AUCs
비교하고자 하는 classifiers들은 다음과 같음
1. Logistic regression
2. k-nearest neighbor classifier
3. naive Bayes classifier
4. Decision tree
5. Random forest

### 2.1. Logistic regression
Manual for `sklearn.linear_model.LogisticRegression`: [click](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

다음 parameter들에 대해 validation data에 대한 AUC값을 살펴볼 것
1. penalty
2. C

In [30]:
# C가 클수록 weak regularization
penalty_set = ['l2']
C_set = [0.1, 1, 10, 1e2, 1e3, 1e4, 1e5, 1e6]

In [31]:
result = []
for penalty in penalty_set:
    for C in C_set:
        model = LogisticRegression(penalty=penalty, C=C, class_weight='balanced')
        model = model.fit(X_train, Y_train)
#         Y_val_score = model.decision_function(X_val)
        Y_val_score = model.predict_proba(X_val)[:, 1]
        fpr, tpr, _ = roc_curve(Y_val, Y_val_score)
        result.append((model, penalty, C, auc(fpr, tpr)))

In [32]:
result

[(LogisticRegression(C=0.1, class_weight='balanced'),
  'l2',
  0.1,
  0.8333934583934584),
 (LogisticRegression(C=1, class_weight='balanced'),
  'l2',
  1,
  0.8332732082732083),
 (LogisticRegression(C=10, class_weight='balanced'),
  'l2',
  10,
  0.832972582972583),
 (LogisticRegression(C=100.0, class_weight='balanced'),
  'l2',
  100.0,
  0.832972582972583),
 (LogisticRegression(C=1000.0, class_weight='balanced'),
  'l2',
  1000.0,
  0.832972582972583),
 (LogisticRegression(C=10000.0, class_weight='balanced'),
  'l2',
  10000.0,
  0.832972582972583),
 (LogisticRegression(C=100000.0, class_weight='balanced'),
  'l2',
  100000.0,
  0.832972582972583),
 (LogisticRegression(C=1000000.0, class_weight='balanced'),
  'l2',
  1000000.0,
  0.832972582972583)]

In [33]:
logreg_result = sorted(result, key=lambda x: x[3], reverse=True)

In [34]:
logreg_result

[(LogisticRegression(C=0.1, class_weight='balanced'),
  'l2',
  0.1,
  0.8333934583934584),
 (LogisticRegression(C=1, class_weight='balanced'),
  'l2',
  1,
  0.8332732082732083),
 (LogisticRegression(C=10, class_weight='balanced'),
  'l2',
  10,
  0.832972582972583),
 (LogisticRegression(C=100.0, class_weight='balanced'),
  'l2',
  100.0,
  0.832972582972583),
 (LogisticRegression(C=1000.0, class_weight='balanced'),
  'l2',
  1000.0,
  0.832972582972583),
 (LogisticRegression(C=10000.0, class_weight='balanced'),
  'l2',
  10000.0,
  0.832972582972583),
 (LogisticRegression(C=100000.0, class_weight='balanced'),
  'l2',
  100000.0,
  0.832972582972583),
 (LogisticRegression(C=1000000.0, class_weight='balanced'),
  'l2',
  1000000.0,
  0.832972582972583)]

In [35]:
best_logreg_result = logreg_result[0]
print(best_logreg_result)

(LogisticRegression(C=0.1, class_weight='balanced'), 'l2', 0.1, 0.8333934583934584)


## 2.2. k-nearest neighbor classifier
Manual for `sklearn.neighbors.KNeighborsClassifier`: [click](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)

다음 parameter들에 대해 validation data에 대한 AUC값을 살펴볼 것
1. n_neighbors
2. weights

In [36]:
weights_set = ['uniform', 'distance']
n_neighbors_set = [1, 3, 5, 7, 9, 11, 13, 15]

In [37]:
result = []
for weights in weights_set:
    for n_neighbors in n_neighbors_set:
        model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights)
        model = model.fit(X_train, Y_train)
        Y_val_score = model.predict_proba(X_val)[:, 1]
        fpr, tpr, _ = roc_curve(Y_val, Y_val_score)
        result.append((model, weights, n_neighbors, auc(fpr, tpr)))        

In [38]:
knn_result = sorted(result, key=lambda x: x[3], reverse=True)

In [39]:
knn_result

[(KNeighborsClassifier(n_neighbors=7), 'uniform', 7, 0.8282227032227032),
 (KNeighborsClassifier(n_neighbors=9), 'uniform', 9, 0.8218795093795094),
 (KNeighborsClassifier(n_neighbors=15), 'uniform', 15, 0.8201659451659451),
 (KNeighborsClassifier(), 'uniform', 5, 0.8175204425204425),
 (KNeighborsClassifier(n_neighbors=13), 'uniform', 13, 0.8174302549302549),
 (KNeighborsClassifier(n_neighbors=11), 'uniform', 11, 0.8166486291486291),
 (KNeighborsClassifier(n_neighbors=3), 'uniform', 3, 0.7936808561808563),
 (KNeighborsClassifier(n_neighbors=7, weights='distance'),
  'distance',
  7,
  0.7915464165464166),
 (KNeighborsClassifier(weights='distance'), 'distance', 5, 0.7820767195767195),
 (KNeighborsClassifier(n_neighbors=9, weights='distance'),
  'distance',
  9,
  0.7797919672919673),
 (KNeighborsClassifier(n_neighbors=15, weights='distance'),
  'distance',
  15,
  0.771945646945647),
 (KNeighborsClassifier(n_neighbors=11, weights='distance'),
  'distance',
  11,
  0.7717051467051468),
 (

In [40]:
best_knn_result = knn_result[0]
print(best_knn_result)

(KNeighborsClassifier(n_neighbors=7), 'uniform', 7, 0.8282227032227032)


## 2.3. naive Bayes clasifier
Manual for `sklearn.naive_bayes.GaussianNB`: [click](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html)

클래스에 대한 prior 정보를 조절하여 fitting

In [41]:
priors_set = [None, [0.5, 0.5], [0.6, 0.4], [0.7, 0.3], [0.8, 0.2], [0.9, 0.1]]

In [42]:
result = []
for priors in priors_set:
    model = GaussianNB(priors=priors)
    model = model.fit(X_train, Y_train)
    Y_val_score = model.predict_proba(X_val)[:, 1]
    fpr, tpr, _ = roc_curve(Y_val, Y_val_score)
    result.append((model, priors, auc(fpr, tpr)))      

In [43]:
nb_result = sorted(result, key=lambda x: x[2], reverse=True)

In [44]:
nb_result

[(GaussianNB(), None, 0.8207070707070708),
 (GaussianNB(priors=[0.5, 0.5]), [0.5, 0.5], 0.8207070707070708),
 (GaussianNB(priors=[0.6, 0.4]), [0.6, 0.4], 0.8207070707070708),
 (GaussianNB(priors=[0.7, 0.3]), [0.7, 0.3], 0.8207070707070708),
 (GaussianNB(priors=[0.8, 0.2]), [0.8, 0.2], 0.8207070707070708),
 (GaussianNB(priors=[0.9, 0.1]), [0.9, 0.1], 0.8207070707070708)]

In [45]:
best_nb_result = nb_result[0]
print(best_nb_result)

(GaussianNB(), None, 0.8207070707070708)


## 2.4. Decision tree
Manual for `sklearn.tree.DecisionTreeClassifier`: [click](http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)

다음 parameter들에 대해 validation data에 대한 AUC값을 살펴볼 것
1. max_depth
2. class_weight

In [46]:
class_weight_set = [None, 'balanced']
max_depth_set = [3, 4, 5, 6, 7]

In [47]:
result = []

for class_weight in class_weight_set:
    for max_depth in max_depth_set:
        model = DecisionTreeClassifier(class_weight=class_weight, max_depth=max_depth)
        model = model.fit(X_train, Y_train)
        Y_val_score = model.predict_proba(X_val)[:, 1]
        fpr, tpr, _ = roc_curve(Y_val, Y_val_score)
        result.append((model, class_weight, max_depth, auc(fpr, tpr)))  

In [48]:
dt_result = sorted(result, key=lambda x: x[3], reverse=True)

In [49]:
dt_result

[(DecisionTreeClassifier(max_depth=5), None, 5, 0.8323713323713325),
 (DecisionTreeClassifier(class_weight='balanced', max_depth=5),
  'balanced',
  5,
  0.8323713323713325),
 (DecisionTreeClassifier(max_depth=4), None, 4, 0.8212181337181338),
 (DecisionTreeClassifier(class_weight='balanced', max_depth=4),
  'balanced',
  4,
  0.8212181337181338),
 (DecisionTreeClassifier(max_depth=3), None, 3, 0.8206469456469456),
 (DecisionTreeClassifier(class_weight='balanced', max_depth=3),
  'balanced',
  3,
  0.8206469456469456),
 (DecisionTreeClassifier(class_weight='balanced', max_depth=6),
  'balanced',
  6,
  0.8192340067340067),
 (DecisionTreeClassifier(max_depth=6), None, 6, 0.8191438191438192),
 (DecisionTreeClassifier(class_weight='balanced', max_depth=7),
  'balanced',
  7,
  0.8134018759018758),
 (DecisionTreeClassifier(max_depth=7), None, 7, 0.8132215007215007)]

In [50]:
best_dt_result = dt_result[0]
print(best_dt_result)

(DecisionTreeClassifier(max_depth=5), None, 5, 0.8323713323713325)


## 2.5. Random forest
다음 parameter들에 대해 validation data에 대한 AUC값을 살펴볼 것
1. n_estimators
2. max_features

In [51]:
n_estimators_set = [5, 10, 15, 20]
max_features_set = ['auto', 'sqrt', 'log2']

In [52]:
result = []
for n_estimators in n_estimators_set:
    for max_features in max_features_set:
        model = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features)
        model = model.fit(X_train, Y_train)
        Y_val_score = model.predict_proba(X_val)[:, 1]
        fpr, tpr, _ = roc_curve(Y_val, Y_val_score)
        result.append((model, n_estimators, max_features, auc(fpr, tpr)))         

In [53]:
rf_result = sorted(result, key=lambda x: x[3], reverse=True)

In [54]:
rf_result

[(RandomForestClassifier(max_features='log2', n_estimators=15),
  15,
  'log2',
  0.8166185666185667),
 (RandomForestClassifier(max_features='log2', n_estimators=20),
  20,
  'log2',
  0.8140632515632514),
 (RandomForestClassifier(max_features='sqrt', n_estimators=5),
  5,
  'sqrt',
  0.8132515632515633),
 (RandomForestClassifier(n_estimators=5), 5, 'auto', 0.8054653679653679),
 (RandomForestClassifier(max_features='log2', n_estimators=10),
  10,
  'log2',
  0.8052248677248677),
 (RandomForestClassifier(max_features='sqrt', n_estimators=20),
  20,
  'sqrt',
  0.8052248677248676),
 (RandomForestClassifier(max_features='log2', n_estimators=5),
  5,
  'log2',
  0.8042328042328042),
 (RandomForestClassifier(max_features='sqrt', n_estimators=15),
  15,
  'sqrt',
  0.8011063011063011),
 (RandomForestClassifier(max_features='sqrt', n_estimators=10),
  10,
  'sqrt',
  0.792989417989418),
 (RandomForestClassifier(n_estimators=15), 15, 'auto', 0.7928992303992304),
 (RandomForestClassifier(n_esti

In [55]:
best_rf_result = rf_result[0]
print(best_rf_result)

(RandomForestClassifier(max_features='log2', n_estimators=15), 15, 'log2', 0.8166185666185667)


# 3. Test the model

In [56]:
selected_models = []
selected_models.append(best_logreg_result[0])
selected_models.append(best_knn_result[0])
selected_models.append(best_nb_result[0])
selected_models.append(best_dt_result[0])
selected_models.append(best_rf_result[0])
pprint(selected_models)

[LogisticRegression(C=0.1, class_weight='balanced'),
 KNeighborsClassifier(n_neighbors=7),
 GaussianNB(),
 DecisionTreeClassifier(max_depth=5),
 RandomForestClassifier(max_features='log2', n_estimators=15)]


In [57]:
test_result = []

for model in selected_models:
    Y_test_score = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(Y_test, Y_test_score)
    test_result.append((model, auc(fpr, tpr)))      

In [58]:
test_result

[(LogisticRegression(C=0.1, class_weight='balanced'), 0.8686234817813765),
 (KNeighborsClassifier(n_neighbors=7), 0.8549932523616736),
 (GaussianNB(), 0.8360998650472335),
 (DecisionTreeClassifier(max_depth=5), 0.8587044534412956),
 (RandomForestClassifier(max_features='log2', n_estimators=15),
  0.8568151147098515)]

In [59]:
test_result = sorted(test_result, key=lambda x: x[1], reverse=True)

In [60]:
test_result

[(LogisticRegression(C=0.1, class_weight='balanced'), 0.8686234817813765),
 (DecisionTreeClassifier(max_depth=5), 0.8587044534412956),
 (RandomForestClassifier(max_features='log2', n_estimators=15),
  0.8568151147098515),
 (KNeighborsClassifier(n_neighbors=7), 0.8549932523616736),
 (GaussianNB(), 0.8360998650472335)]