# Titanic dataset
다음 모델들의 테스트 성능을 비교해봅시다.
1. Logistic regression
2. k-nearest neighbor classifier
3. naive Bayes classifier
4. Decision tree
5. Random forest

# 0. Data preprocessing

In [1]:
from pprint import pprint
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

In [2]:
# Load Titanic dataset
url = 'data/titanic.csv'
titanic = pd.read_csv(url, index_col='PassengerId')

In [3]:
titanic.head(10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
titanic.dtypes

Survived      int64
Pclass        int64
Name         object
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object

In [5]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [6]:
titanic.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
titanic.Age.isnull()

PassengerId
1      False
2      False
3      False
4      False
5      False
6       True
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18      True
19     False
20      True
21     False
22     False
23     False
24     False
25     False
26     False
27      True
28     False
29      True
30      True
       ...  
862    False
863    False
864     True
865    False
866    False
867    False
868    False
869     True
870    False
871    False
872    False
873    False
874    False
875    False
876    False
877    False
878    False
879     True
880    False
881    False
882    False
883    False
884    False
885    False
886    False
887    False
888    False
889     True
890    False
891    False
Name: Age, dtype: bool

In [8]:
list = []
for i in range (0, len(titanic)):
    age = titanic.iloc[i].Age
    age = 'child' if age < 20 else 'adult' if age >= 20 else 'unknown'
    list.append(age)

In [9]:
list

['adult',
 'adult',
 'adult',
 'adult',
 'adult',
 'unknown',
 'adult',
 'child',
 'adult',
 'child',
 'child',
 'adult',
 'adult',
 'adult',
 'child',
 'adult',
 'child',
 'unknown',
 'adult',
 'unknown',
 'adult',
 'adult',
 'child',
 'adult',
 'child',
 'adult',
 'unknown',
 'child',
 'unknown',
 'unknown',
 'adult',
 'unknown',
 'unknown',
 'adult',
 'adult',
 'adult',
 'unknown',
 'adult',
 'child',
 'child',
 'adult',
 'adult',
 'unknown',
 'child',
 'child',
 'unknown',
 'unknown',
 'unknown',
 'unknown',
 'child',
 'child',
 'adult',
 'adult',
 'adult',
 'adult',
 'unknown',
 'adult',
 'adult',
 'child',
 'child',
 'adult',
 'adult',
 'adult',
 'child',
 'unknown',
 'unknown',
 'adult',
 'child',
 'child',
 'adult',
 'adult',
 'child',
 'adult',
 'adult',
 'adult',
 'adult',
 'unknown',
 'unknown',
 'child',
 'adult',
 'adult',
 'adult',
 'unknown',
 'adult',
 'child',
 'adult',
 'child',
 'unknown',
 'adult',
 'adult',
 'adult',
 'adult',
 'adult',
 'adult',
 'adult',
 'unknow

In [10]:
titanic['Age_modified'] = list

In [11]:
titanic.head(3)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_modified
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,adult
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,adult
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,adult


In [12]:
Age_dummies = pd.get_dummies(titanic.Age_modified, prefix = 'Age')
Age_dummies.sample(n = 10)

Unnamed: 0_level_0,Age_adult,Age_child,Age_unknown
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
144,0,1,0
103,1,0,0
403,1,0,0
733,0,0,1
309,1,0,0
53,1,0,0
440,1,0,0
48,0,0,1
414,0,0,1
187,0,0,1


In [13]:
Embarked_dummies = pd.get_dummies(titanic.Embarked, prefix = 'Embarked')
Embarked_dummies.sample(n = 10)

Unnamed: 0_level_0,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
25,0,0,1
174,0,0,1
507,0,0,1
518,0,1,0
888,0,0,1
890,1,0,0
787,0,0,1
246,0,1,0
464,0,0,1
85,0,0,1


In [14]:
Sex_dummies = pd.get_dummies(titanic.Sex, prefix = 'Sex')
Sex_dummies.sample (n = 10)

Unnamed: 0_level_0,Sex_female,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
743,1,0
336,0,1
601,1,0
781,1,0
194,0,1
858,0,1
747,0,1
871,0,1
727,1,0
740,0,1


In [15]:
data = pd.concat([titanic, Age_dummies, Embarked_dummies, Sex_dummies], axis = 1)

In [16]:
data.head(5)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_modified,Age_adult,Age_child,Age_unknown,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,adult,1,0,0,0,0,1,0,1
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,adult,1,0,0,1,0,0,1,0
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,adult,1,0,0,0,0,1,1,0
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,adult,1,0,0,0,0,1,1,0
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,adult,1,0,0,0,0,1,0,1


In [17]:
data = data.drop(['Name', 'Sex', 'Age', 'Age_modified', 'Ticket', 'Fare', 'Cabin', 'Embarked'], axis = 1)

In [18]:
data.head(5)

Unnamed: 0_level_0,Survived,Pclass,SibSp,Parch,Age_adult,Age_child,Age_unknown,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3,1,0,1,0,0,0,0,1,0,1
2,1,1,1,0,1,0,0,1,0,0,1,0
3,1,3,0,0,1,0,0,0,0,1,1,0
4,1,1,1,0,1,0,0,0,0,1,1,0
5,0,3,0,0,1,0,0,0,0,1,0,1


In [19]:
# 변수명 가져오기
col_names = data.columns.values

In [20]:
X = data[col_names[1:]]
Y = data[col_names[0]]

In [21]:
X.head(5)

Unnamed: 0_level_0,Pclass,SibSp,Parch,Age_adult,Age_child,Age_unknown,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,3,1,0,1,0,0,0,0,1,0,1
2,1,1,0,1,0,0,1,0,0,1,0
3,3,0,0,1,0,0,0,0,1,1,0
4,1,1,0,1,0,0,0,0,1,1,0
5,3,0,0,1,0,0,0,0,1,0,1


In [22]:
Y.head(5)

PassengerId
1    0
2    1
3    1
4    1
5    0
Name: Survived, dtype: int64

# 1. Split data into 3 sets
1. Training set (50%)
2. Validation set (30%)
3. Test set (20%)

In [23]:
def train_val_test_split(X, Y, val_size=0.3, test_size=0.2, random_state=123):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, 
                                                        test_size=test_size, 
                                                        random_state=random_state)
    val_size_rev = val_size / (1 - test_size)
    X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train,
                                                      test_size=val_size_rev,
                                                      random_state=random_state)
    return X_train, X_val, X_test, Y_train, Y_val, Y_test

In [24]:
X_train, X_val, X_test, Y_train, Y_val, Y_test = train_val_test_split(X, Y,
                                                                      val_size=0.3,
                                                                      test_size=0.2,
                                                                      random_state=123)

In [25]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(445, 11)
(267, 11)
(179, 11)


# 2. Fit the model and compare validation AUCs
비교하고자 하는 classifiers들은 다음과 같음
1. Logistic regression
2. k-nearest neighbor classifier
3. naive Bayes classifier
4. Decision tree
5. Random forest

### 2.1. Logistic regression
Manual for `sklearn.linear_model.LogisticRegression`: [click](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

다음 parameter들에 대해 validation data에 대한 AUC값을 살펴볼 것
1. penalty
2. C

In [27]:
# C가 클수록 weak regularization
penalty_set = ['l1', 'l2']
C_set = [0.1, 1, 10, 1e2, 1e3, 1e4, 1e5, 1e6]

In [28]:
result = []
for penalty in penalty_set:
    for C in C_set:
        model = LogisticRegression(penalty=penalty, C=C, class_weight='balanced')
        model = model.fit(X_train, Y_train)
#         Y_val_score = model.decision_function(X_val)
        Y_val_score = model.predict_proba(X_val)[:, 1]
        fpr, tpr, _ = roc_curve(Y_val, Y_val_score)
        result.append((model, penalty, C, auc(fpr, tpr)))

In [29]:
result

[(LogisticRegression(C=0.1, class_weight='balanced', dual=False,
            fit_intercept=True, intercept_scaling=1, max_iter=100,
            multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
            solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
  'l1',
  0.1,
  0.80741943241943237),
 (LogisticRegression(C=1, class_weight='balanced', dual=False,
            fit_intercept=True, intercept_scaling=1, max_iter=100,
            multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
            solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
  'l1',
  1,
  0.8306878306878307),
 (LogisticRegression(C=10, class_weight='balanced', dual=False,
            fit_intercept=True, intercept_scaling=1, max_iter=100,
            multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
            solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
  'l1',
  10,
  0.83285233285233284),
 (LogisticRegression(C=100.0, class_weight='balanc

In [30]:
logreg_result = sorted(result, key=lambda x: x[3], reverse=True)

In [31]:
logreg_result

[(LogisticRegression(C=100.0, class_weight='balanced', dual=False,
            fit_intercept=True, intercept_scaling=1, max_iter=100,
            multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
            solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
  'l1',
  100.0,
  0.83297258297258314),
 (LogisticRegression(C=1000.0, class_weight='balanced', dual=False,
            fit_intercept=True, intercept_scaling=1, max_iter=100,
            multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
            solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
  'l1',
  1000.0,
  0.83297258297258314),
 (LogisticRegression(C=10000.0, class_weight='balanced', dual=False,
            fit_intercept=True, intercept_scaling=1, max_iter=100,
            multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
            solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
  'l1',
  10000.0,
  0.83297258297258314),
 (LogisticRegression(C=10

In [32]:
best_logreg_result = logreg_result[0]
print(best_logreg_result)

(LogisticRegression(C=100.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False), 'l1', 100.0, 0.83297258297258314)


## 2.2. k-nearest neighbor classifier
Manual for `sklearn.neighbors.KNeighborsClassifier`: [click](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)

다음 parameter들에 대해 validation data에 대한 AUC값을 살펴볼 것
1. n_neighbors
2. weights

In [33]:
weights_set = ['uniform', 'distance']
n_neighbors_set = [1, 3, 5, 7, 9, 11, 13, 15]

In [34]:
result = []
for weights in weights_set:
    for n_neighbors in n_neighbors_set:
        model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights)
        model = model.fit(X_train, Y_train)
        Y_val_score = model.predict_proba(X_val)[:, 1]
        fpr, tpr, _ = roc_curve(Y_val, Y_val_score)
        result.append((model, weights, n_neighbors, auc(fpr, tpr)))        

In [35]:
knn_result = sorted(result, key=lambda x: x[3], reverse=True)

In [36]:
knn_result

[(KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_params=None, n_jobs=1, n_neighbors=7, p=2,
             weights='uniform'), 'uniform', 7, 0.82822270322270319),
 (KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_params=None, n_jobs=1, n_neighbors=9, p=2,
             weights='uniform'), 'uniform', 9, 0.82187950937950938),
 (KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_params=None, n_jobs=1, n_neighbors=15, p=2,
             weights='uniform'), 'uniform', 15, 0.8201659451659451),
 (KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_params=None, n_jobs=1, n_neighbors=5, p=2,
             weights='uniform'), 'uniform', 5, 0.81752044252044254),
 (KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_params=None, n_jobs=1, n_neighbors=13, p=2,
             weights='uniform'),

In [37]:
best_knn_result = knn_result[0]
print(best_knn_result)

(KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform'), 'uniform', 7, 0.82822270322270319)


## 2.3. naive Bayes clasifier
Manual for `sklearn.naive_bayes.GaussianNB`: [click](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html)

클래스에 대한 prior 정보를 조절하여 fitting

In [38]:
priors_set = [None, [0.5, 0.5], [0.6, 0.4], [0.7, 0.3], [0.8, 0.2], [0.9, 0.1]]

In [39]:
result = []
for priors in priors_set:
    model = GaussianNB(priors=priors)
    model = model.fit(X_train, Y_train)
    Y_val_score = model.predict_proba(X_val)[:, 1]
    fpr, tpr, _ = roc_curve(Y_val, Y_val_score)
    result.append((model, priors, auc(fpr, tpr)))      

In [40]:
nb_result = sorted(result, key=lambda x: x[2], reverse=True)

In [41]:
nb_result

[(GaussianNB(priors=None), None, 0.82070707070707083),
 (GaussianNB(priors=[0.5, 0.5]), [0.5, 0.5], 0.82070707070707083),
 (GaussianNB(priors=[0.6, 0.4]), [0.6, 0.4], 0.82070707070707083),
 (GaussianNB(priors=[0.7, 0.3]), [0.7, 0.3], 0.82070707070707083),
 (GaussianNB(priors=[0.8, 0.2]), [0.8, 0.2], 0.82070707070707083),
 (GaussianNB(priors=[0.9, 0.1]), [0.9, 0.1], 0.82070707070707083)]

In [42]:
best_nb_result = nb_result[0]
print(best_nb_result)

(GaussianNB(priors=None), None, 0.82070707070707083)


## 2.4. Decision tree
Manual for `sklearn.tree.DecisionTreeClassifier`: [click](http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)

다음 parameter들에 대해 validation data에 대한 AUC값을 살펴볼 것
1. max_depth
2. class_weight

In [43]:
class_weight_set = [None, 'balanced']
max_depth_set = [3, 4, 5, 6, 7]

In [44]:
result = []

for class_weight in class_weight_set:
    for max_depth in max_depth_set:
        model = DecisionTreeClassifier(class_weight=class_weight, max_depth=max_depth)
        model = model.fit(X_train, Y_train)
        Y_val_score = model.predict_proba(X_val)[:, 1]
        fpr, tpr, _ = roc_curve(Y_val, Y_val_score)
        result.append((model, class_weight, max_depth, auc(fpr, tpr)))  

In [45]:
dt_result = sorted(result, key=lambda x: x[3], reverse=True)

In [46]:
dt_result

[(DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              presort=False, random_state=None, splitter='best'),
  'balanced',
  5,
  0.83237133237133243),
 (DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              presort=False, random_state=None, splitter='best'),
  None,
  5,
  0.82996632996633013),
 (DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              presor

In [47]:
best_dt_result = dt_result[0]
print(best_dt_result)

(DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'), 'balanced', 5, 0.83237133237133243)


## 2.5. Random forest
다음 parameter들에 대해 validation data에 대한 AUC값을 살펴볼 것
1. n_estimators
2. max_features

In [48]:
n_estimators_set = [5, 10, 15, 20]
max_features_set = ['auto', 'sqrt', 'log2']

In [49]:
result = []
for n_estimators in n_estimators_set:
    for max_features in max_features_set:
        model = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features)
        model = model.fit(X_train, Y_train)
        Y_val_score = model.predict_proba(X_val)[:, 1]
        fpr, tpr, _ = roc_curve(Y_val, Y_val_score)
        result.append((model, n_estimators, max_features, auc(fpr, tpr)))         

In [50]:
rf_result = sorted(result, key=lambda x: x[3], reverse=True)

In [51]:
rf_result

[(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='sqrt', max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=15, n_jobs=1, oob_score=False, random_state=None,
              verbose=0, warm_start=False), 15, 'sqrt', 0.81854256854256846),
 (RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='sqrt', max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=20, n_jobs=1, oob_score=False, random_state=None,
              verbose=0, warm_start=False), 20, 'sqrt', 0.8099747474747474),
 (RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nod

In [52]:
best_rf_result = rf_result[0]
print(best_rf_result)

(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=15, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False), 15, 'sqrt', 0.81854256854256846)


# 3. Test the model

In [53]:
selected_models = []
selected_models.append(best_logreg_result[0])
selected_models.append(best_knn_result[0])
selected_models.append(best_nb_result[0])
selected_models.append(best_dt_result[0])
selected_models.append(best_rf_result[0])
pprint(selected_models)

[LogisticRegression(C=100.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform'),
 GaussianNB(priors=None),
 DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
           

In [54]:
test_result = []

for model in selected_models:
    Y_test_score = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(Y_test, Y_test_score)
    test_result.append((model, auc(fpr, tpr)))      

In [55]:
test_result

[(LogisticRegression(C=100.0, class_weight='balanced', dual=False,
            fit_intercept=True, intercept_scaling=1, max_iter=100,
            multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
            solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
  0.87442645074224024),
 (KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_params=None, n_jobs=1, n_neighbors=7, p=2,
             weights='uniform'), 0.85499325236167356),
 (GaussianNB(priors=None), 0.83609986504723344),
 (DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              presort=False, random_state=None, splitter='best'),
  0.85870445344129565),
 (RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=

In [56]:
test_result = sorted(test_result, key=lambda x: x[1], reverse=True)

In [57]:
test_result

[(LogisticRegression(C=100.0, class_weight='balanced', dual=False,
            fit_intercept=True, intercept_scaling=1, max_iter=100,
            multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
            solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
  0.87442645074224024),
 (DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              presort=False, random_state=None, splitter='best'),
  0.85870445344129565),
 (KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_params=None, n_jobs=1, n_neighbors=7, p=2,
             weights='uniform'), 0.85499325236167356),
 (RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='sqrt', max_leaf_nodes=None,
 

# 4. Discussion