# Titanic dataset
다음 모델들의 테스트 성능을 비교해봅시다.
1. Logistic regression
2. k-nearest neighbor classifier
3. naive Bayes classifier
4. Decision tree
5. Random forest

# 0. Data preprocessing

In [None]:
from pprint import pprint
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

In [None]:
# Load Titanic dataset
url = 'data/titanic.csv'
titanic = pd.read_csv(url, index_col='PassengerId')

In [None]:
titanic.head(10)

In [None]:
titanic.dtypes

In [None]:
titanic.info()

In [None]:
titanic.describe()

In [None]:
titanic.Age.isnull()

In [None]:
list = []
for i in range (0, len(titanic)):
    age = titanic.iloc[i].Age
    age = 'child' if age < 20 else 'adult' if age >= 20 else 'unknown'
    list.append(age)

In [None]:
list

In [None]:
titanic['Age_modified'] = list

In [None]:
titanic.head(3)

In [None]:
Age_dummies = pd.get_dummies(titanic.Age_modified, prefix = 'Age')
Age_dummies.sample(n = 10)

In [None]:
Embarked_dummies = pd.get_dummies(titanic.Embarked, prefix = 'Embarked')
Embarked_dummies.sample(n = 10)

In [None]:
Sex_dummies = pd.get_dummies(titanic.Sex, prefix = 'Sex')
Sex_dummies.sample (n = 10)

In [None]:
data = pd.concat([titanic, Age_dummies, Embarked_dummies, Sex_dummies], axis = 1)

In [None]:
data.head(5)

In [None]:
data = data.drop(['Name', 'Sex', 'Age', 'Age_modified', 'Ticket', 'Fare', 'Cabin', 'Embarked'], axis = 1)

In [None]:
data.head(5)

In [None]:
# 변수명 가져오기
col_names = data.columns.values

In [None]:
X = data[col_names[1:]]
Y = data[col_names[0]]

In [None]:
X.head(5)

In [None]:
Y.head(5)

# 1. Split data into 3 sets
1. Training set (50%)
2. Validation set (30%)
3. Test set (20%)

In [None]:
def train_val_test_split(X, Y, val_size=0.3, test_size=0.2, random_state=123):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, 
                                                        test_size=test_size, 
                                                        random_state=random_state)
    val_size_rev = val_size / (1 - test_size)
    X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train,
                                                      test_size=val_size_rev,
                                                      random_state=random_state)
    return X_train, X_val, X_test, Y_train, Y_val, Y_test

In [None]:
X_train, X_val, X_test, Y_train, Y_val, Y_test = train_val_test_split(X, Y,
                                                                      val_size=0.3,
                                                                      test_size=0.2,
                                                                      random_state=123)

In [None]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

# 2. Fit the model and compare validation AUCs
비교하고자 하는 classifiers들은 다음과 같음
1. Logistic regression
2. k-nearest neighbor classifier
3. naive Bayes classifier
4. Decision tree

### 2.1. Logistic regression
Manual for `sklearn.linear_model.LogisticRegression`: [click](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

다음 parameter들에 대해 validation data에 대한 AUC값을 살펴볼 것
1. penalty
2. C

In [None]:
# C가 클수록 weak regularization
penalty_set = ['l1', 'l2']
C_set = [0.1, 1, 10, 1e2, 1e3, 1e4, 1e5, 1e6]

In [None]:
result = []
for penalty in penalty_set:
    for C in C_set:
        model = LogisticRegression(penalty = 'l2', C = C, class_weight='balanced')
        model = model.fit(X_train, Y_train)
#         Y_val_score = model.decision_function(X_val)
        Y_val_score = model.predict_proba(X_val)[:, 1]
        fpr, tpr, _ = roc_curve(Y_val, Y_val_score)
        result.append((model, penalty, C, auc(fpr, tpr)))

In [None]:
logreg_result = sorted(result, key=lambda x: x[3], reverse=True)

In [None]:
logreg_result

In [None]:
best_logreg_result = logreg_result[0]
print(best_logreg_result)

## 2.2. k-nearest neighbor classifier
Manual for `sklearn.neighbors.KNeighborsClassifier`: [click](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)

다음 parameter들에 대해 validation data에 대한 AUC값을 살펴볼 것
1. n_neighbors
2. weights

In [None]:
weights_set = ['uniform', 'distance']
n_neighbors_set = [1, 3, 5, 7, 9, 11, 13, 15]

In [None]:
result = []
for weights in weights_set:
    for n_neighbors in n_neighbors_set:
        model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights)
        model = model.fit(X_train, Y_train)
        Y_val_score = model.predict_proba(X_val)[:, 1]
        fpr, tpr, _ = roc_curve(Y_val, Y_val_score)
        result.append((model, weights, n_neighbors, auc(fpr, tpr)))        

In [None]:
knn_result = sorted(result, key=lambda x: x[3], reverse=True)

In [None]:
knn_result

In [None]:
best_knn_result = knn_result[0]
print(best_knn_result)

## 2.3. naive Bayes clasifier
Manual for `sklearn.naive_bayes.GaussianNB`: [click](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html)

클래스에 대한 prior 정보를 조절하여 fitting

In [None]:
priors_set = [None, [0.5, 0.5], [0.6, 0.4], [0.7, 0.3], [0.8, 0.2], [0.9, 0.1]]

In [None]:
result = []
for priors in priors_set:
    model = GaussianNB(priors=priors)
    model = model.fit(X_train, Y_train)
    Y_val_score = model.predict_proba(X_val)[:, 1]
    fpr, tpr, _ = roc_curve(Y_val, Y_val_score)
    result.append((model, priors, auc(fpr, tpr)))      

In [None]:
nb_result = sorted(result, key=lambda x: x[2], reverse=True)

In [None]:
nb_result

In [None]:
best_nb_result = nb_result[0]
print(best_nb_result)

## 2.4. Decision tree
Manual for `sklearn.tree.DecisionTreeClassifier`: [click](http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)

다음 parameter들에 대해 validation data에 대한 AUC값을 살펴볼 것
1. max_depth
2. class_weight

In [None]:
class_weight_set = [None, 'balanced']
max_depth_set = [3, 4, 5, 6, 7]

In [None]:
result = []

for class_weight in class_weight_set:
    for max_depth in max_depth_set:
        model = DecisionTreeClassifier(class_weight=class_weight, max_depth=max_depth)
        model = model.fit(X_train, Y_train)
        Y_val_score = model.predict_proba(X_val)[:, 1]
        fpr, tpr, _ = roc_curve(Y_val, Y_val_score)
        result.append((model, class_weight, max_depth, auc(fpr, tpr)))  

In [None]:
dt_result = sorted(result, key=lambda x: x[3], reverse=True)

In [None]:
dt_result

In [None]:
best_dt_result = dt_result[0]
print(best_dt_result)

## 2.5. Random forest
다음 parameter들에 대해 validation data에 대한 AUC값을 살펴볼 것
1. n_estimators
2. max_features

In [None]:
n_estimators_set = [5, 10, 15, 20]
max_features_set = ['auto', 'sqrt', 'log2']

In [None]:
result = []
for n_estimators in n_estimators_set:
    for max_features in max_features_set:
        model = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features)
        model = model.fit(X_train, Y_train)
        Y_val_score = model.predict_proba(X_val)[:, 1]
        fpr, tpr, _ = roc_curve(Y_val, Y_val_score)
        result.append((model, n_estimators, max_features, auc(fpr, tpr)))         

In [None]:
rf_result = sorted(result, key=lambda x: x[3], reverse=True)

In [None]:
rf_result

In [None]:
best_rf_result = rf_result[0]
print(best_rf_result)

# 3. Test the model

In [None]:
selected_models = []
selected_models.append(best_logreg_result[0])
selected_models.append(best_knn_result[0])
selected_models.append(best_nb_result[0])
selected_models.append(best_dt_result[0])
selected_models.append(best_rf_result[0])
pprint(selected_models)

In [None]:
test_result = []

for model in selected_models:
    Y_test_score = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(Y_test, Y_test_score)
    test_result.append((model, auc(fpr, tpr)))      

In [None]:
test_result

In [None]:
test_result = sorted(test_result, key=lambda x: x[1], reverse=True)

In [None]:
test_result

# 4. Discussion