In [None]:
# Author : Jijong Park, jjp9303kr@icloud.com
# Supervisor : Na, In Seop, ypencil@hanmail.net

# This python script is based on the following link.
# 이 파이썬 스크립트는 다음 링크의 글을 기반으로 작성되었습니다.
# https://www.kaggle.com/nadintamer/titanic-survival-predictions-beginner
# Thx for nice tutorials.
# 좋은 튜토리얼에 감사드립니다.
# 처음 시도해보는 캐글, 머신러닝 이기에 일단 무작정 따라해보기로 했습니다.

# 1. Import necessary libs

# data analysis libs
import numpy as np    # 선형 대수학 관련 연산을 쉽게 처리하기 위한 라이브러리
import pandas as pd   # 데이터 분석 처리를 위한 라이브러리

# visualization libs
import matplotlib.pyplot as plt   # 시각화 패키지
import seaborn as sns             # matplotlib를 기반으로 차트 등을 추가한 패키지
%matplotlib inline

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# 2. Read in and explore the data

# import train and test CSV files
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

#take a look at the training data
train.describe(include='all')

In [None]:
# 3. Data analysis

# get a list of the features within the dataset
print(train.columns)

In [None]:
# see a sample of the dataset to get an idea of the variables
train.sample(5)

In [None]:
# see a summary of the training dataset
train.describe(include="all")

In [None]:
# check for any other unusable values
print(pd.isnull(train).sum())

In [None]:
# 4. Data visualization

# sex feature
# draw a bar plot of survival by sex
sns.barplot(x="Sex", y="Survived", data=train)

# print percentages of females vs. l=males that survive
print("Percentage of females who survived:", train["Survived"][train["Sex"] == 'female'].value_counts(normalize = True)[1]*100)
print("Percentage of males who survived:", train["Survived"][train["Sex"] == 'male'].value_counts(normalize = True)[1]*100)

In [None]:
# Pclass feature
# draw a bar plot of survival by Pclass
sns.barplot(x="Pclass", y="Survived", data=train)

# print percentage of people by Pclass that survived
print("Percentage of Pclass = 1 who survived:", train["Survived"][train["Pclass"] == 1].value_counts(normalize = True)[1]*100)
print("Percentage of Pclass = 2 who survived:", train["Survived"][train["Pclass"] == 2].value_counts(normalize = True)[1]*100)
print("Percentage of Pclass = 3 who survived:", train["Survived"][train["Pclass"] == 3].value_counts(normalize = True)[1]*100)

In [None]:
# SibSp feature
# draw a bar plot for SibSp vs. survival
sns.barplot(x="SibSp", y="Survived", data=train)

# print percentage of people by SibSp that survived
# won't be printing all of these
print("Percentage of SibSp = 0 who survived:", train["Survived"][train["SibSp"] == 0].value_counts(normalize = True)[1]*100)
print("Percentage of SibSp = 1 who survived:", train["Survived"][train["SibSp"] == 1].value_counts(normalize = True)[1]*100)
print("Percentage of SibSp = 2 who survived:", train["Survived"][train["SibSp"] == 2].value_counts(normalize = True)[1]*100)

In [None]:
# Parch feature
# draw a bar plot for Parch vs. survival
sns.barplot(x="Parch", y="Survived", data=train)
plt.show()

In [None]:
# Age feature
# sort the ages into logical categories
train["Age"] = train["Age"].fillna(-0.5)
test["Age"] = test["Age"].fillna(-0.5)
bins = [-1, 0, 5, 12, 18, 24, 35, 60, np.inf]
labels = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
train['AgeGroup'] = pd.cut(train["Age"], bins, labels=labels)
test['AgeGroup'] = pd.cut(test["Age"], bins, labels=labels)

# draw a bar plot of Age vs. survival
sns.barplot(x="AgeGroup", y="Survived", data=train)
plt.show()

In [None]:
# Cabin feature
train["CabinBool"] = (train["Cabin"].notnull().astype('int'))
test["CabinBool"] = (test["Cabin"].notnull().astype('int'))

# calculate percentages of CabinBool vs. survived
print("Percentage of CabinBool = 1 who survived:", train["Survived"][train["CabinBool"] == 1].value_counts(normalize = True)[1]*100)
print("Percentage of CabinBool = 0 who survived:", train["Survived"][train["CabinBool"] == 0].value_counts(normalize = True)[1]*100)

# draw a bar plot of CabinBool vs. survival
sns.barplot(x="CabinBool", y="Survived", data=train)
plt.show()

In [None]:
# 5. Cleaning data

# Looking at the Test Data
test.describe(include="all")

In [None]:
# Cabin feature
# 캐빈 정보 드랍
train = train.drop(['Cabin'], axis = 1)
test = test.drop(['Cabin'], axis = 1)

In [None]:
# Ticket feature
# 티켓 정보가 유용할 지는 아직 모르겠으므로 드랍
train = train.drop(['Ticket'], axis = 1)
test = test.drop(['Ticket'], axis = 1)

In [None]:
# Embarked feature
# Embarked 필드에 잃어버린 정보를 채울 필요가 있다
print("Number of people embarking in Southampton (S):")
southampton = train[train["Embarked"] == "S"].shape[0]
print(southampton)

print("Number of people embarking in Southampton (C):")
cherbourg = train[train["Embarked"] == "C"].shape[0]
print(cherbourg)

print("Number of people embarking in Southampton (Q):")
queenstown = train[train["Embarked"] == "Q"].shape[0]
print(queenstown)

In [None]:
# Southampton에서 탑승한 사람이 대부분이므로 탑승지를 모르는 사람들을 Southampton에서 탑승한 것으로 간주해도 무리가 없을 것이다
train = train.fillna({"Embarked":"S"})

In [None]:
# Age feature
# create a combined group of both datasets
combine = [train, test]

# extract a title for each Name in the train and test datasets
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(train['Title'], train['Sex'])

In [None]:
# replace various titles with more common names
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady','Capt','Col','Don','Dr','Major','Rev','Jonkheer','Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace(['Countess','Lady','Sir'],'Royal')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

In [None]:
# map each of the title groups to a numerical value
title_mapping = {"Mr":1, "Miss":2, "Mrs":3, "Master":4, "Royal":5, "Rare":6}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    
train.head()

In [None]:
# fill missing age with mode age group for each title
mr_age = train[train["Title"] == 1]["AgeGroup"].mode()     # Young Adult
miss_age = train[train["Title"] == 2]["AgeGroup"].mode()   # Student
mrs_age = train[train["Title"] == 3]["AgeGroup"].mode()    # Adult
master_age = train[train["Title"] == 4]["AgeGroup"].mode() # Baby
royal_age = train[train["Title"] == 5]["AgeGroup"].mode()  # Adult
rare_age = train[train["Title"] == 6]["AgeGroup"].mode()   # Adult

age_title_mapping = {1:"Young Adult", 2:"Student", 3:"Adult", 4:"Baby", 5:"Adult", 6:"Adult"}

for x in range(len(train["AgeGroup"])):
    if train["AgeGroup"][x] == "Unknown":
        train["AgeGroup"][x] = age_title_mapping[train["Title"][x]]
        
for x in range(len(test["AgeGroup"])):
    if test["AgeGroup"][x] == "Unknown":
        test["AgeGroup"][x] = age_title_mapping[test["Title"][x]]

In [None]:
# map each Age value to a numerical value
age_mapping = {'Baby':1, 'Child':2, 'Teenager':3, 'Student':4, 'Young Adult':5, 'Adult':6, 'Senior':7}
train['AgeGroup'] = train['AgeGroup'].map(age_mapping)
test['AgeGroup'] = test['AgeGroup'].map(age_mapping)

train.head()

# dropping the Age feature for now, might change
train = train.drop(['Age'], axis = 1)
test = test.drop(['Age'], axis = 1)

In [None]:
# Name feature
# 이름 항목은 더이상 필요 없으므로 드랍
train = train.drop(['Name'], axis = 1)
test = test.drop(['Name'], axis = 1)

In [None]:
# Sex feature
# map each Sex value to a numerical value
sex_mapping = {"male":0, "female":1}
train['Sex'] = train['Sex'].map(sex_mapping)
test['Sex'] = test['Sex'].map(sex_mapping)

train.head()

In [None]:
# Embarked feature
# map each Embarked value to a numerical value
embarked_mapping = {"S":1, "C":2, "Q":3}
train['Embarked'] = train['Embarked'].map(embarked_mapping)
test['Embarked'] = test['Embarked'].map(embarked_mapping)

train.head()

In [None]:
# Fare feature
# fill in missing Fare value in test set based on mean fare for that Pclass
for x in range(len(test["Fare"])):
    if pd.isnull(test["Fare"][x]):
        pclass = test["Pclass"][x] # Pclass = 3
        test["Fare"][x] = round(train[train["Pclass"] == pclass]["Fare"].mean(), 4)

# map Fare values into groups of numerical values
train['FareBand'] = pd.qcut(train['Fare'], 4, labels = [1,2,3,4])
test['FareBand'] = pd.qcut(test['Fare'], 4, labels = [1,2,3,4])

# drop Fare value
train = train.drop(['Fare'], axis=1)
test = test.drop(['Fare'], axis=1)

In [None]:
# check train data
train.head()

In [None]:
# check test data
test.head()

In [None]:
# 6. Choosing the best model

# Splitting the Training data

from sklearn.model_selection import train_test_split

predictors = train.drop(['Survived', 'PassengerId'], axis=1)
target = train['Survived']
x_train, x_val, y_train, y_val = train_test_split(predictors, target, test_size = 0.22, random_state = 0)

In [None]:
# Testing different models

# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

gaussian = GaussianNB()
gaussian.fit(x_train, y_train)
y_pred = gaussian.predict(x_val)
acc_gaussian = round(accuracy_score(y_pred, y_val)*100, 2)
print("Gaussian Naive Bayes:", acc_gaussian)

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_val)
acc_logreg = round(accuracy_score(y_pred, y_val)*100, 2)
print("Logistic Regression:", acc_logreg)

In [None]:
# Support Vector Machines
from sklearn.svm import SVC

svc = SVC()
svc.fit(x_train, y_train)
y_pred = svc.predict(x_val)
acc_svc = round(accuracy_score(y_pred, y_val)*100, 2)
print("Support Vector Machines:", acc_svc)

In [None]:
# Linear SVC
from sklearn.svm import LinearSVC

linear_svc = LinearSVC()
linear_svc.fit(x_train, y_train)
y_pred = linear_svc.predict(x_val)
acc_linear_svc = round(accuracy_score(y_pred, y_val)*100, 2)
print("Linear SVC:", acc_linear_svc)

In [None]:
# Perceptron
from sklearn.linear_model import Perceptron

perceptron = Perceptron()
perceptron.fit(x_train, y_train)
y_pred = perceptron.predict(x_val)
acc_perceptron = round(accuracy_score(y_pred, y_val)*100, 2)
print("Perceptron:", acc_perceptron)

In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

decisiontree = DecisionTreeClassifier()
decisiontree.fit(x_train, y_train)
y_pred = decisiontree.predict(x_val)
acc_decisiontree = round(accuracy_score(y_pred, y_val)*100, 2)
print("Decision Tree:", acc_decisiontree)

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

randomforest = RandomForestClassifier()
randomforest.fit(x_train, y_train)
y_pred = randomforest.predict(x_val)
acc_randomforest = round(accuracy_score(y_pred, y_val)*100, 2)
print("Random Forest", acc_randomforest)

In [None]:
# K-Nearest Neighbors, KNN
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
y_pred = knn.predict(x_val)
acc_knn = round(accuracy_score(y_pred, y_val)*100, 2)
print("KNN:", acc_knn)

In [None]:
# Stochastic Gradient Descent
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()
sgd.fit(x_train, y_train)
y_pred = sgd.predict(x_val)
acc_sgd = round(accuracy_score(y_pred, y_val)*100, 2)
print("Stochastic Gradient Desent:", acc_sgd)

In [None]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
gbk = GradientBoostingClassifier()
gbk.fit(x_train, y_train)
y_pred = gbk.predict(x_val)
acc_gbk = round(accuracy_score(y_pred, y_val)*100, 2)
print("Gradient Boosting Classifier:", acc_gbk)

In [None]:
# 각 모델들의 정확성 비교
models = pd.DataFrame({
    'Model':['Support Vector Machines', 'KNN', 'Logistic Regression', 'Random Forest',
             'Naive Bayes', 'Perceptron', 'Linear SVC', 'Decision Tree',
             'Stochastic Gradient Descent', 'Gradient Boosting Classifier'],
    'Score':[acc_svc, acc_knn, acc_logreg, acc_randomforest,
             acc_gaussian, acc_perceptron, acc_linear_svc, acc_decisiontree,
             acc_sgd, acc_gbk]})
models.sort_values(by='Score', ascending=False)

In [None]:
# 점수가 가장 높은 gbk를 사용
# set ids as PassengerId and predict survival
ids = test['PassengerId']
predictions = gbk.predict(test.drop('PassengerId', axis=1))

# set the output as a dataframe and convert to csv file named submission.csv
output = pd.DataFrame({'passengerId':ids, 'Survived':predictions})
output.to_csv('submission.csv', index=False)