In [56]:
# 필요한 라이브러리를 우선 불러온다.

## 데이터 분석 관리
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

## 데이터 visualisation
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid') # related to the style in matplotlib
## 그래프 출력에 필요한 IPython command
%matplotlib inline

## Scikit-Learn의 다양한 머신러닝 모듈을 불러옴
## I will use Linear Regression, SVM, RandomForest and KNN which are kinds of classification algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [57]:
# Fristly bring the data
train_df = pd.read_csv("./input/train.csv")
test_df = pd.read_csv("./input/test.csv")

# Display data
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [58]:
train_df.info()
print('-'*20) # print divider line
test_df.info()

# each dataset has 891 and 418 of data

## Consider these data
# 1. is there empty part ?(if it is, how do I act on this part; Drop or Default; e.g. cabin, Age, Embarked)
# 2. Can I change the format of dataset to float64?(if not, can I make it to 범주형 data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
--------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare        

In [59]:
## delete useless part of data
train_df = train_df.drop(['PassengerId', 'Name', 'Ticket'], axis=1)
test_df = test_df.drop(['Name', 'Ticket'], axis=1)

In [60]:
## Process data one by one
## 1. Pclass - 1등석, 2등석과 같은 정보
train_df['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [61]:
# 1,2 and 3 are integer이지만, 1,2,3등급은 경우에 따라 다를 수 있지만 연속적인 정보가 아님.
# 따라서, 범주형(카테고리) 데이터로 인식하고 인코딩해야함.
pclass_train_dummies = pd.get_dummies(train_df['Pclass'])
pclass_test_dummies = pd.get_dummies(test_df['Pclass'])

train_df.drop(['Pclass'], axis=1, inplace=True)
test_df.drop(['Pclass'], axis=1, inplace=True)

train_df = train_df.join(pclass_train_dummies)
test_df = test_df.join(pclass_test_dummies)

In [62]:
## 2. Sex - 성별
# 이것 또한, one-hot-encoding을 진행
sex_train_dummies = pd.get_dummies(train_df['Sex'])
sex_test_dummies = pd.get_dummies(test_df['Sex'])

sex_train_dummies.columns = ['Female', 'Male']
sex_test_dummies.columns = ['Femail', 'Male']

train_df.drop(['Sex'], axis=1, inplace=True)
test_df.drop(['Sex'], axis=1, inplace=True)

train_df = train_df.join(sex_train_dummies)
test_df = test_df.join(sex_test_dummies)

In [63]:
## 3. Age
# 나이는 연속적 데이터이므로, 큰 처리가 필요없음.
# 하지만, 일부 NaN 데이터가 있으믈 이를 채울 수 있는 방법에 대해 생각(랜덤, 평균, 중간, drop)
# 평균값으로 채우도록하자.
train_df['Age'].fillna(train_df['Age'].mean(), inplace=True)
test_df['Age'].fillna(test_df['Age'].mean(), inplace=True)

## 4. SibSp & Panch
# 형제자매와 부모님은 가족으로 함께 처리. 하지만 바꿀 필요 X

In [64]:
## 5. Fare - 탑승료
# 빈 부분은 무단탑승이라고 생각하고 0으로 채운다.
test_df['Fare'].fillna(0, inplace=True)

In [65]:
## 6. Cabin - 객실
# NaN 데이터가 대부분이므로 데이터를 버리자.
train_df = train_df.drop(['Cabin'], axis=1)
test_df = test_df.drop(['Cabin'], axis=1)

In [66]:
## 7. Embarked - 탑승항구
# 데이터 확인부터 하자
train_df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [67]:
test_df['Embarked'].value_counts()

S    270
C    102
Q     46
Name: Embarked, dtype: int64

In [68]:
# S가 대다수이고 일부 데이터가 비어있는 것을 보아, 빈 부분은 S로 채우자
train_df['Embarked'].fillna('S', inplace=True)
test_df['Embarked'].fillna('S', inplace=True)

In [69]:
embarked_train_dummies = pd.get_dummies(train_df['Embarked'])
embarked_test_dummies = pd.get_dummies(test_df['Embarked'])

embarked_train_dummies.columns = ['S', 'C', 'Q']
embarked_test_dummies.columns = ['S', 'C', 'Q']

train_df.drop(['Embarked'], axis=1, inplace=True)
test_df.drop(['Embarked'], axis=1, inplace=True)

train_df = train_df.join(embarked_train_dummies)
test_df = test_df.join(embarked_test_dummies)

In [70]:
## NOW, dividing data
# (info, survived) 와 같은 형태로 데이터를 나눔
X_train = train_df.drop('Survived', axis=1)
Y_train = train_df['Survived']
X_test = test_df.drop('PassengerId', axis=1).copy()

In [71]:
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

logreg.score(X_train, Y_train)



0.8058361391694725

In [72]:
# Support Vector Machine
svc = SVC()
svc.fit(X_train, Y_train)

Y_pred = svc.predict(X_test)

svc.score(X_train, Y_train)



0.8888888888888888

In [73]:
# Random Forests
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)

Y_pred = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)

0.9809203142536476

In [74]:
# KNN
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)

Y_pred = knn.predict(X_test)

knn.score(X_train, Y_train)

0.835016835016835

In [75]:
## REPORT
# RandomForest showed the best result among the methods
# Random Forests
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)

Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)

submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': Y_pred
})
submission.to_csv('titanic.csv', index=False)