In [3]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# from sklearn import ?
# from sklearn.metrics import ?

<br>

## 1. Preparing dataset 

In [4]:
data_df = pd.read_csv('titanic.csv')
data_df.head(10)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


## Data info

- **PassengerId** : Unique ID of passenger
- **Survived** : 0 = No, 1 = Yes
- **pclass** : Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
- **sibsp** : # of siblings & spouses aboard the Titanic
- **parch** : # of parents / children aboard the Titanic
- **ticket** : Ticket number
- **cabin** : Cabin number
- **embarked** : Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

In [5]:
y_data = data_df['Survived']
y_data.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [6]:
del data_df['Survived']
x_data = data_df.copy()
x_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


<br>

## 2. Feature engineering & Feature selection

#### 시도해볼 수 있는 전략들

- 불필요한 열이나 예측에 방해가 되는 열은 아예 지우기 (ex. PassengerId)
- 결측치 채우기 
- Text로 되어있는 Category(Factor)는 숫자로 바꿔주기 (ex. Male/Female -> 0/1)
- 실수 범위를 구간 범위로 바꿔주기 
- 필요한 경우 기존 열을 바탕으로 새로운 열을 계산해 추가하기

In [7]:
x_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 76.6+ KB


In [8]:
mean_age = x_data['Age'].median(skipna=True)
x_data['Age'] = x_data['Age'].fillna(mean_age) # or .fillna(int/float/str 값)

In [10]:
del x_data['PassengerId']
del x_data['Name']
del x_data['Ticket']
del x_data['Cabin']
del x_data['Fare']

In [12]:
x_data.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Embarked    2
dtype: int64

In [13]:
x_data['Sex'] = x_data['Sex'].replace(['male','female'],[1,2])
x_data['Embarked'] = x_data['Embarked'].replace(['S','C','Q'],[1,2,3])


In [15]:
x_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,3,1,22.0,1,0,1.0
1,1,2,38.0,1,0,2.0
2,3,2,26.0,0,0,1.0
3,1,2,35.0,1,0,1.0
4,3,1,35.0,0,0,1.0


In [16]:
x_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
Pclass      891 non-null int64
Sex         891 non-null int64
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Embarked    889 non-null float64
dtypes: float64(2), int64(4)
memory usage: 41.8 KB


In [18]:
most_freq = x_data['Embarked'].value_counts(dropna=True).idxmax()
print(most_freq)
x_data['Embarked'].fillna(most_freq,inplace=True)

1.0


<br>

## 2. Train - Test split (비율 7:3, seed= 0)

In [19]:
from sklearn import model_selection

In [20]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(x_data, y_data, test_size=0.3, random_state =0)

In [21]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(623, 6)
(268, 6)
(623,)
(268,)


In [22]:
x_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
857,1,1,51.0,0,0,1.0
52,1,2,49.0,1,0,2.0
386,3,1,1.0,5,2,1.0
124,1,1,54.0,0,1,1.0
578,3,2,28.0,1,0,2.0
549,2,1,8.0,1,1,1.0
118,1,1,24.0,0,1,2.0
12,3,1,20.0,0,0,1.0
157,3,1,30.0,0,0,1.0
127,3,1,24.0,0,0,1.0


<br>

## 3. Create model instance variable

In [23]:
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(max_depth=3, random_state=13)
tree_clf.fit(x_train, y_train)
print('Score: {}'.format(tree_clf.score(x_test, y_test)))

Score: 0.8208955223880597


In [24]:
from sklearn import neighbors, datasets
import random as rnd
from sklearn.svm import SVC, LinearSVC
from sklearn import datasets
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import load_breast_cancer

In [25]:
#model_KNN = neighbors.KNeighborsClassifier(6) # K-Neighbors & 클러스터 분류 (Classifier)
#model_KNN.fit(x_train, y_train)

#Y_pred = model_KNN.predict(x_test)
#acc_knn = round(knn.score(x_train, y_train) * 100, 2)
#acc_knn

# KNN
model_knn =  neighbors.KNeighborsClassifier(6)
model_knn.fit(x_train, y_train)
y_pred = model_knn.predict(x_test)
acc_knn = round(model_knn.score(x_test, y_test) * 100, 2)
acc_knn

75.0

In [26]:
model_svc = SVC()
model_svc.fit(x_train, y_train)
y_pred = model_svc.predict(x_test)
acc_svm = round(model_svc.score(x_test, y_test) * 100, 2)
acc_svm

80.97

In [27]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(x_train, y_train)
y_pred = decision_tree.predict(x_test)
acc_decision_tree = round(decision_tree.score(x_test, y_test) * 100, 2)
acc_decision_tree

76.49

In [28]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(x_train, y_train)
y_pred = random_forest.predict(x_test)
acc_random_forest = round(random_forest.score(x_test, y_test) * 100, 2)
acc_random_forest

77.99

In [29]:
gbc = GradientBoostingClassifier(random_state =0, max_depth=2)
gbc.fit(x_train, y_train)

#print("accuracy on training set: {:3f}".format(gbc.score(x_train,y_train)))
gbc.score(x_train,y_train)
#print("accuracy on training set: {:3f}".format(gbc.score(x_test,y_test)))
gbc.score(x_test,y_test)

0.8171641791044776

<br>

## 4. Check the result with metrics

In [30]:
models = pd.DataFrame({
    'Model': [ 'KNN','SVM','Random Forest','Decision Tree'],
    'Score': [ acc_knn, acc_svm,acc_random_forest,acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
1,SVM,80.97
2,Random Forest,78.73
3,Decision Tree,77.61
0,KNN,75.0
