In [58]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import linear_model
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt


# Raw Data Loading
df = pd.read_csv('./data/Titanic/train.csv')
# display(df)

# 결측치 확인
# df.isnull().sum()
#Age            177
#Cabin          687
# Embarked         2

# 이상치 확인
# 성별 변경
df['Sex'] = np.array(df['Sex'] == 'female', dtype = np.int32) 
# sex_mapping = {'male' : 0 , 'female': 1}
# df['Sex'] = df['Sex'].map(sex_mapping) # python 의 map과 다른역할

# family 열 추가
df['Family'] = df['SibSp']|df['Parch']
# df['Family'] = df['SibSp']+df['Parch']

# Fare 는 그래도 사용

# Ticket 삭제
# P class 사용 삭제
# Suvived 종속변수

# Embarked 결측치 처리
df['Embarked'] = df['Embarked'].fillna('Q') # fillna 원하는 값으로 채움

# Age에 대한 결측치 처리
df['Age'] = df['Age'].fillna(df['Age'].mean())

# Embarked 숫자로 처리
embarked_mapping = {'S' : 0 , 'C': 1, 'Q': 2}
df['Embarked'] = df['Embarked'].map(embarked_mapping)

# Age에 대해서 Binning 처리(Numerical value -> categorical value)
df.loc[df['Age'] < 8, 'Age'] = 0
df.loc[(df['Age'] >= 8) & (df['Age'] < 20), 'Age'] = 1
df.loc[(df['Age'] >= 20) & (df['Age'] < 65), 'Age'] = 2
df.loc[df['Age'] >= 65, 'Age'] = 4


df.drop(['PassengerId','Name','Ticket','Cabin','SibSp','Fare','Parch'],axis =1, inplace = True)
display(df)


# 데이터가 준비되었어요
# Tensorflow 를 이용해서 학습을 진행하고 Accuracy 를 구해보아요
# 891개의 데이터가 있어요 이걸 다 학습에 사용하면 안되요
# 7:3 비율로 데이터를 나누어서 학습과 정확도 측정에 사용해요!

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked,Family
0,0,3,0,2.0,0,1
1,1,1,1,2.0,1,1
2,1,3,1,2.0,0,0
3,1,1,1,2.0,0,1
4,0,3,0,2.0,0,0
...,...,...,...,...,...,...
886,0,2,0,2.0,0,0
887,1,1,1,1.0,0,0
888,0,3,1,2.0,0,3
889,1,1,0,2.0,1,0


In [63]:

# Tensorflow


zscore_threshold = 2.0

for col in df.columns:
    outlier = df[col][np.abs(stats.zscore(df[col])) > zscore_threshold]    
    df = df.loc[~df[col].isin(outlier)]
    

# Training Data Set
x_data = df.drop('Survived', axis=1, inplace=False).values
t_data = df['Survived'].values.reshape(-1,1)


# placeholder
X = tf.placeholder(shape=[None,5], dtype=tf.float32)  # 독립변수의 데이터를 받기위한 placeholder
T = tf.placeholder(shape=[None,1], dtype=tf.float32)  # 종속변수(label)의 데이터를 받기위한 placeholder

# Weight & bias
W = tf.Variable(tf.random.normal([5,1]), name='weight')
b = tf.Variable(tf.random.normal([1]), name='bias')

# hypothesis
logit = tf.matmul(X,W) + b
H = tf.sigmoid(logit)

# loss function
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logit, labels=T))

# train
train = tf.train.GradientDescentOptimizer(learning_rate=1e-3).minimize(loss)


# session & 초기화
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# 학습
for step in range(30000):
    _, W_val, b_val, loss_val = sess.run([train, W, b, loss], 
                                         feed_dict={X:x_data, T:t_data})
    
    if step % 3000 == 0:
        print('W : {}, b : {}, loss : {}'.format(W_val, b_val, loss_val))

        
# 정확도(Accuracy)측정
predict = tf.cast(H >= 0.5, dtype=tf.float32)  # True -> 1.0, False -> 0
correct = tf.equal(predict,T)
accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))

accuracy_val = sess.run(accuracy, feed_dict={X:x_data, T:t_data})
print('Accuracy : {}'.format(accuracy_val))
        
# prediction        
survice = np.array([[3, 0,2,0,1]])

result = sess.run(H,feed_dict={X:survice})
print('####### tensorflow 결과값 #########')
print('탑승정보 : {}, 생존확률 : {}'.format(survice,result))    





W : [[ 0.97022253]
 [-0.1561186 ]
 [-0.37181994]
 [-0.13353974]
 [ 2.5207736 ]], b : [-0.01763167], loss : 1.5345916748046875
W : [[ 0.10459678]
 [ 0.09064544]
 [-0.73501825]
 [-0.07116231]
 [ 2.3395123 ]], b : [-0.1992303], loss : 0.670664370059967
W : [[-0.0717603 ]
 [ 0.36181536]
 [-0.5967815 ]
 [ 0.01994945]
 [ 2.1987367 ]], b : [-0.13011219], loss : 0.6183058023452759
W : [[-0.21023765]
 [ 0.59753305]
 [-0.48651513]
 [ 0.09281714]
 [ 2.0545845 ]], b : [-0.07497886], loss : 0.5795398950576782
W : [[-0.3200015 ]
 [ 0.80302566]
 [-0.4000654 ]
 [ 0.15087008]
 [ 1.9133995 ]], b : [-0.03175393], loss : 0.5505033135414124
W : [[-0.4068254 ]
 [ 0.9830145 ]
 [-0.3322401 ]
 [ 0.19728553]
 [ 1.7785581 ]], b : [0.00215872], loss : 0.5284544825553894
W : [[-0.47559237]
 [ 1.1415187 ]
 [-0.27881593]
 [ 0.2347405 ]
 [ 1.6516892 ]], b : [0.02887095], loss : 0.5114535689353943
W : [[-0.5303106 ]
 [ 1.2818842 ]
 [-0.23646104]
 [ 0.2653849 ]
 [ 1.5333538 ]], b : [0.0500485], loss : 0.498140543699264