In [36]:
import numpy as np
import pandas as pd
import tensorflow as tf
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt


# Raw Data Loading
df = pd.read_csv('./data/Titanic/train.csv')
# display(df)

# 결측치 확인
# df.isnull().sum()
#Age            177
#Cabin          687
# Embarked         2

# 이상치 확인
# 성별 변경
df['Sex'] = np.array(df['Sex'] == 'female', dtype = np.int32) 
# sex_mapping = {'male' : 0 , 'female': 1}
# df['Sex'] = df['Sex'].map(sex_mapping) # python 의 map과 다른역할

# family 열 추가
df['Family'] = df['SibSp']|df['Parch']
# df['Family'] = df['SibSp']+df['Parch']

# Fare 는 그래도 사용

# Ticket 삭제
# P class 사용 삭제
# Suvived 종속변수

# Embarked 결측치 처리
df['Embarked'] = df['Embarked'].fillna('Q') # fillna 원하는 값으로 채움

# Age에 대한 결측치 처리
df['Age'] = df['Age'].fillna(df['Age'].mean())

# Embarked 숫자로 처리
embarked_mapping = {'S' : 0 , 'C': 1, 'Q': 2}
df['Embarked'] = df['Embarked'].map(embarked_mapping)

# Age에 대해서 Binning 처리(Numerical value -> categorical value)
df.loc[df['Age'] < 8, 'Age'] = 0
df.loc[(df['Age'] >= 8) & (df['Age'] < 20), 'Age'] = 1
df.loc[(df['Age'] >= 20) & (df['Age'] < 65), 'Age'] = 2
df.loc[df['Age'] >= 65, 'Age'] = 4


df.drop(['PassengerId','Name','Ticket','Cabin','SibSp','Fare','Parch'],axis =1, inplace = True)
display(df)


# 데이터가 준비되었어요
# Tensorflow 를 이용해서 학습을 진행하고 Accuracy 를 구해보아요
# 891개의 데이터가 있어요 이걸 다 학습에 사용하면 안되요
# 7:3 비율로 데이터를 나누어서 학습과 정확도 측정에 사용해요!

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked,Family
0,0,3,0,2.0,0,1
1,1,1,1,2.0,1,1
2,1,3,1,2.0,0,0
3,1,1,1,2.0,0,1
4,0,3,0,2.0,0,0
...,...,...,...,...,...,...
886,0,2,0,2.0,0,0
887,1,1,1,1.0,0,0
888,0,3,1,2.0,0,3
889,1,1,0,2.0,1,0


In [37]:

# Tensorflow


zscore_threshold = 2.0

for col in df.columns:
    outlier = df[col][np.abs(stats.zscore(df[col])) > zscore_threshold]    
    df = df.loc[~df[col].isin(outlier)]
    

# Training Data Set
x_data = df.drop('Survived', axis=1, inplace=False).values
t_data = df['Survived'].values.reshape(-1,1)


# placeholder
X = tf.placeholder(shape=[None,5], dtype=tf.float32)  # 독립변수의 데이터를 받기위한 placeholder
T = tf.placeholder(shape=[None,1], dtype=tf.float32)  # 종속변수(label)의 데이터를 받기위한 placeholder

# Weight & bias
W = tf.Variable(tf.random.normal([5,1]), name='weight')
b = tf.Variable(tf.random.normal([1]), name='bias')

# hypothesis
logit = tf.matmul(X,W) + b
H = tf.sigmoid(logit)

# loss function
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logit, labels=T))

# train
train = tf.train.GradientDescentOptimizer(learning_rate=1e-3).minimize(loss)


# session & 초기화
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# 학습
for step in range(30000):
    _, W_val, b_val, loss_val = sess.run([train, W, b, loss], 
                                         feed_dict={X:x_data, T:t_data})
    
    if step % 3000 == 0:
        print('W : {}, b : {}, loss : {}'.format(W_val, b_val, loss_val))

        
# 정확도(Accuracy)측정
predict = tf.cast(H >= 0.5, dtype=tf.float32)  # True -> 1.0, False -> 0
correct = tf.equal(predict,T)
accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))

accuracy_val = sess.run(accuracy, feed_dict={X:x_data, T:t_data})
print('Accuracy : {}'.format(accuracy_val))
        
# prediction        
survice = np.array([[3, 0,2,0,1]])

result = sess.run(H,feed_dict={X:survice})
print('####### tensorflow 결과값 #########')
print('탑승정보 : {}, 생존확률 : {}'.format(survice,result))    





W : [[ 0.07977581]
 [-0.13718292]
 [ 0.35216364]
 [ 0.7703914 ]
 [ 0.00216539]], b : [0.06230799], loss : 0.9623813033103943
W : [[-0.45447838]
 [ 0.17982051]
 [ 0.10227881]
 [ 0.7457864 ]
 [ 0.11282317]], b : [-0.05562655], loss : 0.5714558959007263
W : [[-0.5576541 ]
 [ 0.47815284]
 [ 0.12889603]
 [ 0.740622  ]
 [ 0.20756787]], b : [-0.02770173], loss : 0.5344400405883789
W : [[-0.6270624 ]
 [ 0.72740215]
 [ 0.142702  ]
 [ 0.7297794 ]
 [ 0.24819261]], b : [-0.00754433], loss : 0.5112208724021912
W : [[-0.6756169 ]
 [ 0.94026405]
 [ 0.1489625 ]
 [ 0.71653336]
 [ 0.2591295 ]], b : [0.00779456], loss : 0.4950907230377197
W : [[-0.71080345]
 [ 1.1248231 ]
 [ 0.15074484]
 [ 0.70253766]
 [ 0.2538994 ]], b : [0.02005546], loss : 0.48317259550094604
W : [[-0.73721087]
 [ 1.2865345 ]
 [ 0.1498733 ]
 [ 0.68869555]
 [ 0.24008079]], b : [0.03028916], loss : 0.47404539585113525
W : [[-0.7577264 ]
 [ 1.4293116 ]
 [ 0.14748599]
 [ 0.67544943]
 [ 0.22199988]], b : [0.0391655], loss : 0.4669048488140

In [43]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import linear_model
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt


# Raw Data Loading
df = pd.read_csv('./data/Titanic/test.csv')
# display(df)

# 결측치 확인
# df.isnull().sum()
#Age            177
#Cabin          687
# Embarked         2

# 이상치 확인
# 성별 변경
df['Sex'] = np.array(df['Sex'] == 'female', dtype = np.int32) 
# sex_mapping = {'male' : 0 , 'female': 1}
# df['Sex'] = df['Sex'].map(sex_mapping) # python 의 map과 다른역할

# family 열 추가
df['Family'] = df['SibSp']|df['Parch']
# df['Family'] = df['SibSp']+df['Parch']

# Fare 는 그래도 사용

# Ticket 삭제
# P class 사용 삭제
# Suvived 종속변수

# Embarked 결측치 처리
df['Embarked'] = df['Embarked'].fillna('Q') # fillna 원하는 값으로 채움

# Age에 대한 결측치 처리
df['Age'] = df['Age'].fillna(df['Age'].mean())

# Embarked 숫자로 처리
embarked_mapping = {'S' : 0 , 'C': 1, 'Q': 2}
df['Embarked'] = df['Embarked'].map(embarked_mapping)

# Age에 대해서 Binning 처리(Numerical value -> categorical value)
df.loc[df['Age'] < 8, 'Age'] = 0
df.loc[(df['Age'] >= 8) & (df['Age'] < 20), 'Age'] = 1
df.loc[(df['Age'] >= 20) & (df['Age'] < 65), 'Age'] = 2
df.loc[df['Age'] >= 65, 'Age'] = 4


df.drop(['PassengerId','Name','Ticket','Cabin','SibSp','Fare','Parch'],axis =1, inplace = True)
display(df)
print(df.isnull().sum())


test_data = df.values
result = sess.run(H, feed_dict={X:test_data})

result = sess.run(H>= 0.5, feed_dict={X:test_data})


                
df_submission = pd.read_csv('./data/Titanic/gender_submission.csv')
df_submission['Survived'] = result.astype(np.int32)
df_submission.to_csv('./data/Titanic/gender_submission.csv', index = False)
display(df_submission)



# DataFrame.to_csv(path_or_buf=None, sep=',', na_rep='', float_format=None, 
#                  columns=None, header=True, index=True, index_label=None, mode='w', 
#                  encoding=None, compression='infer', quoting=None, quotechar='"', 
#                  line_terminator=None, chunksize=None, date_format=None, 
#                  doublequote=True, escapechar=None, decimal='.', errors='strict')[source]

Unnamed: 0,Pclass,Sex,Age,Embarked,Family
0,3,0,2.0,2,0
1,3,1,2.0,0,1
2,2,0,2.0,2,0
3,3,0,2.0,0,0
4,3,1,2.0,0,1
...,...,...,...,...,...
413,3,0,2.0,0,0
414,1,1,2.0,1,0
415,3,0,2.0,0,0
416,3,0,2.0,0,0


Pclass      0
Sex         0
Age         0
Embarked    0
Family      0
dtype: int64


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,1
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
