<a href="https://colab.research.google.com/github/Vaycold/tensorflow_tutorial/blob/main/%239.CSV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# tf.data.Dataset로 csv 데이터 로드 

In [2]:
import functools

import numpy as np
import tensorflow as tf

In [3]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

train_file_path = tf.keras.utils.get_file('train_csv',TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file('eval_csv',TEST_DATA_URL)

Downloading data from https://storage.googleapis.com/tf-datasets/titanic/train.csv
Downloading data from https://storage.googleapis.com/tf-datasets/titanic/eval.csv


In [5]:
np.set_printoptions(precision=3, suppress=True)

In [9]:
# Data load

! head {train_file_path}

survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
1,female,35.0,1,0,53.1,First,C,Southampton,n
0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y
0,male,2.0,3,1,21.075,Third,unknown,Southampton,n
1,female,27.0,0,2,11.1333,Third,unknown,Southampton,n
1,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n
1,female,4.0,1,1,16.7,Third,G,Southampton,n


In [10]:
label_column = 'survived'
labels = [0,1]

# 데이터셋 작성

def get_datasets(file_path, **kwargs) :
    dataset = tf.data.experimental.make_csv_dataset(
        file_path, 
        batch_size = 5,
        label_name = label_column,
        na_value = '?',
        num_epochs = 1,
        ignore_errors = True,
        **kwargs
    )
    return dataset

raw_train_data = get_datasets(train_file_path)
raw_test_data = get_datasets(test_file_path)

In [13]:
def show_batch(dataset) :
    for batch, label in dataset.take(1) :
        for key, value in batch.items() :
            print('{:20s} : {} '.format(key,value.numpy()))

In [14]:
show_batch(raw_train_data)

sex                  : [b'male' b'male' b'female' b'male' b'male'] 
age                  : [19. 28. 28. 28. 28.] 
n_siblings_spouses   : [0 0 1 0 0] 
parch                : [0 0 0 0 0] 
fare                 : [ 8.158 30.696 16.1    7.25   7.05 ] 
class                : [b'Third' b'First' b'Third' b'Third' b'Third'] 
deck                 : [b'unknown' b'unknown' b'unknown' b'unknown' b'unknown'] 
embark_town          : [b'Southampton' b'Cherbourg' b'Southampton' b'Southampton' b'Southampton'] 
alone                : [b'y' b'y' b'n' b'y' b'y'] 


In [15]:
# 만약 첫 번째 줄에 열 이름이 포함되어 있지 않은 경우 column_names 인수로 전달 *이 경우에는 해당 x
csv_columns = ['survived', 'sex', 'age', 'n_siblings_spouses', 'parch', 'fare', 'class', 'deck', 'embark_town', 'alone']

temp_dataset = get_datasets(train_file_path, column_names = csv_columns)

show_batch(temp_dataset)

sex                  : [b'male' b'male' b'female' b'male' b'male'] 
age                  : [28. 24. 50. 23. 25.] 
n_siblings_spouses   : [0 2 0 0 1] 
parch                : [0 0 0 0 0] 
fare                 : [31.    73.5   28.712 10.5   26.   ] 
class                : [b'First' b'Second' b'First' b'Second' b'Second'] 
deck                 : [b'unknown' b'unknown' b'C' b'unknown' b'unknown'] 
embark_town          : [b'Southampton' b'Southampton' b'Cherbourg' b'Southampton' b'Southampton'] 
alone                : [b'y' b'n' b'y' b'y' b'n'] 


In [16]:
select_columns = ['survived', 'age', 'n_siblings_spouses', 'class', 'deck', 'alone']

temp_dataset = get_datasets(train_file_path, select_columns = select_columns)
show_batch(temp_dataset)

age                  : [28. 28. 28. 33. 18.] 
n_siblings_spouses   : [0 0 0 1 0] 
class                : [b'Third' b'Third' b'Second' b'First' b'Second'] 
deck                 : [b'unknown' b'unknown' b'unknown' b'E' b'unknown'] 
alone                : [b'y' b'y' b'y' b'n' b'y'] 


In [18]:
# Data preprocessing

select_columns = ['survived', 'age', 'n_siblings_spouses', 'parch', 'fare']
default = [0, 0.0, 0.0, 0.0, 0.0]

temp_dataset = get_datasets(train_file_path,
                            select_columns = select_columns,
                            column_defaults = default)
show_batch( temp_dataset)

age                  : [29. 28. 56. 26. 30.] 
n_siblings_spouses   : [0. 0. 0. 1. 0.] 
parch                : [0. 0. 0. 1. 0.] 
fare                 : [ 9.5  13.   26.55 26.   13.  ] 


In [19]:
example_batch, labels_batch = next(iter(temp_dataset))

In [22]:
def pack(features, label) :
    return tf.stack(list(features.values()), axis=-1), label_column

packed_dataset =  temp_dataset.map(pack)

for features, labels in packed_dataset.take(1) :
    print(features.numpy())
    print()
    print(labels.numpy())

[[28.     0.     0.    12.35 ]
 [28.     0.     0.    13.   ]
 [40.     1.     1.    39.   ]
 [ 2.     0.     1.    12.288]
 [24.     0.     0.     7.142]]

b'survived'


In [23]:
show_batch(raw_train_data)

sex                  : [b'female' b'male' b'male' b'male' b'male'] 
age                  : [40. 28. 24. 25.  4.] 
n_siblings_spouses   : [0 0 0 0 1] 
parch                : [0 0 0 0 1] 
fare                 : [13.     7.25   7.142  0.    11.133] 
class                : [b'Second' b'Third' b'Third' b'Third' b'Third'] 
deck                 : [b'unknown' b'unknown' b'unknown' b'unknown' b'unknown'] 
embark_town          : [b'Southampton' b'Southampton' b'Southampton' b'Southampton'
 b'Southampton'] 
alone                : [b'y' b'y' b'y' b'y' b'n'] 


In [24]:
example_batch, labels_batch = next(iter(temp_dataset))

In [37]:
# 숫자 특성 목록을 선택하고 단일 열로 묶는 것보다 일반적인 전처리기를 정의함

class PackNumericFeatures(object) :
    def __init__(self, names) :
        self.names = names
    
    def __call__(self, features, labels) :
        numeric_features = [features.pop(name) for name in self.names]
        numeric_features = [tf.cast(feat, tf.float32) for feat in numeric_features]
        numeric_features = tf.stack(numeric_features, axis=-1)
        features['numeric'] = numeric_features

        return features, labels

In [38]:
NUMERIC_FEATURES = ['age','n_siblings_spouses','parch', 'fare']

packed_train_data = raw_train_data.map(
    PackNumericFeatures(NUMERIC_FEATURES))

packed_test_data = raw_test_data.map(
    PackNumericFeatures(NUMERIC_FEATURES))

show_batch(packed_train_data)

sex                  : [b'male' b'male' b'male' b'male' b'male'] 
class                : [b'First' b'First' b'Third' b'Second' b'Third'] 
deck                 : [b'unknown' b'C' b'unknown' b'unknown' b'unknown'] 
embark_town          : [b'Cherbourg' b'Cherbourg' b'Southampton' b'Southampton' b'Southampton'] 
alone                : [b'y' b'n' b'y' b'n' b'y'] 
numeric              : [[ 35.      0.      0.     26.55 ]
 [ 17.      0.      2.    110.883]
 [ 26.      0.      0.      7.896]
 [ 25.      1.      0.     26.   ]
 [ 28.      0.      0.      7.896]] 


In [39]:
example_batch, labels_batch = next(iter(packed_train_data))

In [42]:
# 데이터 정규화

import pandas as pd
desc = pd.read_csv(train_file_path)[NUMERIC_FEATURES].describe()
desc

Unnamed: 0,age,n_siblings_spouses,parch,fare
count,627.0,627.0,627.0,627.0
mean,29.631308,0.545455,0.379585,34.385399
std,12.511818,1.15109,0.792999,54.59773
min,0.75,0.0,0.0,0.0
25%,23.0,0.0,0.0,7.8958
50%,28.0,0.0,0.0,15.0458
75%,35.0,1.0,0.0,31.3875
max,80.0,8.0,5.0,512.3292


In [43]:
MEAN = np.array(desc.T['mean'])
STD = np.array(desc.T['std'])

In [44]:
def normalize(data, mean, std) :
    return (data-mean) / std

In [45]:
normalizer = functools.partial(normalize, mean = MEAN, std = STD)

numeric_column = tf.feature_column.numeric_column('numeric', normalizer_fn = normalizer,
                                                  shape = [len(NUMERIC_FEATURES)])
numeric_columns = [numeric_column]
numeric_columns


[NumericColumn(key='numeric', shape=(4,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function normalize at 0x7fb8d0425cb0>, mean=array([29.631,  0.545,  0.38 , 34.385]), std=array([12.512,  1.151,  0.793, 54.598])))]

In [46]:
example_batch['numeric']

<tf.Tensor: shape=(5, 4), dtype=float32, numpy=
array([[28.   ,  0.   ,  0.   ,  7.896],
       [24.   ,  0.   ,  3.   , 19.258],
       [49.   ,  1.   ,  0.   , 56.929],
       [53.   ,  2.   ,  0.   , 51.479],
       [18.   ,  0.   ,  0.   ,  7.775]], dtype=float32)>