In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd
import sklearn
import tensorflow as tf
from tensorflow import feature_column
from tensorflow import keras
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('datasets/heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0


In [6]:
df.shape

(303, 14)

In [8]:
train, test = train_test_split(df, test_size = 0.2)

In [9]:
train, val = train_test_split(train, test_size = 0.2)

In [10]:
print(len(train), len(val), len(test))

193 49 61


In [11]:
# 使用tf.data的函数完成特征构造

# 一种从pd dataframe 创建tf.data数据集的实用程序方法
def df_2_dataset(dataframe, label_name='label', shuffle=True, batch_size = 32):
    df = dataframe.copy()
    labels = df.pop(label_name)
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds.shuffle(buffer_size = len(dataframe))
    
    return ds.batch(batch_size)

In [12]:
# 演示效果
train_ds = df_2_dataset(train, label_name='target', shuffle=False, batch_size = 5)
for feature_batch, label_batch in train_ds.take(1):
    print('Every feature:', list(feature_batch.keys()))
    print('A batch of ages:', feature_batch['age'])
    print('A batch of targets:', label_batch )

Every feature: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
A batch of ages: tf.Tensor([41 62 37 51 63], shape=(5,), dtype=int32)
A batch of targets: tf.Tensor([0 1 0 0 0], shape=(5,), dtype=int32)


In [14]:
# 用于演示的数据：
example_batch = next(iter(train_ds))[0]

In [15]:
example_batch

{'age': <tf.Tensor: shape=(5,), dtype=int32, numpy=array([41, 62, 37, 51, 63], dtype=int32)>,
 'sex': <tf.Tensor: shape=(5,), dtype=int32, numpy=array([1, 0, 1, 1, 0], dtype=int32)>,
 'cp': <tf.Tensor: shape=(5,), dtype=int32, numpy=array([3, 3, 3, 3, 3], dtype=int32)>,
 'trestbps': <tf.Tensor: shape=(5,), dtype=int32, numpy=array([112, 130, 130, 110, 135], dtype=int32)>,
 'chol': <tf.Tensor: shape=(5,), dtype=int32, numpy=array([250, 263, 250, 175, 252], dtype=int32)>,
 'fbs': <tf.Tensor: shape=(5,), dtype=int32, numpy=array([0, 0, 0, 0, 0], dtype=int32)>,
 'restecg': <tf.Tensor: shape=(5,), dtype=int32, numpy=array([0, 0, 0, 0, 2], dtype=int32)>,
 'thalach': <tf.Tensor: shape=(5,), dtype=int32, numpy=array([179,  97, 187, 123, 172], dtype=int32)>,
 'exang': <tf.Tensor: shape=(5,), dtype=int32, numpy=array([0, 0, 0, 0, 0], dtype=int32)>,
 'oldpeak': <tf.Tensor: shape=(5,), dtype=float64, numpy=array([0. , 1.2, 3.5, 0.6, 0. ])>,
 'slope': <tf.Tensor: shape=(5,), dtype=int32, numpy=arra

In [35]:
# 用于创建一个特征列，并转换一批次的数据的实用方法
def batch2features(feature_name):
    feature_layer = keras.layers.DenseFeatures(feature_name)
    features = feature_layer(example_batch).numpy()
    print(features)
    print(features.shape)

In [36]:
# 数值类

# 使用tf的feature_column做数值转换
batch2features(feature_column.numeric_column("age"))
feature_age = 'f_age'

W0120 21:56:58.855246 140736001074112 base_layer.py:1790] Layer dense_features_13 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



[[41.]
 [62.]
 [37.]
 [51.]
 [63.]]
(5, 1)


In [37]:
# 分桶类
age = feature_column.numeric_column('age')
age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) # 首位各一个
batch2features(age_buckets)
feature_age_buk = ['f_age_buk_' + str(i) for i in range(11)] 

W0120 21:56:59.901951 140736001074112 base_layer.py:1790] Layer dense_features_14 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



[[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]]
(5, 11)


In [38]:
# 分类列
df['thal'].unique()

array(['fixed', 'normal', 'reversible', '1', '2'], dtype=object)

In [39]:
thal = feature_column.categorical_column_with_vocabulary_list(
'thal', df['thal'].unique())

batch2features(feature_column.indicator_column(thal) )
feature_thal =  ['f_thal_' + str(i) for i in range(5)] 

W0120 21:57:02.931308 140736001074112 base_layer.py:1790] Layer dense_features_15 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



[[0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]]
(5, 5)


In [40]:
# 嵌入列
# 假设我们不是只有几个可能的字符串，而是每个类别有数千（或更多）值。
batch2features(feature_column.embedding_column(thal, dimension=4))
feature_thal_embed = ['f_thal_embed_' + str(i) for i in range(4)] 

W0120 21:57:04.118335 140736001074112 base_layer.py:1790] Layer dense_features_16 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



[[ 0.7353011   0.6216138  -0.2674025   0.350217  ]
 [ 0.01296699 -0.20875396  0.09854725 -0.7592551 ]
 [ 0.7353011   0.6216138  -0.2674025   0.350217  ]
 [ 0.7353011   0.6216138  -0.2674025   0.350217  ]
 [ 0.7353011   0.6216138  -0.2674025   0.350217  ]]
(5, 4)


In [41]:
# 组合的特征列
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
batch2features(feature_column.indicator_column(crossed_feature))

W0120 21:57:06.212072 140736001074112 base_layer.py:1790] Layer dense_features_17 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(5, 1000)


In [42]:
feature_names = []
features = []

# 并未实际处理，类似于tf早期版本，先定义网络
# 数值列
for header in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca']:
    features.append(feature_column.numeric_column(header))
    feature_names.append('f_' + header)

# 分桶列
age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
features.append(age_buckets)
feature_names.append('f_' + 'age_buckets')

# 分类列
thal = feature_column.categorical_column_with_vocabulary_list('thal', df['thal'].unique())
thal_one_hot = feature_column.indicator_column(thal)
features.append(thal_one_hot)
feature_names.extend(['f_thal_' + str(i) for i in range(5)])

# 嵌入列
thal_embedding = feature_column.embedding_column(thal, dimension=8)
features.append(thal_embedding)
feature_names.extend(['f_thal_embed_' + str(i) for i in range(8)])

# 组合列
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
crossed_feature = feature_column.indicator_column(crossed_feature)
features.append(crossed_feature)
feature_names.extend(['f_cf_' + str(i) for i in range(1000)])

In [43]:
feature_names[:10]

['f_age',
 'f_trestbps',
 'f_chol',
 'f_thalach',
 'f_oldpeak',
 'f_slope',
 'f_ca',
 'f_age_buckets',
 'f_thal_0',
 'f_thal_1']

In [45]:
features

[NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='trestbps', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='chol', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='thalach', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='oldpeak', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='slope', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='ca', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 BucketizedColumn(source_column=NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(18, 25, 30, 35, 40, 45, 50, 55, 60, 65)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='thal', vocabulary_list=('fixed', 'normal', 'reversi

In [46]:
batch_size = 32
train_ds = df_2_dataset(train, label_name='target', batch_size=batch_size)
val_ds = df_2_dataset(val, label_name='target', batch_size=batch_size)
test_ds = df_2_dataset(test, label_name='target', batch_size=batch_size)

In [47]:
def build_model():
    model = keras.Sequential()
    model.add(keras.layers.DenseFeatures(features))
    model.add(keras.layers.Dense(128, activation='relu'))
    model.add(keras.layers.Dense(128, activation='relu'))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    
    model.compile(optimizer = 'adam',
                 loss='binary_crossentropy',
                 metrics = ['accuracy'],
                 run_eagerly=True)
    
    return model

In [48]:
model = build_model()
model.fit(train_ds, validation_data=val_ds, epochs =5)

Train for 7 steps, validate for 2 steps
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x134d5bb00>

In [49]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.78688526
