In [1]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1500)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPUs


In [2]:


import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
print(tf.__version__)

2.0.0-beta1


In [3]:
# 1.数据集
# 我们将使用克利夫兰诊所心脏病基金会提供的一个小数据集。 CSV中有几百行。 每行描述一个患者，每列描述一个属性。
# 我们将使用此信息来预测患者是否患有心脏病，该疾病在该数据集中是二元分类任务。

In [4]:
# 2.准备数据
# 使用pandas读取数据

In [5]:
# URL = 'https://storage.googleapis.com/applied-dl/heart.csv'
import pandas as pd
dataframe = pd.read_csv('heart.csv')
dataframe.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0


In [6]:
# 划分训练集验证集和测试集

In [7]:
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

193 train examples
49 validation examples
61 test examples


In [8]:
# 使用tf.data构造输入pipeline

In [9]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('target')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

batch_size = 5
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)
for feature_batch, label_batch in train_ds.take(1):
    print('Every feature:', list(feature_batch.keys()))
    print('A batch of ages:', feature_batch['age'])
    print('A batch of targets:', label_batch )

Every feature: ['slope', 'age', 'chol', 'trestbps', 'thal', 'oldpeak', 'sex', 'thalach', 'exang', 'cp', 'restecg', 'fbs', 'ca']
A batch of ages: tf.Tensor([51 56 52 67 56], shape=(5,), dtype=int32)
A batch of targets: tf.Tensor([0 0 0 1 0], shape=(5,), dtype=int32)


In [10]:
# 3.tensorflow的feature columnexample_batch = next(iter(train_ds))[0]
example_batch = next(iter(train_ds))[0]
def demo(feature_column):
    feature_layer = layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())

In [11]:
# 数字列
# 特征列的输出成为模型的输入。 数字列是最简单的列类型。 它用于表示真正有价值的特征。
# 使用此列时，模型将从数据框中接收未更改的列值。

In [12]:
age = feature_column.numeric_column("age")
demo(age)

[[51.]
 [56.]
 [52.]
 [67.]
 [56.]]


In [13]:
# Bucketized列（桶列）
# 通常，您不希望将数字直接输入模型，而是根据数值范围将其值分成不同的类别。 考虑代表一个人年龄的原始数据。 我们可以使用bucketized列将年龄分成几个桶，而不是将年龄表示为数字列。 
# 请注意，下面的one-hot描述了每行匹配的年龄范围。

In [14]:
#新增55,60,65
age_buckets = feature_column.bucketized_column(age, boundaries=[
    18, 25, 30, 35, 40, 50,55,60,65
])
demo(age_buckets)

[[0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]]


In [15]:
# 类别列
# 在该数据集中，thal表示为字符串（例如“固定”，“正常”或“可逆”）。 
# 我们无法直接将字符串提供给模型。 相反，我们必须首先将它们映射到数值。 
# 类别列提供了一种将字符串表示为单热矢量的方法（就像上面用年龄段看到的那样）。 
# 类别表可以使用categorical_column_with_vocabulary_list作为列表传递，
# 或者使用categorical_column_with_vocabulary_file从文件加载。

In [16]:
thal = feature_column.categorical_column_with_vocabulary_list('thal', ['fixed', 'normal', 'reversible'])
thal_one_hot = feature_column.indicator_column(thal)
demo(thal_one_hot)

W0715 01:28:12.479540 139803217413888 deprecation.py:323] From /usr/local/lib/python3.5/dist-packages/tensorflow/python/feature_column/feature_column_v2.py:2655: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0715 01:28:12.486268 139803217413888 deprecation.py:323] From /usr/local/lib/python3.5/dist-packages/tensorflow/python/feature_column/feature_column_v2.py:4215: IndicatorColumn._variable_shape (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
W0715 01:28:12.487715 139803217413888 deprecation.py:323] From /usr/local/lib/python3.5/dist-packages/tensorflow/python/feature_column/feature_column_v2.py:4270: VocabularyListCateg

[[0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]]


In [17]:
# 嵌入列
# 假设我们不是只有几个可能的字符串，而是每个类别有数千（或更多）值。 
# 由于多种原因，随着类别数量的增加，使用单热编码训练神经网络变得不可行。
# 我们可以使用嵌入列来克服此限制。 
# 嵌入列不是将数据表示为多维度的单热矢量，而是将数据表示为低维密集向量，
# 其中每个单元格可以包含任意数字，而不仅仅是0或1.嵌入的大小是必须训练调整的参数。

# 注：当分类列具有许多可能的值时，最好使用嵌入列。

In [18]:
thal_embedding = feature_column.embedding_column(thal, dimension=8)
demo(thal_embedding)

[[ 0.33151653 -0.5251948  -0.03039126 -0.33556095 -0.33658487  0.06435142
  -0.10228992  0.12837255]
 [ 0.33151653 -0.5251948  -0.03039126 -0.33556095 -0.33658487  0.06435142
  -0.10228992  0.12837255]
 [-0.4867591  -0.35177463  0.52706    -0.08485367 -0.06492914  0.1762849
   0.22421625 -0.36759683]
 [ 0.33151653 -0.5251948  -0.03039126 -0.33556095 -0.33658487  0.06435142
  -0.10228992  0.12837255]
 [ 0.33151653 -0.5251948  -0.03039126 -0.33556095 -0.33658487  0.06435142
  -0.10228992  0.12837255]]


In [19]:
# 哈希特征列
# 表示具有大量值的分类列的另一种方法是使用categorical_column_with_hash_bucket。 
# 此功能列计算输入的哈希值，然后选择一个hash_bucket_size存储桶来编码字符串。 使用此列时，您不需要提供词汇表，并且可以选择使hash_buckets的数量远远小于实际类别的数量以节省空间。

# 注：该技术的一个重要缺点是可能存在冲突，其中不同的字符串被映射到同一个桶。

In [20]:
thal_hashed = feature_column.categorical_column_with_hash_bucket('thal', hash_bucket_size=1000)
demo(feature_column.indicator_column(thal_hashed))

W0715 01:28:13.769973 139803217413888 deprecation.py:323] From /usr/local/lib/python3.5/dist-packages/tensorflow/python/feature_column/feature_column_v2.py:4270: HashedCategoricalColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [21]:
# 交叉功能列
# 将特征组合成单个特征（更好地称为特征交叉），使模型能够为每个特征组合学习单独的权重。
# 在这里，我们将创建一个与age和thal交叉的新功能。
# 请注意，crossed_column不会构建所有可能组合的完整表（可能非常大）。 
# 相反，它由hashed_column支持，因此您可以选择表的大小。

In [22]:
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
demo(feature_column.indicator_column(crossed_feature))

W0715 01:28:14.441413 139803217413888 deprecation.py:323] From /usr/local/lib/python3.5/dist-packages/tensorflow/python/feature_column/feature_column_v2.py:4270: CrossedColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [23]:
# 4.选择使用feature column

In [24]:
feature_columns = []

# numeric cols
for header in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca']:
    feature_columns.append(feature_column.numeric_column(header))

# bucketized cols
age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
feature_columns.append(age_buckets)

# indicator cols
thal = feature_column.categorical_column_with_vocabulary_list(
      'thal', ['fixed', 'normal', 'reversible'])
thal_one_hot = feature_column.indicator_column(thal)
feature_columns.append(thal_one_hot)

# embedding cols
thal_embedding = feature_column.embedding_column(thal, dimension=8)
feature_columns.append(thal_embedding)

# crossed cols
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
crossed_feature = feature_column.indicator_column(crossed_feature)
feature_columns.append(crossed_feature)

In [25]:
# 构建特征层

In [26]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [27]:
# 5.构建模型并训练

In [29]:
model = tf.keras.Sequential([
    feature_layer,
    layers.Dense(128, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'],
              run_eagerly=True)##加这个就能运行，不加就不能运行
model.fit(train_ds, validation_data=val_ds,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f2550fcd898>

In [None]:
# 测试

In [30]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.75409836
