## 11.基于神经网络MLP模型的心脏病结构化数据分类

数据集的具体描述:

>列| 描述| 
>------------|----------|
>Age | 年龄 | 
>Gender | 性别（1 = 男；0 = 女） | 
>CpType | 胸痛类型（0，1，2，3，4）|
>RestingBP | 静息血压（入院时，以mm Hg计） |
>SerumChol | 血清胆固醇（mg/dl） | 
>FBG |空腹血糖> 120 mg/dl（1 = true；0 = false）|
>RestECG | 静息心电图结果（0，1，2）|
>HRmax | 达到的最大心率 | 
>Angina | 运动诱发心绞痛（1 =是；0 =否）| 
>StDescent | 与休息时相比由运动引起的 ST 节段下降|
>StSlope | 在运动高峰 ST 段的斜率 | 
>GaNum | 荧光透视法染色的大血管动脉（0-3）的数量 |
>Thalassemia | 地中海贫血类型（fixed=固定缺陷、normal=正常、reversible=可逆缺陷）|
>Result | 心脏病诊断结果（1 = true；0 = false） | 

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
import os
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

从csv文件中读取数据，存入dataframe对象

In [2]:
dataframe = pd.read_csv('data/data_introduce.csv')
dataframe

Unnamed: 0,Age,Gender,CpType,RestingBP,SerumChol,FBG,RestingECG,HRmax,Angina,StDescent,StSlope,GaNum,Thalassemia,Result
0,年龄,性别,胸痛类型,静息血压,血清胆固醇,空腹血糖,静息心电图结果,达到的最大心率,运动诱发心绞痛,运动引起ST节段下降,运动高峰ST段的斜率,大血管动脉数量,地中海贫血类型,心脏病诊断结果


In [3]:
dataframe = pd.read_csv('data/data_heart.csv')
dataframe.head()

Unnamed: 0,Age,Gender,CpType,RestingBP,SerumChol,FBG,RestingECG,HRmax,Angina,StDescent,StSlope,GaNum,Thalassemia,Result
0,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,normal,0.0
1,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,normal,0.0
2,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,normal,0.0
3,62.0,0.0,4.0,140.0,268.0,0.0,2.0,160.0,0.0,3.6,3.0,2.0,normal,1.0
4,57.0,0.0,4.0,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,normal,0.0


In [4]:
train_and_val, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train_and_val, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

192 train examples
48 validation examples
60 test examples


In [24]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframeCopy = dataframe.copy()
    labels = dataframeCopy.pop('Result') 
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframeCopy), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframeCopy))
    ds = ds.batch(batch_size)
    return ds

In [25]:
batch_size =5 
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [26]:
for feature_batch, label_batch in train_ds.take(1):   # 取第一个元素构建dataset
    print('Every feature:', list(feature_batch.keys()))
    print('A batch of age:', feature_batch['Age'])
    print('A batch of result:', label_batch )

Every feature: ['Age', 'Gender', 'CpType', 'RestingBP', 'SerumChol', 'FBG', 'RestingECG', 'HRmax', 'Angina', 'StDescent', 'StSlope', 'GaNum', 'Thalassemia']
A batch of age: tf.Tensor([71. 43. 58. 41. 46.], shape=(5,), dtype=float64)
A batch of result: tf.Tensor([0. 1. 0. 0. 0.], shape=(5,), dtype=float64)


In [27]:
def demo(feature_column):
    example_batch = next(iter(train_ds))[0]
    feature_layer = layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())

In [28]:
age = feature_column.numeric_column(key="Age")
demo(age)

[[57.]
 [62.]
 [59.]
 [67.]
 [58.]]


In [29]:
age_buckets = feature_column.bucketized_column(age, boundaries=[25, 35, 45,  55,  65,  75, 85])
demo(age_buckets)

[[0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]]


In [30]:
thal_class = feature_column.categorical_column_with_vocabulary_list(
      'Thalassemia', ['fixed', 'normal', 'reversible'])

thal_one_hot = feature_column.indicator_column(thal_class)
demo(thal_one_hot)

[[0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]


In [31]:
thal_embedding = feature_column.embedding_column(thal_class, dimension=8)
demo(thal_embedding)

[[-0.0520617  -0.29370883  0.4211001   0.2146015  -0.28931552 -0.40047628
  -0.28597975 -0.0896568 ]
 [-0.18680307 -0.05289834  0.19895566  0.33271798  0.10550185 -0.2915969
   0.03050715 -0.1833851 ]
 [-0.0520617  -0.29370883  0.4211001   0.2146015  -0.28931552 -0.40047628
  -0.28597975 -0.0896568 ]
 [-0.18680307 -0.05289834  0.19895566  0.33271798  0.10550185 -0.2915969
   0.03050715 -0.1833851 ]
 [-0.0520617  -0.29370883  0.4211001   0.2146015  -0.28931552 -0.40047628
  -0.28597975 -0.0896568 ]]


In [34]:
thal_hashed = feature_column.categorical_column_with_hash_bucket('Thalassemia', hash_bucket_size=10)
demo(feature_column.indicator_column(thal_hashed))

[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]


In [35]:
dataframe.describe() 

Unnamed: 0,Age,Gender,CpType,RestingBP,SerumChol,FBG,RestingECG,HRmax,Angina,StDescent,StSlope,GaNum,Result
count,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0
mean,54.483333,0.673333,3.11,131.686667,246.52,0.146667,0.98,149.396667,0.323333,1.047,1.583333,0.673333,0.273333
std,8.990689,0.469778,1.023748,17.732549,52.371674,0.354364,0.988022,23.136775,0.46853,1.16491,0.61453,0.928887,0.446415
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,0.0
25%,48.0,0.0,2.0,120.0,211.0,0.0,0.0,132.75,0.0,0.0,1.0,0.0,0.0
50%,56.0,1.0,3.0,130.0,241.5,0.0,1.0,152.5,0.0,0.8,2.0,0.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,1.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,1.0


In [36]:
feature_columns = []
for header in ['Gender','CpType','FBG','RestingECG','Angina','StSlope','GaNum']:
    feature_columns.append(feature_column.numeric_column(header))
age = feature_column.numeric_column("Age")
age_buckets = feature_column.bucketized_column(age, boundaries=[25, 35, 45,  55,  65,  75, 85])

feature_columns.append(age_buckets)

trestbps = feature_column.numeric_column("RestingBP")
trestbps_buckets = feature_column.bucketized_column(trestbps, boundaries=[90, 110, 130, 150, 170, 190, 210])
feature_columns.append(trestbps_buckets)

chol = feature_column.numeric_column("SerumChol")
chol_buckets = feature_column.bucketized_column(chol, boundaries=[100, 200, 300,  400,  500, 600])
feature_columns.append(chol_buckets)

thalach = feature_column.numeric_column("HRmax")
thalach_buckets = feature_column.bucketized_column(thalach, boundaries=[70, 90, 110, 130, 150, 170, 190, 210])
feature_columns.append(thalach_buckets)

oldpeak = feature_column.numeric_column("StDescent")
oldpeak_buckets = feature_column.bucketized_column(oldpeak, boundaries=[0, 1, 2, 3, 4, 5, 6, 7])
feature_columns.append(oldpeak_buckets)


thal = feature_column.categorical_column_with_vocabulary_list(
      'Thalassemia', ['fixed', 'normal', 'reversible'])

thal_one_hot = feature_column.indicator_column(thal)

feature_columns.append(thal_one_hot)

In [37]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [38]:
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [39]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dense(1, activation='sigmoid')
])

In [40]:
model.compile(
              optimizer='adam',  
              loss='binary_crossentropy',  
              metrics=['accuracy'])

In [41]:
model.fit(train_ds,
          validation_data=val_ds,
          epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x2443bf21f88>

In [42]:
# 返回损失值和选定的指标值，即准确率accuracy
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.8333333134651184
