In [17]:
#加载飞桨、NumPy和相关类库
import paddle
from paddle.nn import Linear
import paddle.nn.functional as F
import numpy as np
import os
import random
import pandas as pd

In [18]:
SEED = 1107
random.seed(SEED)
paddle.seed(SEED)

<paddle.fluid.libpaddle.Generator at 0x7f1c9f891d30>

In [19]:
def change_df(df):
    # Replacing Data in Sex Column
    df["sex"].replace({"Male" : 1,
                    "Female" : 0}, inplace=True)

    # Replacing CP Column
    df["chest_pain_type"].replace({"Typical angina" : 0,
                                    "Atypical angina" : 1,
                                    "Non-anginal pain" : 2,
                                    "Asymptomatic" : 3}, inplace=True)

    # Replacing Data in FBS
    df["fasting_blood_sugar"].replace({"Greater than 120 mg/ml" : 1,
                                    "Lower than 120 mg/ml" : 0}, inplace=True)

    # Replacing Data in restecg
    df["rest_ecg"].replace({"Normal" : 0,
                        "ST-T wave abnormality" : 1,
                        "Left ventricular hypertrophy" : 2}, inplace=True)

    # Replacing Data in exang
    df["exercise_induced_angina"].replace({"Yes" : 1,
                                        "No" : 0}, inplace=True)

    # Replacing Data in slope
    df["slope"].replace({"Upsloping" : 0,
                        "Flat" : 1,
                        "Downsloping" : 3}, inplace=True)

    # Replacing Vessels_colored_by_flourosopy
    df["vessels_colored_by_flourosopy"].replace({"Zero" : 0,
                                                "One" : 1,
                                                "Two" : 2,
                                                "Three" : 3,
                                                "Four" : 4}, inplace=True)

    # Replacing Data in thal
    df["thalassemia"].replace({"Normal" : 1,
                            "Fixed Defect" : 2,
                            "Reversable Defect" : 3,
                            "No" : 0}, inplace=True)
    
    return df

In [20]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    df = change_df(df)

    # 转为numpy格式
    data = df.values.astype('float32')

    # 14个属性，其中13个是特征，最后1个是结果
    feature_names = [ 'age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholestoral', 'fasting_blood_sugar', 'rest_ecg', \
                      'Max_heart_rate', 'exercise_induced_angina', 'oldpeak', 'slope', 'vessels_colored_by_flourosopy', 'thalassemia', 'target']
    feature_num = len(feature_names)

    ratio = 0.8
    offset = int(data.shape[0] * ratio)
    training_data = data[:offset]

    # 在预测时，需要将测试集中的数据使用相同的归一化参数进行归一化。因此，只有训练集的最大值和最小值需要被记录下来，而不是测试集的最大值和最小值。
    maximums, minimums = training_data.max(axis=0), training_data.min(axis=0)

    for i in range(feature_num):
        data[:, i] = (data[:, i] - minimums[i]) / (maximums[i] - minimums[i])

    training_data = data[:offset]
    test_data = data[offset:]

    return training_data, test_data

In [21]:
file_path = 'data/HeartDiseaseTrain-Test.csv'
train_data, test_data = load_data(file_path)     # 默认float64

test_x = test_data[:, :13]
test_y = test_data[:, -1]

In [22]:
# MLP适用于表格处理

class Mymodel(paddle.nn.Layer):
    def __init__(self):
        super(Mymodel, self).__init__()
        self.layer1 = Linear(13, 20)
        self.relu1 = paddle.nn.ReLU()
        self.dropout1 = paddle.nn.Dropout(0.2)
        self.layer2 = Linear(20, 25)
        self.relu2 = paddle.nn.ReLU()
        self.dropout2 = paddle.nn.Dropout(0.5)
        self.layer3 = Linear(25, 10)
        self.relu3 = paddle.nn.ReLU()
        self.output_layer = Linear(10, 2)
        self.sigmoid = paddle.nn.Sigmoid()
    def forward(self, x):
        x = self.layer1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.layer2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.layer3(x)
        x = self.relu3(x)
        x = self.output_layer(x)
        x = self.sigmoid(x)
        return x

In [23]:
model = Mymodel()

# use_gpu = True
# paddle.device.set_device('gpu:0') if use_gpu else paddle.device.set_device('cpu')
model.train()
opt = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters())

In [24]:
EPOCH_NUM = 500   # 设置外层循环次数
BATCH_SIZE = 32  # 设置batch大小

# 定义外层循环
for epoch_id in range(EPOCH_NUM):
    np.random.shuffle(train_data)
    mini_batches = [train_data[k:k+BATCH_SIZE] for k in range(0, len(train_data), BATCH_SIZE)]
    for iter_id, mini_batch in enumerate(mini_batches):
        x = np.array(mini_batch[:, :-1]) # 获得当前批次训练数据     [10,13]
        y = np.array(mini_batch[:, -1:]) # 获得当前批次训练标签      [10,1]
        x_feature = paddle.to_tensor(x)     # [10,13]
        target = paddle.to_tensor(y, dtype='int64')    # [10,1]
        
        predicts = model(x_feature)

        loss = F.cross_entropy(predicts, target)    # [10,1]
        avg_loss = paddle.mean(loss)
        if iter_id%20==0:
            print("epoch: {}, iter: {}, loss is: {}".format(epoch_id, iter_id, avg_loss.numpy()))
        
        avg_loss.backward()
        opt.step()
        opt.clear_grad()

epoch: 0, iter: 20, loss is: [0.6856398]
epoch: 1, iter: 0, loss is: [0.68212473]
epoch: 1, iter: 20, loss is: [0.6886301]
epoch: 2, iter: 0, loss is: [0.69188917]
epoch: 2, iter: 20, loss is: [0.66262186]
epoch: 3, iter: 0, loss is: [0.65689963]
epoch: 3, iter: 20, loss is: [0.63221085]
epoch: 4, iter: 0, loss is: [0.6142374]
epoch: 4, iter: 20, loss is: [0.6481808]
epoch: 5, iter: 0, loss is: [0.5964961]
epoch: 5, iter: 20, loss is: [0.59185255]
epoch: 6, iter: 0, loss is: [0.5286935]
epoch: 6, iter: 20, loss is: [0.6042106]
epoch: 7, iter: 0, loss is: [0.53220195]
epoch: 7, iter: 20, loss is: [0.4802614]
epoch: 8, iter: 0, loss is: [0.52087814]
epoch: 8, iter: 20, loss is: [0.5066894]
epoch: 9, iter: 0, loss is: [0.49544007]
epoch: 9, iter: 20, loss is: [0.5601007]
epoch: 10, iter: 0, loss is: [0.4381833]
epoch: 10, iter: 20, loss is: [0.52409256]
epoch: 11, iter: 0, loss is: [0.4838253]
epoch: 11, iter: 20, loss is: [0.48381025]
epoch: 12, iter: 0, loss is: [

In [25]:
# 保存模型参数，文件名为H_model.pdparams
paddle.save(model.state_dict(), 'H_model.pdparams')
print("模型保存成功，模型参数保存在H_model.pdparams中")

模型保存成功，模型参数保存在H_model.pdparams中


In [26]:
param_dict = paddle.load('H_model.pdparams')
model.load_dict(param_dict)
# 灌入数据
model.eval()
predictions=[]
for i,dt in enumerate(test_x):
    y_pred=model(paddle.to_tensor(dt))
    predictions.append(y_pred.argmax().item())


In [27]:
test_y

array([0., 0., 0., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1., 0., 0., 0., 1.,
       1., 1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0.,
       0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 1., 1., 0., 1., 0., 0., 1.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 1., 0., 1.,
       1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 0., 1., 1., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1., 1.,
       1., 1., 0., 0., 1., 0., 0., 1., 0., 1., 1., 1., 1., 0., 0., 0., 1.,
       1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1.,
       1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1.,
       0.], dtype=float32)

In [28]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(test_y,predictions)
print("The accuracy of model is ",accuracy*100,"%")

The accuracy of model is  87.8048780487805 %
