# 딥러닝 실습 - Cardiovascular Disease Dataset

---


In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras import optimizers
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# https://www.kaggle.com/datasets/sulianova/cardiovascular-disease-dataset

!gdown --id 10y8ofUxjkI4uEeFGLv1c7tOJ1Z2_9Zhh --output dataset.csv
df = pd.read_csv('./dataset.csv',sep=";")

Downloading...
From: https://drive.google.com/uc?id=10y8ofUxjkI4uEeFGLv1c7tOJ1Z2_9Zhh
To: /content/dataset.csv
100% 2.94M/2.94M [00:00<00:00, 95.6MB/s]


In [3]:
df.shape

(70000, 13)

In [4]:
df.head(10)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
5,8,21914,1,151,67.0,120,80,2,2,0,0,0,0
6,9,22113,1,157,93.0,130,80,3,1,0,0,1,0
7,12,22584,2,178,95.0,130,90,3,3,0,0,1,1
8,13,17668,1,158,71.0,110,70,1,1,0,0,1,0
9,14,19834,1,164,68.0,110,60,1,1,0,0,0,0


There are 3 types of input features:
*   Objective: factual information;
*   Examination: results of medical examination;
*   Subjective: information given by the patient.

Features:
*   Age | Objective Feature | age | int (days)
*   Height | Objective Feature | height | int (cm) |
*   Weight | Objective Feature | weight | float (kg) |
*   Gender | Objective Feature | gender | categorical code |
*   Systolic blood pressure | Examination Feature | ap_hi | int |
*   Diastolic blood pressure | Examination Feature | ap_lo | int |
*   Cholesterol | Examination Feature | cholesterol | 1: normal, 2: above normal, 3: well above normal |
*   Glucose | Examination Feature | gluc | 1: normal, 2: above normal, 3: well above normal |
*   Smoking | Subjective Feature | smoke | binary |
*   Alcohol intake | Subjective Feature | alco | binary |
*   Physical activity | Subjective Feature | active | binary |
*   Presence or absence of cardiovascular disease | Target Variable | cardio | binary |

All of the dataset values were collected at the moment of medical examination.

In [5]:
data = np.array(df)
np.random.shuffle(data)

# Feature 분리
x_data = data[??,??]
y_data = data[??,??]

# Train / Validation / Test 데이터셋 분리
vol_train, vol_val, vol_test = 0.8, 0.1, 0.1
idx_train = ??
idx_val = ??

x_train = x_data[:idx_train]
x_val = x_data[idx_train:idx_val]
x_test = x_data[idx_val:]

y_train = y_data[:idx_train]
y_val = y_data[idx_train:idx_val]
y_test = y_data[idx_val:]

print('x_train :', x_train.shape, '\tx_val :', x_val.shape, '\tx_test :', x_test.shape)
print('y_train :', y_train.shape, '\ty_val :', y_val.shape, '\ty_test :', y_test.shape)

SyntaxError: ignored

In [None]:
# 데이터 정규화
mean = x_train.mean(??)
std = x_train.std(??)
x_train = (x_train - mean)/std
x_val = ??
x_test = ??

In [None]:
# 모델 구성
model = Sequential()

??

model.compile(optimizer=??, loss=??, metrics=??)

model.summary()


In [None]:
# 모델 훈련
history = model.fit(x_train, y_train, validation_data=(x_val,y_val),epochs=??, batch_size=??)

In [None]:
# 학습 확인
fig, ax = plt.subplots(1,2, figsize=(15,5))

ax[0].plot(history.history['loss'], 'y.-', label='train loss')
ax[0].plot(history.history['val_loss'], 'r.-', label='val loss')
ax[0].set_xlabel('epoch')
ax[0].set_ylabel('loss')
ax[0].legend(loc='upper right')

ax[1].plot(history.history['accuracy'], 'b.-', label='train acc')
ax[1].plot(history.history['val_accuracy'], 'g.-', label='val acc')
ax[1].set_xlabel('epoch')
ax[1].set_ylabel('accuracy')
ax[1].legend(loc='upper right')

plt.show()

In [None]:
# test data를 이용한 평가
y_eval = model.evaluate(x_test, y_test)
print(y_eval)

In [None]:
# 예측 결과 확인
y_hat = model.predict(x_test)

plt.figure(figsize = (5,5))
cm = confusion_matrix(y_test, np.round(y_hat))
sns.heatmap(cm, annot = True, fmt = 'd', cmap = 'Reds')
 
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
# 오차 행렬 확인
print(classification_report(y_test, np.round(y_hat)))