In [1]:
import tensorflow as tf

In [2]:
import numpy as np
import pandas as pd

In [3]:
# overview of the data
data = pd.read_csv("heart.csv")
data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
# get a summary of the number of discrete values in each str column
index = [1, 2, 6, 8, 10]
df = np.loadtxt('heart.csv', delimiter=',', skiprows=1, usecols = index, dtype = "str")

for i in range(5):
    print("col:", index[i])
    print(pd.Series(df[:, i]).value_counts(), end="\n\n")

    

col: 1
M    725
F    193
dtype: int64

col: 2
ASY    496
NAP    203
ATA    173
TA      46
dtype: int64

col: 6
Normal    552
LVH       188
ST        178
dtype: int64

col: 8
N    547
Y    371
dtype: int64

col: 10
Flat    460
Up      395
Down     63
dtype: int64



In [5]:
from util import load_data, add_theta_0
from sklearn.preprocessing import StandardScaler

# load data
x = np.loadtxt("heart.csv", usecols = range(11), skiprows = 1, dtype = "str", delimiter=',')
y = np.loadtxt("heart.csv", usecols = range(11, 12), skiprows = 1, dtype = "str", delimiter=',')

# convert str into int
m, n = x.shape
d0 = {"M": 1, "F": 0}
d1 = {"ASY": 1, "NAP": 2, "ATA": 3, "TA": 4}
d2 = {"Normal": 1, "ST": 2, "LVH": 3}
d3 = {"Y": 1, "N": 0}
d4 = {"Up": 1, "Flat": 2, "Down": 3}
for i in range(m):
    x[i, 1] = d0[x[i, 1]]
    x[i, 2] = d1[x[i, 2]]
    x[i, 6] = d2[x[i, 6]]
    x[i, 8] = d3[x[i, 8]]
    x[i, 10] = d4[x[i, 10]]
    
x = x.astype(float)
y = y.astype(float)

# feature scaling
scaler = StandardScaler().fit(x)
x = scaler.transform(x)

# add intercept
x = add_theta_0(x)

print("x shape:", x.shape)
print(x)
print("y shape:", y.shape)

x shape: (918, 12)
[[ 1.         -1.4331398   0.51595242 ... -0.8235563  -0.83243239
  -1.05211381]
 [ 1.         -0.47848359 -1.93816322 ... -0.8235563   0.10566353
   0.59607813]
 [ 1.         -1.75135854  0.51595242 ... -0.8235563  -0.83243239
  -1.05211381]
 ...
 [ 1.          0.37009972  0.51595242 ...  1.21424608  0.29328271
   0.59607813]
 [ 1.          0.37009972 -1.93816322 ... -0.8235563  -0.83243239
   0.59607813]
 [ 1.         -1.64528563  0.51595242 ... -0.8235563  -0.83243239
  -1.05211381]]
y shape: (918,)


In [13]:
from sklearn.preprocessing import StandardScaler

# define variables for gradient descent
n_epoch = 100000
learning_rate = 0.1
m, n = x.shape

X = tf.constant(x, dtype = tf.float32, name = "Scaled Features")
y = y.reshape(-1, 1)
theta = tf.Variable((tf.random.uniform([12, 1], -1.0, 1.0)), name = "theta")


In [7]:
# print the the percentage of examples that the model correctly classifies (in decimal)
def correct_rate(theta, X, m, y, mse):
    pred = 1 / (1 + tf.math.exp(- X @ theta))
    pred = pred > 0.5
    sum = 0
    for i in range(m):
        sum += int(y[i][0] == int(pred[i][0]))
    print("Epoch: ", epoch, "correct rate: ", sum / m, " mse:", mse)

In [8]:
# perform gradient descent
for epoch in range(n_epoch):
    y_pred = 1 + tf.math.exp(- X @ theta)
    error = y_pred - y

    mse = tf.reduce_mean(tf.square(error), name = "mse")
    theta_old = theta
    
    # update paramter theta
    theta = theta + learning_rate /  (epoch / 1000 + 1) * tf.transpose(X) @ (y - 1 / (1 + tf.math.exp(- X @ theta))) / m
  
    # end loop if the updated theta is within 1e-5 of the old theta
    if (np.linalg.norm(theta_old - theta) < 0.00001):
        print(theta)
        correct_rate(theta, X, m, y, mse)

        break
        
    # print correct classification rate every 100 updates
    if epoch % 100 == 0:
        correct_rate(theta, X, m, y, mse)
        
        

Epoch:  0 correct rate:  0.28649237472766886  mse: tf.Tensor(692.3969, shape=(), dtype=float32)
Epoch:  100 correct rate:  0.8387799564270153  mse: tf.Tensor(109.093796, shape=(), dtype=float32)
Epoch:  200 correct rate:  0.8572984749455338  mse: tf.Tensor(155.92836, shape=(), dtype=float32)
Epoch:  300 correct rate:  0.8572984749455338  mse: tf.Tensor(183.23137, shape=(), dtype=float32)
Epoch:  400 correct rate:  0.8562091503267973  mse: tf.Tensor(203.11433, shape=(), dtype=float32)
Epoch:  500 correct rate:  0.8572984749455338  mse: tf.Tensor(217.45157, shape=(), dtype=float32)
Epoch:  600 correct rate:  0.8562091503267973  mse: tf.Tensor(227.58604, shape=(), dtype=float32)
Epoch:  700 correct rate:  0.8572984749455338  mse: tf.Tensor(234.70294, shape=(), dtype=float32)
Epoch:  800 correct rate:  0.8572984749455338  mse: tf.Tensor(239.71642, shape=(), dtype=float32)
Epoch:  900 correct rate:  0.8562091503267973  mse: tf.Tensor(243.28006, shape=(), dtype=float32)
Epoch:  1000 correct 

In [9]:
# alternatively, train a logistic regression model using sklearn library
from sklearn.linear_model import LogisticRegression

y = y.reshape(-1,)
log_reg = LogisticRegression()
log_reg.fit(x, y)
threshold = 0.5

In [10]:
# perform a 20-fold cross validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(log_reg, x, y, cv=20)
print(scores)
print("mean:", np.mean(scores))

[0.86956522 0.76086957 0.91304348 0.86956522 0.95652174 0.86956522
 0.93478261 0.91304348 0.93478261 0.7173913  0.84782609 0.7826087
 0.82608696 0.95652174 0.82608696 0.76086957 0.7826087  0.73913043
 0.8        0.8       ]
mean: 0.8430434782608696


In [11]:
score = log_reg.score(x, y)
print("Correct rate on train set:", score)

Correct rate on train set: 0.8562091503267973


In [12]:
# perform hold-out cross validation with 70% of the data in the training set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
logReg = LogisticRegression()
logReg.fit(x_train, y_train)
logReg.score(x_test, y_test)

0.8478260869565217