<a href="https://colab.research.google.com/github/Stemanz/ml-datasets/blob/master/kaggle-cardio2/cardio_training2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! git clone https://github.com/Stemanz/ml-datasets.git

Cloning into 'ml-datasets'...
remote: Enumerating objects: 40, done.[K
remote: Counting objects: 100% (40/40), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 40 (delta 12), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (40/40), done.


In [2]:
! ls

ml-datasets  sample_data


In [0]:
import numpy as np
import pandas as pd
import random
import tensorflow as tf
import time
import seaborn as sns; sns.set()

class DatasetHandler():
    """Iterator that gives back the dataset in slices useful for
    training, testing and validation.
    
    Data should already have been pre-processed.
    
    input
    =====
    
    dataset | the .npz file where the dataset is stored. <dataset> is
            assumed to contain the arrays "inputs" and "targets"
              
    batch_size | defines the batch size for training. If unspecified,
            the dataset is loaded in a whole batch
    
    returns
    =======
    
    Sliced (in batches) <inputs> and <targets>
    """
    
    def __init__(self, dataset, batch_size=None, classes_num=2):
        
        npz = np.load(dataset)
        self.inputs  = npz["inputs"].astype(np.float32)
        self.targets = npz["targets"].astype(np.int8)
        self.classes_num = classes_num
        
        # Counts the batch number. If None, we are either validating
        # or testing (we are not training), so we take it all
        if batch_size is None:
            self.batch_size = self.inputs.shape[0] # n. of rows
        else:
            self.batch_size = batch_size
        
        self.curr_batch = 0
        self.batch_count = self.inputs.shape[0] // self.batch_size
        
    def __next__(self):
        
        if self.curr_batch >= self.batch_count:
            self.curr_batch = 0
            raise StopIteration
        
        start = self.curr_batch * self.batch_size
        stop  = (self.curr_batch + 1) * self.batch_size
        batch_slice   = slice(start, stop)
        inputs_batch  = self.inputs[batch_slice]
        targets_batch = self.targets[batch_slice]
        self.curr_batch += 1
        
        # one-hot encoding
        targets_one_hot = np.zeros((targets_batch.shape[0], self.classes_num))
        targets_one_hot[range(targets_batch.shape[0]), targets_batch] = 1
        
        return inputs_batch, targets_one_hot
        
    def __iter__(self):
        return self

**Pre-processing**

In [0]:
df = pd.read_csv("ml-datasets/kaggle-cardio2/heart.csv")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age         303 non-null int64
sex         303 non-null int64
cp          303 non-null int64
trestbps    303 non-null int64
chol        303 non-null int64
fbs         303 non-null int64
restecg     303 non-null int64
thalach     303 non-null int64
exang       303 non-null int64
oldpeak     303 non-null float64
slope       303 non-null int64
ca          303 non-null int64
thal        303 non-null int64
target      303 non-null int64
dtypes: float64(1), int64(13)
memory usage: 33.2 KB


In [8]:
# checking if dataset is balanced
ratio = sum(df["target"]) / len(df["target"])
print(f"{round(ratio, 2)}%")

0.54%


In [0]:
target_col = "target"
targets = np.array(df[target_col].copy())
del df[target_col]

In [0]:
from sklearn.preprocessing import StandardScaler
unscaled_inputs = np.array(df) # avoids later Warnings
scaler = StandardScaler()
scaler.fit(unscaled_inputs)

scaled_inputs = scaler.transform(unscaled_inputs)
# putting back the thing into a DataFrame and * CREATING A CHECKPOINT *
df_processed_unscaled = df.copy()

In [12]:
unscaled_inputs.shape[0] # that's not too much -.-'

303

In [14]:
# shuffling and dividing the dataset
# np.array version

# parameters
# ==========
train_size = .8
validation_size = .1
dataset = scaled_inputs # requires: np.array


# determining sizes
index_range  = dataset.shape[0]
train_n      = int(index_range * train_size)
validation_n = int(index_range * validation_size)
test_n       = index_range - train_n - validation_n

# pulling random indexes
indices = list(range(index_range))
train_indices      = [random.choice(indices) for _ in range(train_n)]
validation_indices = [random.choice(indices) for _ in range(validation_n)]
test_indices       = [random.choice(indices) for _ in range(test_n)]
assert len(train_indices) + len(validation_indices) + len(test_indices) == index_range

# slicing dataset and targets
train_df = dataset[train_indices]
train_targets = targets[train_indices]
validation_df = dataset[validation_indices]
validation_targets = targets[validation_indices]
test_df = dataset[test_indices]
test_targets = targets[test_indices]

# are outputs binary? If so, printing some stats
if len(set(targets)) < 3:
    print(f"Ones in TOTAL: {round((sum(targets) / len(targets))*100, 2)}%")
    print(f"Ones in train: {round((sum(train_targets) / len(train_targets))*100, 2)}%")
    print(f"Ones in validation: {round((sum(validation_targets) / len(validation_targets))*100, 2)}%")
    print(f"Ones in test: {round((sum(test_targets) / len(test_targets))*100, 2)}%")

Ones in TOTAL: 54.46%
Ones in train: 54.55%
Ones in validation: 50.0%
Ones in test: 58.06%


In [0]:
# prepping and saving the datasets (automatically adds .npz)
#np.savez("train", inputs=np.array(train_df), targets=train_targets)
#np.savez("validation", inputs=np.array(validation_df), targets=validation_targets)
#np.savez("test", inputs=np.array(test_df), targets=test_targets)

# saved datasets features:
#Ones in TOTAL: 54.46%
#Ones in train: 54.55%
#Ones in validation: 50.0%
#Ones in test: 58.06%

In [16]:
print(f"Number of inputs for deep neural network: {np.array(train_df).shape[1]}")

Number of inputs for deep neural network: 13


In [18]:
try:
    sess.close()
except:
    pass

# Reset the default graph, so you can fiddle with the hyperparameters
# and then rerun the code.
tf.reset_default_graph()

# parameters
# ==========
train_npz = "train.npz"
validation_npz = "validation.npz"
test_npz = "test.npz"

# Input size depends on the number of input variables.
input_size = 13
# Output size is 2, as we one-hot encoded the targets.
output_size = 2
# Choose a hidden_layer_size
hidden_layer_size = 128
# Guess what?
learning_rate=0.0001
# Choose the batch size
batch_size = 5

# Set early stopping mechanisms
max_epochs = 300
prev_validation_loss = 9999999.

# ==========

# Create the placeholders
inputs = tf.placeholder(tf.float32, [None, input_size])
targets = tf.placeholder(tf.int32, [None, output_size])

# Stacking the layers of the model
weights_1 = tf.get_variable("weights_1", [input_size, hidden_layer_size])
biases_1 = tf.get_variable("biases_1", [hidden_layer_size])
outputs_1 = tf.nn.tanh(tf.matmul(inputs, weights_1) + biases_1)

weights_2 = tf.get_variable("weights_2", [hidden_layer_size, hidden_layer_size])
biases_2 = tf.get_variable("biases_2", [hidden_layer_size])
outputs_2 = tf.nn.relu(tf.matmul(outputs_1, weights_2) + biases_2)

weights_final = tf.get_variable("weights_final", [hidden_layer_size, output_size])
biases_final = tf.get_variable("biases_final", [output_size])
HIDDEN_LAYERS = 2
# We will incorporate the softmax activation into the loss
outputs = tf.matmul(outputs_2, weights_final) + biases_final # ← change here

# Use the softmax cross entropy loss with logits
loss = tf.nn.softmax_cross_entropy_with_logits(logits=outputs, labels=targets)
mean_loss = tf.reduce_mean(loss)

# Get a 0 or 1 for every input indicating whether it output the correct answer
out_equals_target = tf.equal(tf.argmax(outputs, 1), tf.argmax(targets, 1))
accuracy = tf.reduce_mean(tf.cast(out_equals_target, tf.float32))

# Optimize with Adam
optimize = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(mean_loss)

# Create a session
sess = tf.InteractiveSession()

# Initialize the variables
initializer = tf.global_variables_initializer()
sess.run(initializer)


# let's call our class
train_data = DatasetHandler(train_npz, batch_size)
validation_data = DatasetHandler(validation_npz)

print("Running the deep neural network model.")
print("======================================")
print(f"Batch size: {batch_size}\n\
Hidden layers: {HIDDEN_LAYERS}\n\
Neurons per layer: {hidden_layer_size}\n")

t0 = time.time()
# Create the loop for epochs 
for epoch_counter in range(max_epochs):
    
    print(f"Current epoch: {epoch_counter}", end="\r")    
    # initializing variables for current epoch
    curr_epoch_loss     = 0.
    
    # learning with train dataset
    for input_batch, target_batch in train_data:
        _, batch_loss = sess.run(
            [optimize, mean_loss], 
            feed_dict={inputs: input_batch, targets: target_batch}
        )
        curr_epoch_loss += batch_loss
    
    curr_epoch_loss /= train_data.batch_count #average for batch
    
    # forward propagating only the validation dataset
    for input_batch, target_batch in validation_data:
        validation_loss, validation_accuracy = sess.run(
            [mean_loss, accuracy],
            feed_dict={inputs: input_batch, targets: target_batch}
        )
    
    print(f"Epoch: {epoch_counter}", end=" ")
    print(f"Training loss: {round(curr_epoch_loss, 2)}", end=" ")
    print(f"Validation loss: {round(float(validation_loss), 2)}", end=" ")
    print(f"Validation accuracy: {round(validation_accuracy * 100, 2)}%", end="\n")
    
    # Trigger early stopping if validation loss begins increasing.
    if validation_loss > prev_validation_loss:
        break
        
    # Store this epoch's validation loss to be used as previous in the next iteration.
    prev_validation_loss = validation_loss
    
t1 = time.time()
print(f"End of training. Training took {round(t1 - t0, 2)} seconds.")

Running the deep neural network model.
Batch size: 5
Hidden layers: 2
Neurons per layer: 128

Epoch: 0 Training loss: 0.92 Validation loss: 0.73 Validation accuracy: 50.0%
Epoch: 1 Training loss: 0.67 Validation loss: 0.51 Validation accuracy: 66.67%
Epoch: 2 Training loss: 0.51 Validation loss: 0.38 Validation accuracy: 86.67%
Epoch: 3 Training loss: 0.42 Validation loss: 0.31 Validation accuracy: 90.0%
Epoch: 4 Training loss: 0.36 Validation loss: 0.26 Validation accuracy: 86.67%
Epoch: 5 Training loss: 0.33 Validation loss: 0.23 Validation accuracy: 93.33%
Epoch: 6 Training loss: 0.31 Validation loss: 0.21 Validation accuracy: 93.33%
Epoch: 7 Training loss: 0.29 Validation loss: 0.2 Validation accuracy: 93.33%
Epoch: 8 Training loss: 0.28 Validation loss: 0.19 Validation accuracy: 93.33%
Epoch: 9 Training loss: 0.27 Validation loss: 0.18 Validation accuracy: 93.33%
Epoch: 10 Training loss: 0.26 Validation loss: 0.17 Validation accuracy: 93.33%
Epoch: 11 Training loss: 0.26 Validatio

In [19]:
# forward propagating the test dataset
test_data = DatasetHandler(test_npz)

for input_batch, target_batch in test_data:
    test_accuracy = sess.run(
        accuracy,
        feed_dict={inputs: input_batch, targets: target_batch}
    )

test_accuracy_percent = round(test_accuracy * 100, 2)
print(f"Test accuracy: {test_accuracy_percent}%")

Test accuracy: 96.77%


In [0]:
#trying now other classifiers
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [0]:
reg = LogisticRegression(solver="liblinear")

In [0]:
npz = np.load("train.npz")
x_train = npz["inputs"].astype(np.float32)
y_train = npz["targets"].astype(np.int8)

In [23]:
reg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [0]:
# validation dataset
npz = np.load("validation.npz")
x_valid = npz["inputs"].astype(np.float32)
y_valid = npz["targets"].astype(np.int8)

In [26]:
reg.score(x_valid, y_valid)

0.9333333333333333

In [0]:
# test dataset
npz = np.load("test.npz")
x_test = npz["inputs"].astype(np.float32)
y_test = npz["targets"].astype(np.int8)

In [0]:
reg.score(x_test, y_test)

0.7262857142857143