# PP5: deep learning intro

<a href="https://colab.research.google.com/github/PauliusU/PP5-deep-learning-intro/blob/master/PP5_deep_learning_intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Choosing and preparing the dataset

Project requires to use use dataset which suitable for non-linearly separable classification.

The famous [Iris flower dataset](https://en.wikipedia.org/wiki/Iris_flower_data_set) contains three classes, Setosa is linearly separable from the other two classes. Versicolor and Virginica classes are not linearly separable (see [source](https://www.researchgate.net/figure/Iris-data-set-There-are-three-classes-Setosa-class-is-linearly-separable-from-the-other_fig2_300643220)).

Therefore, the Iris dataset is suitable for the project.

In [1]:
# Load the dataset

from pprint import pprint
from sklearn.datasets import load_iris

# Initiate Iris flower dataset
iris = load_iris()

# Get to know the structure of the dataset
pprint(iris)


{'DESCR': '.. _iris_dataset:\n'
          '\n'
          'Iris plants dataset\n'
          '--------------------\n'
          '\n'
          '**Data Set Characteristics:**\n'
          '\n'
          '    :Number of Instances: 150 (50 in each of three classes)\n'
          '    :Number of Attributes: 4 numeric, predictive attributes and the '
          'class\n'
          '    :Attribute Information:\n'
          '        - sepal length in cm\n'
          '        - sepal width in cm\n'
          '        - petal length in cm\n'
          '        - petal width in cm\n'
          '        - class:\n'
          '                - Iris-Setosa\n'
          '                - Iris-Versicolour\n'
          '                - Iris-Virginica\n'
          '                \n'
          '    :Summary Statistics:\n'
          '\n'
          '                    Min  Max   Mean    SD   Class Correlation\n'
          '    sepal length:   4.3  7.9   5.84   0.83    0.7826\n'
          '    sepal wid

In [1]:
# Data preparation and splitting the dataset

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Split the dataset into X and y
X = iris['data']
y = iris['target']
names = iris['target_names']
feature_names = iris['feature_names']

# Verify X and y
print(X[:5])
print(y[:5])

# One hot encoding
enc = OneHotEncoder()
Y = enc.fit_transform(y[:, np.newaxis]).toarray()

# Use a StandardScaler to remove the mean and scale the features to unit variance.
# In other words, scale data to have mean 0 and variance 1  which is important
# for convergence of the neural network
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(X_scaled)
print(len(X_scaled))

# Split the data set into training and testing sets. Use 80% and 20% split.
X_train, X_test, Y_train, Y_test = train_test_split(
    X_scaled, Y, test_size=0.5, random_state=2)

# Verify splitting results
print(X_train.shape)  # (120, 4)
print(X_test.shape)  # (30, 4)
print(Y_train.shape)  # (120, 3)
print(Y_test.shape)  # (30, 3)

n_features = X.shape[1]
n_classes = Y.shape[1]

NameError: name 'iris' is not defined

## Visualization

In [None]:
# Visualize the dataset

import matplotlib.pyplot as plt

plt.figure(figsize=(16, 6))
plt.subplot(1, 2, 1)
for target, target_name in enumerate(names):
    X_plot = X[y == target]
    plt.plot(X_plot[:, 0], X_plot[:, 1], linestyle='none', marker='o', label=target_name)
plt.xlabel(feature_names[0])
plt.ylabel(feature_names[1])
plt.axis('equal')
plt.legend();

plt.subplot(1, 2, 2)
for target, target_name in enumerate(names):
    X_plot = X[y == target]
    plt.plot(X_plot[:, 2], X_plot[:, 3], linestyle='none', marker='o', label=target_name)
plt.xlabel(feature_names[2])
plt.ylabel(feature_names[3])
plt.axis('equal')
plt.legend();

## Keras

In [None]:
from keras.models import Sequential
from keras.layers import Dense


def create_custom_model(input_dim, output_dim, nodes, n=1, name='model'):
    def create_model():
        # Create model
        model = Sequential(name=name)
        for i in range(n):
            model.add(Dense(nodes, input_dim=input_dim, activation='relu'))
        model.add(Dense(output_dim, activation='softmax'))

        # Compile model
        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        return model
    return create_model


models = [create_custom_model(n_features, n_classes, 8, i, f'model_{i}')
          for i in range(1, 4)]

for create_model in models:
    create_model().summary()


In [None]:
# Training of the models with Keras

from keras.callbacks import TensorBoard

history_dict = {}

# Use Tensorboard as a callback if we want to explore the model and the outputs in detail
cb = TensorBoard()

for create_model in models:
    model = create_model()
    print('Model name:', model.name)
    history_callback = model.fit(X_train, Y_train,
                                 batch_size=5,
                                 epochs=50,
                                 verbose=0,
                                 validation_data=(X_test, Y_test),
                                 callbacks=[cb])
    score = model.evaluate(X_test, Y_test, verbose=0)
    print(f'Test loss: {score[0]}')
    print(f'Test accuracy: {score[1]}')
    print('*' * 80)

    history_dict[model.name] = [history_callback, model]


In [None]:
# Visualize performance of the models created by Keras

fig, (ax1, ax2) = plt.subplots(2, figsize=(8, 6))

for model_name in history_dict:
    val_accurady = history_dict[model_name][0].history['val_accuracy']
    val_loss = history_dict[model_name][0].history['val_loss']
    ax1.plot(val_accurady, label=model_name)
    ax2.plot(val_loss, label=model_name)

ax1.set_ylabel('validation accuracy')
ax2.set_ylabel('validation loss')
ax2.set_xlabel('epochs')
ax1.legend()
ax2.legend()


In [None]:
"""
Show Receiver Operating Characteristic (ROC) curve how well the models perform. 
The ROC plot compares the false positive rate with the true positive rate. 

Also compute for each model the Area under the curve (AUC). 
auc = 1  means perfect classification while auc = 0.5 is random guessing.
"""
from sklearn.metrics import roc_curve, auc

plt.figure(figsize=(10, 10))
plt.plot([0, 1], [0, 1], 'k--')

for model_name in history_dict:
    model = history_dict[model_name][1]

    Y_pred = model.predict(X_test)
    fpr, tpr, threshold = roc_curve(y_test.ravel(), Y_pred.ravel())

    plt.plot(fpr, tpr, label='{}, AUC = {:.3f}'.format(
        model_name, auc(fpr, tpr)))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend()


In [None]:
"""
model_3 usually showed the best results. 
So let's measure performance with 10-fold cross validation for the model_3 by using the KerasClassifier.
KerasClassifier is a useful Wrapper when using Keras together with scikit-learn.
"""
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score

create_model = create_custom_model(n_features, n_classes, 8, 3)

estimator = KerasClassifier(build_fn=create_model,
                            epochs=100, batch_size=5, verbose=0)
scores = cross_val_score(estimator, X_scaled, Y, cv=10)
print(f"Accuracy : {scores.mean():0.2f} (+/- {scores.std():0.2f})")


## PyTorch

In [None]:
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.autograd import Variable

In [None]:
# Setup model

class Model(nn.Module):
    def __init__(self, input_dim):
        super(Model, self).__init__()
        self.layer1 = nn.Linear(input_dim, 50)
        self.layer2 = nn.Linear(50, 50)
        self.layer3 = nn.Linear(50, 3)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = F.softmax(self.layer3(x), dim=1)
        return x


In [None]:
model = Model(X_train.shape[1])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()
print(model)


In [None]:
# Model training
# tqdm is used here to track progress more efficiently

import tqdm

EPOCHS  = 300
X_train = Variable(torch.from_numpy(X_train)).float()
y_train = Variable(torch.from_numpy(y_train)).long()
X_test  = Variable(torch.from_numpy(X_test)).float()
y_test  = Variable(torch.from_numpy(y_test)).long()

loss_list     = np.zeros((EPOCHS,))
accuracy_list = np.zeros((EPOCHS,))

for epoch in tqdm.trange(EPOCHS):
    y_pred = model(X_train)
    loss = loss_fn(y_pred, y_train)
    loss_list[epoch] = loss.item()
    
    # Zero gradients
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    with torch.no_grad():
        y_pred = model(X_test)
        correct = (torch.argmax(y_pred, dim=1) == y_test).type(torch.FloatTensor)
        accuracy_list[epoch] = correct.mean()

In [None]:
# Plot Accuracy and Loss from training

fig, (ax1, ax2) = plt.subplots(2, figsize=(12, 6), sharex=True)

ax1.plot(accuracy_list)
ax1.set_ylabel("validation accuracy")
ax2.plot(loss_list)
ax2.set_ylabel("validation loss")
ax2.set_xlabel("epochs");

In [None]:
# Repeat plotting ROC an AUC as it was done with Keras

from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import OneHotEncoder

plt.figure(figsize=(10, 10))
plt.plot([0, 1], [0, 1], 'k--')

# One hot encoding
# To prepare the test data, we need to use the OneHotEncoder to encode the integer 
# features into a One-hot vector which we then flatten with numpy.ravel() for sklearn.metrics.roc_curve().
enc = OneHotEncoder()
Y_onehot = enc.fit_transform(y_test[:, np.newaxis]).toarray()

with torch.no_grad():
    y_pred = model(X_test).numpy()
    fpr, tpr, threshold = roc_curve(Y_onehot.ravel(), y_pred.ravel())
    
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc(fpr, tpr)))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend();

## Which framework is preferable?

When choose between `Keras` and `PyTorch` I would prefer `Keras` framework work. This is based mostly on subjective experience when facing issues with both of them. I was able to find solutions and fixes for `Keras` more easily. `PyTorch` took me longer to get started and to deal with Python errors.

In addition to that, syntax of `Keras` seems to be more readable and concise. Consequently, compared to `PyTorch`, `Keras` appeared to be more intuitive and beginner friendly.