# Activation Functions in Neural Networks

In [None]:
import random
import torch
from torch import nn, optim
import math
from IPython import display

In [None]:
from res.plot_lib import plot_data, plot_data_np, plot_model, set_default

In [None]:
# Initiale default plotting parameters
set_default()

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
from sklearn.datasets import make_circles
from matplotlib import pyplot

In [None]:
# generate circles
X, y = make_circles(n_samples=1000, noise=0.1, random_state=1)
plot_data_np(X, y)

### Define a Linear Model

In [None]:
train_samples = 800

In [None]:
# from numpy to tensor
X_train, X_test = X[:train_samples, :], X[train_samples:, :]
y_train, y_test = y[:train_samples], y[train_samples:]

X_train = torch.from_numpy(X_train).float().to(device)
y_train = torch.from_numpy(y_train).long().to(device)

X_test = torch.from_numpy(X_test).float().to(device)
y_test = torch.from_numpy(y_test).long().to(device)

In [None]:
seed = 12345
random.seed(seed)
torch.manual_seed(seed)

N = 500  # num epochs
D = 2  # input dimensions
C = 2  # num classes
H = 10  # num_hidden_units

In [None]:
# nn package to create our linear model
# each Linear module has a weight and bias
model = nn.Sequential(
    nn.Linear(D, H),
    nn.Linear(H, C)
)
model.to(device) #Convert to CUDA

# nn package also has different loss functions.
# we use cross entropy loss for our classification task
criterion = torch.nn.CrossEntropyLoss()

# we use the optim package to apply
# stochastic gradient descent for our parameter updates
#optimizer = torch.optim.Adam(model.parameters())
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum = 0.9)


In [None]:
# Training
def train_model(model, X_train, y_train, X_test, y_test, criterion):
    loss_values_train = []
    acc_values_train = []
    acc_values_test = []
    for t in range(N):
        # shuffle training data
        perm = torch.randperm(X_train.size(0))
        X_train = X_train[perm]
        y_train = y_train[perm]
        epoch_acc = 0
        epoch_loss = 0
        # Mini-batch training
        for batch in range(0, len(X_train), 100):
            X_batch = X_train[batch:batch+100]
            y_batch = y_train[batch:batch+100]
            # Feed forward to get the logits
            y_pred = model(X_batch)

            # Compute the loss and accuracy
            loss = criterion(y_pred, y_batch)
            epoch_loss += loss.item()
            
            score, predicted = torch.max(y_pred, 1)
            acc = (y_batch == predicted).sum().float() / len(y_batch)
            epoch_acc += acc.item()
            
            # zero the gradients before running
            # the backward pass.
            optimizer.zero_grad()

            # Backward pass to compute the gradient
            # of loss w.r.t our learnable params. 
            loss.backward()

            # Update params
            optimizer.step()
        
        # Average loss and acc over epoch
        if (t+1) % 10 == 0:            
            loss = epoch_loss / (len(X_train) / 100)
            acc = epoch_acc / (len(X_train) / 100)
            # Test model
            acc_test = test_model(model, X_test, y_test)
            
            print("[EPOCH]: %i, [LOSS]: %.6f, [ACCURACY TRAIN]: %.3f, \
                [ACCURACY TEST]: %.3f" % (t, loss, acc, acc_test))
            #display.clear_output(wait=True)
            
            # Save loss and acc values
            loss_values_train.append(loss)
            acc_values_train.append(acc)
            acc_values_test.append(acc_test.item())
    return loss_values_train, acc_values_train, acc_values_test

@torch.no_grad()
def test_model(model, X, y):
    # Feed forward to get the logits
    y_pred = model(X)
    # Get accuracy
    score, predicted = torch.max(y_pred, 1)
    acc = (y == predicted).sum().float() / len(y) 
    return acc


In [None]:
l_val_train, acc_val_train, acc_val_test = train_model(model = model, 
                                                       X_train = X_train, y_train = y_train, 
                                                       X_test = X_test, y_test = y_test,
                                                       criterion = criterion)

In [None]:
plot_model(X_test, y_test, model)

In [None]:
# Plot Loss and Acc
fig, (ax1, ax2) = pyplot.subplots(1, 2)
fig.set_size_inches(20, 10)
fig.suptitle('Loss and Acc')
ax1.plot(l_val_train, label='train_loss', color = 'red')
ax1.legend()
ax2.plot(acc_val_train, label='train_acc', color = 'blue')
ax2.plot(acc_val_test, label='test_acc', color = 'green')
ax2.legend()

### Add activation function

In [None]:
model = nn.Sequential(
    nn.Linear(D, H),
    nn.Tanh(),
    nn.Linear(H, C)
)
model.to(device)

# nn package also has different loss functions.
# we use cross entropy loss for our classification task
criterion = torch.nn.CrossEntropyLoss()

# we use the optim package to apply
# SGD for our parameter updates
#optimizer = torch.optim.Adam(model.parameters())
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum = 0.9)

# Training
l_val_train, acc_val_train, acc_val_test = train_model(model = model, 
                                                       X_train = X_train, y_train = y_train, 
                                                       X_test = X_test, y_test = y_test,
                                                       criterion = criterion)

In [None]:
plot_model(X_test, y_test, model)

In [None]:
# Plot Loss and Acc
fig, (ax1, ax2) = pyplot.subplots(1, 2)
fig.set_size_inches(20, 10)
fig.suptitle('Loss and Acc')
ax1.plot(l_val_train, label='train_loss', color = 'red')
ax1.legend()
ax2.plot(acc_val_train, label='train_acc', color = 'blue')
ax2.plot(acc_val_test, label='test_acc', color = 'green')
ax2.legend()

In [None]:
# let's try a deeper model
model = nn.Sequential(
    nn.Linear(D, H),
    nn.Tanh(),
    nn.Linear(H, H),
    nn.Tanh(),
    nn.Linear(H, C)
)
model.to(device)

# nn package also has different loss functions.
# we use cross entropy loss for our classification task
criterion = torch.nn.CrossEntropyLoss()

# we use the optim package to apply
# SGD for our parameter updates
#optimizer = torch.optim.Adam(model.parameters())
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum = 0.9)

# Training
l_val_train, acc_val_train, acc_val_test = train_model(model = model, 
                                                       X_train = X_train, y_train = y_train, 
                                                       X_test = X_test, y_test = y_test,
                                                       criterion = criterion)

In [None]:
plot_model(X_test, y_test, model)

In [None]:
# Plot Loss and Acc
fig, (ax1, ax2) = pyplot.subplots(1, 2)
fig.set_size_inches(20, 10)
fig.suptitle('Loss and Acc')
ax1.plot(l_val_train, label='train_loss', color = 'red')
ax1.legend()
ax2.plot(acc_val_train, label='train_acc', color = 'blue')
ax2.plot(acc_val_test, label='test_acc', color = 'green')
ax2.legend()

In [None]:
# let's try an even deeper model
model = nn.Sequential(
    nn.Linear(D, H),
    nn.Tanh(),
    nn.Linear(H, H),
    nn.Tanh(),
    nn.Linear(H, H),
    nn.Tanh(),
    nn.Linear(H, H),
    nn.Tanh(),
    nn.Linear(H, H),
    nn.Tanh(),
    nn.Linear(H, H),
    nn.Tanh(),
    nn.Linear(H, H),
    nn.Tanh(),
    nn.Linear(H, H),
    nn.Linear(H, H),
    nn.Tanh(),
    nn.Linear(H, C)
)
model.to(device)

# nn package also has different loss functions.
# we use cross entropy loss for our classification task
criterion = torch.nn.CrossEntropyLoss()

# we use the optim package to apply
# SGD for our parameter updates
#optimizer = torch.optim.Adam(model.parameters())
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum = 0.9)

# Training
l_val_train, acc_val_train, acc_val_test = train_model(model = model, 
                                                       X_train = X_train, y_train = y_train, 
                                                       X_test = X_test, y_test = y_test,
                                                       criterion = criterion)

In [None]:
plot_model(X_test, y_test, model)

In [None]:
# Plot Loss and Acc
fig, (ax1, ax2) = pyplot.subplots(1, 2)
fig.set_size_inches(20, 10)
fig.suptitle('Loss and Acc')
ax1.plot(l_val_train, label='train_loss', color = 'red')
ax1.legend()
ax2.plot(acc_val_train, label='train_acc', color = 'blue')
ax2.plot(acc_val_test, label='test_acc', color = 'green')
ax2.legend()

In [None]:
# let's try RelU instead of Sigmoid
model = nn.Sequential(
    nn.Linear(D, H),
    nn.ReLU(),
    nn.Linear(H, C)
)
model.to(device)

# nn package also has different loss functions.
# we use cross entropy loss for our classification task
criterion = torch.nn.CrossEntropyLoss()

# we use the optim package to apply
# SGD for our parameter updates
#optimizer = torch.optim.Adam(model.parameters())
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum = 0.9)

# Training
l_val_train, acc_val_train, acc_val_test = train_model(model = model, 
                                                       X_train = X_train, y_train = y_train, 
                                                       X_test = X_test, y_test = y_test,
                                                       criterion = criterion)

In [None]:
plot_model(X_test, y_test, model)

In [None]:
# Plot Loss and Acc
fig, (ax1, ax2) = pyplot.subplots(1, 2)
fig.set_size_inches(20, 10)
fig.suptitle('Loss and Acc')
ax1.plot(l_val_train, label='train_loss', color = 'red')
ax1.legend()
ax2.plot(acc_val_train, label='train_acc', color = 'blue')
ax2.plot(acc_val_test, label='test_acc', color = 'green')
ax2.legend()

In [None]:
# ex1: Experiment with different activation functions (like leakyrelu or sigmoid) and layers dimension. How the results change?