<center><h1>1-ab: Introduction to Neural Networks</h1></center>

<center><h2><a href="https://rdfia.github.io/">Course link</a></h2></center>

# Warning :
# Do "File -> Save a copy in Drive" before you start modifying the notebook, otherwise your modifications will not be saved.


In [1]:
!wget https://github.com/rdfia/rdfia.github.io/raw/master/data/2-ab.zip
!unzip -j 2-ab.zip
!wget https://github.com/rdfia/rdfia.github.io/raw/master/code/2-ab/utils-data.py

--2023-11-21 15:42:53--  https://github.com/rdfia/rdfia.github.io/raw/master/data/2-ab.zip
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/rdfia/rdfia.github.io/master/data/2-ab.zip [following]
--2023-11-21 15:42:53--  https://raw.githubusercontent.com/rdfia/rdfia.github.io/master/data/2-ab.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13423991 (13M) [application/zip]
Saving to: ‘2-ab.zip’


2023-11-21 15:42:54 (115 MB/s) - ‘2-ab.zip’ saved [13423991/13423991]

Archive:  2-ab.zip
  inflating: ._2-ab                  
  inflating: circles.mat             
  inflating: ._circles.mat         

In [2]:
import math
import torch
from torch.autograd import Variable
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import random
import seaborn as sns
%run 'utils-data.py'

KeyboardInterrupt: ignored

# Part 1 : Forward and backward passes "by hands"

In [None]:
def init_params(nx, nh, ny):
    """
    nx, nh, ny: integers
    out params: dictionary
    """
    params = {}

    # Initialize weights with a normal distribution of mean 0 and standard deviation 0.3
    params["Wh"] = torch.randn(nx, nh) * 0.3
    params["Wy"] = torch.randn(nh, ny) * 0.3

    # Initialize biases with zeros
    params["bh"] = torch.zeros(nh)
    params["by"] = torch.zeros(ny)

    return params

In [None]:
def forward(params, X):
    """
    params: dictionary
    X: tensor of size (n_batch, nx)
    """
    outputs = {}

    # Just for convenience after
    outputs["X"] = X

    # Linear combination for hidden layer
    outputs["htilde"] = X @ params['Wh'] + params['bh']  # XWh + bh
    # Activation function for hidden layer
    outputs["h"] = torch.tanh(outputs['htilde'])

    # Linear combination for output layer
    outputs["ytilde"] = outputs["h"] @ params['Wy'] + params['by']  # hWy + by
    # Softmax function for output layer to estimate probability distribution
    exp_ytilde = torch.exp(outputs["ytilde"])
    outputs["yhat"] = exp_ytilde / torch.sum(exp_ytilde, dim=1, keepdim=True)  # Normalize along the batch dimension

    return outputs['yhat'], outputs

In [None]:
def loss_accuracy(Yhat, Y):
    # Preparing cross-entropy (CE)
    criterion = torch.nn.CrossEntropyLoss()

    # Convert one-hot encoded labels to single-label format
    _, targets = torch.max(Y, 1)

    # Applying CE
    L = criterion(Yhat, targets)

    # Same for predicted values
    _, indsYhat = torch.max(Yhat, 1)

    # Compute the accuracy
    acc = (indsYhat == targets).float().mean()

    return L, acc

In [None]:
def backward(params, outputs, Y):
    grads = {}

    # Computation of theorical definitions, w.r.t. tensor dimensions
    grad_ytilde = outputs['yhat'] - Y

    grads["Wy"] = outputs['h'].T @ grad_ytilde

    grad_htilde = (grad_ytilde @ params["Wy"].T) * (1 - outputs["h"]**2)

    grads["Wh"] = outputs["X"].T @ grad_htilde
    grads["by"] = grad_ytilde.sum(axis=0)
    grads["bh"] = grad_htilde.sum(axis=0)

    return grads

In [None]:
def sgd(params, grads, eta):

    # Updating parameters according to theorical definitions and using eta as a learning rate
    params["Wh"] = params["Wh"] - eta * grads["Wh"]
    params["Wy"] = params["Wy"] - eta * grads["Wy"]
    params["bh"] = params["bh"] - eta * grads["bh"]
    params["by"] = params["by"] - eta * grads["by"]

    return params

## Global learning procedure "by hands"

Results according to *LR*

In [None]:
# init
data = CirclesData()
data.plot_data()
N = data.Xtrain.shape[0]
Nbatch = 10
nx = data.Xtrain.shape[1]
nh = 10
ny = data.Ytrain.shape[1]
eta = 0.03

params = init_params(nx, nh, ny)

curves = [[],[], [], []]
params = init_params(nx, nh, ny)  # Initialize parameters for each batch size
# epoch
for iteration in range(150):
    # permute
    perm = np.random.permutation(N)
    Xtrain = data.Xtrain[perm, :]
    Ytrain = data.Ytrain[perm, :]

    # batches
    for j in range(N // Nbatch):
        indsBatch = range(j * Nbatch, (j+1) * Nbatch)
        X = Xtrain[indsBatch, :]
        Y = Ytrain[indsBatch, :]
        # Forward
        Yhat, outputs = forward(params, X)
        L, acc = loss_accuracy(Yhat, Y)
        # Backward
        grads = backward(params, outputs, Y)
        # Updating parameters
        params = sgd(params, grads, eta)

    Yhat_train, _ = forward(params, data.Xtrain)
    Yhat_test, _ = forward(params, data.Xtest)
    Ltrain, acctrain = loss_accuracy(Yhat_train, data.Ytrain)
    Ltest, acctest = loss_accuracy(Yhat_test, data.Ytest)
    Ygrid, _ = forward(params, data.Xgrid)

    #title = 'Iter {}: Acc train {:.1f}% ({:.2f}), acc test {:.1f}% ({:.2f})'.format(iteration, acctrain, Ltrain, acctest, Ltest)
    #data.plot_data_with_grid(Ygrid, title)

    curves[0].append(acctrain)
    curves[1].append(acctest)
    curves[2].append(Ltrain)
    curves[3].append(Ltest)

plt.title(f"Learning rate :{eta}, batch size : {Nbatch}")
plt.plot(curves[0], label="acc. train")
plt.plot(curves[1], label="acc. test")
plt.plot(curves[2], label="loss train")
plt.plot(curves[3], label="loss test")
plt.legend()
plt.show()


In [None]:
# init
data = CirclesData()
data.plot_data()
N = data.Xtrain.shape[0]
batch_sizes = [10, 20, 50, 100]
nx = data.Xtrain.shape[1]
nh = 10
ny = data.Ytrain.shape[1]
learning_rates = [0.01, 0.03, 0.05, 0.1]

results = {}


for Nbatch in batch_sizes:
    for eta in learning_rates:
        curves = [[],[], [], []]
        params = init_params(nx, nh, ny)  # Initialize parameters for each batch size
        # epoch
        for iteration in range(150):
            # permute
            perm = np.random.permutation(N)
            Xtrain = data.Xtrain[perm, :]
            Ytrain = data.Ytrain[perm, :]

            # batches
            for j in range(N // Nbatch):
                indsBatch = range(j * Nbatch, (j+1) * Nbatch)
                X = Xtrain[indsBatch, :]
                Y = Ytrain[indsBatch, :]
                # Forward
                Yhat, outputs = forward(params, X)
                L, acc = loss_accuracy(Yhat, Y)
                # Backward
                grads = backward(params, outputs, Y)
                # Updating parameters
                params = sgd(params, grads, eta)

            Yhat_train, _ = forward(params, data.Xtrain)
            Yhat_test, _ = forward(params, data.Xtest)
            Ltrain, acctrain = loss_accuracy(Yhat_train, data.Ytrain)
            Ltest, acctest = loss_accuracy(Yhat_test, data.Ytest)
            Ygrid, _ = forward(params, data.Xgrid)

            #title = 'Iter {}: Acc train {:.1f}% ({:.2f}), acc test {:.1f}% ({:.2f})'.format(iteration, acctrain, Ltrain, acctest, Ltest)
            #data.plot_data_with_grid(Ygrid, title)

            curves[0].append(acctrain)
            curves[1].append(acctest)
            curves[2].append(Ltrain)
            curves[3].append(Ltest)

        # Store the results
        key = (Nbatch, eta)
        results[key] = {
            'acc_train': curves[0][-1],  # Last accuracy value for training
            'acc_test': curves[1][-1],   # Last accuracy value for testing
            'loss_train': curves[2][-1], # Last loss value for training
            'loss_test': curves[3][-1]   # Last loss value for testing
        }

In [None]:
# Prepare matrices for loss and accuracy
loss_train_matrix = np.zeros((len(batch_sizes), len(learning_rates)))
loss_test_matrix = np.zeros((len(batch_sizes), len(learning_rates)))
acc_train_matrix = np.zeros((len(batch_sizes), len(learning_rates)))
acc_test_matrix = np.zeros((len(batch_sizes), len(learning_rates)))

for i, Nbatch in enumerate(batch_sizes):
    for j, eta in enumerate(learning_rates):
        key = (Nbatch, eta)
        loss_train_matrix[i, j] = results[key]['loss_train']
        loss_test_matrix[i, j] = results[key]['loss_test']
        acc_train_matrix[i, j] = results[key]['acc_train']
        acc_test_matrix[i, j] = results[key]['acc_test']

# Plotting
fig, ax = plt.subplots(2, 2, figsize=(12, 10))

# Heatmaps
sns.heatmap(loss_train_matrix, ax=ax[0, 0], annot=True, xticklabels=learning_rates, yticklabels=batch_sizes)
ax[0, 0].set_title('Training Loss')
ax[0, 0].set_xlabel('Learning Rate')
ax[0, 0].set_ylabel('Batch Size')

sns.heatmap(loss_test_matrix, ax=ax[0, 1], annot=True, xticklabels=learning_rates, yticklabels=batch_sizes)
ax[0, 1].set_title('Testing Loss')
ax[0, 1].set_xlabel('Learning Rate')
ax[0, 1].set_ylabel('Batch Size')

sns.heatmap(acc_train_matrix, ax=ax[1, 0], annot=True, xticklabels=learning_rates, yticklabels=batch_sizes)
ax[1, 0].set_title('Training Accuracy')
ax[1, 0].set_xlabel('Learning Rate')
ax[1, 0].set_ylabel('Batch Size')

sns.heatmap(acc_test_matrix, ax=ax[1, 1], annot=True, xticklabels=learning_rates, yticklabels=batch_sizes)
ax[1, 1].set_title('Testing Accuracy')
ax[1, 1].set_xlabel('Learning Rate')
ax[1, 1].set_ylabel('Batch Size')

plt.tight_layout()
plt.show()

# Part 2 : Simplification of the backward pass with `torch.autograd`



In [None]:
def init_params(nx, nh, ny):
    """
    nx, nh, ny: integers
    out params: dictionnary
    """
    params = {}

    # Initialize weights with a normal distribution of mean 0 and standard deviation 0.3 with autograd
    params["Wh"] = torch.randn(nx, nh) * 0.3
    params["Wy"] = torch.randn(nh, ny) * 0.3
    params["Wh"].requires_grad = True
    params["Wy"].requires_grad = True


    # Initialize biases with zeros with autograd
    params["bh"] = torch.zeros(nh, requires_grad=True)
    params["by"] = torch.zeros(ny, requires_grad=True)

    return params

The function `forward` remains unchanged from previous part.

The function `backward` is no longer used because of "autograd".

In [None]:
def sgd(params, eta):

    with torch.no_grad():
        # Updating parameters
        params["Wh"] -= eta * params["Wh"].grad
        params["Wy"] -= eta * params["Wy"].grad
        params["bh"] -= eta * params["bh"].grad
        params["by"] -= eta * params["by"].grad

        # Reset the gradient accumulators
        params['Wh'].grad.zero_()
        params['Wy'].grad.zero_()
        params['bh'].grad.zero_()
        params['by'].grad.zero_()

    return params

## Global learning procedure with autograd

In [None]:
# init
data = CirclesData()
data.plot_data()
N = data.Xtrain.shape[0]
Nbatch = 10
nx = data.Xtrain.shape[1]
nh = 10
ny = data.Ytrain.shape[1]
eta = 0.1 # Modification of eta

params = init_params(nx, nh, ny)

curves = [[],[], [], []]

# epoch
for iteration in range(150):

    # permute
    perm = np.random.permutation(N)
    Xtrain = data.Xtrain[perm, :]
    Ytrain = data.Ytrain[perm, :]

    # batches
    for j in range(N // Nbatch):

        indsBatch = range(j * Nbatch, (j+1) * Nbatch)
        X = Xtrain[indsBatch, :]
        Y = Ytrain[indsBatch, :]

        # Forward
        Yhat, _ = forward(params, X)
        L, _ = loss_accuracy(Yhat, Y)

        # Backward
        L.backward()

        # Updating parameters
        params = sgd(params, eta)


    Yhat_train, _ = forward(params, data.Xtrain)
    Yhat_test, _ = forward(params, data.Xtest)
    Ltrain, acctrain = loss_accuracy(Yhat_train, data.Ytrain)
    Ltest, acctest = loss_accuracy(Yhat_test, data.Ytest)
    Ygrid, _ = forward(params, data.Xgrid)

    title = 'Iter {}: Acc train {:.1f}% ({:.2f}), acc test {:.1f}% ({:.2f})'.format(iteration, acctrain, Ltrain, acctest, Ltest)
    print(title)
    # detach() is used to remove the predictions from the computational graph in autograd
    data.plot_data_with_grid(Ygrid.detach(), title)

    curves[0].append(acctrain)
    curves[1].append(acctest)
    curves[2].append(Ltrain.detach().numpy())
    curves[3].append(Ltest.detach().numpy())

fig = plt.figure()
plt.plot(curves[0], label="acc. train")
plt.plot(curves[1], label="acc. test")
plt.plot(curves[2], label="loss train")
plt.plot(curves[3], label="loss test")
plt.legend()
plt.show()

In [None]:
# init
data = CirclesData()
data.plot_data()
N = data.Xtrain.shape[0]
batch_sizes = [10, 20, 50, 100]
nx = data.Xtrain.shape[1]
nh = 10
ny = data.Ytrain.shape[1]
learning_rates = [0.01, 0.03, 0.05, 0.1]

results = {}


for Nbatch in batch_sizes:
    for eta in learning_rates:
        curves = [[],[], [], []]
        params = init_params(nx, nh, ny)  # Initialize parameters for each batch size
        # epoch
        for iteration in range(150):
            # permute
            perm = np.random.permutation(N)
            Xtrain = data.Xtrain[perm, :]
            Ytrain = data.Ytrain[perm, :]

            # batches
            for j in range(N // Nbatch):
                indsBatch = range(j * Nbatch, (j+1) * Nbatch)
                X = Xtrain[indsBatch, :]
                Y = Ytrain[indsBatch, :]

                # Forward
                Yhat, _ = forward(params, X)
                L, _ = loss_accuracy(Yhat, Y)

                # Backward
                L.backward()

                # Updating parameters
                params = sgd(params, eta)

            Yhat_train, _ = forward(params, data.Xtrain)
            Yhat_test, _ = forward(params, data.Xtest)
            Ltrain, acctrain = loss_accuracy(Yhat_train, data.Ytrain)
            Ltest, acctest = loss_accuracy(Yhat_test, data.Ytest)
            Ygrid, _ = forward(params, data.Xgrid)

            #title = 'Iter {}: Acc train {:.1f}% ({:.2f}), acc test {:.1f}% ({:.2f})'.format(iteration, acctrain, Ltrain, acctest, Ltest)
            #data.plot_data_with_grid(Ygrid, title)

            curves[0].append(acctrain)
            curves[1].append(acctest)
            curves[2].append(Ltrain)
            curves[3].append(Ltest)

        # Store the results
        key = (Nbatch, eta)
        results[key] = {
            'acc_train': curves[0][-1],  # Last accuracy value for training
            'acc_test': curves[1][-1],   # Last accuracy value for testing
            'loss_train': curves[2][-1], # Last loss value for training
            'loss_test': curves[3][-1]   # Last loss value for testing
        }

In [None]:
# Prepare matrices for loss and accuracy
loss_train_matrix = np.zeros((len(batch_sizes), len(learning_rates)))
loss_test_matrix = np.zeros((len(batch_sizes), len(learning_rates)))
acc_train_matrix = np.zeros((len(batch_sizes), len(learning_rates)))
acc_test_matrix = np.zeros((len(batch_sizes), len(learning_rates)))

for i, Nbatch in enumerate(batch_sizes):
    for j, eta in enumerate(learning_rates):
        key = (Nbatch, eta)
        loss_train_matrix[i, j] = results[key]['loss_train']
        loss_test_matrix[i, j] = results[key]['loss_test']
        acc_train_matrix[i, j] = results[key]['acc_train']
        acc_test_matrix[i, j] = results[key]['acc_test']

# Plotting
fig, ax = plt.subplots(2, 2, figsize=(12, 10))

# Heatmaps
sns.heatmap(loss_train_matrix, ax=ax[0, 0], annot=True, xticklabels=learning_rates, yticklabels=batch_sizes)
ax[0, 0].set_title('Training Loss')
ax[0, 0].set_xlabel('Learning Rate')
ax[0, 0].set_ylabel('Batch Size')

sns.heatmap(loss_test_matrix, ax=ax[0, 1], annot=True, xticklabels=learning_rates, yticklabels=batch_sizes)
ax[0, 1].set_title('Testing Loss')
ax[0, 1].set_xlabel('Learning Rate')
ax[0, 1].set_ylabel('Batch Size')

sns.heatmap(acc_train_matrix, ax=ax[1, 0], annot=True, xticklabels=learning_rates, yticklabels=batch_sizes)
ax[1, 0].set_title('Training Accuracy')
ax[1, 0].set_xlabel('Learning Rate')
ax[1, 0].set_ylabel('Batch Size')

sns.heatmap(acc_test_matrix, ax=ax[1, 1], annot=True, xticklabels=learning_rates, yticklabels=batch_sizes)
ax[1, 1].set_title('Testing Accuracy')
ax[1, 1].set_xlabel('Learning Rate')
ax[1, 1].set_ylabel('Batch Size')

plt.tight_layout()
plt.show()

# Part 3 : Simplification of the forward pass with `torch.nn`

`init_params` and `forward` are replaced by the `init_model` function which defines the network architecture and the loss.

In [None]:
def init_model(nx, nh, ny):
    model = torch.nn.Sequential(
        torch.nn.Linear(nx, nh),
        torch.nn.Tanh(),
        torch.nn.Linear(nh, ny),
        torch.nn.Softmax()
    )

    loss = torch.nn.CrossEntropyLoss()

    return model, loss

In [None]:
def loss_accuracy(loss, Yhat, Y):
    # Convert one-hot encoded labels to single-label format
    _, targets = torch.max(Y, 1)

    L = loss(Yhat, targets)

    # Same for predicted values
    _, indsYhat = torch.max(Yhat, 1)

    acc = torch.mean((indsYhat == targets).float())

    return L, acc

In [None]:
def sgd(model, eta):

    with torch.no_grad():
        for param in model.parameters():
            param -= eta * param.grad
        model.zero_grad()

    return model

## Global learning procedure with autograd and `torch.nn`

In [None]:
# init
data = CirclesData()
data.plot_data()
N = data.Xtrain.shape[0]
Nbatch = 10
nx = data.Xtrain.shape[1]
nh = 10
ny = data.Ytrain.shape[1]
eta = 0.1 # modification of eta

model, loss = init_model(nx, nh, ny)

curves = [[],[], [], []]

# epoch
for iteration in range(150):

    # permute
    perm = np.random.permutation(N)
    Xtrain = data.Xtrain[perm, :]
    Ytrain = data.Ytrain[perm, :]

    # batches
    for j in range(N // Nbatch):

        indsBatch = range(j * Nbatch, (j+1) * Nbatch)
        X = Xtrain[indsBatch, :]
        Y = Ytrain[indsBatch, :]

        # Forward
        Yhat = model(X)
        L = loss(Yhat, Y)

        # Backward
        L.backward()
        model = sgd(model, eta)



    Yhat_train = model(data.Xtrain)
    Yhat_test = model(data.Xtest)
    Ltrain, acctrain = loss_accuracy(loss, Yhat_train, data.Ytrain)
    Ltest, acctest = loss_accuracy(loss, Yhat_test, data.Ytest)
    Ygrid = model(data.Xgrid)

    title = 'Iter {}: Acc train {:.1f}% ({:.2f}), acc test {:.1f}% ({:.2f})'.format(iteration, acctrain, Ltrain, acctest, Ltest)
    print(title)
    data.plot_data_with_grid(torch.nn.Softmax(dim=1)(Ygrid.detach()), title)

    curves[0].append(acctrain)
    curves[1].append(acctest)
    curves[2].append(Ltrain.detach().numpy())
    curves[3].append(Ltest.detach().numpy())

fig = plt.figure()
plt.plot(curves[0], label="acc. train")
plt.plot(curves[1], label="acc. test")
plt.plot(curves[2], label="loss train")
plt.plot(curves[3], label="loss test")
plt.legend()
plt.show()

In [None]:
# init
data = CirclesData()
data.plot_data()
N = data.Xtrain.shape[0]
batch_sizes = [10, 20, 50, 100]
nx = data.Xtrain.shape[1]
nh = 10
ny = data.Ytrain.shape[1]
learning_rates = [0.01, 0.03, 0.05, 0.1]

results = {}


for Nbatch in batch_sizes:
    for eta in learning_rates:
        curves = [[],[], [], []]
        params = init_params(nx, nh, ny)  # Initialize parameters for each batch size
        # epoch
        for iteration in range(150):
            # permute
            perm = np.random.permutation(N)
            Xtrain = data.Xtrain[perm, :]
            Ytrain = data.Ytrain[perm, :]

            # batches
            for j in range(N // Nbatch):
                indsBatch = range(j * Nbatch, (j+1) * Nbatch)
                X = Xtrain[indsBatch, :]
                Y = Ytrain[indsBatch, :]

                # Forward
                Yhat = model(X)
                L = loss(Yhat, Y)

                # Backward
                L.backward()
                model = sgd(model, eta)

            Yhat_train = model(data.Xtrain)
            Yhat_test = model(data.Xtest)
            Ltrain, acctrain = loss_accuracy(loss, Yhat_train, data.Ytrain)
            Ltest, acctest = loss_accuracy(loss, Yhat_test, data.Ytest)
            Ygrid = model(data.Xgrid)

            #title = 'Iter {}: Acc train {:.1f}% ({:.2f}), acc test {:.1f}% ({:.2f})'.format(iteration, acctrain, Ltrain, acctest, Ltest)
            #data.plot_data_with_grid(Ygrid, title)

            curves[0].append(acctrain)
            curves[1].append(acctest)
            curves[2].append(Ltrain)
            curves[3].append(Ltest)

        # Store the results
        key = (Nbatch, eta)
        results[key] = {
            'acc_train': curves[0][-1],  # Last accuracy value for training
            'acc_test': curves[1][-1],   # Last accuracy value for testing
            'loss_train': curves[2][-1], # Last loss value for training
            'loss_test': curves[3][-1]   # Last loss value for testing
        }

In [None]:
# Prepare matrices for loss and accuracy
loss_train_matrix = np.zeros((len(batch_sizes), len(learning_rates)))
loss_test_matrix = np.zeros((len(batch_sizes), len(learning_rates)))
acc_train_matrix = np.zeros((len(batch_sizes), len(learning_rates)))
acc_test_matrix = np.zeros((len(batch_sizes), len(learning_rates)))

for i, Nbatch in enumerate(batch_sizes):
    for j, eta in enumerate(learning_rates):
        key = (Nbatch, eta)
        loss_train_matrix[i, j] = results[key]['loss_train']
        loss_test_matrix[i, j] = results[key]['loss_test']
        acc_train_matrix[i, j] = results[key]['acc_train']
        acc_test_matrix[i, j] = results[key]['acc_test']

# Plotting
fig, ax = plt.subplots(2, 2, figsize=(12, 10))

# Heatmaps
sns.heatmap(loss_train_matrix, ax=ax[0, 0], annot=True, xticklabels=learning_rates, yticklabels=batch_sizes)
ax[0, 0].set_title('Training Loss')
ax[0, 0].set_xlabel('Learning Rate')
ax[0, 0].set_ylabel('Batch Size')

sns.heatmap(loss_test_matrix, ax=ax[0, 1], annot=True, xticklabels=learning_rates, yticklabels=batch_sizes)
ax[0, 1].set_title('Testing Loss')
ax[0, 1].set_xlabel('Learning Rate')
ax[0, 1].set_ylabel('Batch Size')

sns.heatmap(acc_train_matrix, ax=ax[1, 0], annot=True, xticklabels=learning_rates, yticklabels=batch_sizes)
ax[1, 0].set_title('Training Accuracy')
ax[1, 0].set_xlabel('Learning Rate')
ax[1, 0].set_ylabel('Batch Size')

sns.heatmap(acc_test_matrix, ax=ax[1, 1], annot=True, xticklabels=learning_rates, yticklabels=batch_sizes)
ax[1, 1].set_title('Testing Accuracy')
ax[1, 1].set_xlabel('Learning Rate')
ax[1, 1].set_ylabel('Batch Size')

plt.tight_layout()
plt.show()

# Part 4 : Simplification of the SGD with `torch.optim`

In [None]:
def init_model(nx, nh, ny, eta):

    model = torch.nn.Sequential(
        torch.nn.Linear(nx, nh),
        torch.nn.Tanh(),
        torch.nn.Linear(nh, ny),
        torch.nn.Softmax()
    )

    loss = torch.nn.CrossEntropyLoss()

    optim = torch.optim.SGD(model.parameters(), lr=eta)

    return model, loss, optim

The `sgd` function is replaced by calling the `optim.zero_grad()` before the backward and `optim.step()` after.

## Algorithme global d'apprentissage (avec autograd, les couches `torch.nn` et `torch.optim`)

In [None]:
# init
data = CirclesData()
data.plot_data()
N = data.Xtrain.shape[0]
Nbatch = 10
nx = data.Xtrain.shape[1]
nh = 10
ny = data.Ytrain.shape[1]
eta = 0.1 # modification of eta

model, loss, optim = init_model(nx, nh, ny, eta)

curves = [[],[], [], []]

# epoch
for iteration in range(150):

    # permute
    perm = np.random.permutation(N)
    Xtrain = data.Xtrain[perm, :]
    Ytrain = data.Ytrain[perm, :]

    # batches
    for j in range(N // Nbatch):

        indsBatch = range(j * Nbatch, (j+1) * Nbatch)
        X = Xtrain[indsBatch, :]
        Y = Ytrain[indsBatch, :]

        # Forward
        Yhat = model(X)
        L = loss(Yhat, Y)

        # Backward
        optim.zero_grad()
        L.backward()
        optim.step()


    Yhat_train = model(data.Xtrain)
    Yhat_test = model(data.Xtest)
    Ltrain, acctrain = loss_accuracy(loss, Yhat_train, data.Ytrain)
    Ltest, acctest = loss_accuracy(loss, Yhat_test, data.Ytest)
    Ygrid = model(data.Xgrid)

    title = 'Iter {}: Acc train {:.1f}% ({:.2f}), acc test {:.1f}% ({:.2f})'.format(iteration, acctrain, Ltrain, acctest, Ltest)
    print(title)
    data.plot_data_with_grid(torch.nn.Softmax(dim=1)(Ygrid.detach()), title)

    curves[0].append(acctrain)
    curves[1].append(acctest)
    curves[2].append(Ltrain.detach().numpy())
    curves[3].append(Ltest.detach().numpy())

fig = plt.figure()
plt.plot(curves[0], label="acc. train")
plt.plot(curves[1], label="acc. test")
plt.plot(curves[2], label="loss train")
plt.plot(curves[3], label="loss test")
plt.legend()
plt.show()

In [None]:
# init
data = CirclesData()
data.plot_data()
N = data.Xtrain.shape[0]
batch_sizes = [10, 20, 50, 100]
nx = data.Xtrain.shape[1]
nh = 10
ny = data.Ytrain.shape[1]
learning_rates = [0.01, 0.03, 0.05, 0.1]

results = {}


for Nbatch in batch_sizes:
    for eta in learning_rates:
        curves = [[],[], [], []]
        params = init_params(nx, nh, ny)  # Initialize parameters for each batch size
        # epoch
        for iteration in range(150):
            # permute
            perm = np.random.permutation(N)
            Xtrain = data.Xtrain[perm, :]
            Ytrain = data.Ytrain[perm, :]

            # batches
            for j in range(N // Nbatch):
                indsBatch = range(j * Nbatch, (j+1) * Nbatch)
                X = Xtrain[indsBatch, :]
                Y = Ytrain[indsBatch, :]

                # Forward
                Yhat = model(X)
                L = loss(Yhat, Y)

                # Backward
                optim.zero_grad()
                L.backward()
                optim.step()

            Yhat_train = model(data.Xtrain)
            Yhat_test = model(data.Xtest)
            Ltrain, acctrain = loss_accuracy(loss, Yhat_train, data.Ytrain)
            Ltest, acctest = loss_accuracy(loss, Yhat_test, data.Ytest)
            Ygrid = model(data.Xgrid)

            #title = 'Iter {}: Acc train {:.1f}% ({:.2f}), acc test {:.1f}% ({:.2f})'.format(iteration, acctrain, Ltrain, acctest, Ltest)
            #data.plot_data_with_grid(Ygrid, title)

            curves[0].append(acctrain)
            curves[1].append(acctest)
            curves[2].append(Ltrain)
            curves[3].append(Ltest)

        # Store the results
        key = (Nbatch, eta)
        results[key] = {
            'acc_train': curves[0][-1],  # Last accuracy value for training
            'acc_test': curves[1][-1],   # Last accuracy value for testing
            'loss_train': curves[2][-1], # Last loss value for training
            'loss_test': curves[3][-1]   # Last loss value for testing
        }

In [None]:
# Prepare matrices for loss and accuracy
loss_train_matrix = np.zeros((len(batch_sizes), len(learning_rates)))
loss_test_matrix = np.zeros((len(batch_sizes), len(learning_rates)))
acc_train_matrix = np.zeros((len(batch_sizes), len(learning_rates)))
acc_test_matrix = np.zeros((len(batch_sizes), len(learning_rates)))

for i, Nbatch in enumerate(batch_sizes):
    for j, eta in enumerate(learning_rates):
        key = (Nbatch, eta)
        loss_train_matrix[i, j] = results[key]['loss_train']
        loss_test_matrix[i, j] = results[key]['loss_test']
        acc_train_matrix[i, j] = results[key]['acc_train']
        acc_test_matrix[i, j] = results[key]['acc_test']

# Plotting
fig, ax = plt.subplots(2, 2, figsize=(12, 10))

# Heatmaps
sns.heatmap(loss_train_matrix, ax=ax[0, 0], annot=True, xticklabels=learning_rates, yticklabels=batch_sizes)
ax[0, 0].set_title('Training Loss')
ax[0, 0].set_xlabel('Learning Rate')
ax[0, 0].set_ylabel('Batch Size')

sns.heatmap(loss_test_matrix, ax=ax[0, 1], annot=True, xticklabels=learning_rates, yticklabels=batch_sizes)
ax[0, 1].set_title('Testing Loss')
ax[0, 1].set_xlabel('Learning Rate')
ax[0, 1].set_ylabel('Batch Size')

sns.heatmap(acc_train_matrix, ax=ax[1, 0], annot=True, xticklabels=learning_rates, yticklabels=batch_sizes)
ax[1, 0].set_title('Training Accuracy')
ax[1, 0].set_xlabel('Learning Rate')
ax[1, 0].set_ylabel('Batch Size')

sns.heatmap(acc_test_matrix, ax=ax[1, 1], annot=True, xticklabels=learning_rates, yticklabels=batch_sizes)
ax[1, 1].set_title('Testing Accuracy')
ax[1, 1].set_xlabel('Learning Rate')
ax[1, 1].set_ylabel('Batch Size')

plt.tight_layout()
plt.show()

# Part 5 : MNIST

Apply the code from previous part code to the MNIST dataset.

In [None]:
# init
data = MNISTData()
N = data.Xtrain.shape[0]
Nbatch = 100
nx = data.Xtrain.shape[1]
nh = 100
ny = data.Ytrain.shape[1]
eta = 0.03

model, loss, optim = init_model(nx, nh, ny, eta)

curves = [[],[], [], []]

# epoch
for iteration in range(150):

    # permute
    perm = np.random.permutation(N)
    Xtrain = data.Xtrain[perm, :]
    Ytrain = data.Ytrain[perm, :]

    # batches
    for j in range(N // Nbatch):

        indsBatch = range(j * Nbatch, (j+1) * Nbatch)
        X = Xtrain[indsBatch, :]
        Y = Ytrain[indsBatch, :]

        # Forward
        Yhat = model(X)
        L = loss(Yhat, Y)

        # Backward
        optim.zero_grad()
        L.backward()
        optim.step()


    Yhat_train = model(data.Xtrain)
    Yhat_test = model(data.Xtest)
    Ltrain, acctrain = loss_accuracy(loss, Yhat_train, data.Ytrain)
    Ltest, acctest = loss_accuracy(loss, Yhat_test, data.Ytest)
    # Ygrid = model(data.Xgrid)

    title = 'Iter {}: Acc train {:.1f}% ({:.2f}), acc test {:.1f}% ({:.2f})'.format(iteration, acctrain, Ltrain, acctest, Ltest)
    print(title)
    to_show = random.randint(0, data.Ytest.shape[0]-1) # Defining randomly a sample test to see its prediction, ground truth and associated image (for overwatching purposes)
    print("Predicted value : {}, Ground truth : {}".format(torch.argmax(Yhat_test[to_show]), torch.argmax(data.Ytest[to_show]))) # the argmax = class to predict (maximum likelihood for predictions)
    plt.imshow(data.Xtest[to_show].reshape(28, 28)) # sample test
    plt.show()

    curves[0].append(acctrain)
    curves[1].append(acctest)
    curves[2].append(Ltrain.detach().numpy())
    curves[3].append(Ltest.detach().numpy())

fig = plt.figure()
plt.plot(curves[0], label="acc. train")
plt.plot(curves[1], label="acc. test")
plt.plot(curves[2], label="loss train")
plt.plot(curves[3], label="loss test")
plt.legend()
plt.show()

In [None]:
# init
data = MNISTData()
N = data.Xtrain.shape[0]
batch_sizes = [10, 20, 50, 100]
nx = data.Xtrain.shape[1]
nh = 10
ny = data.Ytrain.shape[1]
learning_rates = [0.01, 0.03, 0.05, 0.1]

results = {}


for Nbatch in batch_sizes:
    for eta in learning_rates:
        curves = [[],[], [], []]
        params = init_params(nx, nh, ny)  # Initialize parameters for each batch size
        # epoch
        for iteration in range(150):
            # permute
            perm = np.random.permutation(N)
            Xtrain = data.Xtrain[perm, :]
            Ytrain = data.Ytrain[perm, :]

            # batches
            for j in range(N // Nbatch):
                indsBatch = range(j * Nbatch, (j+1) * Nbatch)
                X = Xtrain[indsBatch, :]
                Y = Ytrain[indsBatch, :]

                # Forward
                Yhat = model(X)
                L = loss(Yhat, Y)

                # Backward
                optim.zero_grad()
                L.backward()
                optim.step()

            Yhat_train = model(data.Xtrain)
            Yhat_test = model(data.Xtest)
            Ltrain, acctrain = loss_accuracy(loss, Yhat_train, data.Ytrain)
            Ltest, acctest = loss_accuracy(loss, Yhat_test, data.Ytest)
            Ygrid = model(data.Xgrid)

            #title = 'Iter {}: Acc train {:.1f}% ({:.2f}), acc test {:.1f}% ({:.2f})'.format(iteration, acctrain, Ltrain, acctest, Ltest)
            #data.plot_data_with_grid(Ygrid, title)

            curves[0].append(acctrain)
            curves[1].append(acctest)
            curves[2].append(Ltrain)
            curves[3].append(Ltest)

        # Store the results
        key = (Nbatch, eta)
        results[key] = {
            'acc_train': curves[0][-1],  # Last accuracy value for training
            'acc_test': curves[1][-1],   # Last accuracy value for testing
            'loss_train': curves[2][-1], # Last loss value for training
            'loss_test': curves[3][-1]   # Last loss value for testing
        }

In [None]:
# Prepare matrices for loss and accuracy
loss_train_matrix = np.zeros((len(batch_sizes), len(learning_rates)))
loss_test_matrix = np.zeros((len(batch_sizes), len(learning_rates)))
acc_train_matrix = np.zeros((len(batch_sizes), len(learning_rates)))
acc_test_matrix = np.zeros((len(batch_sizes), len(learning_rates)))

for i, Nbatch in enumerate(batch_sizes):
    for j, eta in enumerate(learning_rates):
        key = (Nbatch, eta)
        loss_train_matrix[i, j] = results[key]['loss_train']
        loss_test_matrix[i, j] = results[key]['loss_test']
        acc_train_matrix[i, j] = results[key]['acc_train']
        acc_test_matrix[i, j] = results[key]['acc_test']

# Plotting
fig, ax = plt.subplots(2, 2, figsize=(12, 10))

# Heatmaps
sns.heatmap(loss_train_matrix, ax=ax[0, 0], annot=True, xticklabels=learning_rates, yticklabels=batch_sizes)
ax[0, 0].set_title('Training Loss')
ax[0, 0].set_xlabel('Learning Rate')
ax[0, 0].set_ylabel('Batch Size')

sns.heatmap(loss_test_matrix, ax=ax[0, 1], annot=True, xticklabels=learning_rates, yticklabels=batch_sizes)
ax[0, 1].set_title('Testing Loss')
ax[0, 1].set_xlabel('Learning Rate')
ax[0, 1].set_ylabel('Batch Size')

sns.heatmap(acc_train_matrix, ax=ax[1, 0], annot=True, xticklabels=learning_rates, yticklabels=batch_sizes)
ax[1, 0].set_title('Training Accuracy')
ax[1, 0].set_xlabel('Learning Rate')
ax[1, 0].set_ylabel('Batch Size')

sns.heatmap(acc_test_matrix, ax=ax[1, 1], annot=True, xticklabels=learning_rates, yticklabels=batch_sizes)
ax[1, 1].set_title('Testing Accuracy')
ax[1, 1].set_xlabel('Learning Rate')
ax[1, 1].set_ylabel('Batch Size')

plt.tight_layout()
plt.show()

# Part 6: Bonus: SVM


Train a SVM model on the Circles dataset.

Ideas :
- First try a linear SVM (sklearn.svm.LinearSVC dans scikit-learn). Does it work well ? Why ?
- Then try more complex kernels (sklearn.svm.SVC). Which one is the best ? why ?
- Does the parameter C of regularization have an impact? Why ?

In [None]:
from sklearn.svm import SVC, LinearSVC

In [None]:
# data
data = CirclesData()
Xtrain = data.Xtrain.numpy()
Ytrain = data.Ytrain[:, 0].numpy()

Xgrid = data.Xgrid.numpy()

Xtest = data.Xtest.numpy()
Ytest = data.Ytest[:, 0].numpy()

def plot_svm_predictions(data, predictions):
      plt.figure(2)
      plt.clf()
      plt.imshow(np.reshape(predictions, (40,40)))
      plt.plot(data._Xtrain[data._Ytrain[:,0] == 1,0]*10+20, data._Xtrain[data._Ytrain[:,0] == 1,1]*10+20, 'bo', label="Train")
      plt.plot(data._Xtrain[data._Ytrain[:,1] == 1,0]*10+20, data._Xtrain[data._Ytrain[:,1] == 1,1]*10+20, 'ro')
      plt.plot(data._Xtest[data._Ytest[:,0] == 1,0]*10+20, data._Xtest[data._Ytest[:,0] == 1,1]*10+20, 'b+', label="Test")
      plt.plot(data._Xtest[data._Ytest[:,1] == 1,0]*10+20, data._Xtest[data._Ytest[:,1] == 1,1]*10+20, 'r+')
      plt.xlim(0,39)
      plt.ylim(0,39)
      plt.clim(0.3,0.7)
      plt.draw()
      plt.pause(1e-3)

## Linear SVC

In [None]:
# Linear SVC parameters and fitting

l_svc = LinearSVC(random_state=42)
l_svc.fit(Xtrain, Ytrain)

In [None]:
## Print results

Ytest_pred = l_svc.predict(Xtest)
accuracy = np.sum(Ytest == Ytest_pred) / len(Ytest)
print(f"Accuracy : {100 * accuracy:.2f}")
Ygrid_pred = l_svc.predict(Xgrid)
plot_svm_predictions(data, Ygrid_pred)

Does not work very well because LinearSVC is optimal for linear binary classification. Here, we face to a non linear problem (non separable 2D data with a straight line), we can observe it easily because data is dispensed in a circle shape.

## Classical SVC

In [None]:
# SVC parameters and fitting

svc = SVC(random_state=42)
svc.fit(Xtrain, Ytrain)

In [None]:
## Print results

Ytest_pred = svc.predict(Xtest)
accuracy = np.sum(Ytest == Ytest_pred) / len(Ytest)
print(f"Accuracy : {100 * accuracy:.2f}")
Ygrid_pred = svc.predict(Xgrid)
plot_svm_predictions(data, Ygrid_pred)

In [None]:
# Faire deux ou trois autres SVC pour comparer puis moduler la régularisation et expliquer comme dans l'énoncé

svc = SVC(kernel='poly', random_state=42)
svc.fit(Xtrain, Ytrain)

Ytest_pred = svc.predict(Xtest)
accuracy = np.sum(Ytest == Ytest_pred) / len(Ytest)
print(f"Accuracy : {100 * accuracy:.2f}")
Ygrid_pred = svc.predict(Xgrid)
plot_svm_predictions(data, Ygrid_pred)

In [None]:
svc = SVC(kernel='sigmoid', random_state=42)
svc.fit(Xtrain, Ytrain)

Ytest_pred = svc.predict(Xtest)
accuracy = np.sum(Ytest == Ytest_pred) / len(Ytest)
print(f"Accuracy : {100 * accuracy:.2f}")
Ygrid_pred = svc.predict(Xgrid)
plot_svm_predictions(data, Ygrid_pred)

In [None]:
accs = []
for reg in range(1, 100):
    svc = SVC(C=reg, random_state=42)
    svc.fit(Xtrain, Ytrain)

    Ytest_pred = svc.predict(Xtest)
    accuracy = np.sum(Ytest == Ytest_pred) / len(Ytest)
    accs.append(accuracy)

plt.plot(accs)
plt.xlabel("Accuracy")
plt.ylabel('Regularization parameter (C)')
plt.show()