In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import torch
import torch.utils.data as data_utils
from torchvision import datasets, transforms
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import sys, time, random

In [None]:
# Number of hidden layers
N_layers = 3

# Number of neurons in each hidden layers
N_hidden_neurons = 500
N_input_neurons = 784
N_output_neurons = 10
# Training parameters
BETA = 1
epsilon = 0.5
n_iter1 = 100
n_iter2 = 6

alpha = np.zeros(N_layers)
a_xh1 = 0.4
a_h1h2 = 0.1
a_h2y = 0.01

MINI_BATCH_SIZE = 20
EPOCHS = 30

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
def glorot_bengio_init(n_in, n_out):
    std_dev = math.sqrt(2 / (n_in + n_out))
    return torch.tensor(np.random.normal(0, std_dev, (n_out, n_in)))

W_xh1 = glorot_bengio_init(N_input_neurons, N_hidden_neurons).repeat(MINI_BATCH_SIZE, 1, 1).to(device)

W_h1h2 = glorot_bengio_init(N_hidden_neurons, N_hidden_neurons).repeat(MINI_BATCH_SIZE, 1, 1).to(device)

W_h2y = glorot_bengio_init(N_hidden_neurons, N_output_neurons).repeat(MINI_BATCH_SIZE, 1, 1).to(device)

B_h1 = torch.zeros(N_hidden_neurons).repeat(MINI_BATCH_SIZE, 1).to(device)

B_h2 = torch.zeros(N_hidden_neurons).repeat(MINI_BATCH_SIZE, 1).to(device)

B_y = torch.zeros(N_output_neurons).repeat(MINI_BATCH_SIZE, 1).to(device)

input_layer = torch.zeros(MINI_BATCH_SIZE, N_input_neurons).to(device)

hlayer1 = torch.zeros(MINI_BATCH_SIZE, N_hidden_neurons).to(device)

hlayer2 = torch.zeros(MINI_BATCH_SIZE, N_hidden_neurons).to(device)

output_layer = torch.zeros(MINI_BATCH_SIZE, N_output_neurons).to(device)

In [None]:
input_layer.shape

torch.Size([20, 784])

In [None]:
train_dataset = datasets.MNIST('../datasets',
                               train=True,
                               download=True,
                               transform=transforms.Compose([
                                   transforms.ToTensor(),
                                   transforms.Normalize((0.1307,), (0.3081,))]))

test_dataset = datasets.MNIST('../datasets',
                              train=False,
                              download=True,
                              transform=transforms.Compose([
                                  transforms.ToTensor(),
                                  transforms.Normalize((0.1307,), (0.3081,))]))

train_loader = data_utils.DataLoader(train_dataset, batch_size=MINI_BATCH_SIZE, shuffle=True)
test_loader = data_utils.DataLoader(test_dataset, batch_size=MINI_BATCH_SIZE, shuffle=False)

label_encoder = LabelEncoder()
numbers = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
integer_encoded = label_encoder.fit_transform(numbers)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

x_train_batches = []
y_train_batches = []
for batch_idx, (data, target) in enumerate(train_loader):

    x_train = data.view(-1, 784)

    y_train = torch.tensor(onehot_encoded[target])

    x_train = x_train / 255.0

    x_train_batches.append(x_train)
    y_train_batches.append(y_train)

print(y_train_batches)

x_train_batches = torch.stack(x_train_batches, 0).to(device)
y_train_batches = torch.stack(y_train_batches, 0).to(device)

x_test_batches = []
y_test_batches = []
for batch_idx, (data, target) in enumerate(test_loader):

    x_test = data.view(-1, 784)

    y_test = torch.tensor(onehot_encoded[target])

    x_test = x_test / 255.0

    x_test_batches.append(x_test)
    y_test_batches.append(y_test)

x_test_batches = torch.stack(x_test_batches, 0).to(device)
y_test_batches = torch.stack(y_test_batches, 0).to(device)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
print(y_train_batches[0].shape)

torch.Size([20, 10])


In [None]:
def hard_sigmoid(x):
    a = torch.clamp(x, 0, 1).to(device)
    return a.double()

def del_hard_sigmoid(x):
    a = hard_sigmoid(x).to(device)
    slope = (x == a).double()
    return slope

In [None]:
t = torch.zeros(MINI_BATCH_SIZE, hlayer1.shape[1]).to(device)
print(t.shape)
# hard_sigmoid(t).shape

# x = torch.tensor([1])
print(del_hard_sigmoid(t).shape)

torch.Size([20, 500])
torch.Size([20, 500])


In [None]:
y_train_batches[0].shape

torch.Size([20, 10])

In [None]:
a = torch.tensor([[1, 2, 1], [1, 2, 1]])
b = torch.tensor([[1, 2, 1], [1, 1, 1]])
a * b

tensor([[1, 4, 1],
        [1, 2, 1]])

In [None]:
def ds_dt(W_prev, layer_prev, W_next, layer_next, N_layer, layer, B, y_hat, is_op, beta):

    dc_ds = torch.zeros(MINI_BATCH_SIZE, N_layer).to(device)

    de_ds = (torch.matmul(W_prev, hard_sigmoid(layer_prev).unsqueeze(-1)).squeeze(-1) + B).to(device)

    if is_op == 1:
        dc_ds = beta * (y_hat - layer)
    else:
        de_ds += torch.matmul(W_next.transpose(1, 2).double(), hard_sigmoid(layer_next).unsqueeze(-1)).squeeze(-1)

    de_ds *= del_hard_sigmoid(layer)
    de_ds -= layer

    return de_ds + dc_ds


In [None]:
B_h1.shape

torch.Size([20, 500])

In [None]:
ds_dt(W_xh1, input_layer, W_h1h2, hlayer2, hlayer1.shape[1], hlayer2, B_h1, y_train_batches[0], 0., BETA).shape

torch.Size([20, 500])

In [None]:
# print(np.max(x_train[100]))
print(x_train_batches.shape)
print(y_train_batches.shape)
print(x_test_batches.shape)
print(y_test_batches.shape)
print(len(x_train_batches))
# print(x_train[4])

torch.Size([3000, 20, 784])
torch.Size([3000, 20, 10])
torch.Size([500, 20, 784])
torch.Size([500, 20, 10])
3000


In [None]:
dw_xh1 = torch.zeros(W_xh1.shape).to(device)
dw_h1h2 = torch.zeros(W_h1h2.shape).to(device)
dw_h2y = torch.zeros(W_h2y.shape).to(device)

db_h1 = torch.zeros(B_h1.shape).to(device)
db_h2 = torch.zeros(B_h2.shape).to(device)
db_y = torch.zeros(B_y.shape).to(device)

dw_xh1.shape

torch.Size([20, 500, 784])

In [None]:
# SANITY CHECK
print(x_train_batches.device)
print(W_xh1.device)
print(hlayer1.device)
print(dw_xh1.device)

cuda:0
cuda:0
cuda:0
cuda:0


In [None]:
from tqdm.notebook import tqdm
import time
import random

total_iterations = len(x_train_batches)

In [None]:
torch.backends.cudnn.benchmark = True

In [None]:
start_total = time.time()

for epoch in range(EPOCHS):
    avg_time = 0

    # Training

    print("Currently running epoch ", epoch)

    for batch_index, batch in tqdm(enumerate(x_train_batches), desc="Progress", mininterval=2.0, leave=False):
        start = time.time()

        # print("Batch number ", batch_index)

        dw_xh1.zero_()
        dw_h1h2.zero_()
        dw_h2y.zero_()

        db_h1.zero_()
        db_h2.zero_()
        db_y.zero_()

        # Forward Pass

        input_layer = batch

        for iter in range(n_iter1):
            hlayer1 += epsilon * ds_dt(W_xh1, input_layer, W_h1h2, hlayer2, hlayer1.shape[1], hlayer1, B_h1, None, 0, 0)
            hlayer1 = torch.clamp(hlayer1, 0, 1).double()

            hlayer2 += epsilon * ds_dt(W_h1h2, hlayer1, W_h2y, output_layer, hlayer2.shape[1], hlayer2, B_h2, None, 0, 0)
            hlayer2 = torch.clamp(hlayer2, 0, 1).double()

            output_layer += epsilon * ds_dt(W_h2y, hlayer2, None, None, output_layer.shape[1], output_layer, B_y, y_train_batches[batch_index], 1, 0)
            output_layer = torch.clamp(output_layer, 0, 1).double()

        # Collect activations
        dw_xh1 -= torch.matmul(hlayer1.unsqueeze(-1), input_layer.unsqueeze(-1).transpose(1,2).double())

        dw_h2y -= torch.matmul(output_layer.unsqueeze(-1), hlayer2.unsqueeze(-1).transpose(1,2).double())

        dw_h1h2 -= torch.matmul(hlayer2.unsqueeze(-1), hlayer1.unsqueeze(-1).transpose(1,2).double())


        db_y -= output_layer

        db_h2 -= hlayer2

        db_h1 -= hlayer1

        # Backward Pass
        for iter in range(n_iter2):

            hlayer1 += epsilon * ds_dt(W_xh1, input_layer, W_h1h2, hlayer2, hlayer1.shape[1], hlayer1, B_h1, None, 0, BETA)
            hlayer1 = torch.clamp(hlayer1, 0, 1).double()

            hlayer2 += epsilon * ds_dt(W_h1h2, hlayer1, W_h2y, output_layer, hlayer2.shape[1], hlayer2, B_h2, None, 0, BETA)
            hlayer2 = torch.clamp(hlayer2, 0, 1).double()

            output_layer += epsilon * ds_dt(W_h2y, hlayer2, None, None, output_layer.shape[1], output_layer, B_y, y_train_batches[batch_index], 1, BETA)
            output_layer = torch.clamp(output_layer, 0, 1).double()

        # Collect activations

        dw_xh1 += torch.matmul(hlayer1.unsqueeze(-1), input_layer.unsqueeze(-1).transpose(1,2).double())

        dw_h2y += torch.matmul(output_layer.unsqueeze(-1), hlayer2.unsqueeze(-1).transpose(1,2).double())

        dw_h1h2 += torch.matmul(hlayer2.unsqueeze(-1), hlayer1.unsqueeze(-1).transpose(1,2).double())

        db_y += output_layer

        db_h2 += hlayer2

        db_h1 += hlayer1

        # Update weights only after each minibatch

        W_h2y += a_h2y * (dw_h2y / MINI_BATCH_SIZE)

        W_h1h2 += a_h1h2 * (dw_h1h2 / MINI_BATCH_SIZE)

        W_xh1 += a_xh1 * (dw_xh1 / MINI_BATCH_SIZE)

        B_y += a_h2y * (db_y / MINI_BATCH_SIZE)

        B_h2 += a_h1h2 * (db_h2 / MINI_BATCH_SIZE)

        B_h1 += a_xh1 * (db_h1 / MINI_BATCH_SIZE)

        end = time.time()

        avg_time += ((end - start - avg_time) / (batch_index + 1))

    print("Average training time per batch in epoch ", epoch, " = ", avg_time)

    print("Testing model on training data")

    # TRAINING ERROR

    c = 0
    for train_index, train_batch_y in enumerate(y_train_batches):
        # Forward Pass

        input_layer = x_train_batches[train_index]

        for iter in range(n_iter1):
            hlayer1 += epsilon * ds_dt(W_xh1, input_layer, W_h1h2, hlayer2, hlayer1.shape[1], hlayer1, B_h1, None, 0, 0)
            hlayer1 = torch.clamp(hlayer1, 0, 1).double()

            hlayer2 += epsilon * ds_dt(W_h1h2, hlayer1, W_h2y, output_layer, hlayer2.shape[1], hlayer2, B_h2, None, 0, 0)
            hlayer2 = torch.clamp(hlayer2, 0, 1).double()

            output_layer += epsilon * ds_dt(W_h2y, hlayer2, None, None, output_layer.shape[1], output_layer, B_y, y_train_batches[batch_index], 1, 0)
            output_layer = torch.clamp(output_layer, 0, 1).double()

        for idx, element in enumerate(train_batch_y):
            c += int((element.argmax() == output_layer[idx].argmax()))

    print("C", c)
    acc = c / (MINI_BATCH_SIZE * len(y_train_batches))
    print("Training accuracy after epoch ", epoch, " = ", acc)


total_time = time.time() - start_total
print("Total Time ",total_time)




Currently running epoch  0


Progress: 0it [00:00, ?it/s]

Average training time per batch in epoch  0  =  0.13046517030398044
Testing model on training data
Training accuracy after epoch  0  =  0.10296666666666666
Currently running epoch  1


Progress: 0it [00:00, ?it/s]

Average training time per batch in epoch  1  =  0.13038826235135414
Testing model on training data
Training accuracy after epoch  1  =  0.10468333333333334
Currently running epoch  2


Progress: 0it [00:00, ?it/s]

Average training time per batch in epoch  2  =  0.13027116743723546
Testing model on training data
Training accuracy after epoch  2  =  0.10528333333333334
Currently running epoch  3


Progress: 0it [00:00, ?it/s]

Average training time per batch in epoch  3  =  0.13034588146209716
Testing model on training data
Training accuracy after epoch  3  =  0.10528333333333334
Currently running epoch  4


Progress: 0it [00:00, ?it/s]

Average training time per batch in epoch  4  =  0.13004761274655652
Testing model on training data
Training accuracy after epoch  4  =  0.10528333333333334
Currently running epoch  5


Progress: 0it [00:00, ?it/s]

Average training time per batch in epoch  5  =  0.13115479397773722
Testing model on training data
Training accuracy after epoch  5  =  0.10528333333333334
Currently running epoch  6


Progress: 0it [00:00, ?it/s]

Average training time per batch in epoch  6  =  0.13027957693735726
Testing model on training data
Training accuracy after epoch  6  =  0.10528333333333334
Currently running epoch  7


Progress: 0it [00:00, ?it/s]

Average training time per batch in epoch  7  =  0.13021159124374382
Testing model on training data
Training accuracy after epoch  7  =  0.10528333333333334
Currently running epoch  8


Progress: 0it [00:00, ?it/s]

Average training time per batch in epoch  8  =  0.1304385873476663
Testing model on training data
Training accuracy after epoch  8  =  0.10633333333333334
Currently running epoch  9


Progress: 0it [00:00, ?it/s]

Average training time per batch in epoch  9  =  0.13000108949343356
Testing model on training data
Training accuracy after epoch  9  =  0.10633333333333334
Currently running epoch  10


Progress: 0it [00:00, ?it/s]

Average training time per batch in epoch  10  =  0.13035178192456565
Testing model on training data
Training accuracy after epoch  10  =  0.10681666666666667
Currently running epoch  11


Progress: 0it [00:00, ?it/s]

Average training time per batch in epoch  11  =  0.13177801187833124
Testing model on training data
Training accuracy after epoch  11  =  0.10746666666666667
Currently running epoch  12


Progress: 0it [00:00, ?it/s]

Average training time per batch in epoch  12  =  0.13029650855064376
Testing model on training data
Training accuracy after epoch  12  =  0.10746666666666667
Currently running epoch  13


Progress: 0it [00:00, ?it/s]

KeyboardInterrupt: ignored

# New Section