In [13]:
# Import statements
import torch
import torchvision
import torch.nn as nn


import numpy as np

In [4]:
# Tensors : Warm-Up : Numpy
# Implement the network using Numpy


# variables
batch_size, input_dimensions, hidden_dimensions, output_dimensions = 64, 1000, 100, 10
learning_rate = 1e-6
# Create random input and output data
random_input = np.random.randn(batch_size, input_dimensions)
random_output = np.random.randn(batch_size, output_dimensions)
# Randomly initialize weights
weight_one = np.random.randn(input_dimensions, hidden_dimensions)
weight_two = np.random.randn(hidden_dimensions, output_dimensions)

for idx in range(250):
    # Perform a forward pass to compute the predicted y
    h = random_input.dot(weight_one)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(weight_two)

    # Compute and print loss
    loss = np.square(y_pred - random_output).sum()
    print("Epoch Loss pair", idx, loss)
    print()

    # Perform a Backprop to compute gradients of weight_one and weight_two wrt loss
    grad_y_pred = 2.0 * (y_pred - random_output)
    grad_weight_two = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(weight_two.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_weight_one = random_input.T.dot(grad_h)

    # Update Weights
    weight_one -= learning_rate * grad_weight_one
    weight_two -= learning_rate * grad_weight_two


Epoch Loss pair 0 32570906.456289362

Epoch Loss pair 1 24919643.695544064

Epoch Loss pair 2 18871516.72931711

Epoch Loss pair 3 13384847.969570603

Epoch Loss pair 4 8978487.885150198

Epoch Loss pair 5 5886105.268144172

Epoch Loss pair 6 3927536.953626305

Epoch Loss pair 7 2733028.317743892

Epoch Loss pair 8 2001537.0453600758

Epoch Loss pair 9 1536565.8125415863

Epoch Loss pair 10 1223849.0550200008

Epoch Loss pair 11 1001821.2689651926

Epoch Loss pair 12 835898.3228042189

Epoch Loss pair 13 706961.5482111586

Epoch Loss pair 14 603899.5674323032

Epoch Loss pair 15 519861.3318611538

Epoch Loss pair 16 450271.5285312706

Epoch Loss pair 17 392101.4799038395

Epoch Loss pair 18 343087.00791163533

Epoch Loss pair 19 301343.01322013367

Epoch Loss pair 20 265594.5547427781

Epoch Loss pair 21 234848.9862650811

Epoch Loss pair 22 208259.86564860446

Epoch Loss pair 23 185178.412069844

Epoch Loss pair 24 165070.95301717674

Epoch Loss pair 25 147506.434906177

Epoch Loss pa

In [8]:
# Implement network with PyTorch

d_type = torch.float
gpu_device = torch.device("cuda:0")

# variables
batch_size, input_dimensions, hidden_dimensions, output_dimensions = 64, 1000, 100, 10
learning_rate = 1e-6
# Create random input and output data
random_input = torch.randn(batch_size, input_dimensions, device=gpu_device, dtype=d_type)
random_output = torch.randn(batch_size, output_dimensions, device=gpu_device, dtype=d_type)
# Randomly initialize weights
weight_one = torch.randn(input_dimensions, hidden_dimensions, device=gpu_device, dtype=d_type)
weight_two = torch.randn(hidden_dimensions, output_dimensions, device=gpu_device, dtype=d_type)

for idx in range(500):
    # Perform a forward pass to compute the predicted y
    h = random_input.mm(weight_one)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(weight_two)

    # Compute and print loss
    loss = (y_pred - random_output).pow(2).sum().item()
    if idx % 100 == 99:
        print("Epoch Loss pair", idx, loss)
        print()

    # Perform a Backprop to compute gradients of weight_one and weight_two wrt loss
    grad_y_pred = 2.0 * (y_pred - random_output)
    grad_weight_two = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(weight_two.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_weight_one = random_input.t().mm(grad_h)

    # Update Weights
    weight_one -= learning_rate * grad_weight_one
    weight_two -= learning_rate * grad_weight_two

Epoch Loss pair 99 378.1025390625

Epoch Loss pair 199 1.3936830759048462

Epoch Loss pair 299 0.008971026167273521

Epoch Loss pair 399 0.00021459953859448433

Epoch Loss pair 499 3.687627031467855e-05



In [11]:
# Implement Network with Tensors + AutoGrad

d_type = torch.float
gpu_device = torch.device("cuda:0")

# variables
batch_size, input_dimensions, hidden_dimensions, output_dimensions = 64, 1000, 100, 10
learning_rate = 1e-6
# Create random input and output data
random_input = torch.randn(batch_size, input_dimensions, device=gpu_device, dtype=d_type)
random_output = torch.randn(batch_size, output_dimensions, device=gpu_device, dtype=d_type)
# Randomly initialize weights
weight_one = torch.randn(input_dimensions, hidden_dimensions, device=gpu_device, dtype=d_type, requires_grad=True)
weight_two = torch.randn(hidden_dimensions, output_dimensions, device=gpu_device, dtype=d_type, requires_grad=True)

for idx in range(500):
    # Perform a forward pass to compute the predicted y
    # h = random_input.mm(weight_one)
    # h_relu = h.clamp(min=0)
    y_pred = random_input.mm(weight_one).clamp(min=0).mm(weight_two)

    # Compute and print loss
    loss = (y_pred - random_output).pow(2).sum()
    if idx % 100 == 99:
        print("Epoch Loss pair", idx, loss.item())
        print()

    # Perform a Backprop to compute gradients of weight_one and weight_two wrt loss using AutoGrad
    loss.backward()   

    # Update Weights
    with torch.no_grad():
        weight_one -= learning_rate * weight_one.grad
        weight_two -= learning_rate * weight_two.grad

        # Manually zero the gradients post weight updating
        weight_one.grad.zero_()
        weight_two.grad.zero_()

Epoch Loss pair 99 941.2940673828125

Epoch Loss pair 199 9.423967361450195

Epoch Loss pair 299 0.16894696652889252

Epoch Loss pair 399 0.004014036152511835

Epoch Loss pair 499 0.00028565828688442707



In [12]:
# Implement Network with Tensors + AutoGrad + AutoGrad Function

class CustomRelu(torch.autograd.Function):

    @staticmethod
    def forward(ctx, input:torch.Tensor):
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output:torch.Tensor):
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

d_type = torch.float
gpu_device = torch.device("cuda:0")

# variables
batch_size, input_dimensions, hidden_dimensions, output_dimensions = 64, 1000, 100, 10
learning_rate = 1e-6
# Create random input and output data
random_input = torch.randn(batch_size, input_dimensions, device=gpu_device, dtype=d_type)
random_output = torch.randn(batch_size, output_dimensions, device=gpu_device, dtype=d_type)
# Randomly initialize weights
weight_one = torch.randn(input_dimensions, hidden_dimensions, device=gpu_device, dtype=d_type, requires_grad=True)
weight_two = torch.randn(hidden_dimensions, output_dimensions, device=gpu_device, dtype=d_type, requires_grad=True)

for idx in range(500):
    # Perform a forward pass to compute the predicted y
    # h = random_input.mm(weight_one)
    # h_relu = h.clamp(min=0)
    relu = CustomRelu.apply
    y_pred = relu(random_input.mm(weight_one)).mm(weight_two)

    # Compute and print loss
    loss = (y_pred - random_output).pow(2).sum()
    if idx % 100 == 99:
        print("Epoch Loss pair", idx, loss.item())
        print()

    # Perform a Backprop to compute gradients of weight_one and weight_two wrt loss using AutoGrad
    loss.backward()   

    # Update Weights
    with torch.no_grad():
        weight_one -= learning_rate * weight_one.grad
        weight_two -= learning_rate * weight_two.grad

        # Manually zero the gradients post weight updating
        weight_one.grad.zero_()
        weight_two.grad.zero_()

Epoch Loss pair 99 536.0281372070312

Epoch Loss pair199 2.800607919692993

Epoch Loss pair 299 0.02065237984061241

Epoch Loss pair 399 0.0003756055375561118

Epoch Loss pair 499 5.1681501645362005e-05



In [16]:
# Implement Network with Tensors + AutoGrad + nn Module
d_type = torch.float
gpu_device = torch.device("cuda:0")

# variables
batch_size, input_dimensions, hidden_dimensions, output_dimensions = 64, 1000, 100, 10
learning_rate = 1e-6
# Create random input and output data
random_input = torch.randn(batch_size, input_dimensions, device=gpu_device, dtype=d_type)
random_output = torch.randn(batch_size, output_dimensions, device=gpu_device, dtype=d_type)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
model = nn.Sequential(nn.Linear(in_features=input_dimensions, out_features=hidden_dimensions), nn.ReLU(), nn.Linear(in_features=hidden_dimensions, out_features=output_dimensions),).to(gpu_device)

loss_fxn = nn.MSELoss(reduction='sum')

for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(random_input)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fxn(y_pred, random_output)
    if idx % 100 == 99:
        print("Epoch Loss pair", idx, loss.item())
        print()

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad


Epoch Loss pair 499 592.2860717773438

Epoch Loss pair 499 591.8289184570312

Epoch Loss pair 499 591.3723754882812

Epoch Loss pair 499 590.9163208007812

Epoch Loss pair 499 590.460693359375

Epoch Loss pair 499 590.0059204101562

Epoch Loss pair 499 589.5518188476562

Epoch Loss pair 499 589.0983276367188

Epoch Loss pair 499 588.645263671875

Epoch Loss pair 499 588.1927490234375

Epoch Loss pair 499 587.74072265625

Epoch Loss pair 499 587.2892456054688

Epoch Loss pair 499 586.8381958007812

Epoch Loss pair 499 586.3876953125

Epoch Loss pair 499 585.9376831054688

Epoch Loss pair 499 585.4881591796875

Epoch Loss pair 499 585.0392456054688

Epoch Loss pair 499 584.5908813476562

Epoch Loss pair 499 584.1429443359375

Epoch Loss pair 499 583.6956176757812

Epoch Loss pair 499 583.2487182617188

Epoch Loss pair 499 582.8023071289062

Epoch Loss pair 499 582.3565063476562

Epoch Loss pair 499 581.9110717773438

Epoch Loss pair 499 581.46630859375

Epoch Loss pair 499 581.0222778320

In [17]:
# Implement Network with Tensors + AutoGrad + nn Module + optim
d_type = torch.float
gpu_device = torch.device("cuda:0")

# variables
batch_size, input_dimensions, hidden_dimensions, output_dimensions = 64, 1000, 100, 10
learning_rate = 1e-4
# Create random input and output data
random_input = torch.randn(batch_size, input_dimensions, device=gpu_device, dtype=d_type)
random_output = torch.randn(batch_size, output_dimensions, device=gpu_device, dtype=d_type)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
model = nn.Sequential(nn.Linear(in_features=input_dimensions, out_features=hidden_dimensions), nn.ReLU(), nn.Linear(in_features=hidden_dimensions, out_features=output_dimensions),).to(gpu_device)

loss_fxn = nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(random_input)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fxn(y_pred, random_output)
    if idx % 100 == 99:
        print("Epoch Loss pair", idx, loss.item())
        print()

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    optimizer.step()

och Loss pair 499 501.0843505859375

Epoch Loss pair 499 489.2422790527344

Epoch Loss pair 499 477.72760009765625

Epoch Loss pair 499 466.57550048828125

Epoch Loss pair 499 455.7320251464844

Epoch Loss pair 499 445.2145690917969

Epoch Loss pair 499 435.00799560546875

Epoch Loss pair 499 425.0719299316406

Epoch Loss pair 499 415.3626403808594

Epoch Loss pair 499 405.9352111816406

Epoch Loss pair 499 396.7318115234375

Epoch Loss pair 499 387.76458740234375

Epoch Loss pair 499 378.9987487792969

Epoch Loss pair 499 370.4552001953125

Epoch Loss pair 499 362.1180419921875

Epoch Loss pair 499 353.963623046875

Epoch Loss pair 499 346.0450439453125

Epoch Loss pair 499 338.3359680175781

Epoch Loss pair 499 330.781005859375

Epoch Loss pair 499 323.3877258300781

Epoch Loss pair 499 316.14581298828125

Epoch Loss pair 499 309.04388427734375

Epoch Loss pair 499 302.0611267089844

Epoch Loss pair 499 295.2125549316406

Epoch Loss pair 499 288.48486328125

Epoch Loss pair 499 281.8

In [18]:
# Implement Network with Tensors + AutoGrad + nn Module + optim

class TwoLayerNN(nn.Module):
    def __init__(self, input_dims, hidden_dims, output_dims):
        super(TwoLayerNN, self).__init__()
        self.linear_one = nn.Linear(in_features=input_dims, out_features=hidden_dims)
        self.linear_two = nn.Linear(in_features=hidden_dims, out_features=output_dims)
    
    def forward(self, input_tensor:torch.Tensor):
        h_relu = self.linear_one(input_tensor).clamp(min=0)
        y_pred = self.linear_two(h_relu)
        return y_pred


d_type = torch.float
gpu_device = torch.device("cuda:0")

# variables
batch_size, input_dimensions, hidden_dimensions, output_dimensions = 64, 1000, 100, 10
learning_rate = 1e-4
# Create random input and output data
random_input = torch.randn(batch_size, input_dimensions, device=gpu_device, dtype=d_type)
random_output = torch.randn(batch_size, output_dimensions, device=gpu_device, dtype=d_type)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
model = TwoLayerNN(input_dims=input_dimensions, hidden_dims=hidden_dimensions, output_dims=output_dimensions).to(gpu_device)

loss_fxn = nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(random_input)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fxn(y_pred, random_output)
    if idx % 100 == 99:
        print("Epoch Loss pair", idx, loss.item())
        print()

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    optimizer.step()

499 162.05828857421875

Epoch Loss pair 499 152.57545471191406

Epoch Loss pair 499 143.57264709472656

Epoch Loss pair 499 135.05853271484375

Epoch Loss pair 499 127.01045989990234

Epoch Loss pair 499 119.43370056152344

Epoch Loss pair 499 112.29047393798828

Epoch Loss pair 499 105.57135009765625

Epoch Loss pair 499 99.2163314819336

Epoch Loss pair 499 93.23991394042969

Epoch Loss pair 499 87.62385559082031

Epoch Loss pair 499 82.3311996459961

Epoch Loss pair 499 77.35382843017578

Epoch Loss pair 499 72.6815414428711

Epoch Loss pair 499 68.29139709472656

Epoch Loss pair 499 64.17308044433594

Epoch Loss pair 499 60.30839538574219

Epoch Loss pair 499 56.68817138671875

Epoch Loss pair 499 53.291297912597656

Epoch Loss pair 499 50.074317932128906

Epoch Loss pair 499 47.05964279174805

Epoch Loss pair 499 44.232486724853516

Epoch Loss pair 499 41.57513427734375

Epoch Loss pair 499 39.08771514892578

Epoch Loss pair 499 36.75895309448242

Epoch Loss pair 499 34.5794830322