# BIG PYTORCH LOOKUP SESSION

In [2]:
import torch
import torch.nn as nn

import pprint
pp = pprint.PrettyPrinter()

# Every TensorOp

In [8]:
list_of_lists = [[1, 2, 3], [4, 5, 6]]
data = torch.tensor(list_of_lists)
print(data)

list_of_lists = [[1.111, 2, 3], [4, 5, 6]]
data2 = torch.tensor(list_of_lists, dtype=torch.float32)
print(data2)

tensor([[1, 2, 3],
        [4, 5, 6]])
tensor([[1.1110, 2.0000, 3.0000],
        [4.0000, 5.0000, 6.0000]])


In [11]:
# ALL SPECIAL TENSORS YOU NEED TO KNOW
zeros = torch.zeros(2, 5)
print(zeros)
ones = torch.ones(2, 5)
print(ones)
rr = torch.arange(1, 10)     # arange always gives you a 1D TENSOR of the range (start_inc, end_exc)!!!
print(rr)

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])
tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]])
tensor([1, 2, 3, 4, 5, 6, 7, 8, 9])


In [46]:
# ALL IMPORTANT OPS
# ELEM WISE OPS
print(rr + 2)
print(rr * 2)

# MATMUL OR @
print(zeros @ ones.T)
print(zeros.matmul(ones.T))

# VECTORIZED OPERATIONS
data = torch.arange(1, 36, dtype=torch.float32).reshape(5, 7)
print(data)

print(data.sum(dim=1))   # SUM OVER ROWS (dimi is the one we're eliminating)
print(data.sum(dim=0))   # SUM OVER COLS
print(data.std(dim=1))
print(data.sum())        # exception: sum() with no dimension is just total sum of the tensor


data = torch.tensor([[1,2.2, 9.6], [4, -7.2, 6.3]], dtype = torch.float32)
print(data)
print(data.mean(dim=0))
print(data.mean(dim=1))

tensor([ 3,  4,  5,  6,  7,  8,  9, 10, 11])
tensor([ 2,  4,  6,  8, 10, 12, 14, 16, 18])
tensor([[0., 0.],
        [0., 0.]])
tensor([[0., 0.],
        [0., 0.]])
tensor([[ 1.,  2.,  3.,  4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11., 12., 13., 14.],
        [15., 16., 17., 18., 19., 20., 21.],
        [22., 23., 24., 25., 26., 27., 28.],
        [29., 30., 31., 32., 33., 34., 35.]])
tensor([ 28.,  77., 126., 175., 224.])
tensor([ 75.,  80.,  85.,  90.,  95., 100., 105.])
tensor([2.1602, 2.1602, 2.1602, 2.1602, 2.1602])
tensor(630.)
tensor([[ 1.0000,  2.2000,  9.6000],
        [ 4.0000, -7.2000,  6.3000]])
tensor([ 2.5000, -2.5000,  7.9500])
tensor([4.2667, 1.0333])


In [34]:
# CHECKING SHAPE AND RESHAPING
print(ones.shape)

# RESHAPING!!!
long_line = torch.arange(1, 16)
new_line = long_line

new_line = new_line.view(5, 3)
new_line = new_line.reshape(5, 3)

print(new_line)

torch.Size([2, 5])
tensor([[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9],
        [10, 11, 12],
        [13, 14, 15]])


In [61]:
# CONVERSION
import numpy as np

np_array = np.array([1,0,5])
my_torch_tensor = torch.tensor(np_array)
print(my_torch_tensor)

my_torch_tensor_to_numpy = my_torch_tensor.numpy()
print(my_torch_tensor_to_numpy)

# can quickly convert to python value for debugging:
print(my_torch_tensor[0].item())

tensor([1, 0, 5])
[1 0 5]
1


# Indexing

In [59]:
# IT'S JUST A LIST OF LISTS!!!
data = torch.tensor([[[1,2.2, 9.6], [4, -7.2, 6.3]], [[1,2, 9], [4, -7, 8.3]]], dtype = torch.float32)
print(data)
print(data.shape)
print(data[0])
print(data[:, 0])  # the two outer elements, but the 0th thing in both of those outer elements
print(data[:, :, 0]) # just the 0th element in each of the 4 inner

print("\n\nindexing tensors:")
indexing_tensor = torch.tensor([0, 0, 1, 1])
j_idx_tensor = torch.tensor([0, 0, 0, 0])
print(data[indexing_tensor, j_idx_tensor])    # same as stacking [x[0], x[0], x[1], x[1]]


tensor([[[ 1.0000,  2.2000,  9.6000],
         [ 4.0000, -7.2000,  6.3000]],

        [[ 1.0000,  2.0000,  9.0000],
         [ 4.0000, -7.0000,  8.3000]]])
torch.Size([2, 2, 3])
tensor([[ 1.0000,  2.2000,  9.6000],
        [ 4.0000, -7.2000,  6.3000]])
tensor([[1.0000, 2.2000, 9.6000],
        [1.0000, 2.0000, 9.0000]])
tensor([[1., 4.],
        [1., 4.]])


indexing tensors:
tensor([[1.0000, 2.2000, 9.6000],
        [1.0000, 2.2000, 9.6000],
        [1.0000, 2.0000, 9.0000],
        [1.0000, 2.0000, 9.0000]])


In [67]:
# EXAMPLE GETTING A SINGLE ROW OR COL

data = torch.Tensor([[1,2.2, 9.6], [4, -7.2, 6.3]])
print(data)
print(data[:, 0])  # first col
print(data[0, :])  # first row

tensor([[ 1.0000,  2.2000,  9.6000],
        [ 4.0000, -7.2000,  6.3000]])
tensor([1., 4.])
tensor([1.0000, 2.2000, 9.6000])


# AUTOGRAD (MINI BACKPROP)

In [75]:
import autograd.numpy as jnp
from autograd import grad

def tanh(x):
    y = jnp.exp(-x)
    return (1.0 - y) / (1.0 + y)

my_gradient_computing_fn = grad(tanh)
print("COMPUTED DERIVATIVE/GRAD OF SOME FUNCTION:", my_gradient_computing_fn(1.0))   # GRADIENT OF tanh(1)

COMPUTED DERIVATIVE/GRAD OF SOME FUNCTION: 0.39322386648296376


# PYTORCH BACKPROP

In [82]:
# Basic function:
import torch

x = torch.ones(5)   # input
y = torch.zeros(3)  # expected output

w = torch.randn(5, 3, requires_grad=True)
b = torch.randn(3, requires_grad=True)
print(b)   # [1,1,1,1] vs [[1,1,1]] which you would get from shape (1,3) instead of (3,)

z = torch.matmul(x, w) + b
loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)
print(loss)

tensor([-0.5721, -0.8316, -0.2502], requires_grad=True)
tensor(0.5048, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


In [85]:
loss.backward()    # just call .backward(), and ANY PYTORCH OBJECT WITH 
print(w.grad)      # A GRAD ATTRIBUTE WILL GET AUTOMATICALLY COMPUTED
print(b.grad)

tensor([[0.1850, 0.0405, 0.1459],
        [0.1850, 0.0405, 0.1459],
        [0.1850, 0.0405, 0.1459],
        [0.1850, 0.0405, 0.1459],
        [0.1850, 0.0405, 0.1459]])
tensor([0.1850, 0.0405, 0.1459])


In [87]:
# DISABLE GRADIENT STORAGE ON PYTORCH TENSORS:

print(z.requires_grad)
with torch.no_grad():
    z = torch.matmul(x, w) + b
print(z.requires_grad)

False
False


# PYTORCH NN BACKPROP

Pytorch gives us all the modules needed to make neural nets

In [97]:
# LINEAR LAYER
import torch.nn as nn

my_in = torch.ones(2,3,4)
print(my_in.shape)


linear = nn.Linear(4, 2, bias=False)   # transforms (N, *, H_in) into (N, *, H_out)
linear_output = linear(my_in)
print(linear_output.shape)    # col_size 4 -> 2


print("params:", list(linear.parameters()))  # w, bias also there by default

torch.Size([2, 3, 4])
torch.Size([2, 3, 2])
params: [Parameter containing:
tensor([[-0.0933, -0.2155, -0.2164,  0.1380],
        [ 0.1377,  0.1489, -0.2211, -0.0227]], requires_grad=True)]


In [99]:
# ACTIVATION FUNCTION LAYER - JUST APPLY SOME NONLINEAR FUNCTION TO EACH ELEMENT!!!

sigmoid = nn.Sigmoid()
output = sigmoid(linear_output)
print(output)
print(output.shape)    # JUST APPLIES IT ELEMENTWISE!!

tensor([[[0.4044, 0.5107],
         [0.4044, 0.5107],
         [0.4044, 0.5107]],

        [[0.4044, 0.5107],
         [0.4044, 0.5107],
         [0.4044, 0.5107]]], grad_fn=<SigmoidBackward0>)
torch.Size([2, 3, 2])


In [101]:
# IMPORTANT: COMPOSING LAYERS TOGETHER
block_of_modules = nn.Sequential(
    nn.Linear(4, 2),
    nn.Sigmoid()
)

input_0 = torch.ones(2,3,4)
output_0 = block_of_modules(input_0)
print(output_0)

tensor([[[0.6770, 0.5666],
         [0.6770, 0.5666],
         [0.6770, 0.5666]],

        [[0.6770, 0.5666],
         [0.6770, 0.5666],
         [0.6770, 0.5666]]], grad_fn=<SigmoidBackward0>)


In [105]:
# CREATING YOUR OWN MODULE (composed of a bunch of other modules that you can do backprop on)

class WillyMLP(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(WillyMLP, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size

        self.model = nn.Sequential(
            nn.Linear(self.input_size, self.hidden_size),
            nn.ReLU(),
            nn.Linear(self.hidden_size, self.input_size),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        output = self.model(x)
        return output

# also don't have to use nn.Sequential: can also just run it through the blocks yourself:
class WillyMLP(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(WillyMLP, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size

        # CAN MANUALLY DEFINE LAYERS AND RUN IT THROUGH
        self.linear = nn.Linear(self.input_size, self.hidden_size)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(self.hidden_size, self.input_size)
        self.sigmoid = nn.Sigmoid()

        
    def forward(self, x):
        linear = self.linear(x)
        relu = self.relu(linear)
        linear2 = self.linear2(relu)
        output = self.sigmoid(linear2)
        return output


In [115]:
# FORWARD PASS USING OUR CUSTOM NETWORK
myi = torch.randn(2, 5)
model = WillyMLP(5, 3)
print(model(myi))

print("\n=================\nBUNCH OF NAMED PARAMETERS OF OUR NETWORK\n=================\n")
print(list(model.named_parameters()))

tensor([[0.3667, 0.6227, 0.4619, 0.6093, 0.3967],
        [0.4430, 0.7104, 0.3968, 0.6004, 0.5214]], grad_fn=<SigmoidBackward0>)

BUNCH OF NAMED PARAMETERS OF OUR NETWORK

[('linear.weight', Parameter containing:
tensor([[-0.2330,  0.1034, -0.2053,  0.2449, -0.0176],
        [ 0.3257,  0.3162,  0.1809, -0.2025, -0.1827],
        [ 0.4167, -0.4469, -0.2990, -0.3230,  0.3591]], requires_grad=True)), ('linear.bias', Parameter containing:
tensor([ 0.2989, -0.4220, -0.1315], requires_grad=True)), ('linear2.weight', Parameter containing:
tensor([[-0.4283, -0.4458,  0.3641],
        [-0.0695, -0.2243,  0.3731],
        [ 0.3718,  0.5350, -0.3076],
        [-0.1127, -0.0146, -0.0142],
        [-0.3571,  0.1548,  0.5226]], requires_grad=True)), ('linear2.bias', Parameter containing:
tensor([-0.4286,  0.5203, -0.2549,  0.4755, -0.3212], requires_grad=True))]


# TRAINING NEURAL NET CODE

In [118]:
import torch.optim as optim

y = torch.ones(10, 5)
x = y + torch.randn_like(y)  # adding some noise to true y
x

tensor([[ 2.4363e+00,  8.6235e-01,  1.9732e+00,  6.9897e-01, -2.8719e-03],
        [ 2.5686e+00,  3.7412e+00,  1.7354e+00,  1.8184e+00,  1.7603e+00],
        [ 1.3305e+00,  2.7098e+00,  2.2795e+00,  3.3021e-01,  1.1954e+00],
        [ 8.8416e-01,  1.0124e+00,  2.4622e+00,  6.0831e-01,  7.4986e-01],
        [ 2.5592e+00,  7.0638e-01,  5.3073e-01,  3.0516e-01,  1.0083e+00],
        [ 1.3382e+00,  3.1100e+00,  1.2589e+00, -1.5555e-01,  1.8188e+00],
        [ 6.2724e-01,  1.5129e+00,  1.7279e+00,  2.2089e+00,  1.1439e+00],
        [ 1.8218e+00,  2.0479e+00,  1.6203e+00,  9.4049e-01,  2.0260e+00],
        [ 1.7789e+00,  4.5470e-01,  4.8896e-01,  1.5719e+00, -1.4228e+00],
        [ 3.1478e+00,  1.4750e+00,  1.2238e+00,  2.8399e-01,  5.0628e-01]])

In [127]:
# need your defined model
model = WillyMLP(5, 3)

# and the OPTIMIZER you are using (adam is fine), also kind of loss youre using
adam = optim.Adam(model.parameters(), lr=1e-1)
loss_function = nn.MSELoss()

y_preds = model(x)
# print("computed loss: ", loss_function(y_preds, y).item())   # COMPUTED LOSS

In [129]:
# FULL BACKPROP LOOP
n_epoch = 25

for e in range(n_epoch):
    adam.zero_grad()   # set all gradients to 0
    y_pred = model(x)
    
    loss = loss_function(y_pred, y)
    
    print(f"Epoch {e}: training loss is {loss}")
    
    loss.backward()  # first compute all the gradients in the model...
    adam.step()      # THEN take a step using those computed gradients


Epoch 0: training loss is 0.1451420933008194
Epoch 1: training loss is 0.06197305768728256
Epoch 2: training loss is 0.016157714650034904
Epoch 3: training loss is 0.003395551349967718
Epoch 4: training loss is 0.0007160273380577564
Epoch 5: training loss is 0.00015263627574313432
Epoch 6: training loss is 3.269046646892093e-05
Epoch 7: training loss is 7.167921467043925e-06
Epoch 8: training loss is 1.6430896039310028e-06
Epoch 9: training loss is 3.99641436388265e-07
Epoch 10: training loss is 1.0407149630964341e-07
Epoch 11: training loss is 2.914954144728199e-08
Epoch 12: training loss is 8.800101092276691e-09
Epoch 13: training loss is 2.863711490164178e-09
Epoch 14: training loss is 1.0029017616375313e-09
Epoch 15: training loss is 3.776816082101675e-10
Epoch 16: training loss is 1.523366710554086e-10
Epoch 17: training loss is 6.546344022417827e-11
Epoch 18: training loss is 3.0029808500975363e-11
Epoch 19: training loss is 1.4462955139071809e-11
Epoch 20: training loss is 7.457

In [130]:
# LETS SEE THE RESULTS
y_pred = model(x)
y_pred

tensor([[1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000]], grad_fn=<SigmoidBackward0>)

# DATALOADERS AND BATCHING

In [131]:
# CODE OF TRAINING ON GPU WITH A DATALOADER!!!

import torch.nn.functional as F

# Define the neural network for FashionMNIST
class FashionMNISTNet(nn.Module):
    def __init__(self):
        super(FashionMNISTNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, 3)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(16, 32, 3)
        self.fc1 = nn.Linear(32 * 5 * 5, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 32 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# IMPORTANT CODE BELOW
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Create an instance of the neural network
net = FashionMNISTNet()
print(net)

# Move the model to the GPU if available
net.to(device)

# Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)

# Train the neural network using the FashionMNIST dataset
num_epochs = 5
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader, 0):

        # Move the inputs and labels to the GPU if available
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = net(inputs)

        # Compute the loss
        loss = criterion(outputs, labels)

        # Backward pass
        loss.backward()
        optimizer.step()

        # Update the running loss
        running_loss += loss.item()

    # Print the average loss for this epoch
    avg_loss = running_loss / (i + 1)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")
print("Training finished.")

In [None]:
# LOOKUP NUM PARAMS

total_params = sum(p.numel() for p in net.parameters())
print(f'{total_params:,} total parameters.')
total_trainable_params = sum(
    p.numel() for p in net.parameters() if p.requires_grad)
print(f'{total_trainable_params:,} training parameters.')

In [None]:
# FREEZE OTHER LAYERS: IF YOU JUST WANT TO MODIFY THE VERY LAST LAYER
# Now that we have a trained model, if we want to adapt the model to 
# another dataset with only 5 classes, we can freeze earlier layers and 
# only train on the last fully-connected layer.

# Freeze earlier layers
for param in net.parameters():
    param.requires_grad = False

n_inputs = net.fc3.in_features
n_classes = 5
net.fc3 = nn.Linear(n_inputs, n_classes)
total_params = sum(p.numel() for p in net.parameters())
print(f'{total_params:,} total parameters.')
total_trainable_params = sum(
    p.numel() for p in net.parameters() if p.requires_grad)
print(f'{total_trainable_params:,} training parameters.')