<h1 style="text-align: center;">Pytorch Tutorial</h1>

In [None]:
import os
import sys

## 1 PyTorch Introduction

The following tutorial is adopted from:

https://brsoff.github.io/tutorials/beginner/deep_learning_60min_blitz.html

https://github.com/yunjey/pytorch-tutorial/

**WHAT IS PYTORCH?**

>It’s a Python-based scientific computing package targeted at two sets of audiences:
> 1. A replacement for NumPy to use the power of GPUs
> 2. a deep learning research platform that provides maximum flexibility and speed

### **1.1 Tensors**

In [None]:
from __future__ import print_function
import torch
import numpy as np

In [None]:
# Task-01 Construct a uninitialized 5x3 matrix with random values (depending on memory state):

x = torch.empty(5, 4)
print(x)

In [None]:
# Task-02 Construct a randomly initialized matrix sampled from a uniform distribution U(0,1):

x = torch.rand(5,4)
print(x)

In [None]:
# Task-03 Construct a matrix filled zeros and of dtype long:

x = torch.zeros(5, 4, dtype=torch.long)
print(x)

In [None]:
# Task-04 Construct a tensor directly from data:

x = torch.tensor([5.5, 3, 4., -1.])
print(x)

In [None]:
# Task-05 These methods will reuse properties of the 
# input tensor, e.g. dtype, unless new values are provided by user

x = x.new_ones(5, 4, dtype=torch.double)      # new_* methods take in sizes
print(x)
print(type(x))

# Creates a new tensor with the same size as x, filled with random numbers from a normal distribution (mean=0, std=1), and converts to float32 type
x = torch.randn_like(x, dtype=torch.float)    # override dtype!
print(x)
print(type(x))

### **1.2 Operations**

In [None]:
# Task-06 Get its size:

print(x.size())

In [None]:
# Task-07 Addition using +

y = torch.rand(5, 4)
print(x + y)

In [None]:
# Task-08 Addition using add() method

print(torch.add(x, y))

In [None]:
# Task-09 Addition: providing an output tensor as argument

result = torch.empty(5, 4)
torch.add(x, y, out=result)
print(result)

In [None]:
# Task-10 Addition: in-place
# Any operation that mutates a tensor in-place is post-fixed with an _. 
# For example: x.copy_(y), x.t_(), will change x.

# adds x to y
y.add_(x)
print(y)

In [None]:
# Task-11 You can use standard NumPy-like indexing with all bells and whistles!

print(x[:, 1])

In [None]:
# Task-12 Resizing: If you want to resize/reshape tensor, you can use torch.view or torch.reshape.

# torch.view:
# Reshapes the tensor without changing its data.
x = torch.randn(4, 4)
y = x.view(16)
z = x.view(-1, 8)  # the size -1 is inferred from other dimensions
print(x.size(), y.size(), z.size())
print(f"is x, y, z share memory? {x.data_ptr() == y.data_ptr() == z.data_ptr()}")

# torch.reshape:
a = torch.randn(4, 4)
b = a.reshape(16)
c = a.reshape(-1, 8)  # the size -1 is inferred from other dimensions
print(a.size(), b.size(), c.size())
print(f"is a, b, c share memory? {a.data_ptr() == b.data_ptr() == c.data_ptr()}")


In [None]:
# Demonstrate the difference between reshape and view
x = torch.randn(4, 4)
x_t = x.t()  # Transpose operation makes memory non-contiguous

# Using view will raise an error
try:
    y = x_t.view(16)
except RuntimeError as e:
    print("view failed:", e)

# Using reshape succeeds
z = x_t.reshape(16)
print("\nreshape succeeded:")
print(z)

# Check memory contiguity
print("\nMemory contiguity check:")
print(f"Is x_t contiguous: {x_t.is_contiguous()}")
print(f"Is z contiguous: {z.is_contiguous()}")

In [None]:
# Task-13 Get numerical value: 
# If you have a one element tensor, use .item() 
# to get the value as a Python number

x = torch.randn(1)
print(x)
print(x.item())
print(y[0].item())

### **1.3 Converting a Torch Tensor to a NumPy Array**

In [None]:
# Task-14 Converting a Torch Tensor to a NumpPy Array
a = torch.ones(5)
print(a)
b = a.numpy()
print(b)

In [None]:
# Task-15 See how the numpy array changed in value.

a.add_(1)
print(a)
print(b)

# They share the same memory

In [None]:
# Task-16 Converting NumPy Array to Torch Tensor
a = np.ones(5)
b = torch.from_numpy(a)
np.add(a, 1, out=a)
print(a)
print(b)

In [None]:
# Task-17 Tensors can be moved onto any device using the .to method.

# let us run this cell only if CUDA is available
# We will use ``torch.device`` objects to move tensors in and out of GPU
if torch.cuda.is_available():
    device = torch.device("cuda")          # a CUDA device object
    y = torch.ones_like(x, device=device)  # directly create a tensor on GPU
    print(f"x's device: {x.device}")
    x = x.to(device)                       # or just use strings ``.to("cuda")``
    print(f"x's device: {x.device}")
    z = x + y
    print(z)
    print(z.to("cpu", torch.double))       # ``.to`` can also change dtype together!

### **1.4 Autograd: automatic differentiation**

Central to all neural networks in PyTorch is the *autograd* package. Let’s first briefly visit this, and we will then go to training our first neural network.

The autograd package provides automatic differentiation for all operations on Tensors. It is a define-by-run framework, which means that your backprop is defined by how your code is run, and that every single iteration can be different.

In [None]:
# Task-18 Create a tensor and set requires_grad=True to track computation with it
x = torch.ones(2, 2, requires_grad=True)
print(x)
y = x + 2
print(y)
print(y.grad_fn)

In [None]:
# Task-19 do operations on y

z = y * y * 3
out = z.mean()
print(z, out)

a = torch.randn(2, 2)
a = ((a * 3) / (a - 1))
print(a.requires_grad)
a.requires_grad_(True)
print(a.requires_grad)
b = (a * a).sum()
print(b.grad_fn)

### **1.5 Gradients**

Let’s backprop now Because out contains a single scalar, 
out.backward() is equivalent to out.backward(torch.tensor(1)).

In [None]:
# Task-20 do the backprop
out.backward()
print(x.grad)

In [None]:
# Task-21 the autograd operation could complicated

x = torch.randn(3, requires_grad=True)
y = x * 2
while y.data.norm() < 1000:
    y = y * 2
print(y)
gradients = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float)
y.backward(gradients)
print(x.grad)

In [None]:
# Task-22 Use requires_grad()
# You can also stop autograd from tracking history on Tensors with 
# .requires_grad=True by wrapping the code block in with torch.no_grad():
print(x.requires_grad)
print((x ** 2).requires_grad)
with torch.no_grad():
    print((x ** 2).requires_grad)

### **1.6 Basic autograd example**

In [None]:
# Task-23 create a neural network model

# Create tensors of shape (10, 3) and (10, 2).
x = torch.randn(10, 3)
y = torch.randn(10, 2)

# Build a fully connected layer.
linear = nn.Linear(3, 2)
print ('w: ', linear.weight)
print ('b: ', linear.bias)

# Build loss function and optimizer.
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(linear.parameters(), lr=0.01)

# Forward pass.
pred = linear(x)

# Compute loss.
loss = criterion(pred, y)
print('loss: ', loss.item())

# Backward pass.
loss.backward()

# Print out the gradients.
print ('dL/dw: ', linear.weight.grad) 
print ('dL/db: ', linear.bias.grad)

# 1-step gradient descent.
optimizer.step()

# You can also perform gradient descent at the low level.
# linear.weight.data.sub_(0.01 * linear.weight.grad.data)
# linear.bias.data.sub_(0.01 * linear.bias.grad.data)

# Print out the loss after 1-step gradient descent.
pred = linear(x)
loss = criterion(pred, y)
print('loss after 1 step optimization: ', loss.item())

An typical example of PyTorch neural network training workflow

1. Data Preparation
2. Model Definition
    - Create neural network layers (e.g., `nn.Linear` )
    - Initialize model parameters (weights and biases)
3. Loss Function & Optimizer Setup
    - Choose a loss function (e.g., `MSELoss` )
    - Select an optimizer (e.g., `SGD` )
    - Set hyperparameters (e.g., learning rate)
4. Training Loop
    - Forward Pass
    - Loss Computation
    - Backward Pass
        - Clear previous gradients  
        - Compute gradients using `backward()`
    - Parameter Update:
        - Update weights using optimizer's `step()`  
5. Evaluation
    - Switch to evaluation mode
    - Make predictions on validation set
    - Monitor validation metrics

### **1.7 Datasets**



In [None]:
# You should build your custom dataset as below.
from torch.utils.data import Dataset
class CustomDataset(Dataset):
    def __init__(self):
        # TODO
        # 1. Initialize file paths or a list of file names. 
        # self.data = ... # initialize a list of file names.
        pass
    def __getitem__(self, index):
        # TODO
        # 1. Read one data from file (e.g. using numpy.fromfile, PIL.Image.open).
        # 2. Preprocess the data (e.g. torchvision.Transform).
        # 3. Return a data pair (e.g. image and label).
        # return self.data[index] # return a data pair
        pass
    def __len__(self):
        # You should change 1 to the total size of your dataset.
        # return len(self.data) # return the total size of your dataset
        return 1

# You can then use the prebuilt data loader. 
custom_dataset = CustomDataset()
train_loader = torch.utils.data.DataLoader(dataset=custom_dataset, batch_size=64, shuffle=True)

### **1.8 Models**

In [None]:
# Create neural network model
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


net = Net()
print(net)

In [None]:
# Task-24 The learnable parameters of a model are returned by net.parameters()
params = list(net.parameters())
print(len(params))
print(params[0].size())  # size of conv1's .weight
print(net.conv1.weight, net.conv1.bias) # conv1's.weight

In [None]:
# Task-25 forward pass 
input = torch.randn(1, 3, 32, 32)
out = net(input)
print(out)

In [None]:
# Task-26 Zero the gradient buffers of all parameters and backprops with random gradients:
net.zero_grad()
out.backward(torch.randn(1, 10)) # backward with random gradients

### **1.9 Loss function**

In [None]:
# Task-27 Loss function 

output = net(input)
target = torch.randn(10)  # a dummy target, for example
target = target.view(1, -1)  # make it the same shape as output
criterion = nn.MSELoss()

loss = criterion(output, target)
print(loss)

# Task-28 Backprop
# computational graph of backpropagation
# MSELoss -> Linear -> ReLU 
print(loss.grad_fn)  # MSELoss
print(loss.grad_fn.next_functions[0][0])  # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])  # ReLU


### **1.10 Optimizer**

In [None]:
# Task-28 Backprop
net.zero_grad()     # zeroes the gradient buffers of all parameters
print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)
loss.backward()
print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)
# Update the weights

learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)

# create your optimizer
optimizer = optim.Adam(net.parameters(), lr=0.001)

### **1.11 Training code**

In [None]:
# in your training loop:
optimizer.zero_grad()   # zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()    # Does the update

In [None]:
x = torch.tensor(-2.0, requires_grad=True)
y = torch.tensor(5.0, requires_grad=True)
z = torch.tensor(-4.0, requires_grad=True)
f = (x+y)*z # Define the computation graph
f.backward() # PyTorch’s internal backward gradient computation
print('Gradients after backpropagation:', x.grad, y.grad, z.grad)

### **1.12 A Full Pipeline for Practical Neural Network Training**

In [None]:
import torch 
import torchvision
import torch.nn as nn
import numpy as np
import torchvision.transforms as transforms
from tqdm import tqdm

In [None]:
# Download and construct CIFAR-10 dataset.
train_dataset = torchvision.datasets.CIFAR10(root='data/',
                                             train=True, 
                                             transform=transforms.ToTensor(),
                                             download=True)
test_dataset = torchvision.datasets.CIFAR10(root='data/',
                                            train=False,
                                            transform=transforms.ToTensor(),
                                            download=True)                                        
# Fetch one data pair (read data from disk).
image, label = train_dataset[0]
print (image.size())
print (label)

# Data loader (this provides queues and threads in a very simple way).
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=64, 
                                           shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=64,
                                          shuffle=False)               

In [None]:
# When iteration starts, queue and thread start to load data from files.
data_iter = iter(train_loader)

# Mini-batch images and labels.
images, labels = next(data_iter)

# Model.
net = Net()

# Loss and Optimizer.
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

# Actual usage of the data loader is as below.
pbar = tqdm(train_loader, ncols=100, position=0, leave=True)
for batch_idx, (images, labels) in enumerate(pbar):
    # Forward pass
    outputs = net(images)
    
    # Calculate loss
    loss = criterion(outputs, labels)
    
    # Backward pass and optimize
    optimizer.zero_grad()  # Clear gradients
    loss.backward()       # Compute gradients
    optimizer.step()      # Update weights
    
    if batch_idx % 10 == 0:
        pbar.set_description(f'Training Loss: {loss.item():.4f}')   

In [None]:
# Test the model
correct = 0
total = 0
# Since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    for images, labels in test_loader:
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)     
        correct += (predicted == labels).sum().item()
print(f'Accuracy of the network on the 10000 test images: {100 * correct / total} %') 

In [None]:
# Pretrained model 
# Download and load the pretrained ResNet-18.
resnet = torchvision.models.resnet18(weights=True)
# If you want to finetune only the top layer of the model, set as below.
for param in resnet.parameters():
    param.requires_grad = False

# Replace the top layer for finetuning.
resnet.fc = nn.Linear(resnet.fc.in_features, 10)  # 10 is an example.

# Forward pass.
images = torch.randn(64, 3, 224, 224)
outputs = resnet(images)
print(outputs.size())     # (64, 10)


In [None]:
# Save and load the model
torch.save(resnet, 'model.ckpt')
model = torch.load('model.ckpt', weights_only=False)

# Save and load only the model parameters (recommended).
torch.save(resnet.state_dict(), 'params.ckpt')
resnet.load_state_dict(torch.load('params.ckpt'))

## 2 Micrograd Package
This section introduces the Micrograd package, which is a minimal implementation of an autograd engine (automatic differentiation).

### **2.1 `Value` class**

In [None]:
class Value:
    """ stores a single scalar value and its gradient """

    def __init__(self, data, _children=(), _op=''):
        self.data = data
        self.grad = 0
        # internal variables used for autograd graph construction
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op # the op that produced this node, for graphviz / debugging / etc

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')

        def _backward():
            self.grad += out.grad
            other.grad += out.grad
        out._backward = _backward

        return out

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward

        return out

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
        out = Value(self.data**other, (self,), f'**{other}')

        def _backward():
            self.grad += (other * self.data**(other-1)) * out.grad
        out._backward = _backward

        return out

    def relu(self):
        out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU')

        def _backward():
            self.grad += (out.data > 0) * out.grad
        out._backward = _backward

        return out

    def backward(self):

        # topological order all of the children in the graph
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        # go one variable at a time and apply the chain rule to get its gradient
        self.grad = 1
        for v in reversed(topo):
            v._backward()

    def __neg__(self): # -self
        return self * -1

    def __radd__(self, other): # other + self
        return self + other

    def __sub__(self, other): # self - other
        return self + (-other)

    def __rsub__(self, other): # other - self
        return other + (-self)

    def __rmul__(self, other): # other * self
        return self * other

    def __truediv__(self, other): # self / other
        return self * other**-1

    def __rtruediv__(self, other): # other / self
        return other * self**-1

    def __repr__(self):
        return f"Value(data={self.data}, grad={self.grad})"


### **2.2 `Module`, `Neuron`, `Layer` and `MLP` classes**

In [None]:
import random

# Base class for all neural network modules
# Provides basic functionality for parameter management and gradient zeroing
class Module:

    def zero_grad(self):
        for p in self.parameters():
            p.grad = 0

    def parameters(self):
        return []

# Single neuron class that implements either a ReLU or Linear neuron
# Inherits from Module base class for parameter management
class Neuron(Module):

    def __init__(self, nin, nonlin=True):
        self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
        self.b = Value(0)
        self.nonlin = nonlin

    def __call__(self, x):
        act = sum((wi*xi for wi,xi in zip(self.w, x)), self.b)
        return act.relu() if self.nonlin else act

    def parameters(self):
        return self.w + [self.b]

    def __repr__(self):
        return f"{'ReLU' if self.nonlin else 'Linear'}Neuron({len(self.w)})"

# Neural network layer class that contains multiple neurons
# Inherits from Module base class for parameter management
class Layer(Module):

    def __init__(self, nin, nout, **kwargs):
        self.neurons = [Neuron(nin, **kwargs) for _ in range(nout)]

    def __call__(self, x):
        out = [n(x) for n in self.neurons]
        return out[0] if len(out) == 1 else out

    def parameters(self):
        return [p for n in self.neurons for p in n.parameters()]

    def __repr__(self):
        return f"Layer of [{', '.join(str(n) for n in self.neurons)}]"

# Multi-Layer Perceptron (MLP) class that implements a feedforward neural network
class MLP(Module):
    def __init__(self, nin, nouts):
        sz = [nin] + nouts
        self.layers = [Layer(sz[i], sz[i+1], nonlin=i!=len(nouts)-1) for i in range(len(nouts))]

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

    def __repr__(self):
        return f"MLP of [{', '.join(str(layer) for layer in self.layers)}]"


In [None]:
import torch

def test_sanity_check():
    """
    A basic test to verify the implementation of autograd matches PyTorch's behavior.
    Compares the results of forward and backward passes between custom Value class
    and PyTorch's autograd.
    """
    x = Value(-4.0)
    z = 2 * x + 2 + x
    q = z.relu() + z * x
    h = (z * z).relu()
    y = h + q + q * x
    y.backward()
    xmg, ymg = x, y

    x = torch.Tensor([-4.0]).double()
    x.requires_grad = True
    z = 2 * x + 2 + x
    q = z.relu() + z * x
    h = (z * z).relu()
    y = h + q + q * x
    y.backward()
    xpt, ypt = x, y

    # forward pass went well
    assert ymg.data == ypt.data.item()
    # backward pass went well
    assert xmg.grad == xpt.grad.item()

def test_more_ops():
    """
    A more comprehensive test comparing custom autograd implementation with PyTorch.
    Tests various operations including addition, multiplication, power, ReLU,
    and division.
    """
    a = Value(-4.0)
    b = Value(2.0)
    c = a + b
    d = a * b + b**3
    c += c + 1
    c += 1 + c + (-a)
    d += d * 2 + (b + a).relu()
    d += 3 * d + (b - a).relu()
    e = c - d
    f = e**2
    g = f / 2.0
    g += 10.0 / f
    g.backward()
    amg, bmg, gmg = a, b, g

    a = torch.Tensor([-4.0]).double()
    b = torch.Tensor([2.0]).double()
    a.requires_grad = True
    b.requires_grad = True
    c = a + b
    d = a * b + b**3
    c = c + c + 1
    c = c + 1 + c + (-a)
    d = d + d * 2 + (b + a).relu()
    d = d + 3 * d + (b - a).relu()
    e = c - d
    f = e**2
    g = f / 2.0
    g = g + 10.0 / f
    g.backward()
    apt, bpt, gpt = a, b, g

    tol = 1e-6
    # forward pass went well
    assert abs(gmg.data - gpt.data.item()) < tol
    # backward pass went well
    assert abs(amg.grad - apt.grad.item()) < tol
    assert abs(bmg.grad - bpt.grad.item()) < tol

In [None]:
test_sanity_check()
test_more_ops()

In [None]:
a = Value(-4.0)
b = Value(2.0)
c = a + b
d = a * b + b**3
c += c + 1
c += 1 + c + (-a)
d += d * 2 + (b + a).relu()
d += 3 * d + (b - a).relu()
e = c - d
f = e**2
g = f / 2.0
g += 10.0 / f
print(f'{g.data:.4f}') # prints 24.7041, the outcome of this forward pass
g.backward()
print(f'{a.grad:.4f}') # prints 138.8338, i.e. the numerical value of dg/da
print(f'{b.grad:.4f}') # prints 645.5773, i.e. the numerical value of dg/db