In [1]:
import pandas as pd
import numpy as np
import torch


In [2]:
print(torch.__version__)

2.3.0+cpu


In [3]:
!nvidia-smi

Wed Jun 12 02:47:02 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.85                 Driver Version: 555.85         CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce MX230         WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   74C    P0             N/A / ERR!  |       0MiB /   2048MiB |      2%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Creating Tensors

In [4]:
#scalar 
scalar = torch.tensor(7)
scalar

tensor(7)

In [5]:
scalar.ndim

0

In [6]:
scalar.item()

7

In [7]:
#vector

vector = torch.tensor([1,2,3,4,5])
vector
vector.ndim


1

In [8]:
vector.shape

torch.Size([5])

In [9]:
#MATRIX

MATRIX = torch.tensor([[7,8],[9,10]])
MATRIX

tensor([[ 7,  8],
        [ 9, 10]])

In [10]:
MATRIX.ndim

2

In [11]:
MATRIX[1]

tensor([ 9, 10])

In [12]:
MATRIX.shape

torch.Size([2, 2])

In [13]:
#TENSOR

TENSOR = torch.tensor([[[1,2,3],[4,5,6],[7,8,9]]])

TENSOR

tensor([[[1, 2, 3],
         [4, 5, 6],
         [7, 8, 9]]])

In [14]:
TENSOR.ndim

3

In [15]:
TENSOR.shape

torch.Size([1, 3, 3])

In [16]:
TENSOR[0]

tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])

In [17]:
## Random Tensors

random_tensor = torch.rand(3,4)
random_tensor


tensor([[0.5326, 0.4039, 0.4159, 0.9516],
        [0.1827, 0.4552, 0.3709, 0.9532],
        [0.9926, 0.6782, 0.6395, 0.4933]])

In [18]:
random_tensor.ndim

2

In [19]:
# create a random tensor with a similar shape to image tensor

image = torch.rand(size=(224, 224,3))

image.ndim


3

In [20]:
image.shape

torch.Size([224, 224, 3])

In [21]:
# Zeros & ones

zero=torch.zeros(3,4)
zero

tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])

In [22]:
ones=torch.ones(2,3,2)
ones

tensor([[[1., 1.],
         [1., 1.],
         [1., 1.]],

        [[1., 1.],
         [1., 1.],
         [1., 1.]]])

In [23]:
ones.dtype, ones.ndim

(torch.float32, 3)

In [24]:
# create a range of tensors & tensors-like

one_to_ten= torch.arange(0,10)
one_to_ten

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [25]:
#Tensors-like

ten_zeros=torch.zeros_like(one_to_ten)
ten_zeros

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## Tensors Datatypes

In [26]:
float_32 = torch.tensor([3.0,6.0,9.0],
                        dtype=None,device = None, 
                        requires_grad=False)

float_32



tensor([3., 6., 9.])

In [27]:
float_32.dtype

torch.float32

In [28]:
float_16 = float_32.type(torch.float16)
float_16

tensor([3., 6., 9.], dtype=torch.float16)

In [29]:
float_16*float_32

tensor([ 9., 36., 81.])

In [30]:
# Getting information from tensor

some_tensor = torch.rand(3,4)
some_tensor

tensor([[0.1555, 0.0211, 0.6508, 0.4131],
        [0.2660, 0.2450, 0.0761, 0.7361],
        [0.3545, 0.5815, 0.2543, 0.9427]])

In [31]:
print(some_tensor)
print(some_tensor.dtype)
print(some_tensor.size)
print(some_tensor.device)


tensor([[0.1555, 0.0211, 0.6508, 0.4131],
        [0.2660, 0.2450, 0.0761, 0.7361],
        [0.3545, 0.5815, 0.2543, 0.9427]])
torch.float32
<built-in method size of Tensor object at 0x00000227D982F9A0>
cpu


## Manipulating tensors

 Tensors operations include addition, subtractition,Multiplication,Division & matrix multiplication



In [32]:
tensor = torch.tensor([1,2,3])
scalar_tensor = torch.tensor(10)

tensor + 10

tensor([11, 12, 13])

In [33]:
tensor * 10

tensor([10, 20, 30])

In [34]:
tensor - 10

tensor([-9, -8, -7])

## Matrix Multiplication

Multiplication can be done in 2 ways.
Element wise multiplication & matrix multiplication.

In [35]:
print(tensor * tensor)

tensor([1, 4, 9])


In [36]:
#Matrix Multiplication

torch.matmul(tensor,tensor)

tensor(14)

condition to satisfy matrix multiplication :

Inner dimesnsion should be same.
(3,2) @ (3,2) wont work
(3,2) @ (2,3) will work

THE RESULTING MATRIX HAS THE SHAPE OF **OUTER DIMENSION**


In [37]:
# SHAPES for matrix multiplication

tensor_a = torch.tensor([[1,2],[3,4],[5,6]])

tensor_b = torch.tensor([[7,10],[8,11],[9,12]])



torch.mm(tensor_a,tensor_b.T)


# torch.mm(tensor_a,tensor_b) WILL NOT WORK


tensor([[ 27,  30,  33],
        [ 61,  68,  75],
        [ 95, 106, 117]])

In [38]:
tensor_a.shape , tensor_b.shape

(torch.Size([3, 2]), torch.Size([3, 2]))

To fix our shape issues we can manipulate the shape of one of the tensors using **TRANSPOSE** 

In [39]:
tensor_b.T


tensor([[ 7,  8,  9],
        [10, 11, 12]])

In [40]:
tensor_b.T.shape

torch.Size([2, 3])

## Tensor Aggregation

In [41]:
x = torch.arange(0,100,10)
x

tensor([ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90])

In [42]:
torch.min(x)

tensor(0)

In [43]:
torch.max(x)

tensor(90)

In [44]:
x.dtype

torch.int64

In [45]:
#torch.mean(x) wont work 

torch.mean(x.type(torch.float32)),

x.type(torch.float32).mean()

tensor(45.)

In [46]:
torch.min(x), torch.max(x)

(tensor(0), tensor(90))

In [47]:
#positional min & max

x.argmin(), x.argmax(), 

(tensor(0), tensor(9))

# Reshaping, stacking , squeezeing and unsqueezing

Reshaping- reshapes an input to a defined shape

stacking- combine multiple tensor on top of each other
hstack, vstack

squeezeing- removes the dimension of size 1

unsqueezing- adds the dimension of size 1





In [48]:
x = torch.arange(1.,11.)
x , x.shape

(tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]), torch.Size([10]))

In [49]:
y = x.reshape(5,2)
y

tensor([[ 1.,  2.],
        [ 3.,  4.],
        [ 5.,  6.],
        [ 7.,  8.],
        [ 9., 10.]])

In [50]:
#stack

x_stacked = torch.stack([x,x,x,x])

x_stacked

tensor([[ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.],
        [ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.],
        [ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.],
        [ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]])

In [51]:
x = torch.zeros(2, 1, 2, 1, 2)
x.size()


torch.Size([2, 1, 2, 1, 2])

In [52]:
y = torch.squeeze(x)
y.size()

torch.Size([2, 2, 2])

In [53]:
y = torch.squeeze(x, 0)
y.size()


torch.Size([2, 1, 2, 1, 2])

In [54]:
y = torch.squeeze(x, 1)
y.size()


torch.Size([2, 2, 1, 2])

In [55]:
# torch.permute (mostly used for images)

x = torch.randn(2, 3, 5)
x.size()
torch.permute(x, (2, 0, 1)).size()

torch.Size([5, 2, 3])

### INDEXING

In [56]:
import torch

x = torch.arange(1,10).reshape(1,3,3)

x, x.shape

(tensor([[[1, 2, 3],
          [4, 5, 6],
          [7, 8, 9]]]),
 torch.Size([1, 3, 3]))

In [57]:
x[0]

tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])

In [58]:
x[0][0]

tensor([1, 2, 3])

In [59]:
x[0][0][0]

tensor(1)

In [60]:
x[:,:,1]

tensor([[2, 5, 8]])

In [61]:
x[:,1,1]

tensor([5])

In [62]:
x[0,0,:]

tensor([1, 2, 3])

In [63]:
x

tensor([[[1, 2, 3],
         [4, 5, 6],
         [7, 8, 9]]])

In [64]:
x[0,2,2]

tensor(9)

In [65]:
x[:,:,2]

tensor([[3, 6, 9]])

In [66]:
## Pytorch & Numpy

import torch
import numpy as np

array = np.arange(1.0,8.0)
tensor = torch.from_numpy(array)

array, tensor

(array([1., 2., 3., 4., 5., 6., 7.]),
 tensor([1., 2., 3., 4., 5., 6., 7.], dtype=torch.float64))

In [67]:
array = array = 1
array, tensor

(1, tensor([1., 2., 3., 4., 5., 6., 7.], dtype=torch.float64))

In [68]:
tensor = torch.ones(7)

numpy_tensor = tensor.numpy()

tensor, numpy_tensor

(tensor([1., 1., 1., 1., 1., 1., 1.]),
 array([1., 1., 1., 1., 1., 1., 1.], dtype=float32))

### Reproducibility 

Taking random out of random

In [69]:
torch.rand(3,3)

tensor([[0.0284, 0.0875, 0.7601],
        [0.2457, 0.8734, 0.6997],
        [0.4117, 0.3488, 0.2464]])

In [70]:
a = torch.rand(3,4)
b = torch.rand(3,4)

print(a)
print(b)
print(a==b)

tensor([[0.7111, 0.2458, 0.1603, 0.4806],
        [0.8274, 0.1481, 0.5549, 0.3050],
        [0.7543, 0.6009, 0.6157, 0.2063]])
tensor([[0.5384, 0.0328, 0.0697, 0.0903],
        [0.3568, 0.3417, 0.6903, 0.7235],
        [0.3028, 0.9785, 0.3268, 0.3551]])
tensor([[False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]])


In [71]:
torch.manual_seed(42)
a = torch.rand(3,4)

torch.manual_seed(42)
b = torch.rand(3,4)

print(a)
print(b)

print(a==b)



tensor([[0.8823, 0.9150, 0.3829, 0.9593],
        [0.3904, 0.6009, 0.2566, 0.7936],
        [0.9408, 0.1332, 0.9346, 0.5936]])
tensor([[0.8823, 0.9150, 0.3829, 0.9593],
        [0.3904, 0.6009, 0.2566, 0.7936],
        [0.9408, 0.1332, 0.9346, 0.5936]])
tensor([[True, True, True, True],
        [True, True, True, True],
        [True, True, True, True]])


In [72]:
## GPUS using

import torch
print(torch.cuda.is_available())


False


# Py Torch Workflow


In [73]:
import torch
from torch import nn 
import matplotlib.pyplot as plt
import numpy as np

In [74]:
weight = 0.7
bias = 0.3

start = 0
end = 1
step = 0.02

X = torch.arange(start,end,step).unsqueeze(dim=1)
y = weight * X + bias
 

X[:10], y[:10] , len(X), len(y) 

(tensor([[0.0000],
         [0.0200],
         [0.0400],
         [0.0600],
         [0.0800],
         [0.1000],
         [0.1200],
         [0.1400],
         [0.1600],
         [0.1800]]),
 tensor([[0.3000],
         [0.3140],
         [0.3280],
         [0.3420],
         [0.3560],
         [0.3700],
         [0.3840],
         [0.3980],
         [0.4120],
         [0.4260]]),
 50,
 50)

In [75]:
#splitting data into train and test

train_split = int(0.8*len(X))

X_train, y_train = X[:train_split], y[:train_split]

X_test, y_test = X[train_split:], y[train_split:]

len(X_train), len(y_train), len(X_test), len(y_test)


(40, 40, 10, 10)

In [76]:
class LinearRegression(nn.Module):

    def __init__(self):
        super().__init__()
        self.weights = nn.Parameter(torch.randn(1,requires_grad=True, 
                                dtype=torch.float))
        
        self.bias = nn.Parameter(torch.randn(1,requires_grad=True, 
                                dtype=torch.float))
        
        #Forward method to define the computation in the moedel
    def forward(self,x: torch.Tensor) -> torch.Tensor:
        return self.weights * x + self.bias 
    

### PYTORCH MODEL BUILDING ESSENTIALS

torch.nn - Contains all of the buildings for computational graphs(a neural network can be considered a computational graph)

torch.optim - Contains all of the optimization algorithms

torch.nn.functional - Contains all of the non-computational layers

torch.utils - Contains utilities for working with data

torch.tensor - Creates a tensor representing            

torch.autograd - Contains all of the autograd functionality

torch.optim - Contains all of the optimization algorithms

torch.nn.module - The base class for all neural network modules, if subwrite it, should overwrite forward.



In [77]:
# checking pytorch model

torch.manual_seed(42)

model_0  = LinearRegression()

list(model_0.parameters())


[Parameter containing:
 tensor([0.3367], requires_grad=True),
 Parameter containing:
 tensor([0.1288], requires_grad=True)]

In [78]:
model_0.state_dict()

OrderedDict([('weights', tensor([0.3367])), ('bias', tensor([0.1288]))])

In [79]:
#Making prediction

with torch.inference_mode(): #for predictions
    
    y_pred = model_0(X_test)

y_pred

tensor([[0.3982],
        [0.4049],
        [0.4116],
        [0.4184],
        [0.4251],
        [0.4318],
        [0.4386],
        [0.4453],
        [0.4520],
        [0.4588]])

In [80]:
print(y_test)

y_test - y_pred

tensor([[0.8600],
        [0.8740],
        [0.8880],
        [0.9020],
        [0.9160],
        [0.9300],
        [0.9440],
        [0.9580],
        [0.9720],
        [0.9860]])


tensor([[0.4618],
        [0.4691],
        [0.4764],
        [0.4836],
        [0.4909],
        [0.4982],
        [0.5054],
        [0.5127],
        [0.5200],
        [0.5272]])

In [81]:
# Training the model

model_0.state_dict()

OrderedDict([('weights', tensor([0.3367])), ('bias', tensor([0.1288]))])

In [82]:
#setup loss function

loss_fn = nn.L1Loss()

#OPTIMIZER

optimizer = torch.optim.SGD(model_0.parameters(), lr=0.1)


In [83]:
epochs = 200

for epoch in range(epochs):

    model_0.train()

    #forward pass
    y_pred = model_0(X_train)

    #calculate the loss function
    loss = loss_fn(y_pred, y_train)
    

    #optimizer zero grad
    optimizer.zero_grad()

    #backward pass
    loss.backward()

    #optimizer step
    optimizer.step()


    model_0.eval() #turns off different setting in model that are not needed for model evaluation

    with torch.inference_mode(): #turns off gradients tracking
        test_pred = model_0(X_test)

        test_loss = loss_fn(test_pred, y_test)

    if epoch % 10 == 0:  
        
        print(f"Epoch: {epoch} | Test Loss: {test_loss} |Loss :{loss}")

Epoch: 0 | Test Loss: 0.35982614755630493 |Loss :0.31288138031959534
Epoch: 10 | Test Loss: 0.05427704378962517 |Loss :0.025432366877794266
Epoch: 20 | Test Loss: 0.11934101581573486 |Loss :0.039773717522621155
Epoch: 30 | Test Loss: 0.11934101581573486 |Loss :0.039773717522621155
Epoch: 40 | Test Loss: 0.11934101581573486 |Loss :0.039773717522621155
Epoch: 50 | Test Loss: 0.11934101581573486 |Loss :0.039773717522621155
Epoch: 60 | Test Loss: 0.11934101581573486 |Loss :0.039773717522621155
Epoch: 70 | Test Loss: 0.11934101581573486 |Loss :0.039773717522621155
Epoch: 80 | Test Loss: 0.11934101581573486 |Loss :0.039773717522621155
Epoch: 90 | Test Loss: 0.11934101581573486 |Loss :0.039773717522621155
Epoch: 100 | Test Loss: 0.11934101581573486 |Loss :0.039773717522621155
Epoch: 110 | Test Loss: 0.11934101581573486 |Loss :0.039773717522621155
Epoch: 120 | Test Loss: 0.11934101581573486 |Loss :0.039773717522621155
Epoch: 130 | Test Loss: 0.11934101581573486 |Loss :0.039773717522621155
Epoc

In [84]:
model_0.state_dict()

OrderedDict([('weights', tensor([0.6512])), ('bias', tensor([0.3588]))])

## Saving and loading model in pytorch

In [85]:
from pathlib import Path

# create the model directory

Model_Path = Path('models')
Model_Path.mkdir(parents=True, exist_ok=True)

# Create model save path

Model_name = '01_pytorch_model.pth'

model_save_path = Model_Path / Model_name

model_save_path

#save model

torch.save(model_0.state_dict(), model_save_path)

In [86]:
model_0.state_dict()

OrderedDict([('weights', tensor([0.6512])), ('bias', tensor([0.3588]))])

In [87]:
# to load in a saved state_dict we have to instantiate a new instance of the model

loaded_model_0 = LinearRegression()

In [88]:
loaded_model_0.load_state_dict(torch.load(f=model_save_path))

<All keys matched successfully>

In [89]:
loaded_model_0.state_dict()

OrderedDict([('weights', tensor([0.6512])), ('bias', tensor([0.3588]))])

In [90]:
#predicting with our loaded model

loaded_model_0.eval()

with torch.inference_mode():
    loaded_model_preds = loaded_model_0(X_test)

loaded_model_preds

tensor([[0.8798],
        [0.8928],
        [0.9058],
        [0.9188],
        [0.9319],
        [0.9449],
        [0.9579],
        [0.9709],
        [0.9840],
        [0.9970]])

## Classification with Pytorch

1. Make classification data and get it ready
Let's begin by making some data.

We'll use the make_circles() method from Scikit-Learn to generate two circles with different coloured dots.

In [94]:
import sklearn
from sklearn.datasets import make_circles


# Make 1000 samples 
n_samples = 1000

# Create circles
X, y = make_circles(n_samples,
                    noise=0.03, # a little bit of noise to the dots
                    random_state=42) # keep random state so we get the same values

In [95]:
print(f"First 5 X features:\n{X[:5]}")
print(f"\nFirst 5 y labels:\n{y[:5]}")


First 5 X features:
[[ 0.75424625  0.23148074]
 [-0.75615888  0.15325888]
 [-0.81539193  0.17328203]
 [-0.39373073  0.69288277]
 [ 0.44220765 -0.89672343]]

First 5 y labels:
[1 1 1 1 0]


In [96]:
# Make DataFrame of circle data
import pandas as pd
circles = pd.DataFrame({"X1": X[:, 0],
    "X2": X[:, 1],
    "label": y
})
circles.head(10)

Unnamed: 0,X1,X2,label
0,0.754246,0.231481,1
1,-0.756159,0.153259,1
2,-0.815392,0.173282,1
3,-0.393731,0.692883,1
4,0.442208,-0.896723,0
5,-0.479646,0.676435,1
6,-0.013648,0.803349,1
7,0.771513,0.14776,1
8,-0.169322,-0.793456,1
9,-0.121486,1.021509,0


In [97]:
# Check different labels
circles.label.value_counts()

label
1    500
0    500
Name: count, dtype: int64

In [98]:
# Check the shapes of our features and labels
X.shape, y.shape

((1000, 2), (1000,))

In [99]:
# View the first example of features and labels
X_sample = X[0]
y_sample = y[0]
print(f"Values for one sample of X: {X_sample} and the same for y: {y_sample}")
print(f"Shapes for one sample of X: {X_sample.shape} and the same for y: {y_sample.shape}")

Values for one sample of X: [0.75424625 0.23148074] and the same for y: 1
Shapes for one sample of X: (2,) and the same for y: ()


### Turn data into tensors and create train and test splits

In [102]:
X = torch.from_numpy(X).type(torch.float)

y  = torch.from_numpy(y).type(torch.float)
        
X[:5], y[:5] 

(tensor([[ 0.7542,  0.2315],
         [-0.7562,  0.1533],
         [-0.8154,  0.1733],
         [-0.3937,  0.6929],
         [ 0.4422, -0.8967]]),
 tensor([1., 1., 1., 1., 0.]))

In [103]:
# Split data into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, # 20% test, 80% train
                                                    random_state=42) # make the random split reproducible

len(X_train), len(X_test), len(y_train), len(y_test)

(800, 200, 800, 200)

### Building the model

In [104]:
# Standard PyTorch imports
import torch
from torch import nn

# Make device agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [105]:
# 1. Construct a model class that subclasses nn.Module
class CircleModelV0(nn.Module):
    def __init__(self):
        super().__init__()
        # 2. Create 2 nn.Linear layers capable of handling X and y input and output shapes
        self.layer_1 = nn.Linear(in_features=2, out_features=5) # takes in 2 features (X), produces 5 features
        self.layer_2 = nn.Linear(in_features=5, out_features=1) # takes in 5 features, produces 1 feature (y)
    
    # 3. Define a forward method containing the forward pass computation
    def forward(self, x):
        # Return the output of layer_2, a single feature, the same shape as y
        return self.layer_2(self.layer_1(x)) # computation goes through layer_1 first then the output of layer_1 goes through layer_2

# 4. Create an instance of the model and send it to target device
model_0 = CircleModelV0().to(device)
model_0

CircleModelV0(
  (layer_1): Linear(in_features=2, out_features=5, bias=True)
  (layer_2): Linear(in_features=5, out_features=1, bias=True)
)

In [106]:
# Replicate CircleModelV0 with nn.Sequential
model_0 = nn.Sequential(
    nn.Linear(in_features=2, out_features=5),
    nn.Linear(in_features=5, out_features=1)
).to(device)

model_0

Sequential(
  (0): Linear(in_features=2, out_features=5, bias=True)
  (1): Linear(in_features=5, out_features=1, bias=True)
)

that looks much simpler than subclassing nn.Module, why not just always use nn.Sequential?

nn.Sequential is fantastic for straight-forward computations, however, as the namespace says, it always runs in sequential order.

So if you'd something else to happen (rather than just straight-forward sequential computation) you'll want to define your own custom nn.Module subclass.

Now we've got a model, let's see what happens when we pass some data through it.

In [107]:
# Make predictions with the model
untrained_preds = model_0(X_test.to(device))
print(f"Length of predictions: {len(untrained_preds)}, Shape: {untrained_preds.shape}")
print(f"Length of test samples: {len(y_test)}, Shape: {y_test.shape}")
print(f"\nFirst 10 predictions:\n{untrained_preds[:10]}")
print(f"\nFirst 10 test labels:\n{y_test[:10]}")

Length of predictions: 200, Shape: torch.Size([200, 1])
Length of test samples: 200, Shape: torch.Size([200])

First 10 predictions:
tensor([[-0.2818],
        [-0.2232],
        [-0.3505],
        [-0.2627],
        [-0.2155],
        [-0.1808],
        [-0.1227],
        [-0.1124],
        [-0.3560],
        [-0.2178]], grad_fn=<SliceBackward0>)

First 10 test labels:
tensor([1., 0., 1., 0., 1., 1., 0., 0., 1., 0.])


### Setup loss function and optimizer

We've setup a loss (also called a criterion or cost function) and optimizer before in notebook 01.

But different problem types require different loss functions.

For example, for a regression problem (predicting a number) you might used mean absolute error (MAE) loss.

And for a binary classification problem (like ours), you'll often use binary cross entropy as the loss function.

However, the same optimizer function can often be used across different problem spaces.

For example, the stochastic gradient descent optimizer (SGD, torch.optim.SGD()) can be used for a range of problems, and the same applies to the Adam optimizer (torch.optim.Adam()). 

Since we're working with a binary classification problem, let's use a binary cross entropy loss function.


PyTorch has two binary cross entropy implementations:

1.  torch.nn.BCELoss() - Creates a loss function that measures the binary cross entropy between the target (label) and input (features).

2.  torch.nn.BCEWithLogitsLoss() - This is the same as above except it has a sigmoid layer (nn.Sigmoid) built-in (we'll see what this means soon).
Which one should you use?

The documentation for torch.nn.BCEWithLogitsLoss() states that it's more numerically stable than using torch.nn.BCELoss() after a nn.Sigmoid layer.

So generally, implementation 2 is a better option. However for advanced usage, you may want to separate the combination of nn.Sigmoid and torch.nn.BCELoss() but that is beyond the scope of this notebook.

Knowing this, let's create a loss function and an optimizer.

For the optimizer we'll use torch.optim.SGD() to optimize the model parameters with learning rate 0.1.

In [108]:
# Create a loss function
# loss_fn = nn.BCELoss() # BCELoss = no sigmoid built-in
loss_fn = nn.BCEWithLogitsLoss() # BCEWithLogitsLoss = sigmoid built-in

# Create an optimizer
optimizer = torch.optim.SGD(params=model_0.parameters(), 
                            lr=0.1)

In [109]:
# Calculate accuracy (a classification metric)
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item() # torch.eq() calculates where two tensors are equal
    acc = (correct / len(y_pred)) * 100 
    return acc

### Train the model

In [110]:
# View the frist 5 outputs of the forward pass on the test data
y_logits = model_0(X_test.to(device))[:5]
y_logits

tensor([[-0.2818],
        [-0.2232],
        [-0.3505],
        [-0.2627],
        [-0.2155]], grad_fn=<SliceBackward0>)

In [111]:
# Use sigmoid on model logits
y_pred_probs = torch.sigmoid(y_logits)
y_pred_probs

tensor([[0.4300],
        [0.4444],
        [0.4133],
        [0.4347],
        [0.4463]], grad_fn=<SigmoidBackward0>)

In [112]:
# Find the predicted labels (round the prediction probabilities)
y_preds = torch.round(y_pred_probs)

# In full
y_pred_labels = torch.round(torch.sigmoid(model_0(X_test.to(device))[:5]))

# Check for equality
print(torch.eq(y_preds.squeeze(), y_pred_labels.squeeze()))

# Get rid of extra dimension
y_preds.squeeze()

tensor([True, True, True, True, True])


tensor([0., 0., 0., 0., 0.], grad_fn=<SqueezeBackward0>)

In [113]:
y_test[:5]

tensor([1., 0., 1., 0., 1.])

### Building a training and testing loop
Alright, we've discussed how to take our raw model outputs and convert them to prediction labels, now let's build a training loop.

Let's start by training for 100 epochs and outputing the model's progress every 10 epochs.

In [114]:
torch.manual_seed(42)

# Set the number of epochs
epochs = 100

# Put data to target device
X_train, y_train = X_train.to(device), y_train.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)

# Build training and evaluation loop
for epoch in range(epochs):
    ### Training
    model_0.train()

    # 1. Forward pass (model outputs raw logits)
    y_logits = model_0(X_train).squeeze() # squeeze to remove extra `1` dimensions, this won't work unless model and data are on same device 
    y_pred = torch.round(torch.sigmoid(y_logits)) # turn logits -> pred probs -> pred labls
  
    # 2. Calculate loss/accuracy
    # loss = loss_fn(torch.sigmoid(y_logits), # Using nn.BCELoss you need torch.sigmoid()
    #                y_train) 
    loss = loss_fn(y_logits, # Using nn.BCEWithLogitsLoss works with raw logits
                   y_train) 
    acc = accuracy_fn(y_true=y_train, 
                      y_pred=y_pred) 

    # 3. Optimizer zero grad
    optimizer.zero_grad()

    # 4. Loss backwards
    loss.backward()

    # 5. Optimizer step
    optimizer.step()

    ### Testing
    model_0.eval()
    with torch.inference_mode():
        # 1. Forward pass
        test_logits = model_0(X_test).squeeze() 
        test_pred = torch.round(torch.sigmoid(test_logits))
        # 2. Caculate loss/accuracy
        test_loss = loss_fn(test_logits,
                            y_test)
        test_acc = accuracy_fn(y_true=y_test,
                               y_pred=test_pred)

    # Print out what's happening every 10 epochs
    if epoch % 10 == 0:
        print(f"Epoch: {epoch} | Loss: {loss:.5f}, Accuracy: {acc:.2f}% | Test loss: {test_loss:.5f}, Test acc: {test_acc:.2f}%")

Epoch: 0 | Loss: 0.70143, Accuracy: 50.00% | Test loss: 0.70301, Test acc: 50.00%
Epoch: 10 | Loss: 0.69693, Accuracy: 50.00% | Test loss: 0.69937, Test acc: 50.00%
Epoch: 20 | Loss: 0.69504, Accuracy: 45.50% | Test loss: 0.69796, Test acc: 42.50%
Epoch: 30 | Loss: 0.69420, Accuracy: 46.88% | Test loss: 0.69738, Test acc: 48.00%
Epoch: 40 | Loss: 0.69379, Accuracy: 48.50% | Test loss: 0.69712, Test acc: 48.50%
Epoch: 50 | Loss: 0.69357, Accuracy: 49.50% | Test loss: 0.69698, Test acc: 48.00%
Epoch: 60 | Loss: 0.69344, Accuracy: 50.12% | Test loss: 0.69688, Test acc: 47.00%
Epoch: 70 | Loss: 0.69335, Accuracy: 50.12% | Test loss: 0.69679, Test acc: 47.00%
Epoch: 80 | Loss: 0.69329, Accuracy: 50.25% | Test loss: 0.69671, Test acc: 46.50%
Epoch: 90 | Loss: 0.69324, Accuracy: 50.38% | Test loss: 0.69664, Test acc: 47.00%


### Make predictions and evaluate the model

In [117]:
import requests
from pathlib import Path 

# Download helper functions from Learn PyTorch repo (if not already downloaded)
if Path("helper_functions.py").is_file():
  print("helper_functions.py already exists, skipping download")
else:
  print("Downloading helper_functions.py")
  request = requests.get("https://raw.githubusercontent.com/mrdbourke/pytorch-deep-learning/main/helper_functions.py")
  with open("helper_functions.py", "wb") as f:
    f.write(request.content)

from helper_functions import plot_predictions, plot_decision_boundary

Downloading helper_functions.py


### Improving a model (from a model perspective)

In [118]:
class CircleModelV1(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer_1 = nn.Linear(in_features=2, out_features=10)
        self.layer_2 = nn.Linear(in_features=10, out_features=10) # extra layer
        self.layer_3 = nn.Linear(in_features=10, out_features=1)
        
    def forward(self, x): # note: always make sure forward is spelt correctly!
        # Creating a model like this is the same as below, though below
        # generally benefits from speedups where possible.
        # z = self.layer_1(x)
        # z = self.layer_2(z)
        # z = self.layer_3(z)
        # return z
        return self.layer_3(self.layer_2(self.layer_1(x)))

model_1 = CircleModelV1().to(device)
model_1

CircleModelV1(
  (layer_1): Linear(in_features=2, out_features=10, bias=True)
  (layer_2): Linear(in_features=10, out_features=10, bias=True)
  (layer_3): Linear(in_features=10, out_features=1, bias=True)
)

In [119]:
# loss_fn = nn.BCELoss() # Requires sigmoid on input
loss_fn = nn.BCEWithLogitsLoss() # Does not require sigmoid on input
optimizer = torch.optim.SGD(model_1.parameters(), lr=0.1)

In [120]:
torch.manual_seed(42)

epochs = 1000 # Train for longer

# Put data to target device
X_train, y_train = X_train.to(device), y_train.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)

for epoch in range(epochs):
    ### Training
    # 1. Forward pass
    y_logits = model_1(X_train).squeeze()
    y_pred = torch.round(torch.sigmoid(y_logits)) # logits -> predicition probabilities -> prediction labels

    # 2. Calculate loss/accuracy
    loss = loss_fn(y_logits, y_train)
    acc = accuracy_fn(y_true=y_train, 
                      y_pred=y_pred)

    # 3. Optimizer zero grad
    optimizer.zero_grad()

    # 4. Loss backwards
    loss.backward()

    # 5. Optimizer step
    optimizer.step()

    ### Testing
    model_1.eval()
    with torch.inference_mode():
        # 1. Forward pass
        test_logits = model_1(X_test).squeeze() 
        test_pred = torch.round(torch.sigmoid(test_logits))
        # 2. Caculate loss/accuracy
        test_loss = loss_fn(test_logits,
                            y_test)
        test_acc = accuracy_fn(y_true=y_test,
                               y_pred=test_pred)

    # Print out what's happening every 10 epochs
    if epoch % 100 == 0:
        print(f"Epoch: {epoch} | Loss: {loss:.5f}, Accuracy: {acc:.2f}% | Test loss: {test_loss:.5f}, Test acc: {test_acc:.2f}%")

Epoch: 0 | Loss: 0.69396, Accuracy: 50.88% | Test loss: 0.69261, Test acc: 51.00%
Epoch: 100 | Loss: 0.69305, Accuracy: 50.38% | Test loss: 0.69379, Test acc: 48.00%
Epoch: 200 | Loss: 0.69299, Accuracy: 51.12% | Test loss: 0.69437, Test acc: 46.00%
Epoch: 300 | Loss: 0.69298, Accuracy: 51.62% | Test loss: 0.69458, Test acc: 45.00%
Epoch: 400 | Loss: 0.69298, Accuracy: 51.12% | Test loss: 0.69465, Test acc: 46.00%
Epoch: 500 | Loss: 0.69298, Accuracy: 51.00% | Test loss: 0.69467, Test acc: 46.00%
Epoch: 600 | Loss: 0.69298, Accuracy: 51.00% | Test loss: 0.69468, Test acc: 46.00%
Epoch: 700 | Loss: 0.69298, Accuracy: 51.00% | Test loss: 0.69468, Test acc: 46.00%
Epoch: 800 | Loss: 0.69298, Accuracy: 51.00% | Test loss: 0.69468, Test acc: 46.00%
Epoch: 900 | Loss: 0.69298, Accuracy: 51.00% | Test loss: 0.69468, Test acc: 46.00%


### Preparing data to see if our model can model a straight line
Let's create some linear data to see if our model's able to model it and we're not just using a model that can't learn anything.

In [121]:
# Create some data (same as notebook 01)
weight = 0.7
bias = 0.3
start = 0
end = 1
step = 0.01

# Create data
X_regression = torch.arange(start, end, step).unsqueeze(dim=1)
y_regression = weight * X_regression + bias # linear regression formula

# Check the data
print(len(X_regression))
X_regression[:5], y_regression[:5]

100


(tensor([[0.0000],
         [0.0100],
         [0.0200],
         [0.0300],
         [0.0400]]),
 tensor([[0.3000],
         [0.3070],
         [0.3140],
         [0.3210],
         [0.3280]]))

In [122]:
# Create train and test splits
train_split = int(0.8 * len(X_regression)) # 80% of data used for training set
X_train_regression, y_train_regression = X_regression[:train_split], y_regression[:train_split]
X_test_regression, y_test_regression = X_regression[train_split:], y_regression[train_split:]

# Check the lengths of each split
print(len(X_train_regression), 
    len(y_train_regression), 
    len(X_test_regression), 
    len(y_test_regression))

80 80 20 20


### Adjusting model_1 to fit a straight line

In [123]:
# Same architecture as model_1 (but using nn.Sequential)
model_2 = nn.Sequential(
    nn.Linear(in_features=1, out_features=10),
    nn.Linear(in_features=10, out_features=10),
    nn.Linear(in_features=10, out_features=1)
).to(device)

model_2

Sequential(
  (0): Linear(in_features=1, out_features=10, bias=True)
  (1): Linear(in_features=10, out_features=10, bias=True)
  (2): Linear(in_features=10, out_features=1, bias=True)
)

We'll setup the loss function to be nn.L1Loss() (the same as mean absolute error) and the optimizer to be torch.optim.SGD().

In [124]:
# Loss and optimizer
loss_fn = nn.L1Loss()
optimizer = torch.optim.SGD(model_2.parameters(), lr=0.1)

In [125]:
# Train the model
torch.manual_seed(42)

# Set the number of epochs
epochs = 1000

# Put data to target device
X_train_regression, y_train_regression = X_train_regression.to(device), y_train_regression.to(device)
X_test_regression, y_test_regression = X_test_regression.to(device), y_test_regression.to(device)

for epoch in range(epochs):
    ### Training 
    # 1. Forward pass
    y_pred = model_2(X_train_regression)
    
    # 2. Calculate loss (no accuracy since it's a regression problem, not classification)
    loss = loss_fn(y_pred, y_train_regression)

    # 3. Optimizer zero grad
    optimizer.zero_grad()

    # 4. Loss backwards
    loss.backward()

    # 5. Optimizer step
    optimizer.step()

    ### Testing
    model_2.eval()
    with torch.inference_mode():
      # 1. Forward pass
      test_pred = model_2(X_test_regression)
      # 2. Calculate the loss 
      test_loss = loss_fn(test_pred, y_test_regression)

    # Print out what's happening
    if epoch % 100 == 0: 
        print(f"Epoch: {epoch} | Train loss: {loss:.5f}, Test loss: {test_loss:.5f}")

Epoch: 0 | Train loss: 0.75986, Test loss: 0.54143
Epoch: 100 | Train loss: 0.09309, Test loss: 0.02901
Epoch: 200 | Train loss: 0.07376, Test loss: 0.02850
Epoch: 300 | Train loss: 0.06745, Test loss: 0.00615
Epoch: 400 | Train loss: 0.06107, Test loss: 0.02004
Epoch: 500 | Train loss: 0.05698, Test loss: 0.01061
Epoch: 600 | Train loss: 0.04857, Test loss: 0.01326
Epoch: 700 | Train loss: 0.06109, Test loss: 0.02127
Epoch: 800 | Train loss: 0.05600, Test loss: 0.01425
Epoch: 900 | Train loss: 0.05571, Test loss: 0.00603
