# 1. Tensors:

In [1]:
import torch

In [2]:
a = torch.tensor([1, 2, 3])
print(a)
print(a.dtype) #torch default is float64
print(a.shape) #this is just the shape; it's different from the dimension (aka. rank)!
#"a" has a shape of 3 but it has a dimension (or rank) of 1 (aka. a 1D tensor).

tensor([1, 2, 3])
torch.int64
torch.Size([3])


In [3]:
a_floated = torch.tensor([1.0, 2.0, 3.0])
print(a_floated)
print(a_floated.dtype) #torch floated default is float32. THis is the ideal dtype for GPUs.

tensor([1., 2., 3.])
torch.float32


In [4]:
b = a.to(torch.float32)
print(b)
print(b.dtype)

tensor([1., 2., 3.])
torch.float32


In [5]:
a_t = a.T
print(a_t)
print(a_t.shape)

tensor([1, 2, 3])
torch.Size([3])


  a_t = a.T


In [6]:
# .matmul is the same method as @

mat_1 = torch.tensor([[1, 2, 3],
                      [4, 5, 6]], dtype=torch.float32)
print("mat_1: \n", mat_1)
print(mat_1.shape)
print(mat_1.dtype)
print("\n")

mat_2 = torch.tensor([[7, 8, 9],
                     [10, 11, 12]], dtype=torch.float32)
print("mat_2: \n", mat_2)
print(mat_2.dtype)
print(mat_2.shape)
print("\n")

mat_multpl_1 = mat_1 * mat_2 # This is element-wise matrix multiplication.
print("the * operation: \n", mat_multpl_1)
print("\n")

mat_multpl_2 = mat_1.matmul(mat_2.T) # This is the typical algbra matrix multiolication (aka. times).
print("the matmul operation: \n", mat_multpl_2)
print("\n")

mat_multpl_3 = mat_1 @ mat_2.T # Same as .matmul
print("the @ operation: \n", mat_multpl_3)

mat_1: 
 tensor([[1., 2., 3.],
        [4., 5., 6.]])
torch.Size([2, 3])
torch.float32


mat_2: 
 tensor([[ 7.,  8.,  9.],
        [10., 11., 12.]])
torch.float32
torch.Size([2, 3])


the * operation: 
 tensor([[ 7., 16., 27.],
        [40., 55., 72.]])


the matmul operation: 
 tensor([[ 50.,  68.],
        [122., 167.]])


the @ operation: 
 tensor([[ 50.,  68.],
        [122., 167.]])


In [7]:
print("a: \n", a, "\n", a.shape)
print("\n")

a_reshaped_1 = a.reshape(1, 3)
print("reshaped a: \n", a_reshaped_1, "\n", a_reshaped_1.shape)
print("\n")

a_reshaped_2 = a.reshape(3, 1)
print("reshaped a: \n", a_reshaped_2, "\n", a_reshaped_2.shape)

a: 
 tensor([1, 2, 3]) 
 torch.Size([3])


reshaped a: 
 tensor([[1, 2, 3]]) 
 torch.Size([1, 3])


reshaped a: 
 tensor([[1],
        [2],
        [3]]) 
 torch.Size([3, 1])


# 2. Autograd: PyTorch's automatic differentiation engine ->
### 1. Manual: the "grad" function
### 2. Automatic: the "backward" function and the "grad" attribute

In [8]:
#This is the forward pass of a simple logistic regression classifier: (MANUAL:)

import torch.nn.functional as F
from torch.autograd import grad

y = torch.tensor([1.0]) #true label
x1 = torch.tensor([1.1])
w1 = torch.tensor([2.2], requires_grad=True)
b = torch.tensor([0.0], requires_grad=True)

z = x1 * w1 + b
a = torch.sigmoid(z)

loss = F.binary_cross_entropy(a, y)

grad_L_w1 = grad(loss, w1, retain_graph=True)
grad_L_b = grad(loss, b, retain_graph=True)

print(grad_L_w1)
print(grad_L_b)

(tensor([-0.0898]),)
(tensor([-0.0817]),)


In [9]:
# (AUTOMATIC:)

loss.backward()
print(w1.grad)
print(b.grad)

tensor([-0.0898])
tensor([-0.0817])


# 3. Multilayer neural networks: (no training yet)

In [10]:
class NeuralNetwork(torch.nn.Module):
  def __init__(self, num_inputs, num_outputs):
    super().__init__()

    self.layers = torch.nn.Sequential(
        #1st hidden layer
        torch.nn.Linear(num_inputs, 30),
        torch.nn.ReLU(),

        #2nd hidden layer
        torch.nn.Linear(30, 20),
        torch.nn.ReLU(),

        #output layer
        torch.nn.Linear(20, num_outputs),
    )


  def forward(self, x):
    logits = self.layers(x)
    return logits

In [11]:
torch.manual_seed(123)
model = NeuralNetwork(50, 3)
print(model)
print(model.layers)
print(model.layers[0].weight)
print(model.layers[0].weight.shape)

NeuralNetwork(
  (layers): Sequential(
    (0): Linear(in_features=50, out_features=30, bias=True)
    (1): ReLU()
    (2): Linear(in_features=30, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=3, bias=True)
  )
)
Sequential(
  (0): Linear(in_features=50, out_features=30, bias=True)
  (1): ReLU()
  (2): Linear(in_features=30, out_features=20, bias=True)
  (3): ReLU()
  (4): Linear(in_features=20, out_features=3, bias=True)
)
Parameter containing:
tensor([[-0.0577,  0.0047, -0.0702,  ...,  0.0222,  0.1260,  0.0865],
        [ 0.0502,  0.0307,  0.0333,  ...,  0.0951,  0.1134, -0.0297],
        [ 0.1077, -0.1108,  0.0122,  ...,  0.0108, -0.1049, -0.1063],
        ...,
        [-0.0787,  0.1259,  0.0803,  ...,  0.1218,  0.1303, -0.1351],
        [ 0.1359,  0.0175, -0.0673,  ...,  0.0674,  0.0676,  0.1058],
        [ 0.0790,  0.1343, -0.0293,  ...,  0.0344, -0.0971, -0.0509]],
       requires_grad=True)
torch.Size([30, 50])


In [12]:
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total number of trainable parameters: ", num_params)

Total number of trainable parameters:  2213


In [13]:
# Now with example input:
torch.manual_seed(123)
X = torch.rand((1, 50))
out = model(X)
print(out)

# A more efficient way is to not calculate gradients if we are not doing any training/backpropagation (aka. we are doing inference/predictin):
with torch.no_grad():
  out = model(X)
print(out)

tensor([[-0.1262,  0.1080, -0.1792]], grad_fn=<AddmmBackward0>)
tensor([[-0.1262,  0.1080, -0.1792]])


# 4. Dataset and Dataloader classes:

In [14]:
X_train = torch.tensor([
    [-1.2, 3.1],
    [-0.9, 2.9],
    [-0.5, 2.6],
    [2.3, -1.1],
    [2.7, -1.5]
])

y_train = torch.tensor([0, 0, 0, 1, 1])

x_test = torch.tensor([
    [-0.8, 2.8],
    [2.6, -1.6]
])

y_test = torch.tensor([0, 1])

In [15]:
from torch.utils.data import Dataset, DataLoader

class Dataset(Dataset):
  def __init__(self, X, y):
    self.features = X
    self.labels = y

  def __getitem__(self, index):
    one_x = self.features[index]
    one_y = self.labels[index]
    return one_x, one_y

  def __len__(self):
    return self.labels.shape[0] #[0] makes it so that it returns the digit, not torch.tensor(digit) as the vector shape.

train_ds = Dataset(X_train, y_train)
test_ds = Dataset(x_test, y_test)

In [16]:
torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_ds,
    batch_size=2,
    shuffle=True,
    num_workers=0, #background processes
    drop_last=True #optional: helps with dropping the last no-deviadle batch size to prevent disturbances to convergence
)


test_loader = DataLoader(
    dataset=test_ds,
    batch_size=2,
    shuffle=False, #there is no need to shuffle test data
    num_workers=0
)

for idx, (x, y) in enumerate(train_loader):
  print(f"Batch {idx+1}: ", x, y)

Batch 1:  tensor([[ 2.3000, -1.1000],
        [-0.9000,  2.9000]]) tensor([1, 0])
Batch 2:  tensor([[-1.2000,  3.1000],
        [-0.5000,  2.6000]]) tensor([0, 0])


# 5. A typical training loop:


In [17]:
import torch.nn.functional as F #for loss

torch.manual_seed(123)
model = NeuralNetwork(num_inputs=2, num_outputs=2)
optimizer = torch.optim.SGD(model.parameters(), lr=0.5)

num_epochs = 3
for epoch in range(num_epochs): #in each epoch

  model.train() #initialize the model for training
  for batch_idx, (features, labels) in enumerate(train_loader): #make the baches
    logits = model(features)

    loss = F.cross_entropy(logits, labels) #calculate loss
    optimizer.zero_grad() #zero the previous gradients
    loss.backward() #calculate the gradients
    optimizer.step() #apply the gradients

    #logging:
    print(f"Epoch: {epoch+1:03d}/{num_epochs:03d}" f" | Batch: {batch_idx+1:03d}/{len(train_loader):03d}" f" | Train Loss: {loss:.2f}")

Epoch: 001/003 | Batch: 001/002 | Train Loss: 0.75
Epoch: 001/003 | Batch: 002/002 | Train Loss: 0.65
Epoch: 002/003 | Batch: 001/002 | Train Loss: 0.44
Epoch: 002/003 | Batch: 002/002 | Train Loss: 0.13
Epoch: 003/003 | Batch: 001/002 | Train Loss: 0.03
Epoch: 003/003 | Batch: 002/002 | Train Loss: 0.00


In [18]:
model.eval()
#insert optional model evaluation code.
with torch.no_grad():
  outputs = model(X_train) #obviously using train data isn't wise. but this is just for practice.
print(outputs)

output_argmax = torch.argmax(outputs, dim=1)
print(output_argmax)

tensor([[ 2.8569, -4.1618],
        [ 2.5382, -3.7548],
        [ 2.0944, -3.1820],
        [-1.4814,  1.4816],
        [-1.7176,  1.7342]])
tensor([0, 0, 0, 1, 1])


In [19]:
probabs = torch.softmax(outputs, dim=1)

torch.set_printoptions(sci_mode=False) #to make the putput more legible (aka. without the sci notation of e to the power of...)
print(probabs)

preds = torch.argmax(probabs, dim=1)
print(preds)

tensor([[    0.9991,     0.0009],
        [    0.9982,     0.0018],
        [    0.9949,     0.0051],
        [    0.0491,     0.9509],
        [    0.0307,     0.9693]])
tensor([0, 0, 0, 1, 1])


In [20]:
# calculating prediction acuracy:

def compute_accuracy(model, dataloader):
  model = model.eval()
  correct = 0.0
  total_examples = 0

  for idx, (features, labels) in enumerate(dataloader):
    with torch.no_grad():
      logits = model(features)

    predictions = torch.argmax(logits, dim=1)
    compare = labels == predictions
    correct += torch.sum(compare)
    total_examples += len(compare)

  return (correct/total_examples).item()


print(compute_accuracy(model, train_loader))
print(compute_accuracy(model, test_loader))

1.0
1.0


In [21]:
#saving the model:

torch.save(model.state_dict(), "model.pth") # .state_dict() maps each layr in the model to its trainable parameters.

#to restore the model from disk (aka. calling the model):
model = NeuralNetwork(2, 2) # this ine is not strictly necessary if we are executing it in the same session where we saved the model.
#But otherwise, we do need it as an instance of the model in the memory to apply saved parameters to.
model.load_state_dict(torch.load("model.pth"))
# .load() reads the provided file and reconstruct the model's parameters.
# .load_state_dict() applies these parameters to the model.

<All keys matched successfully>

# 6. Single-GPU training: (addding 3 lines of code to the model in section 5)

In [22]:
print(torch.cuda.is_available())

True


In [23]:
tensor_1 = torch.tensor([1., 2., 3.])
tensor_2 = torch.tensor([4., 5., 6.])

print(tensor_1 + tensor_2) #this runs on CPU by default.

tensor_1 = tensor_1.to("cuda")
tensor_2 = tensor_2.to("cuda")
print(tensor_1 + tensor_2)

tensor([5., 7., 9.])
tensor([5., 7., 9.], device='cuda:0')


In [24]:
# The previous model, this time being trained on GPU:

torch.manual_seed(123)
device = torch.device("cuda")
# or:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NeuralNetwork(num_inputs=2, num_outputs=2)
model.to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=0.5)

num_epochs=3
for epoch in range(num_epochs):
  model.train()

  for batch_idx, (features, labels) in enumerate(train_loader):
    features, labels = features.to(device), labels.to(device)
    logits = model(features)

    loss = F.cross_entropy(logits, labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    #logging:
    print(f"Epoch: {epoch+1:03d}/{num_epochs:03d}" f" | Batch: {batch_idx+1:03d}/{len(train_loader):03d}" f" | Train/Val Loss: {loss:.2f}")

  model.eval()
  #insert evaluation section here.

Epoch: 001/003 | Batch: 001/002 | Train/Val Loss: 0.75
Epoch: 001/003 | Batch: 002/002 | Train/Val Loss: 0.65
Epoch: 002/003 | Batch: 001/002 | Train/Val Loss: 0.44
Epoch: 002/003 | Batch: 002/002 | Train/Val Loss: 0.13
Epoch: 003/003 | Batch: 001/002 | Train/Val Loss: 0.03
Epoch: 003/003 | Batch: 002/002 | Train/Val Loss: 0.00


# 7. Multi-GPU training: -> Interative platforms such as Jupyter notebooks don't handle multiprocessing the same way as Python scripts do. So, for this section, refer to the following GitHub for the Python script: (Book Author's GitHub page)

https://github.com/rasbt/LLMs-from-scratch/tree/main/appendix-A/01_main-chapter-code