# Building an LLM from Scratch

### Uncomment line below if torch is not already installed in your environment

In [1]:
# !pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu128

In [2]:
import torch

### I want to test how a basic Neural Network functions, so I'm creating a basic Neural Network with only a few layers to experiment with

In [3]:
class NeuralNetwork(torch.nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super().__init__()

        self.layers = torch.nn.Sequential(

            # 1st hidden layer
            torch.nn.Linear(num_inputs, 30), # Linear layer takes the number of input and output nodes as arguments
            torch.nn.ReLU(),                 # Nonlinear activation functions are placed between the hidden layers
    
            # 2nd hidden layer
            torch.nn.Linear(30, 20),         # The number of output nodes of one hidden layer had to match the number of input nodes of the next layer
            torch.nn.ReLU(),
    
            # Output layer
            torch.nn.Linear(20, num_outputs)
        )
        
    def forward(self, x):
        logits = self.layers(x)
        return logits                        # The outputs of the last layer are called logits

In [4]:
model = NeuralNetwork(50, 3)
print(model)

NeuralNetwork(
  (layers): Sequential(
    (0): Linear(in_features=50, out_features=30, bias=True)
    (1): ReLU()
    (2): Linear(in_features=30, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=3, bias=True)
  )
)


### As a core part of training AI models is gradient descent through  I want to analyze the structure of the parameters that will be adjusted

In [5]:
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total number of trainable model parameters:", num_params)

Total number of trainable model parameters: 2213


In [6]:
print(model.layers[0].weight)
print(model.layers[0].weight.shape)

Parameter containing:
tensor([[-0.0334,  0.1269,  0.1080,  ..., -0.1135, -0.1335,  0.1115],
        [-0.0968, -0.0664,  0.0215,  ..., -0.0666, -0.0634, -0.0421],
        [ 0.1287,  0.0971, -0.0989,  ..., -0.0694,  0.0166, -0.0071],
        ...,
        [ 0.1235, -0.0045,  0.1107,  ...,  0.1009,  0.0674, -0.0080],
        [ 0.0961,  0.0807, -0.1186,  ...,  0.0144, -0.1199,  0.0450],
        [ 0.0370, -0.0577,  0.1146,  ..., -0.0860, -0.1162, -0.1088]],
       requires_grad=True)
torch.Size([30, 50])


In [7]:
torch.manual_seed(123)
model = NeuralNetwork(50, 3)
print(model.layers[0].weight)

Parameter containing:
tensor([[-0.0577,  0.0047, -0.0702,  ...,  0.0222,  0.1260,  0.0865],
        [ 0.0502,  0.0307,  0.0333,  ...,  0.0951,  0.1134, -0.0297],
        [ 0.1077, -0.1108,  0.0122,  ...,  0.0108, -0.1049, -0.1063],
        ...,
        [-0.0787,  0.1259,  0.0803,  ...,  0.1218,  0.1303, -0.1351],
        [ 0.1359,  0.0175, -0.0673,  ...,  0.0674,  0.0676,  0.1058],
        [ 0.0790,  0.1343, -0.0293,  ...,  0.0344, -0.0971, -0.0509]],
       requires_grad=True)


### When a gradient is computed within the model it keeps track of each operation for the calculation of the parameters partial derivatives

In [8]:
torch.manual_seed(123)
X = torch.rand((1, 50))
out = model(X)
print(out)

tensor([[-0.1262,  0.1080, -0.1792]], grad_fn=<AddmmBackward0>)


In [9]:
with torch.no_grad():
    out = model(X)
print(out)

tensor([[-0.1262,  0.1080, -0.1792]])


### Initializing testing data to experiment with

In [10]:
X_train = torch.tensor([
    [-1.2, 3.1],
    [-0.9, 2.9],
    [-0.5, 2.6],
    [2.3, -1.1],
    [2.7, -1.5]
])
y_train = torch.tensor([0, 0, 0, 1, 1])

X_test = torch.tensor([
    [-0.8, 2.8],
    [2.6, -1.6],
])
y_test = torch.tensor([0, 1])

In [11]:
from torch.utils.data import Dataset

class ToyDataset(Dataset):
    def __init__(self, X, y):
        self.features = X
        self.labels = y

    def __getitem__(self, index):
        one_x = self.features[index]
        one_y = self.labels[index]
        return one_x, one_y

    def __len__(self):
        return self.labels.shape[0]

In [12]:
train_ds = ToyDataset(X_train, y_train)
test_ds = ToyDataset(X_test, y_test)

In [13]:
print(len(train_ds))

5


### In order to make the process of training more streamlined im creating a simple DataLoader which will handle the batching among other data preprocessing when used while training

In [14]:
from torch.utils.data import DataLoader

torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_ds,
    batch_size=2,
    shuffle=True,
    num_workers=0        # The number of subprocesses used to batch the dataset: useful for large datasets and allows GPU to batch while CPU prepares
)

test_loader = DataLoader(
    dataset=test_ds,
    batch_size=2,
    shuffle=False,
    num_workers=0
)

In [15]:
for idx, (x, y) in enumerate(train_loader):
    print(f"Batch {idx+1}:", x, y)

Batch 1: tensor([[ 2.3000, -1.1000],
        [-0.9000,  2.9000]]) tensor([1, 0])
Batch 2: tensor([[-1.2000,  3.1000],
        [-0.5000,  2.6000]]) tensor([0, 0])
Batch 3: tensor([[ 2.7000, -1.5000]]) tensor([1])


In [16]:
# Having a substantially smaller batch as the last batch in a training epoch can disturb the convergence during training.
# To prevent this we will set drop_last=True.
train_loader = DataLoader(
    dataset=train_ds,
    batch_size=2,
    shuffle=True,
    num_workers=0,
    drop_last=True
)

In [17]:
for idx, (x, y) in enumerate(train_loader):
    print(f"Batch {idx+1}:", x, y)

Batch 1: tensor([[-1.2000,  3.1000],
        [-0.5000,  2.6000]]) tensor([0, 0])
Batch 2: tensor([[ 2.3000, -1.1000],
        [-0.9000,  2.9000]]) tensor([1, 0])


### Creating the training loop and examining the process and loss outputs

In [18]:
import torch.nn.functional as F

torch.manual_seed(123)
model = NeuralNetwork(num_inputs=2, num_outputs=2) # The toy dataset has 2 features and 2 classes
optimizer = torch.optim.SGD(
    model.parameters(), lr=0.5                     # The optimizer needs to know which parameters to optimize
)

num_epochs = 3
for epoch in range(num_epochs):
    model.train()

    for batch_idx, (features, labels) in enumerate(train_loader):
        logits = model(features)

        loss = F.cross_entropy(logits, labels)

        optimizer.zero_grad()                     # Sets the gradients from the previous round to 0 to prevent unintended gradient accumulation
        loss.backward()                           # Computes the gradients of the loss given the model parameters
        optimizer.step()                          # The optimizer used the gradients to update the model parameters

        ## LOGGING
        print(f"Epoch: {epoch+1:03d}/{num_epochs:03d}"
              f" | Batch {batch_idx+1:03d}/{len(train_loader):03d}"
              f" | Train Loss: {loss:.2f}")

    model.eval()
    # Insert optional model evaluation code

Epoch: 001/003 | Batch 001/002 | Train Loss: 0.75
Epoch: 001/003 | Batch 002/002 | Train Loss: 0.65
Epoch: 002/003 | Batch 001/002 | Train Loss: 0.44
Epoch: 002/003 | Batch 002/002 | Train Loss: 0.13
Epoch: 003/003 | Batch 001/002 | Train Loss: 0.03
Epoch: 003/003 | Batch 002/002 | Train Loss: 0.00


### Now that the model is trained I'm going to make some predictions with it and look at how it represents the data

In [19]:
model.eval()
with torch.no_grad():
    outputs = model(X_train)
print(outputs)

tensor([[ 2.8569, -4.1618],
        [ 2.5382, -3.7548],
        [ 2.0944, -3.1820],
        [-1.4814,  1.4816],
        [-1.7176,  1.7342]])


In [20]:
torch.set_printoptions(sci_mode=False)  # Make outputs more legible
probas = torch.softmax(outputs, dim=1)
print(probas)

tensor([[0.9991, 0.0009],
        [0.9982, 0.0018],
        [0.9949, 0.0051],
        [0.0491, 0.9509],
        [0.0307, 0.9693]])


In [21]:
predictions = torch.argmax(probas, dim=1)
print(predictions)

tensor([0, 0, 0, 1, 1])


In [22]:
# We can apply the argmax function to the logits (outputs) directly
predictions = torch.argmax(outputs, dim=1)
print(predictions)

tensor([0, 0, 0, 1, 1])


In [23]:
predictions == y_train

tensor([True, True, True, True, True])

In [24]:
torch.sum(predictions == y_train)

tensor(5)

### Creating a function to generalize the concept of model accuracy and account for arbitrarily large datasets by computing correctness in batches

In [25]:
def compute_accuracy(model, dataloader):
    
    model = model.eval()
    correct = 0.0
    total_examples = 0

    for idx, (features, labels) in enumerate(dataloader):

        with torch.no_grad():
            logits = model(features)

        predictions = torch.argmax(logits, dim=1)
        compare = labels == predictions      # Returns a tensor of True/False values depending on whether the labels match
        correct += torch.sum(compare)        # Count the number of True values (Since compare is a Tensor correct's type is promoted to a Tensor following the calculation to preserve Tensor properties)
        total_examples += len(compare)

    return (correct / total_examples).item() # The fraction of correct prediction, a value between 0 and 1. .item() returns the value of the tensor as a Python float

In [26]:
print(compute_accuracy(model, train_loader))

1.0


In [27]:
print(compute_accuracy(model, test_loader))

1.0


### Seeing how to save a model's parameters to the disk and load a saved models parameters into a starting model instance

In [28]:
torch.save(model.state_dict(), "model.pth") # "model.pth" can be any arbitrary name and file ending, however, .pth and .pt are convention

In [29]:
model = NeuralNetwork(2, 2)
model.load_state_dict(torch.load("model.pth")) # An instance of the model in memory is required to apply the saved parameters

<All keys matched successfully>

### Exploring how to optimize performance with GPUs

In [30]:
# Check if the version of PyTorch has GPU compatability
print(torch.cuda.is_available())

True


In [31]:
tensor_1 = torch.tensor([1., 2., 3.])
tensor_2 = torch.tensor([4., 5., 6.])
print(tensor_1 + tensor_2)

tensor([5., 7., 9.])


In [33]:
tensor_1 = tensor_1.to("cuda")  # Changing datatype of tensor_1; can specify which GPU to locate tensor on through .to("cuda:0") or .to("cuda:1")
tensor_2 = tensor_2.to("cuda")
print(tensor_1 + tensor_2)

tensor([5., 7., 9.], device='cuda:0')


In [35]:
# NOTE: This cell is intended to return an error to see what happens when tensors are on different devices
tensor_1 = tensor_1.to("cpu")   # All tensors used in a computation must be on the same device or they will fail
print(tensor_1 + tensor_2)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

### Since what I'm using to test my Neural Network is a small dataset we won't notice an increase in performance because of the transfer cost from CPU to GPU. 
### However, this will be usefull when training deep neural networks, especially LLMs.

In [38]:
device = torch.device("cuda") # Defines a device variable that defaults to a GPU

In [39]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Make the device able to run on CPU if GPU not available

In [42]:
# The following code is for Mac's which an Apple Silicon chip (like the M1, M2, M3, and newer models)
"""
device = torch.device(
    "mps" if torch.backends.mps.is_available() else "cpu"
)
"""

'\ndevice = torch.device(\n    "mps" if torch.backends.mps.is_available() else "cpu"\n)\n'

In [43]:
torch.manual_seed(123)
model = NeuralNetwork(num_inputs=2, num_outputs=2) # The toy dataset has 2 features and 2 classes

model = model.to(device)                           # Transfers the model onto the GPU

optimizer = torch.optim.SGD(
    model.parameters(), lr=0.5                     # The optimizer needs to know which parameters to optimize
)

num_epochs = 3

for epoch in range(num_epochs):
    model.train()

    for batch_idx, (features, labels) in enumerate(train_loader):
        features, labels = features.to(device), labels.to(device) # Transfers the data onto the GPU
        logits = model(features)

        loss = F.cross_entropy(logits, labels)

        optimizer.zero_grad()                     # Sets the gradients from the previous round to 0 to prevent unintended gradient accumulation
        loss.backward()                           # Computes the gradients of the loss given the model parameters
        optimizer.step()                          # The optimizer used the gradients to update the model parameters

        ## LOGGING
        print(f"Epoch: {epoch+1:03d}/{num_epochs:03d}"
              f" | Batch {batch_idx+1:03d}/{len(train_loader):03d}"
              f" | Train Loss: {loss:.2f}")

    model.eval()
    # Insert optional model evaluation code

Epoch: 001/003 | Batch 001/002 | Train Loss: 0.75
Epoch: 001/003 | Batch 002/002 | Train Loss: 0.65
Epoch: 002/003 | Batch 001/002 | Train Loss: 0.44
Epoch: 002/003 | Batch 002/002 | Train Loss: 0.13
Epoch: 003/003 | Batch 001/002 | Train Loss: 0.03
Epoch: 003/003 | Batch 002/002 | Train Loss: 0.00


### Training with multiple GPUs

I will be experimenting with a Distributed Data Parallel (DDP) strategy for utilizing multiple GPUs. However, DDP does not function properly within interactivePython environments like Jupyter notebooks, as they dont handle multiprocessing in the same way that a standalone Python script does. 

Therefore, the following code of my experimentation will be executed as a script as DDp needs to spawn multiple processes, and each process should have its own Python interpreter instance.

The code will be located at the following link in my GitHub repo for this project: https://github.com/Plehndm/LLM_From_Scratch/blob/main/DDP_Multiprocessing_Test.py

As I am doing this on a Windows laptop (which is not best suited for DDP) if another windows user should attempt to run this you will likley run into issues as well.