In [2]:
import pandas as pd
import numpy as np
import glob
import re

from tokenizers import SentencePieceBPETokenizer, normalizers, decoders
from tokenizers.normalizers import Replace

from nltk.tokenize import word_tokenize

from transformers import AutoConfig, AutoModelForCausalLM
from transformers import PreTrainedTokenizerFast
import torch

Model

In [2]:
config = AutoConfig.from_pretrained("openai-community/gpt2")

config.n_head = 8
config.n_layer = 8
config.vocab_size = 32768
config.n_embd = 768

model = AutoModelForCausalLM.from_config(config=config)
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params}")

Total parameters: 82656768


In [3]:
device = 'mps'

In [None]:
X, Y = torch.load("Archive 2/X_train.pt"), torch.load("Archive 2/Y_train.pt")


In [5]:
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(32768, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-7): 8 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=32768, bias=False)
)

In [8]:
X.shape, Y.shape

(torch.Size([44789637, 22]), torch.Size([44789637]))

In [None]:
from torch.optim import AdamW
import torch
import numpy as np
import os

# Define the path for saving model checkpoints
checkpoint_dir = './model_checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

optim = AdamW(model.parameters(), lr=0.01)
batch_size = 32
train_size = len(X)
logging_interval = 0.1
total_loss = 0

for epoch in range(5):
    epoch_loss = np.array([])
    model.train()  # Ensure the model is in training mode

    for iter in range(0, train_size, batch_size):
        x = X[iter: iter + batch_size]
        y = Y[iter: iter + batch_size]
        x, y = x.to(device), y.to(device)

        # Ensure y is a 1D tensor
        y = y.squeeze()

        # Forward pass
        y_pred = model(x)
        loss = torch.nn.functional.cross_entropy(y_pred.logits[:, -1, :], y)
        # print(loss)

        # Accumulate the loss for logging
        total_loss += loss.item()

        # Zero out gradients, backpropagate, and update weights
        optim.zero_grad()
        loss.backward()
        optim.step()

        # Track epoch loss
        epoch_loss = np.append(epoch_loss, loss.item())

        # Log perplexity at specified intervals #TODO Its wrong correct this
        if iter % int(train_size * logging_interval) == 0 and iter != 0:
            avg_loss = total_loss / ((iter // batch_size) + 1)
            perplexity = torch.exp(torch.tensor(avg_loss))
            print(f"Perplexity at epoch {epoch + 1}, step {iter}: {perplexity.item()}")

    # Save checkpoint after each epoch
    checkpoint_path = os.path.join(checkpoint_dir, f'model_epoch_{epoch + 1}.pth')
    torch.save({
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optim.state_dict(),
        'loss': epoch_loss.mean()
    }, checkpoint_path)
    print(f"Checkpoint saved at {checkpoint_path}")

    print(f"Loss of epoch {epoch + 1}: {epoch_loss.mean()}")
    total_loss = 0  # Reset total_loss after each epoch


In [1]:
model

NameError: name 'model' is not defined