In [1]:
!nvidia-smi

Fri Jan  5 10:43:27 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P8              13W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

### Importing Libraries

In [3]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from datasets import load_dataset

### Checking GPU Availability
It would be best to run this on a GPU as running it on a CPU will take a long time and may melt your CPU :)

In [4]:
torch.cuda.is_available()

True

In [5]:
torch.cuda.get_device_name()

'Tesla T4'

### Hyperparameters from Training

In [6]:
# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 300 # what is the maximum context length for predictions?
max_iters = 10000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 1024
n_head = 20
n_layer = 10
dropout = 0.05
# ------------

torch.manual_seed(1337)

<torch._C.Generator at 0x7dd8fafeb9f0>

### Loading our Sitcom Screenplay dataset

In [None]:
with open('sitcom_pre_training_dataset.txt', 'r', encoding='utf-8') as f:
    text = f.read().lower()

### Loading BERT Tokenizer

In [7]:
from transformers import BertTokenizer

In [8]:
enc = BertTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [9]:
vocab_size = len(enc.vocab)

In [None]:
tokens = enc.encode("Scene: An empty apartment in New York City")

In [None]:
tokens

[101, 3496, 1024, 2019, 4064, 4545, 1999, 2047, 2259, 2103, 102]

In [None]:
print(f"Vocabulary Size: {vocab_size}")
print(f"Raw Text: Scene: An empty apartment in New York City, Tokens: {tokens}")


Vocabulary Size: 30522
Raw Text: Scene: An empty apartment in New York City, Tokens: [101, 3496, 1024, 2019, 4064, 4545, 1999, 2047, 2259, 2103, 102]


### Train-Test Split

In [None]:
# Train and test splits
data = torch.tensor(enc.encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
n = len(data) - n
random_part = len(data) // 2
train_data1 = data[:random_part]
val_data = data[random_part: random_part + n]
train_data2 = data[random_part + n:]
train_data = torch.hstack([train_data1, train_data2])
print(train_data.shape, train_data1.shape, train_data2.shape)

Token indices sequence length is longer than the specified maximum sequence length for this model (967501 > 512). Running this sequence through the model will result in indexing errors


torch.Size([870750]) torch.Size([483750]) torch.Size([387000])


In [None]:
print(train_data.shape, val_data.shape)

torch.Size([870750]) torch.Size([96751])


In [None]:
train_data[-10:]

tensor([7324, 1007, 6864, 1012, 2097, 2017, 5914, 2033, 1029,  102])

### Custom Functions for generating a Batch of Dataset and Computing Losses

In [None]:
# data loading
def get_batch(split='train'):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else train_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
X, y = get_batch()

In [None]:
X.shape, y.shape

(torch.Size([16, 300]), torch.Size([16, 300]))

### Language Model's Head Class

In [10]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

### Multi-Headed Attention Class

In [11]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

### Feed Forward Neural Network Class

In [12]:
class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)


### Transformer Block Class

In [13]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

### Final Language Model Class

In [14]:
class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


## Instantiating the Model
Use Pre-trained Weights otherwise start with Random Weights

In [15]:
model = GPTLanguageModel()

# Model path will be different for our run, as you will be saving it to your
# local drive or Google Drive or any other storage solution
# Comment out the load_state_dict is weights not present
model_path = './drive/MyDrive/sitcom_20_head_10_layers_1024_nembd_20000_steps.pt'
print(f"Loading weights from {model_path}")
model.load_state_dict(torch.load(model_path))
model = model.to(device)
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

Loading weights from ./drive/MyDrive/sitcom_20_head_10_layers_1024_nembd_20000_steps.pt
188.616506 M parameters


## Model Training Loop

In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    # every 500 iterations, we evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


188.616506 M parameters
step 0: train loss 0.1535, val loss 0.1555
step 500: train loss 0.1533, val loss 0.1556
step 1000: train loss 0.1485, val loss 0.1483
step 1500: train loss 0.1438, val loss 0.1433
step 2000: train loss 0.1377, val loss 0.1399
step 2500: train loss 0.1323, val loss 0.1337
step 3000: train loss 0.1298, val loss 0.1291
step 3500: train loss 0.1257, val loss 0.1245
step 4000: train loss 0.1224, val loss 0.1223
step 4500: train loss 0.1189, val loss 0.1193
step 5000: train loss 0.1147, val loss 0.1166
step 5500: train loss 0.1137, val loss 0.1142
step 6000: train loss 0.1124, val loss 0.1113
step 6500: train loss 0.1088, val loss 0.1085
step 7000: train loss 0.1073, val loss 0.1073
step 7500: train loss 0.1057, val loss 0.1055
step 8000: train loss 0.1025, val loss 0.1027
step 8500: train loss 0.1013, val loss 0.1014
step 9000: train loss 0.1001, val loss 0.0998
step 9500: train loss 0.0986, val loss 0.0992
step 9999: train loss 0.0986, val loss 0.0982


## Generating Text from our Trained Model

In [None]:
# generate from the model
input_tokens = enc.encode("Scene: Sheldon and Penny arguing about Star Wars")
context = torch.tensor([input_tokens],
                       device=device)
out = enc.decode(model.generate(context, max_new_tokens=500)[0].tolist())
# open('more.txt', 'w').write(decode(m.generate(context, max_new_tokens=10000)[0].tolist()))

In [None]:
import re
pattern = r'(\w+)(\s+):'
modified_text = re.sub(pattern, r'\n\1:', text)
modified_text

"[CLS] scene : sheldon and penny arguing about star wars [SEP] in a dream. sheldon : please, i don't start taking advantage of them. howard : what was i thinking? sheldon : sack of rome a sack's metaphorical kind, but i'm sure that even more of a more likely to develop my birthday at all. penny : oh, come on, i'll put that on fire. hello, howard. howard : raj, you're not gonna run away. sheldon : i couldn't take the compliment. penny : there are so many people here tonight doesn't make us less sales. sheldon : hey, fellas, and she wasn't the only reason i'd like to have a glass of water. penny : yup, i was thinking about that. leonard : i would have maybe head on the krmit to hate you. penny : hey, you don't have to be access to the university who will have to support you? leonard : oh, well, cool. penny : mm - hmm. leonard : at least i know the girlfriend you and i were thinking about investing in stuart's head out thinking about how electric bill? penny : oh, please sit down. sheldon

In [None]:
# Your model save path will be different based on the storage solution you are using
model_save_path = './drive/MyDrive/sitcom_20_head_10_layers_1024_nembd_20000_steps.pt'
torch.save(model.state_dict(), model_save_path)

# Fine Tuning

In [17]:
# Fine Tuning hyperparameters
max_iters = 1000
eval_interval = 50
learning_rate = 1e-4
eval_iters = 50

In [19]:
from datasets import load_dataset

def get_batch_fine_tune(split='train'):
    # generate a small batch of data of inputs x and targets y
    block_size = 300
    tbbt_dataset = load_dataset('./tbbt_scenes/', split='train')
    output = []
    for sample in tbbt_dataset:
        text = sample['text']
        enc_text = enc.encode(text)
        if len(enc_text) <= 300:
            enc_text = enc_text + [0] * (301 - len(enc_text))
            r = 1
        else:
            r = len(enc_text) - 300
        data = torch.tensor(enc_text, dtype=torch.long)
        x = torch.stack([data[i:i+block_size] for i in range(r)])
        y = torch.stack([data[i+1:i+block_size+1] for i in range(r)])
        output.append((x, y))
    return output

@torch.no_grad()
def estimate_loss_fine_tune():
    out = {}
    model.eval()
    losses = []
    for X, Y in get_batch_fine_tune():
        X, Y = X.to(device), Y.to(device)
        logits, loss = model(X, Y)
        losses.append(loss.item())
    out['ft'] = sum(losses) / len(losses)
    model.train()
    return out

In [20]:
import gc
torch.cuda.empty_cache()
gc.collect()

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    # every 500 iterations, we evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss_fine_tune()
        print(f"step {iter}: Fine Tune loss {losses['ft']:.4f}")

    torch.cuda.empty_cache()
    gc.collect()
    # sample a batch of data
    for X, Y in get_batch_fine_tune():
      # evaluate the loss
      X, Y = X.to(device), Y.to(device)
      logits, loss = model(X, Y)
      optimizer.zero_grad(set_to_none=True)
      loss.backward()
      optimizer.step()
      torch.cuda.empty_cache()
      gc.collect()


Resolving data files:   0%|          | 0/208 [00:00<?, ?it/s]

step 0: Fine Tune loss 13.3444


Resolving data files:   0%|          | 0/208 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacty of 14.75 GiB of which 9.06 MiB is free. Process 270361 has 14.74 GiB memory in use. Of the allocated memory 14.25 GiB is allocated by PyTorch, and 366.75 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF