In [None]:
# # !pip install datasets
# from datasets import load_dataset
# dataset = load_dataset("Skylion007/openwebtext")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading dataset shards:   0%|          | 0/80 [00:00<?, ?it/s]

In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import mmap
import random
import pickle
# import argparse

# parser = argparse.ArgumentParser(description="this is demo program")
# parser.add_argument('-batch_size', type=str, required=True, help="please provide a batch size")

# args = parser.parse_args()


device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

batch_size = 64 # args.batch_size
block_size = 128
max_iters = 3000
learning_rate = 3e-4 #3e-3, 3e-4, 1e-3, 1e-4
eval_iters = 500
n_embd = 384
n_head = 8
n_layers = 8
dropout = 0.2

cuda


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
chars =""
with open('/content/drive/MyDrive/create_gpt/openwebtext/vocab.txt' , 'r', encoding='utf-8') as f:
    text = f.read()
    chars = sorted(list(set(text)))

vocab_size = len(chars)


In [5]:
#very  simple character level tokenizer
string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l:''.join([int_to_string[i] for i in l])

In [6]:
def get_random_chunk(split):
    filename = "/content/drive/MyDrive/create_gpt/openwebtext/train_split.txt" if split == 'train' else "/content/drive/MyDrive/create_gpt/openwebtext/val_split.txt"
    with open(filename, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            # Determine the file size and a random position to start reading
            file_size = len(mm)
            start_pos = random.randint(0, (file_size) -block_size*batch_size)

            # Seek to the random position and read the block of text
            mm.seek(start_pos)
            block = mm.read(block_size*batch_size-1)

            # Decode the block to a string, ignoring any invalid byte sequences
            decoded_block = block.decode('utf-8', errors='ignore').replace('\r','')

            # Train and test splits
            data = torch.tensor(encode(decoded_block), dtype=torch.long)

    return data

def get_batch(split):
    data = get_random_chunk(split)
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch. stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y =get_batch('train')

In [7]:

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [8]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
      #input of size (batch , time-step, channels)
      #output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x) #(B, T, hs)
        q = self.query(x) #(B, T, hs)
        #compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1] **-0.5 #(B, T, hs) @ (B,hs,T) --> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) #(B, T, T)
        wei = F.softmax(wei, dim=-1) #(B, T, T)
        wei = self.dropout(wei)
        #performe the weight aggregation of the values
        v = self.value(x) #(B, T, hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) --> (B, T, hs)
        return out



class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) #(B, T, F)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)


class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)


    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x + y)
        y = self.ffwd(x)
        x = self.ln2(x + y)
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.block = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, index, targets=None):
        B, T = index.shape

        # index and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(index) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B, T,CI
        x = self.block(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, index, max_new_tokens):
        for _ in range(max_new_tokens):
            index_cond = index if index.shape[1] < block_size else index[:, -block_size:]
            logits, loss = self.forward(index_cond)
            logits = logits[:, -1, :] if logits.shape[1] > 0 else logits[:, 0, :]
            probs = F.softmax(logits, dim =- 1)
            index_next = torch.multinomial(probs, num_samples=1)
            index = torch.cat((index_cond, index_next), dim=1)
        return index

model = GPTLanguageModel(vocab_size)
print("loading model parameters.......")
with open('/content/drive/MyDrive/create_gpt/model_state.pkl', 'rb') as f:
    model = pickle.load(f)
    print("loaded successfully!")
m = model.to(device)


loading model parameters.......
loaded successfully!


In [13]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.4f}, val loss: {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())


with open('/content/drive/MyDrive/create_gpt/model_state.pkl', 'wb') as f:
    pickle.dump(model, f)
    print("model saved")

step: 0, train loss: 1.4057, val loss: 1.4085
step: 500, train loss: 1.4016, val loss: 1.3820
step: 1000, train loss: 1.3734, val loss: 1.4076
step: 1500, train loss: 1.3707, val loss: 1.3417
step: 2000, train loss: 1.3407, val loss: 1.3615
step: 2500, train loss: 1.3639, val loss: 1.3226
1.4215856790542603
model saved


In [15]:
prompt = input("prompt:\n")
context = torch.tensor(encode(prompt), dtype = torch.long, device=device)
generated_chars = decode(m.generate(context.unsqueeze(0), max_new_tokens = 100)[0].tolist())
print(f"Completion:\n{generated_chars}")

prompt:
hello my name is rivki
Completion:


In [11]:
@torch.no_grad()
def evaluate_metrics(split='val'):
    model.eval()
    total_loss = 0.0
    total_tokens = 0
    total_correct = 0

    for _ in range(eval_iters):
        X, Y = get_batch(split)
        logits, loss = model(X, Y)
        total_loss += loss.item() * Y.numel()
        total_tokens += Y.numel()

        # Reshape preds to match Y's shape before comparison
        preds = torch.argmax(logits, dim=-1).view(Y.shape) # Reshape preds to match Y's shape

        correct = (preds == Y).float()
        total_correct += correct.sum().item()

    avg_loss = total_loss / total_tokens
    accuracy = total_correct / total_tokens
    perplexity = torch.exp(torch.tensor(avg_loss))

    return {
        'loss': avg_loss,
        'accuracy': accuracy,
        'perplexity': perplexity.item()
    }

In [14]:
metrics = evaluate_metrics('val')
print(f"val loss: {metrics['loss']:.4f}, accuracy: {metrics['accuracy']:.4f}, perplexity: {metrics['perplexity']:.4f}")


val loss: 1.3524, accuracy: 0.6087, perplexity: 3.8665
