In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import mmap
import random
import pickle
import regex as re

from colorama import Fore, Style

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'

from tokenizers import Tokenizer

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

file_path = 'C:/Code_Projects/llm/datasets/'
train_file_path = file_path + 'sw_train_split.txt'
val_file_path = file_path + 'sw_val_split.txt'
vocab_file_path = file_path + 'sw_vocab.txt'

batch_size = 32
block_size = 64
learning_rate = 3e-4
n_embd = 384 #384
n_head = 4
n_layer = 4
dropout = 0.2

cuda


In [2]:
def Convert(string): 
    li = list(string.split('\n')) 
    return li

with open(vocab_file_path, 'r', encoding='utf-8') as f:
    text = f.read()

print(f'text: {text[:100]}')
sub_word = Convert(text)

text: distresses 
impos 
Inglourious 
Healy 
Republicans 
MLB 
Taoist 
sash 
Flick 
whirled 
FitzChivalry 


In [3]:
#huggingface BPE tokenizer
tokenizer = Tokenizer.from_file("tokenizer.json")
vocab_size = tokenizer.get_vocab_size()
vocab_size

100000

In [4]:
def encode(text):
    return tokenizer.encode(text).ids

def decode(ids):
    return tokenizer.decode(ids)

def get_random_chunk(split):
    filename = train_file_path if split == 'train' else val_file_path
    with open(filename, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            file_size = len(mm)
            max_start_pos = max(0, file_size - block_size * batch_size)
            start_pos = random.randint(0, max_start_pos)
            # file 1000
            # block 16
            # batch 8
            
            mm.seek(start_pos)
            block = mm.read(block_size * batch_size)

            decoded_block = block.decode('utf-8', errors='ignore').replace('\r', '')
            data = torch.tensor(encode(decoded_block), dtype=torch.long)
            
    return data

def get_batch(split):
    data = get_random_chunk(split)
    
    if len(data) < block_size + 1:
        # raise ValueError
        print(f"Data length ({len(data)}) is too short for the specified block size ({block_size})")
        data = get_random_chunk(split)

    max_index = len(data) - block_size - 1
    if max_index <= 0:
        raise ValueError(f"Max index ({max_index}) must be greater than 0. Adjust block size or ensure sufficient data length.")
    
    ix = torch.randint(0, max_index + 1, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    x, y = x.to(device), y.to(device)
    return x, y

In [5]:
x, y = get_batch('train')
print('inputs:')
print(x.shape)
print('targets:')
print(y.shape)

inputs:
torch.Size([32, 64])
targets:
torch.Size([32, 64])



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Code_Projects\llm\cuda\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Code_Projects\llm\cuda\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Code_Projects\llm\cuda\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start()
  File "c:\Code_Projects\llm\cuda\Lib\site-

In [6]:
@torch.no_grad
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [7]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

# [1, 0, 0]
# [1, 0.6, 0]
# [1, 0.6, 0.4]
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
        out = self.dropout(self.proj(out))
        return out
    

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)
    
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x + y)
        y = self.ffwd(x)
        x = self.ln2(x + y)
        return x
    
class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
        
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, index, targets=None):
        B, T = index.shape
        # print(Fore.BLUE + f'B,T: {index}')
        
        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(index) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            index_cond = index[:, -block_size:]
            # get the predictions
            logits, loss = self.forward(index_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

In [14]:
model_path = 'C:/Code_Projects/llm/trained_models/'
model_name = 'SW_V'
saved_model =  model_path + 'SW_V_3230.pkl'
use_trained_model = True
max_iters = 50000
eval_iters = 100

In [15]:
model = GPTLanguageModel(vocab_size)

if(use_trained_model):
    print('loading model parameters...')
    with open(saved_model, 'rb') as f:
        model = pickle.load(f)
    print('loaded successfully!')


m = model.to(device)

loading model parameters...
loaded successfully!


In [13]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(Fore.CYAN + f'##step: {iter}, train loss: {losses["train"]:.4f}, eval loss: {losses["val"]:.4f}##')
        
        #checkpoint
        mn = model_path + model_name + f'_{int(iter/10)}' + '.pkl'
        with open(mn, "wb") as f:
            pickle.dump(model, f)
        print(f'{mn} Saved')
        
        print(Style.RESET_ALL)
        
    #sample a batch of data
    xb, yb = get_batch('train')

    #evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print(loss.item())

mn = model_name + f'_Final' + '.pkl'
with open(model_path + mn, "wb") as f:
    pickle.dump(model, f)
print(f'{mn} Saved')

Data length (2) is too short for the specified block size (64)
[36m##step: 0, train loss: 6.1205, eval loss: 6.0733##
C:/Code_Projects/llm/trained_models/SW_V_0.pkl Saved
[0m
Data length (2) is too short for the specified block size (64)
[36m##step: 100, train loss: 6.0357, eval loss: 5.9722##
C:/Code_Projects/llm/trained_models/SW_V_10.pkl Saved
[0m
Data length (2) is too short for the specified block size (64)
Data length (2) is too short for the specified block size (64)
Data length (2) is too short for the specified block size (64)
[36m##step: 200, train loss: 6.0009, eval loss: 6.0808##
C:/Code_Projects/llm/trained_models/SW_V_20.pkl Saved
[0m
Data length (2) is too short for the specified block size (64)
Data length (2) is too short for the specified block size (64)
Data length (29) is too short for the specified block size (64)
[36m##step: 300, train loss: 6.0128, eval loss: 6.1614##
C:/Code_Projects/llm/trained_models/SW_V_30.pkl Saved
[0m
Data length (2) is too short f

KeyboardInterrupt: 

In [16]:
# prompt = input("Prompt:\n")
prompt = 'this is a corpus'
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context.unsqueeze(0), max_new_tokens=150)[0].tolist())
print(f'Completion:\n{generated_chars}')

Completion:
this is a corpus designed to encode his demise and the Ministry of Economics near the Supreme Church in central Pour da ia Nation . At the Council border , the French colon isation of the Syrian candidate , expressed a point migration deficit of 19 - foot - cave homes and 4 - 1 Weiss of Texas ’ s universities class than he served in a war era hurt Muslims who saw off of the wall - the air in Fort ress and walk out . Walmart , 35 , PP sentencing , 62 , was child contact at $ 78 , 800 , rone sites . But the source for both parties also has a pipeline div vy settlement centre in 2000 , with away - too long , a little slower about Georgia explained out their team minister than the party . My wife is an exit move toward an end


In [19]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=150)[0].tolist())
print(generated_chars)

you want . road output ?” but almost such a reality , Gor man . But she continues regularly from sic , ever , to walk . An anonymous news with her friend , enforcement observers have also raised 22 , 000 miss a Sheriff ’ s Department of Defence . And because Trump was feeling strongly with her own place .” telling Trump given up about bias all mid - excesses “ if a man should revoke ched the attack around the border . The army was heavily gestures that were asked to do the winds who were painted and attacked by her pro - Reagan homemade jeans on his hand . “ Feinstein Adi is scared of your receivers . When you look at this story , all domestic beings Seab ye notes , there are those that cannot day walk ,” Christ said . “ ce8 .
