In [1]:
%load_ext autoreload
%autoreload 2
import os
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


In [2]:
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# !pip3 install bitsandbytes transformers livelossplot matplotlib tqdm scikit-learn
# !pip3 install mixture-of-experts pydantic

In [3]:
import sys
sys.path.append('/mnt/c/Users/Arush Bansal/OneDrive - IIT Delhi/Desktop/arushGPT')


In [4]:
# from layers.RegexTokenizer import RegexTokenizer
import torch
from torch import nn
from torch.nn import functional as F
from torch.amp import autocast, GradScaler
import bitsandbytes as bnb

import time
from transformers import AutoTokenizer


from livelossplot import PlotLosses

from utils import ModelSpecs, TrainingData
# from layers.Block import Block
from layers.MLA_Block import Block
# from layers.WGQA_Block import Block
torch.manual_seed(1337)

<torch._C.Generator at 0x74b706ccf850>

In [5]:
torch.__version__

'2.7.1+cu118'

In [6]:
# hyperparameters
modelSpecs = ModelSpecs.create('target')
# BATCH_SIZE = 38 # how many independent sequences will we process in parallel?
# BATCH_SIZE = 28 # how many independent sequences will we process in parallel?
BATCH_SIZE = 1 # how many independent sequences will we process in parallel?
MAX_ITERS = 2000
LEARNING_RATE = 3e-4
EVAL_INTERVALS = 100
# EVAL_ITERS = 200
EVAL_ITERS = 50
TRAIN_TEST_SPLIT = 0.9
# device = 'cpu'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Running on {device.capitalize()} device.")


Running on Cuda device.


In [7]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3.5-mini-instruct")
# tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
stories = TrainingData.TinyStories()

In [8]:
i = 8
text = stories[int(10**6 * 0.730612 * i):int(10**6 * 0.730612 *(i+1) )]
# text = TrainingData.TinyStories()[:int(10**6 * 0.730612 * 20 * 5)]
print(len(text)/10**6,"M")


0.730612 M


In [9]:

data = torch.tensor(tokenizer.encode(text))

n = int(TRAIN_TEST_SPLIT * len(data))
print(len(data)/10**6,"M")

train_data = data[:n]
val_data = data[n:]


Token indices sequence length is longer than the specified maximum sequence length for this model (184703 > 1024). Running this sequence through the model will result in indexing errors


0.184703 M


In [10]:
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - modelSpecs.BLOCK_SIZE, (BATCH_SIZE,))
    x = torch.stack([data[i:i+modelSpecs.BLOCK_SIZE] for i in ix])
    y = torch.stack([data[i+1:i+modelSpecs.BLOCK_SIZE+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [11]:
class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(modelSpecs.VOCAB_SIZE, modelSpecs.N_EMBD)
        self.position_embedding_table = nn.Embedding(modelSpecs.BLOCK_SIZE, modelSpecs.N_EMBD)
        self.blocks = nn.Sequential(*[Block(modelSpecs) for _ in range(modelSpecs.N_LAYER)])
        self.ln_f = nn.LayerNorm(modelSpecs.N_EMBD) # final layer norm
        self.lm_head = nn.Linear(modelSpecs.N_EMBD, modelSpecs.VOCAB_SIZE)

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -modelSpecs.BLOCK_SIZE:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


In [12]:
@torch.no_grad()
def estimate_loss(model : GPTLanguageModel):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(EVAL_ITERS)
        for k in range(EVAL_ITERS):
            X, Y = get_batch(split)
            with autocast(device_type='cuda', dtype=torch.float16):
                _, loss = model(X, Y)
            # _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


In [13]:
m = GPTLanguageModel()
m = m.to(device)
# m = torch.compile(m, mode='default') # TODO use this when ready to use linux, and ready to train a 100M model
# m = torch.compile(m, backend='aot_eager')
# m = torch.compile(m, mode='reduce-overhead', backend='inductor') 
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
# optimizer = torch.optim.AdamW(m.parameters(), lr=LEARNING_RATE)
optimizer = bnb.optim.AdamW8bit(m.parameters(), lr=LEARNING_RATE)
scaler = GradScaler()

93.941061 M parameters


In [14]:
startTime = time.time()
m.train()

# torch.backends.cudnn.benchmark = True
# torch.backends.cuda.matmul.allow_tf32 = True
# torch.backends.cudnn.allow_tf32 = True

# temporary line
MAX_ITERS = 800 // BATCH_SIZE
torch.cuda.empty_cache()
iter99Str = ""
iter99time = None
ACCUMILATION = 4
for iter in range(0, MAX_ITERS, ACCUMILATION):
    print(f"iter #{iter}")
    is_target_iter = (iter == ACCUMILATION)
    # torch.cuda.empty_cache()
    
    # every once in a while evaluate the loss on train and val sets
    # if iter % EVAL_INTERVALS == 0 or iter == MAX_ITERS - 1:
    #     losses = estimate_loss(model)
        # print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, time {(time.time() - startTime)/60:.2f} minutes")
        # print (f"step {iter}:  time {(time.time() - startTime)/60:.2f} minutes")
    
    # torch.cuda.empty_cache()
    # sample a batch of data
    if is_target_iter : iter99time = time.time()

    # evaluate the loss
    optimizer.zero_grad(set_to_none=True)    

    for i in range(ACCUMILATION):
        # torch.compiler.cudagraph_mark_step_begin()
        with autocast(device_type='cuda', dtype=torch.float16):
            xb, yb = get_batch('train') 
            logits, loss = m(xb, yb)
        scaler.scale(loss/ ACCUMILATION).backward()

    if is_target_iter :
        print(f"forwardbackwardPass {time.time() - iter99time:.1f}s ", end="")
        iter99time = time.time()
    scaler.step(optimizer)
    if is_target_iter :
        print(f"backwardPassStep {time.time() - iter99time:.1f}s ", end="")
        iter99time = time.time()

    scaler.update()

    if is_target_iter :
        print(f"backwardPassUpdate {time.time() - iter99time:.1f}s")
        iter99time = time.time()


print(f"{torch.cuda.memory_allocated() / 1e6:.1f}MB {torch.cuda.max_memory_allocated() / 1e6:.1f}MB(peak) allocated, {torch.cuda.memory_reserved() / 1e6:.1f}MB reserved")


endTime = time.time()
print(f"Total Training Time : {(endTime - startTime)/60:.2f} minutes")

torch.cuda.empty_cache()

iter #0
iter #4
forwardbackwardPass 0.5s backwardPassStep 0.1s backwardPassUpdate 0.0s
iter #8
iter #12
iter #16
iter #20
iter #24
iter #28
iter #32
iter #36
iter #40
iter #44
iter #48
iter #52
iter #56
iter #60
iter #64
iter #68
iter #72
iter #76
iter #80
iter #84
iter #88
iter #92
iter #96
iter #100
iter #104
iter #108
iter #112
iter #116
iter #120
iter #124
iter #128
iter #132
iter #136
iter #140
iter #144
iter #148
iter #152
iter #156
iter #160
iter #164
iter #168
iter #172
iter #176
iter #180
iter #184
iter #188
iter #192
iter #196
iter #200
iter #204
iter #208
iter #212
iter #216
iter #220
iter #224
iter #228
iter #232
iter #236
iter #240
iter #244
iter #248
iter #252
iter #256
iter #260
iter #264
iter #268
iter #272
iter #276
iter #280
iter #284
iter #288
iter #292
iter #296
iter #300
iter #304
iter #308
iter #312
iter #316
iter #320
iter #324
iter #328
iter #332
iter #336
iter #340
iter #344
iter #348
iter #352
iter #356
iter #360
iter #364
iter #368
iter #372
iter #376
iter #3

In [15]:
# 100M BATCH 8 BLOCK 512 16.77min 56.86days-for-2B-tokens
# 100M BATCH 8 BLOCK 512 11.82min 40.07days-for-2B-tokens 1654MB 7756MB (peak) allocated 8512MB reserved

# 22.2s
#10.2s

In [16]:
print(torch.backends.cuda.flash_sdp_enabled())       # should be True
print(torch.backends.cuda.mem_efficient_sdp_enabled()) # True
print(torch.backends.cuda.math_sdp_enabled())          # True

True
True
True


In [17]:
# WGQA-8 16 head 16 layer - perplexity implimentation (95M param)
# 1412.382208 MB allocated
# 8919.369216 MB peak allocated
# 9753.853952 MB reserved
# Total Training Time : 12.06 minutes

# WGQA-8 16 head 18 layer - perplexity implimentation (95M param)
# 1412.382208 MB allocated
# 8919.369216 MB peak allocated
# 9753.853952 MB reserved
# Total Training Time : 12.06 minutes

# WGQA-8 16 head 18 layer - perplexity implimentation (103M param)
# 1470.321152 MB allocated
# 10227.390976 MB peak allocated
# 10525.605888 MB reserved
# Total Training Time : 15.58 minutes

# MLA 16 head 8 layer - 103M param (76M param)
# 1696.742912 MB allocated
# 6963.769856 MB peak allocated
# 7746.879488 MB reserved
# Total Training Time : 4.31 minutes



In [None]:
# MHA 516Embed 12Heads 16Layers :aot_eager 
# 1702.7MB 7804.9MB(peak) allocated 8638.1MB reserved
# Total Training Time : 13.01 minutes

# MHA 516Embed 12Heads 16Layers :default 
# 1695.1MB 5558.3MB(peak) allocated, 5893.0MB reserved
# Total Training Time : 16.79 minutes

# MHA 516Embed 12Heads 16Layers :reduce-overhead:inductor 
# forwardPass 0.0s backwardPassScale 0.0s backwardPassStep 6.2s backwardPassUpdate 0.0s
# 832.5MB 5885.0MB(peak) allocated, 6341.8MB reserved
# Total Training Time : 15.12 minutes

# MHA 516Embed 12Heads 16Layers :gradient-accumilation
# BATCH 16 ITER 100 ACCUMILATION 1 : ~16min
# BATCH 16 ITER 100 ACCUMILATION 2 : OOM Error
# BATCH 08 ITER 200 ACCUMILATION 2 : 6.54min 1423.5MB 5743.7MB(peak) allocated, 6157.2MB reserved
# BATCH 08 ITER 200 ACCUMILATION 4 : 6.89min 1423.5MB 5743.7MB(peak) allocated, 6157.2MB reserved
# BATCH 08 ITER 200 ACCUMILATION 8 : ~7min forwardbackwardPass 16.8s backwardPassStep 0.3s backwardPassUpdate 0.0s

# iter #99
# 802.5MB 5817.1MB(peak) allocated, 6178.2MB reserved
# Total Training Time : 9.90 minutes

# MHA 512Embed 12Heads 16Layers :gradient-accumilation :
# no-sqda 5.27min forwardbackwardPass 2.8s backwardPassStep 0.1s backwardPassUpdate 0.0s 1448.4MB 4803.8MB(peak) allocated, 5207.2MB reserved
# sqda 6.19 minutes 1448.4MB 4845.2MB(peak) allocated, 5677.0MB reserved] 

# 1345.5MB 3120.2MB(peak) allocated, 3741.3MB reserved
# Total Training Time : 6.47 minutes

# Batch size 2 accumilation 2 mla block size 512 embed 516
# 1306.3MB 3028.1MB(peak) allocated, 3231.7MB reserved
# Total Training Time : 1.24 minutes


# MLA batch 4 accumilation 2 mla block size 512 embed 516
# forwardbackwardPass 2.0s backwardPassStep 0.1s backwardPassUpdate 0.0s
# BATCH_SIZE = 4

# MLA batch 2 accumlation mla block size 512 embed 516
# forwardbackwardPass 0.7s backwardPassStep 0.1s backwardPassUpdate 0.0s
# 1301.0MB 3025.9MB(peak) allocated, 3439.3MB reserved
# Total Training Time : 1.12 minutes

# MLA batch 1 accumlation 8 mla block size 512 embed 516
# forwardbackwardPass 1.0s backwardPassStep 0.1s backwardPassUpdate 0.0s
# 1159.5MB 2099.8MB(peak) allocated, 2300.6MB reserved
# Total Training Time : 1.62 minutes

# MLA batch 1 accumlation 4 mla block size 512 embed 516
# forwardbackwardPass 0.5s backwardPassStep 0.1s backwardPassUpdate 0.0s
# 1159.2MB 2094.4MB(peak) allocated, 2300.6MB reserved
# Total Training Time : 1.69 minutes

In [19]:
# input_tokens = torch.tensor(tokenizer.encode("there was once a forest with 100s of bushes")).unsqueeze(0).cuda()
input_tokens = torch.tensor(tokenizer.encode("Once")).unsqueeze(0).cuda()
# print(input_tokens)

m.eval()

output_tokens = m.generate(input_tokens, max_new_tokens=64)[0]
# print(output_tokens)

m.train()

output : str = tokenizer.decode(output_tokens)


print(output)

Once makes with not for their defect,,,,,,,,Who h,,,,,,,,,,,,, hurt called,,,Her head,, Mia,, noise,, Sparkle named,,, Canad,,,,,,,,,,,,,,


In [20]:
print(int(len(stories)//(10**6 )))

10


In [21]:
ITERS = 100
startTime = time.time()
step = BATCH_SIZE * modelSpecs.BLOCK_SIZE * ITERS * 8
liveloss = PlotLosses()

# for i in range(step * 0, len(stories), step):
for i in range(989593600, len(stories), step):
    print("ITER:", i // step, "::::", " STRING INDEX:", i)
    text = stories[i : i + step]
    data = torch.tensor(tokenizer.encode(text))
    n = int(TRAIN_TEST_SPLIT * len(data))
    print("tokens", len(data)/10**6,"M")

    train_data = data[:n]
    val_data = data[n:]





    for iter in range(ITERS):
        print(f"iter #{iter}")
        torch.cuda.empty_cache()
        # every once in a while evaluate the loss on train and val sets
        if iter % EVAL_INTERVALS == 0 or iter == ITERS - 1:
            losses = estimate_loss(m)
            liveloss.update({ 'loss': losses['train'], 'val_loss': losses['val']})
            liveloss.send()
            print("ITER:", i // step, "::::", " STRING INDEX:", i)
            print("tokens", len(data)/10**6,"M")
            print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, time {int((time.time() - startTime)//60)} minutes")
            
        torch.cuda.empty_cache()
        # sample a batch of data
        xb, yb = get_batch('train')

        # evaluate the loss
        logits, loss = m(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    endTime = time.time()
    print(f"Total Training Time : {int((endTime - startTime)//60)} minutes")

In [22]:
655,360,000
989,593,600

(989, 593, 600)

In [23]:
input_tokens = torch.tensor(tokenizer.encode(" ")).unsqueeze(0).cuda()
# print(input_tokens)


output_tokens = m.generate(input_tokens, max_new_tokens=100)[0]
# print(output_tokens)

output : str = tokenizer.decode(output_tokens)



print(output)


 







.


"..






















Sam. continween it said. plays.


"!"

Bene all.












!" also looks building green away off them is Ben thencase go tower cars's important us back from it now nice world both helped it looksite


In [24]:

def convert_optimizer_state_to_float32(optimizer : torch.optim.AdamW):
    for state in optimizer.state.values():
        for k, v in state.items():
            if isinstance(v, torch.Tensor) and v.dtype == torch.float64:
                state[k] = v.float()


# Saving the model
def save_model(model : nn.Module, optimizer : torch.optim.AdamW, filepath):
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }
    torch.save(checkpoint, filepath)
    print(f"Model saved to {filepath}")

# Loading the model
def load_model(model : nn.Module, optimizer : torch.optim.AdamW, filepath, device):
    checkpoint = torch.load(filepath, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    convert_optimizer_state_to_float32(optimizer)

    print(f"Model loaded from {filepath}")
    model.to(device)
    return model, optimizer





In [25]:
# Save after training
save_model(m, optimizer, 'weights/gpt_model_50M.pth') # TODO improve this naming convention

# Later or for inference
# model = GPTLanguageModel()
# optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
# model, optimizer = load_model(model, optimizer, 'gpt_model_checkpoint.pth', device)


RuntimeError: Parent directory weights does not exist.

In [None]:
pre_model = GPTLanguageModel()
pre_optimizer = torch.optim.AdamW(pre_model.parameters(), lr=LEARNING_RATE)

loaded_model, loaded_optimizer = load_model(pre_model, pre_optimizer, './weights/gpt_model_50M.pth', device)


RuntimeError: Error(s) in loading state_dict for GPTLanguageModel:
	Missing key(s) in state_dict: "token_embedding_table.weight", "position_embedding_table.weight", "blocks.0.sa.heads.0.tril", "blocks.0.sa.heads.0.key.weight", "blocks.0.sa.heads.0.query.weight", "blocks.0.sa.heads.0.value.weight", "blocks.0.sa.heads.1.tril", "blocks.0.sa.heads.1.key.weight", "blocks.0.sa.heads.1.query.weight", "blocks.0.sa.heads.1.value.weight", "blocks.0.sa.heads.2.tril", "blocks.0.sa.heads.2.key.weight", "blocks.0.sa.heads.2.query.weight", "blocks.0.sa.heads.2.value.weight", "blocks.0.sa.heads.3.tril", "blocks.0.sa.heads.3.key.weight", "blocks.0.sa.heads.3.query.weight", "blocks.0.sa.heads.3.value.weight", "blocks.0.sa.heads.4.tril", "blocks.0.sa.heads.4.key.weight", "blocks.0.sa.heads.4.query.weight", "blocks.0.sa.heads.4.value.weight", "blocks.0.sa.heads.5.tril", "blocks.0.sa.heads.5.key.weight", "blocks.0.sa.heads.5.query.weight", "blocks.0.sa.heads.5.value.weight", "blocks.0.sa.heads.6.tril", "blocks.0.sa.heads.6.key.weight", "blocks.0.sa.heads.6.query.weight", "blocks.0.sa.heads.6.value.weight", "blocks.0.sa.heads.7.tril", "blocks.0.sa.heads.7.key.weight", "blocks.0.sa.heads.7.query.weight", "blocks.0.sa.heads.7.value.weight", "blocks.0.sa.heads.8.tril", "blocks.0.sa.heads.8.key.weight", "blocks.0.sa.heads.8.query.weight", "blocks.0.sa.heads.8.value.weight", "blocks.0.sa.heads.9.tril", "blocks.0.sa.heads.9.key.weight", "blocks.0.sa.heads.9.query.weight", "blocks.0.sa.heads.9.value.weight", "blocks.0.sa.heads.10.tril", "blocks.0.sa.heads.10.key.weight", "blocks.0.sa.heads.10.query.weight", "blocks.0.sa.heads.10.value.weight", "blocks.0.sa.heads.11.tril", "blocks.0.sa.heads.11.key.weight", "blocks.0.sa.heads.11.query.weight", "blocks.0.sa.heads.11.value.weight", "blocks.0.sa.proj.weight", "blocks.0.sa.proj.bias", "blocks.0.ffwd.net.0.weight", "blocks.0.ffwd.net.0.bias", "blocks.0.ffwd.net.2.weight", "blocks.0.ffwd.net.2.bias", "blocks.0.ln1.weight", "blocks.0.ln1.bias", "blocks.0.ln2.weight", "blocks.0.ln2.bias", "blocks.1.sa.heads.0.tril", "blocks.1.sa.heads.0.key.weight", "blocks.1.sa.heads.0.query.weight", "blocks.1.sa.heads.0.value.weight", "blocks.1.sa.heads.1.tril", "blocks.1.sa.heads.1.key.weight", "blocks.1.sa.heads.1.query.weight", "blocks.1.sa.heads.1.value.weight", "blocks.1.sa.heads.2.tril", "blocks.1.sa.heads.2.key.weight", "blocks.1.sa.heads.2.query.weight", "blocks.1.sa.heads.2.value.weight", "blocks.1.sa.heads.3.tril", "blocks.1.sa.heads.3.key.weight", "blocks.1.sa.heads.3.query.weight", "blocks.1.sa.heads.3.value.weight", "blocks.1.sa.heads.4.tril", "blocks.1.sa.heads.4.key.weight", "blocks.1.sa.heads.4.query.weight", "blocks.1.sa.heads.4.value.weight", "blocks.1.sa.heads.5.tril", "blocks.1.sa.heads.5.key.weight", "blocks.1.sa.heads.5.query.weight", "blocks.1.sa.heads.5.value.weight", "blocks.1.sa.heads.6.tril", "blocks.1.sa.heads.6.key.weight", "blocks.1.sa.heads.6.query.weight", "blocks.1.sa.heads.6.value.weight", "blocks.1.sa.heads.7.tril", "blocks.1.sa.heads.7.key.weight", "blocks.1.sa.heads.7.query.weight", "blocks.1.sa.heads.7.value.weight", "blocks.1.sa.heads.8.tril", "blocks.1.sa.heads.8.key.weight", "blocks.1.sa.heads.8.query.weight", "blocks.1.sa.heads.8.value.weight", "blocks.1.sa.heads.9.tril", "blocks.1.sa.heads.9.key.weight", "blocks.1.sa.heads.9.query.weight", "blocks.1.sa.heads.9.value.weight", "blocks.1.sa.heads.10.tril", "blocks.1.sa.heads.10.key.weight", "blocks.1.sa.heads.10.query.weight", "blocks.1.sa.heads.10.value.weight", "blocks.1.sa.heads.11.tril", "blocks.1.sa.heads.11.key.weight", "blocks.1.sa.heads.11.query.weight", "blocks.1.sa.heads.11.value.weight", "blocks.1.sa.proj.weight", "blocks.1.sa.proj.bias", "blocks.1.ffwd.net.0.weight", "blocks.1.ffwd.net.0.bias", "blocks.1.ffwd.net.2.weight", "blocks.1.ffwd.net.2.bias", "blocks.1.ln1.weight", "blocks.1.ln1.bias", "blocks.1.ln2.weight", "blocks.1.ln2.bias", "blocks.2.sa.heads.0.tril", "blocks.2.sa.heads.0.key.weight", "blocks.2.sa.heads.0.query.weight", "blocks.2.sa.heads.0.value.weight", "blocks.2.sa.heads.1.tril", "blocks.2.sa.heads.1.key.weight", "blocks.2.sa.heads.1.query.weight", "blocks.2.sa.heads.1.value.weight", "blocks.2.sa.heads.2.tril", "blocks.2.sa.heads.2.key.weight", "blocks.2.sa.heads.2.query.weight", "blocks.2.sa.heads.2.value.weight", "blocks.2.sa.heads.3.tril", "blocks.2.sa.heads.3.key.weight", "blocks.2.sa.heads.3.query.weight", "blocks.2.sa.heads.3.value.weight", "blocks.2.sa.heads.4.tril", "blocks.2.sa.heads.4.key.weight", "blocks.2.sa.heads.4.query.weight", "blocks.2.sa.heads.4.value.weight", "blocks.2.sa.heads.5.tril", "blocks.2.sa.heads.5.key.weight", "blocks.2.sa.heads.5.query.weight", "blocks.2.sa.heads.5.value.weight", "blocks.2.sa.heads.6.tril", "blocks.2.sa.heads.6.key.weight", "blocks.2.sa.heads.6.query.weight", "blocks.2.sa.heads.6.value.weight", "blocks.2.sa.heads.7.tril", "blocks.2.sa.heads.7.key.weight", "blocks.2.sa.heads.7.query.weight", "blocks.2.sa.heads.7.value.weight", "blocks.2.sa.heads.8.tril", "blocks.2.sa.heads.8.key.weight", "blocks.2.sa.heads.8.query.weight", "blocks.2.sa.heads.8.value.weight", "blocks.2.sa.heads.9.tril", "blocks.2.sa.heads.9.key.weight", "blocks.2.sa.heads.9.query.weight", "blocks.2.sa.heads.9.value.weight", "blocks.2.sa.heads.10.tril", "blocks.2.sa.heads.10.key.weight", "blocks.2.sa.heads.10.query.weight", "blocks.2.sa.heads.10.value.weight", "blocks.2.sa.heads.11.tril", "blocks.2.sa.heads.11.key.weight", "blocks.2.sa.heads.11.query.weight", "blocks.2.sa.heads.11.value.weight", "blocks.2.sa.proj.weight", "blocks.2.sa.proj.bias", "blocks.2.ffwd.net.0.weight", "blocks.2.ffwd.net.0.bias", "blocks.2.ffwd.net.2.weight", "blocks.2.ffwd.net.2.bias", "blocks.2.ln1.weight", "blocks.2.ln1.bias", "blocks.2.ln2.weight", "blocks.2.ln2.bias", "blocks.3.sa.heads.0.tril", "blocks.3.sa.heads.0.key.weight", "blocks.3.sa.heads.0.query.weight", "blocks.3.sa.heads.0.value.weight", "blocks.3.sa.heads.1.tril", "blocks.3.sa.heads.1.key.weight", "blocks.3.sa.heads.1.query.weight", "blocks.3.sa.heads.1.value.weight", "blocks.3.sa.heads.2.tril", "blocks.3.sa.heads.2.key.weight", "blocks.3.sa.heads.2.query.weight", "blocks.3.sa.heads.2.value.weight", "blocks.3.sa.heads.3.tril", "blocks.3.sa.heads.3.key.weight", "blocks.3.sa.heads.3.query.weight", "blocks.3.sa.heads.3.value.weight", "blocks.3.sa.heads.4.tril", "blocks.3.sa.heads.4.key.weight", "blocks.3.sa.heads.4.query.weight", "blocks.3.sa.heads.4.value.weight", "blocks.3.sa.heads.5.tril", "blocks.3.sa.heads.5.key.weight", "blocks.3.sa.heads.5.query.weight", "blocks.3.sa.heads.5.value.weight", "blocks.3.sa.heads.6.tril", "blocks.3.sa.heads.6.key.weight", "blocks.3.sa.heads.6.query.weight", "blocks.3.sa.heads.6.value.weight", "blocks.3.sa.heads.7.tril", "blocks.3.sa.heads.7.key.weight", "blocks.3.sa.heads.7.query.weight", "blocks.3.sa.heads.7.value.weight", "blocks.3.sa.heads.8.tril", "blocks.3.sa.heads.8.key.weight", "blocks.3.sa.heads.8.query.weight", "blocks.3.sa.heads.8.value.weight", "blocks.3.sa.heads.9.tril", "blocks.3.sa.heads.9.key.weight", "blocks.3.sa.heads.9.query.weight", "blocks.3.sa.heads.9.value.weight", "blocks.3.sa.heads.10.tril", "blocks.3.sa.heads.10.key.weight", "blocks.3.sa.heads.10.query.weight", "blocks.3.sa.heads.10.value.weight", "blocks.3.sa.heads.11.tril", "blocks.3.sa.heads.11.key.weight", "blocks.3.sa.heads.11.query.weight", "blocks.3.sa.heads.11.value.weight", "blocks.3.sa.proj.weight", "blocks.3.sa.proj.bias", "blocks.3.ffwd.net.0.weight", "blocks.3.ffwd.net.0.bias", "blocks.3.ffwd.net.2.weight", "blocks.3.ffwd.net.2.bias", "blocks.3.ln1.weight", "blocks.3.ln1.bias", "blocks.3.ln2.weight", "blocks.3.ln2.bias", "blocks.4.sa.heads.0.tril", "blocks.4.sa.heads.0.key.weight", "blocks.4.sa.heads.0.query.weight", "blocks.4.sa.heads.0.value.weight", "blocks.4.sa.heads.1.tril", "blocks.4.sa.heads.1.key.weight", "blocks.4.sa.heads.1.query.weight", "blocks.4.sa.heads.1.value.weight", "blocks.4.sa.heads.2.tril", "blocks.4.sa.heads.2.key.weight", "blocks.4.sa.heads.2.query.weight", "blocks.4.sa.heads.2.value.weight", "blocks.4.sa.heads.3.tril", "blocks.4.sa.heads.3.key.weight", "blocks.4.sa.heads.3.query.weight", "blocks.4.sa.heads.3.value.weight", "blocks.4.sa.heads.4.tril", "blocks.4.sa.heads.4.key.weight", "blocks.4.sa.heads.4.query.weight", "blocks.4.sa.heads.4.value.weight", "blocks.4.sa.heads.5.tril", "blocks.4.sa.heads.5.key.weight", "blocks.4.sa.heads.5.query.weight", "blocks.4.sa.heads.5.value.weight", "blocks.4.sa.heads.6.tril", "blocks.4.sa.heads.6.key.weight", "blocks.4.sa.heads.6.query.weight", "blocks.4.sa.heads.6.value.weight", "blocks.4.sa.heads.7.tril", "blocks.4.sa.heads.7.key.weight", "blocks.4.sa.heads.7.query.weight", "blocks.4.sa.heads.7.value.weight", "blocks.4.sa.heads.8.tril", "blocks.4.sa.heads.8.key.weight", "blocks.4.sa.heads.8.query.weight", "blocks.4.sa.heads.8.value.weight", "blocks.4.sa.heads.9.tril", "blocks.4.sa.heads.9.key.weight", "blocks.4.sa.heads.9.query.weight", "blocks.4.sa.heads.9.value.weight", "blocks.4.sa.heads.10.tril", "blocks.4.sa.heads.10.key.weight", "blocks.4.sa.heads.10.query.weight", "blocks.4.sa.heads.10.value.weight", "blocks.4.sa.heads.11.tril", "blocks.4.sa.heads.11.key.weight", "blocks.4.sa.heads.11.query.weight", "blocks.4.sa.heads.11.value.weight", "blocks.4.sa.proj.weight", "blocks.4.sa.proj.bias", "blocks.4.ffwd.net.0.weight", "blocks.4.ffwd.net.0.bias", "blocks.4.ffwd.net.2.weight", "blocks.4.ffwd.net.2.bias", "blocks.4.ln1.weight", "blocks.4.ln1.bias", "blocks.4.ln2.weight", "blocks.4.ln2.bias", "blocks.5.sa.heads.0.tril", "blocks.5.sa.heads.0.key.weight", "blocks.5.sa.heads.0.query.weight", "blocks.5.sa.heads.0.value.weight", "blocks.5.sa.heads.1.tril", "blocks.5.sa.heads.1.key.weight", "blocks.5.sa.heads.1.query.weight", "blocks.5.sa.heads.1.value.weight", "blocks.5.sa.heads.2.tril", "blocks.5.sa.heads.2.key.weight", "blocks.5.sa.heads.2.query.weight", "blocks.5.sa.heads.2.value.weight", "blocks.5.sa.heads.3.tril", "blocks.5.sa.heads.3.key.weight", "blocks.5.sa.heads.3.query.weight", "blocks.5.sa.heads.3.value.weight", "blocks.5.sa.heads.4.tril", "blocks.5.sa.heads.4.key.weight", "blocks.5.sa.heads.4.query.weight", "blocks.5.sa.heads.4.value.weight", "blocks.5.sa.heads.5.tril", "blocks.5.sa.heads.5.key.weight", "blocks.5.sa.heads.5.query.weight", "blocks.5.sa.heads.5.value.weight", "blocks.5.sa.heads.6.tril", "blocks.5.sa.heads.6.key.weight", "blocks.5.sa.heads.6.query.weight", "blocks.5.sa.heads.6.value.weight", "blocks.5.sa.heads.7.tril", "blocks.5.sa.heads.7.key.weight", "blocks.5.sa.heads.7.query.weight", "blocks.5.sa.heads.7.value.weight", "blocks.5.sa.heads.8.tril", "blocks.5.sa.heads.8.key.weight", "blocks.5.sa.heads.8.query.weight", "blocks.5.sa.heads.8.value.weight", "blocks.5.sa.heads.9.tril", "blocks.5.sa.heads.9.key.weight", "blocks.5.sa.heads.9.query.weight", "blocks.5.sa.heads.9.value.weight", "blocks.5.sa.heads.10.tril", "blocks.5.sa.heads.10.key.weight", "blocks.5.sa.heads.10.query.weight", "blocks.5.sa.heads.10.value.weight", "blocks.5.sa.heads.11.tril", "blocks.5.sa.heads.11.key.weight", "blocks.5.sa.heads.11.query.weight", "blocks.5.sa.heads.11.value.weight", "blocks.5.sa.proj.weight", "blocks.5.sa.proj.bias", "blocks.5.ffwd.net.0.weight", "blocks.5.ffwd.net.0.bias", "blocks.5.ffwd.net.2.weight", "blocks.5.ffwd.net.2.bias", "blocks.5.ln1.weight", "blocks.5.ln1.bias", "blocks.5.ln2.weight", "blocks.5.ln2.bias", "blocks.6.sa.heads.0.tril", "blocks.6.sa.heads.0.key.weight", "blocks.6.sa.heads.0.query.weight", "blocks.6.sa.heads.0.value.weight", "blocks.6.sa.heads.1.tril", "blocks.6.sa.heads.1.key.weight", "blocks.6.sa.heads.1.query.weight", "blocks.6.sa.heads.1.value.weight", "blocks.6.sa.heads.2.tril", "blocks.6.sa.heads.2.key.weight", "blocks.6.sa.heads.2.query.weight", "blocks.6.sa.heads.2.value.weight", "blocks.6.sa.heads.3.tril", "blocks.6.sa.heads.3.key.weight", "blocks.6.sa.heads.3.query.weight", "blocks.6.sa.heads.3.value.weight", "blocks.6.sa.heads.4.tril", "blocks.6.sa.heads.4.key.weight", "blocks.6.sa.heads.4.query.weight", "blocks.6.sa.heads.4.value.weight", "blocks.6.sa.heads.5.tril", "blocks.6.sa.heads.5.key.weight", "blocks.6.sa.heads.5.query.weight", "blocks.6.sa.heads.5.value.weight", "blocks.6.sa.heads.6.tril", "blocks.6.sa.heads.6.key.weight", "blocks.6.sa.heads.6.query.weight", "blocks.6.sa.heads.6.value.weight", "blocks.6.sa.heads.7.tril", "blocks.6.sa.heads.7.key.weight", "blocks.6.sa.heads.7.query.weight", "blocks.6.sa.heads.7.value.weight", "blocks.6.sa.heads.8.tril", "blocks.6.sa.heads.8.key.weight", "blocks.6.sa.heads.8.query.weight", "blocks.6.sa.heads.8.value.weight", "blocks.6.sa.heads.9.tril", "blocks.6.sa.heads.9.key.weight", "blocks.6.sa.heads.9.query.weight", "blocks.6.sa.heads.9.value.weight", "blocks.6.sa.heads.10.tril", "blocks.6.sa.heads.10.key.weight", "blocks.6.sa.heads.10.query.weight", "blocks.6.sa.heads.10.value.weight", "blocks.6.sa.heads.11.tril", "blocks.6.sa.heads.11.key.weight", "blocks.6.sa.heads.11.query.weight", "blocks.6.sa.heads.11.value.weight", "blocks.6.sa.proj.weight", "blocks.6.sa.proj.bias", "blocks.6.ffwd.net.0.weight", "blocks.6.ffwd.net.0.bias", "blocks.6.ffwd.net.2.weight", "blocks.6.ffwd.net.2.bias", "blocks.6.ln1.weight", "blocks.6.ln1.bias", "blocks.6.ln2.weight", "blocks.6.ln2.bias", "blocks.7.sa.heads.0.tril", "blocks.7.sa.heads.0.key.weight", "blocks.7.sa.heads.0.query.weight", "blocks.7.sa.heads.0.value.weight", "blocks.7.sa.heads.1.tril", "blocks.7.sa.heads.1.key.weight", "blocks.7.sa.heads.1.query.weight", "blocks.7.sa.heads.1.value.weight", "blocks.7.sa.heads.2.tril", "blocks.7.sa.heads.2.key.weight", "blocks.7.sa.heads.2.query.weight", "blocks.7.sa.heads.2.value.weight", "blocks.7.sa.heads.3.tril", "blocks.7.sa.heads.3.key.weight", "blocks.7.sa.heads.3.query.weight", "blocks.7.sa.heads.3.value.weight", "blocks.7.sa.heads.4.tril", "blocks.7.sa.heads.4.key.weight", "blocks.7.sa.heads.4.query.weight", "blocks.7.sa.heads.4.value.weight", "blocks.7.sa.heads.5.tril", "blocks.7.sa.heads.5.key.weight", "blocks.7.sa.heads.5.query.weight", "blocks.7.sa.heads.5.value.weight", "blocks.7.sa.heads.6.tril", "blocks.7.sa.heads.6.key.weight", "blocks.7.sa.heads.6.query.weight", "blocks.7.sa.heads.6.value.weight", "blocks.7.sa.heads.7.tril", "blocks.7.sa.heads.7.key.weight", "blocks.7.sa.heads.7.query.weight", "blocks.7.sa.heads.7.value.weight", "blocks.7.sa.heads.8.tril", "blocks.7.sa.heads.8.key.weight", "blocks.7.sa.heads.8.query.weight", "blocks.7.sa.heads.8.value.weight", "blocks.7.sa.heads.9.tril", "blocks.7.sa.heads.9.key.weight", "blocks.7.sa.heads.9.query.weight", "blocks.7.sa.heads.9.value.weight", "blocks.7.sa.heads.10.tril", "blocks.7.sa.heads.10.key.weight", "blocks.7.sa.heads.10.query.weight", "blocks.7.sa.heads.10.value.weight", "blocks.7.sa.heads.11.tril", "blocks.7.sa.heads.11.key.weight", "blocks.7.sa.heads.11.query.weight", "blocks.7.sa.heads.11.value.weight", "blocks.7.sa.proj.weight", "blocks.7.sa.proj.bias", "blocks.7.ffwd.net.0.weight", "blocks.7.ffwd.net.0.bias", "blocks.7.ffwd.net.2.weight", "blocks.7.ffwd.net.2.bias", "blocks.7.ln1.weight", "blocks.7.ln1.bias", "blocks.7.ln2.weight", "blocks.7.ln2.bias", "blocks.8.sa.heads.0.tril", "blocks.8.sa.heads.0.key.weight", "blocks.8.sa.heads.0.query.weight", "blocks.8.sa.heads.0.value.weight", "blocks.8.sa.heads.1.tril", "blocks.8.sa.heads.1.key.weight", "blocks.8.sa.heads.1.query.weight", "blocks.8.sa.heads.1.value.weight", "blocks.8.sa.heads.2.tril", "blocks.8.sa.heads.2.key.weight", "blocks.8.sa.heads.2.query.weight", "blocks.8.sa.heads.2.value.weight", "blocks.8.sa.heads.3.tril", "blocks.8.sa.heads.3.key.weight", "blocks.8.sa.heads.3.query.weight", "blocks.8.sa.heads.3.value.weight", "blocks.8.sa.heads.4.tril", "blocks.8.sa.heads.4.key.weight", "blocks.8.sa.heads.4.query.weight", "blocks.8.sa.heads.4.value.weight", "blocks.8.sa.heads.5.tril", "blocks.8.sa.heads.5.key.weight", "blocks.8.sa.heads.5.query.weight", "blocks.8.sa.heads.5.value.weight", "blocks.8.sa.heads.6.tril", "blocks.8.sa.heads.6.key.weight", "blocks.8.sa.heads.6.query.weight", "blocks.8.sa.heads.6.value.weight", "blocks.8.sa.heads.7.tril", "blocks.8.sa.heads.7.key.weight", "blocks.8.sa.heads.7.query.weight", "blocks.8.sa.heads.7.value.weight", "blocks.8.sa.heads.8.tril", "blocks.8.sa.heads.8.key.weight", "blocks.8.sa.heads.8.query.weight", "blocks.8.sa.heads.8.value.weight", "blocks.8.sa.heads.9.tril", "blocks.8.sa.heads.9.key.weight", "blocks.8.sa.heads.9.query.weight", "blocks.8.sa.heads.9.value.weight", "blocks.8.sa.heads.10.tril", "blocks.8.sa.heads.10.key.weight", "blocks.8.sa.heads.10.query.weight", "blocks.8.sa.heads.10.value.weight", "blocks.8.sa.heads.11.tril", "blocks.8.sa.heads.11.key.weight", "blocks.8.sa.heads.11.query.weight", "blocks.8.sa.heads.11.value.weight", "blocks.8.sa.proj.weight", "blocks.8.sa.proj.bias", "blocks.8.ffwd.net.0.weight", "blocks.8.ffwd.net.0.bias", "blocks.8.ffwd.net.2.weight", "blocks.8.ffwd.net.2.bias", "blocks.8.ln1.weight", "blocks.8.ln1.bias", "blocks.8.ln2.weight", "blocks.8.ln2.bias", "blocks.9.sa.heads.0.tril", "blocks.9.sa.heads.0.key.weight", "blocks.9.sa.heads.0.query.weight", "blocks.9.sa.heads.0.value.weight", "blocks.9.sa.heads.1.tril", "blocks.9.sa.heads.1.key.weight", "blocks.9.sa.heads.1.query.weight", "blocks.9.sa.heads.1.value.weight", "blocks.9.sa.heads.2.tril", "blocks.9.sa.heads.2.key.weight", "blocks.9.sa.heads.2.query.weight", "blocks.9.sa.heads.2.value.weight", "blocks.9.sa.heads.3.tril", "blocks.9.sa.heads.3.key.weight", "blocks.9.sa.heads.3.query.weight", "blocks.9.sa.heads.3.value.weight", "blocks.9.sa.heads.4.tril", "blocks.9.sa.heads.4.key.weight", "blocks.9.sa.heads.4.query.weight", "blocks.9.sa.heads.4.value.weight", "blocks.9.sa.heads.5.tril", "blocks.9.sa.heads.5.key.weight", "blocks.9.sa.heads.5.query.weight", "blocks.9.sa.heads.5.value.weight", "blocks.9.sa.heads.6.tril", "blocks.9.sa.heads.6.key.weight", "blocks.9.sa.heads.6.query.weight", "blocks.9.sa.heads.6.value.weight", "blocks.9.sa.heads.7.tril", "blocks.9.sa.heads.7.key.weight", "blocks.9.sa.heads.7.query.weight", "blocks.9.sa.heads.7.value.weight", "blocks.9.sa.heads.8.tril", "blocks.9.sa.heads.8.key.weight", "blocks.9.sa.heads.8.query.weight", "blocks.9.sa.heads.8.value.weight", "blocks.9.sa.heads.9.tril", "blocks.9.sa.heads.9.key.weight", "blocks.9.sa.heads.9.query.weight", "blocks.9.sa.heads.9.value.weight", "blocks.9.sa.heads.10.tril", "blocks.9.sa.heads.10.key.weight", "blocks.9.sa.heads.10.query.weight", "blocks.9.sa.heads.10.value.weight", "blocks.9.sa.heads.11.tril", "blocks.9.sa.heads.11.key.weight", "blocks.9.sa.heads.11.query.weight", "blocks.9.sa.heads.11.value.weight", "blocks.9.sa.proj.weight", "blocks.9.sa.proj.bias", "blocks.9.ffwd.net.0.weight", "blocks.9.ffwd.net.0.bias", "blocks.9.ffwd.net.2.weight", "blocks.9.ffwd.net.2.bias", "blocks.9.ln1.weight", "blocks.9.ln1.bias", "blocks.9.ln2.weight", "blocks.9.ln2.bias", "blocks.10.sa.heads.0.tril", "blocks.10.sa.heads.0.key.weight", "blocks.10.sa.heads.0.query.weight", "blocks.10.sa.heads.0.value.weight", "blocks.10.sa.heads.1.tril", "blocks.10.sa.heads.1.key.weight", "blocks.10.sa.heads.1.query.weight", "blocks.10.sa.heads.1.value.weight", "blocks.10.sa.heads.2.tril", "blocks.10.sa.heads.2.key.weight", "blocks.10.sa.heads.2.query.weight", "blocks.10.sa.heads.2.value.weight", "blocks.10.sa.heads.3.tril", "blocks.10.sa.heads.3.key.weight", "blocks.10.sa.heads.3.query.weight", "blocks.10.sa.heads.3.value.weight", "blocks.10.sa.heads.4.tril", "blocks.10.sa.heads.4.key.weight", "blocks.10.sa.heads.4.query.weight", "blocks.10.sa.heads.4.value.weight", "blocks.10.sa.heads.5.tril", "blocks.10.sa.heads.5.key.weight", "blocks.10.sa.heads.5.query.weight", "blocks.10.sa.heads.5.value.weight", "blocks.10.sa.heads.6.tril", "blocks.10.sa.heads.6.key.weight", "blocks.10.sa.heads.6.query.weight", "blocks.10.sa.heads.6.value.weight", "blocks.10.sa.heads.7.tril", "blocks.10.sa.heads.7.key.weight", "blocks.10.sa.heads.7.query.weight", "blocks.10.sa.heads.7.value.weight", "blocks.10.sa.heads.8.tril", "blocks.10.sa.heads.8.key.weight", "blocks.10.sa.heads.8.query.weight", "blocks.10.sa.heads.8.value.weight", "blocks.10.sa.heads.9.tril", "blocks.10.sa.heads.9.key.weight", "blocks.10.sa.heads.9.query.weight", "blocks.10.sa.heads.9.value.weight", "blocks.10.sa.heads.10.tril", "blocks.10.sa.heads.10.key.weight", "blocks.10.sa.heads.10.query.weight", "blocks.10.sa.heads.10.value.weight", "blocks.10.sa.heads.11.tril", "blocks.10.sa.heads.11.key.weight", "blocks.10.sa.heads.11.query.weight", "blocks.10.sa.heads.11.value.weight", "blocks.10.sa.proj.weight", "blocks.10.sa.proj.bias", "blocks.10.ffwd.net.0.weight", "blocks.10.ffwd.net.0.bias", "blocks.10.ffwd.net.2.weight", "blocks.10.ffwd.net.2.bias", "blocks.10.ln1.weight", "blocks.10.ln1.bias", "blocks.10.ln2.weight", "blocks.10.ln2.bias", "blocks.11.sa.heads.0.tril", "blocks.11.sa.heads.0.key.weight", "blocks.11.sa.heads.0.query.weight", "blocks.11.sa.heads.0.value.weight", "blocks.11.sa.heads.1.tril", "blocks.11.sa.heads.1.key.weight", "blocks.11.sa.heads.1.query.weight", "blocks.11.sa.heads.1.value.weight", "blocks.11.sa.heads.2.tril", "blocks.11.sa.heads.2.key.weight", "blocks.11.sa.heads.2.query.weight", "blocks.11.sa.heads.2.value.weight", "blocks.11.sa.heads.3.tril", "blocks.11.sa.heads.3.key.weight", "blocks.11.sa.heads.3.query.weight", "blocks.11.sa.heads.3.value.weight", "blocks.11.sa.heads.4.tril", "blocks.11.sa.heads.4.key.weight", "blocks.11.sa.heads.4.query.weight", "blocks.11.sa.heads.4.value.weight", "blocks.11.sa.heads.5.tril", "blocks.11.sa.heads.5.key.weight", "blocks.11.sa.heads.5.query.weight", "blocks.11.sa.heads.5.value.weight", "blocks.11.sa.heads.6.tril", "blocks.11.sa.heads.6.key.weight", "blocks.11.sa.heads.6.query.weight", "blocks.11.sa.heads.6.value.weight", "blocks.11.sa.heads.7.tril", "blocks.11.sa.heads.7.key.weight", "blocks.11.sa.heads.7.query.weight", "blocks.11.sa.heads.7.value.weight", "blocks.11.sa.heads.8.tril", "blocks.11.sa.heads.8.key.weight", "blocks.11.sa.heads.8.query.weight", "blocks.11.sa.heads.8.value.weight", "blocks.11.sa.heads.9.tril", "blocks.11.sa.heads.9.key.weight", "blocks.11.sa.heads.9.query.weight", "blocks.11.sa.heads.9.value.weight", "blocks.11.sa.heads.10.tril", "blocks.11.sa.heads.10.key.weight", "blocks.11.sa.heads.10.query.weight", "blocks.11.sa.heads.10.value.weight", "blocks.11.sa.heads.11.tril", "blocks.11.sa.heads.11.key.weight", "blocks.11.sa.heads.11.query.weight", "blocks.11.sa.heads.11.value.weight", "blocks.11.sa.proj.weight", "blocks.11.sa.proj.bias", "blocks.11.ffwd.net.0.weight", "blocks.11.ffwd.net.0.bias", "blocks.11.ffwd.net.2.weight", "blocks.11.ffwd.net.2.bias", "blocks.11.ln1.weight", "blocks.11.ln1.bias", "blocks.11.ln2.weight", "blocks.11.ln2.bias", "blocks.12.sa.heads.0.tril", "blocks.12.sa.heads.0.key.weight", "blocks.12.sa.heads.0.query.weight", "blocks.12.sa.heads.0.value.weight", "blocks.12.sa.heads.1.tril", "blocks.12.sa.heads.1.key.weight", "blocks.12.sa.heads.1.query.weight", "blocks.12.sa.heads.1.value.weight", "blocks.12.sa.heads.2.tril", "blocks.12.sa.heads.2.key.weight", "blocks.12.sa.heads.2.query.weight", "blocks.12.sa.heads.2.value.weight", "blocks.12.sa.heads.3.tril", "blocks.12.sa.heads.3.key.weight", "blocks.12.sa.heads.3.query.weight", "blocks.12.sa.heads.3.value.weight", "blocks.12.sa.heads.4.tril", "blocks.12.sa.heads.4.key.weight", "blocks.12.sa.heads.4.query.weight", "blocks.12.sa.heads.4.value.weight", "blocks.12.sa.heads.5.tril", "blocks.12.sa.heads.5.key.weight", "blocks.12.sa.heads.5.query.weight", "blocks.12.sa.heads.5.value.weight", "blocks.12.sa.heads.6.tril", "blocks.12.sa.heads.6.key.weight", "blocks.12.sa.heads.6.query.weight", "blocks.12.sa.heads.6.value.weight", "blocks.12.sa.heads.7.tril", "blocks.12.sa.heads.7.key.weight", "blocks.12.sa.heads.7.query.weight", "blocks.12.sa.heads.7.value.weight", "blocks.12.sa.heads.8.tril", "blocks.12.sa.heads.8.key.weight", "blocks.12.sa.heads.8.query.weight", "blocks.12.sa.heads.8.value.weight", "blocks.12.sa.heads.9.tril", "blocks.12.sa.heads.9.key.weight", "blocks.12.sa.heads.9.query.weight", "blocks.12.sa.heads.9.value.weight", "blocks.12.sa.heads.10.tril", "blocks.12.sa.heads.10.key.weight", "blocks.12.sa.heads.10.query.weight", "blocks.12.sa.heads.10.value.weight", "blocks.12.sa.heads.11.tril", "blocks.12.sa.heads.11.key.weight", "blocks.12.sa.heads.11.query.weight", "blocks.12.sa.heads.11.value.weight", "blocks.12.sa.proj.weight", "blocks.12.sa.proj.bias", "blocks.12.ffwd.net.0.weight", "blocks.12.ffwd.net.0.bias", "blocks.12.ffwd.net.2.weight", "blocks.12.ffwd.net.2.bias", "blocks.12.ln1.weight", "blocks.12.ln1.bias", "blocks.12.ln2.weight", "blocks.12.ln2.bias", "blocks.13.sa.heads.0.tril", "blocks.13.sa.heads.0.key.weight", "blocks.13.sa.heads.0.query.weight", "blocks.13.sa.heads.0.value.weight", "blocks.13.sa.heads.1.tril", "blocks.13.sa.heads.1.key.weight", "blocks.13.sa.heads.1.query.weight", "blocks.13.sa.heads.1.value.weight", "blocks.13.sa.heads.2.tril", "blocks.13.sa.heads.2.key.weight", "blocks.13.sa.heads.2.query.weight", "blocks.13.sa.heads.2.value.weight", "blocks.13.sa.heads.3.tril", "blocks.13.sa.heads.3.key.weight", "blocks.13.sa.heads.3.query.weight", "blocks.13.sa.heads.3.value.weight", "blocks.13.sa.heads.4.tril", "blocks.13.sa.heads.4.key.weight", "blocks.13.sa.heads.4.query.weight", "blocks.13.sa.heads.4.value.weight", "blocks.13.sa.heads.5.tril", "blocks.13.sa.heads.5.key.weight", "blocks.13.sa.heads.5.query.weight", "blocks.13.sa.heads.5.value.weight", "blocks.13.sa.heads.6.tril", "blocks.13.sa.heads.6.key.weight", "blocks.13.sa.heads.6.query.weight", "blocks.13.sa.heads.6.value.weight", "blocks.13.sa.heads.7.tril", "blocks.13.sa.heads.7.key.weight", "blocks.13.sa.heads.7.query.weight", "blocks.13.sa.heads.7.value.weight", "blocks.13.sa.heads.8.tril", "blocks.13.sa.heads.8.key.weight", "blocks.13.sa.heads.8.query.weight", "blocks.13.sa.heads.8.value.weight", "blocks.13.sa.heads.9.tril", "blocks.13.sa.heads.9.key.weight", "blocks.13.sa.heads.9.query.weight", "blocks.13.sa.heads.9.value.weight", "blocks.13.sa.heads.10.tril", "blocks.13.sa.heads.10.key.weight", "blocks.13.sa.heads.10.query.weight", "blocks.13.sa.heads.10.value.weight", "blocks.13.sa.heads.11.tril", "blocks.13.sa.heads.11.key.weight", "blocks.13.sa.heads.11.query.weight", "blocks.13.sa.heads.11.value.weight", "blocks.13.sa.proj.weight", "blocks.13.sa.proj.bias", "blocks.13.ffwd.net.0.weight", "blocks.13.ffwd.net.0.bias", "blocks.13.ffwd.net.2.weight", "blocks.13.ffwd.net.2.bias", "blocks.13.ln1.weight", "blocks.13.ln1.bias", "blocks.13.ln2.weight", "blocks.13.ln2.bias", "blocks.14.sa.heads.0.tril", "blocks.14.sa.heads.0.key.weight", "blocks.14.sa.heads.0.query.weight", "blocks.14.sa.heads.0.value.weight", "blocks.14.sa.heads.1.tril", "blocks.14.sa.heads.1.key.weight", "blocks.14.sa.heads.1.query.weight", "blocks.14.sa.heads.1.value.weight", "blocks.14.sa.heads.2.tril", "blocks.14.sa.heads.2.key.weight", "blocks.14.sa.heads.2.query.weight", "blocks.14.sa.heads.2.value.weight", "blocks.14.sa.heads.3.tril", "blocks.14.sa.heads.3.key.weight", "blocks.14.sa.heads.3.query.weight", "blocks.14.sa.heads.3.value.weight", "blocks.14.sa.heads.4.tril", "blocks.14.sa.heads.4.key.weight", "blocks.14.sa.heads.4.query.weight", "blocks.14.sa.heads.4.value.weight", "blocks.14.sa.heads.5.tril", "blocks.14.sa.heads.5.key.weight", "blocks.14.sa.heads.5.query.weight", "blocks.14.sa.heads.5.value.weight", "blocks.14.sa.heads.6.tril", "blocks.14.sa.heads.6.key.weight", "blocks.14.sa.heads.6.query.weight", "blocks.14.sa.heads.6.value.weight", "blocks.14.sa.heads.7.tril", "blocks.14.sa.heads.7.key.weight", "blocks.14.sa.heads.7.query.weight", "blocks.14.sa.heads.7.value.weight", "blocks.14.sa.heads.8.tril", "blocks.14.sa.heads.8.key.weight", "blocks.14.sa.heads.8.query.weight", "blocks.14.sa.heads.8.value.weight", "blocks.14.sa.heads.9.tril", "blocks.14.sa.heads.9.key.weight", "blocks.14.sa.heads.9.query.weight", "blocks.14.sa.heads.9.value.weight", "blocks.14.sa.heads.10.tril", "blocks.14.sa.heads.10.key.weight", "blocks.14.sa.heads.10.query.weight", "blocks.14.sa.heads.10.value.weight", "blocks.14.sa.heads.11.tril", "blocks.14.sa.heads.11.key.weight", "blocks.14.sa.heads.11.query.weight", "blocks.14.sa.heads.11.value.weight", "blocks.14.sa.proj.weight", "blocks.14.sa.proj.bias", "blocks.14.ffwd.net.0.weight", "blocks.14.ffwd.net.0.bias", "blocks.14.ffwd.net.2.weight", "blocks.14.ffwd.net.2.bias", "blocks.14.ln1.weight", "blocks.14.ln1.bias", "blocks.14.ln2.weight", "blocks.14.ln2.bias", "blocks.15.sa.heads.0.tril", "blocks.15.sa.heads.0.key.weight", "blocks.15.sa.heads.0.query.weight", "blocks.15.sa.heads.0.value.weight", "blocks.15.sa.heads.1.tril", "blocks.15.sa.heads.1.key.weight", "blocks.15.sa.heads.1.query.weight", "blocks.15.sa.heads.1.value.weight", "blocks.15.sa.heads.2.tril", "blocks.15.sa.heads.2.key.weight", "blocks.15.sa.heads.2.query.weight", "blocks.15.sa.heads.2.value.weight", "blocks.15.sa.heads.3.tril", "blocks.15.sa.heads.3.key.weight", "blocks.15.sa.heads.3.query.weight", "blocks.15.sa.heads.3.value.weight", "blocks.15.sa.heads.4.tril", "blocks.15.sa.heads.4.key.weight", "blocks.15.sa.heads.4.query.weight", "blocks.15.sa.heads.4.value.weight", "blocks.15.sa.heads.5.tril", "blocks.15.sa.heads.5.key.weight", "blocks.15.sa.heads.5.query.weight", "blocks.15.sa.heads.5.value.weight", "blocks.15.sa.heads.6.tril", "blocks.15.sa.heads.6.key.weight", "blocks.15.sa.heads.6.query.weight", "blocks.15.sa.heads.6.value.weight", "blocks.15.sa.heads.7.tril", "blocks.15.sa.heads.7.key.weight", "blocks.15.sa.heads.7.query.weight", "blocks.15.sa.heads.7.value.weight", "blocks.15.sa.heads.8.tril", "blocks.15.sa.heads.8.key.weight", "blocks.15.sa.heads.8.query.weight", "blocks.15.sa.heads.8.value.weight", "blocks.15.sa.heads.9.tril", "blocks.15.sa.heads.9.key.weight", "blocks.15.sa.heads.9.query.weight", "blocks.15.sa.heads.9.value.weight", "blocks.15.sa.heads.10.tril", "blocks.15.sa.heads.10.key.weight", "blocks.15.sa.heads.10.query.weight", "blocks.15.sa.heads.10.value.weight", "blocks.15.sa.heads.11.tril", "blocks.15.sa.heads.11.key.weight", "blocks.15.sa.heads.11.query.weight", "blocks.15.sa.heads.11.value.weight", "blocks.15.sa.proj.weight", "blocks.15.sa.proj.bias", "blocks.15.ffwd.net.0.weight", "blocks.15.ffwd.net.0.bias", "blocks.15.ffwd.net.2.weight", "blocks.15.ffwd.net.2.bias", "blocks.15.ln1.weight", "blocks.15.ln1.bias", "blocks.15.ln2.weight", "blocks.15.ln2.bias", "ln_f.weight", "ln_f.bias", "lm_head.weight", "lm_head.bias". 
	Unexpected key(s) in state_dict: "_orig_mod.token_embedding_table.weight", "_orig_mod.position_embedding_table.weight", "_orig_mod.blocks.0.sa.heads.0.tril", "_orig_mod.blocks.0.sa.heads.0.key.weight", "_orig_mod.blocks.0.sa.heads.0.query.weight", "_orig_mod.blocks.0.sa.heads.0.value.weight", "_orig_mod.blocks.0.sa.heads.1.tril", "_orig_mod.blocks.0.sa.heads.1.key.weight", "_orig_mod.blocks.0.sa.heads.1.query.weight", "_orig_mod.blocks.0.sa.heads.1.value.weight", "_orig_mod.blocks.0.sa.heads.2.tril", "_orig_mod.blocks.0.sa.heads.2.key.weight", "_orig_mod.blocks.0.sa.heads.2.query.weight", "_orig_mod.blocks.0.sa.heads.2.value.weight", "_orig_mod.blocks.0.sa.heads.3.tril", "_orig_mod.blocks.0.sa.heads.3.key.weight", "_orig_mod.blocks.0.sa.heads.3.query.weight", "_orig_mod.blocks.0.sa.heads.3.value.weight", "_orig_mod.blocks.0.sa.heads.4.tril", "_orig_mod.blocks.0.sa.heads.4.key.weight", "_orig_mod.blocks.0.sa.heads.4.query.weight", "_orig_mod.blocks.0.sa.heads.4.value.weight", "_orig_mod.blocks.0.sa.heads.5.tril", "_orig_mod.blocks.0.sa.heads.5.key.weight", "_orig_mod.blocks.0.sa.heads.5.query.weight", "_orig_mod.blocks.0.sa.heads.5.value.weight", "_orig_mod.blocks.0.sa.heads.6.tril", "_orig_mod.blocks.0.sa.heads.6.key.weight", "_orig_mod.blocks.0.sa.heads.6.query.weight", "_orig_mod.blocks.0.sa.heads.6.value.weight", "_orig_mod.blocks.0.sa.heads.7.tril", "_orig_mod.blocks.0.sa.heads.7.key.weight", "_orig_mod.blocks.0.sa.heads.7.query.weight", "_orig_mod.blocks.0.sa.heads.7.value.weight", "_orig_mod.blocks.0.sa.heads.8.tril", "_orig_mod.blocks.0.sa.heads.8.key.weight", "_orig_mod.blocks.0.sa.heads.8.query.weight", "_orig_mod.blocks.0.sa.heads.8.value.weight", "_orig_mod.blocks.0.sa.heads.9.tril", "_orig_mod.blocks.0.sa.heads.9.key.weight", "_orig_mod.blocks.0.sa.heads.9.query.weight", "_orig_mod.blocks.0.sa.heads.9.value.weight", "_orig_mod.blocks.0.sa.heads.10.tril", "_orig_mod.blocks.0.sa.heads.10.key.weight", "_orig_mod.blocks.0.sa.heads.10.query.weight", "_orig_mod.blocks.0.sa.heads.10.value.weight", "_orig_mod.blocks.0.sa.heads.11.tril", "_orig_mod.blocks.0.sa.heads.11.key.weight", "_orig_mod.blocks.0.sa.heads.11.query.weight", "_orig_mod.blocks.0.sa.heads.11.value.weight", "_orig_mod.blocks.0.sa.proj.weight", "_orig_mod.blocks.0.sa.proj.bias", "_orig_mod.blocks.0.ffwd.net.0.weight", "_orig_mod.blocks.0.ffwd.net.0.bias", "_orig_mod.blocks.0.ffwd.net.2.weight", "_orig_mod.blocks.0.ffwd.net.2.bias", "_orig_mod.blocks.0.ln1.weight", "_orig_mod.blocks.0.ln1.bias", "_orig_mod.blocks.0.ln2.weight", "_orig_mod.blocks.0.ln2.bias", "_orig_mod.blocks.1.sa.heads.0.tril", "_orig_mod.blocks.1.sa.heads.0.key.weight", "_orig_mod.blocks.1.sa.heads.0.query.weight", "_orig_mod.blocks.1.sa.heads.0.value.weight", "_orig_mod.blocks.1.sa.heads.1.tril", "_orig_mod.blocks.1.sa.heads.1.key.weight", "_orig_mod.blocks.1.sa.heads.1.query.weight", "_orig_mod.blocks.1.sa.heads.1.value.weight", "_orig_mod.blocks.1.sa.heads.2.tril", "_orig_mod.blocks.1.sa.heads.2.key.weight", "_orig_mod.blocks.1.sa.heads.2.query.weight", "_orig_mod.blocks.1.sa.heads.2.value.weight", "_orig_mod.blocks.1.sa.heads.3.tril", "_orig_mod.blocks.1.sa.heads.3.key.weight", "_orig_mod.blocks.1.sa.heads.3.query.weight", "_orig_mod.blocks.1.sa.heads.3.value.weight", "_orig_mod.blocks.1.sa.heads.4.tril", "_orig_mod.blocks.1.sa.heads.4.key.weight", "_orig_mod.blocks.1.sa.heads.4.query.weight", "_orig_mod.blocks.1.sa.heads.4.value.weight", "_orig_mod.blocks.1.sa.heads.5.tril", "_orig_mod.blocks.1.sa.heads.5.key.weight", "_orig_mod.blocks.1.sa.heads.5.query.weight", "_orig_mod.blocks.1.sa.heads.5.value.weight", "_orig_mod.blocks.1.sa.heads.6.tril", "_orig_mod.blocks.1.sa.heads.6.key.weight", "_orig_mod.blocks.1.sa.heads.6.query.weight", "_orig_mod.blocks.1.sa.heads.6.value.weight", "_orig_mod.blocks.1.sa.heads.7.tril", "_orig_mod.blocks.1.sa.heads.7.key.weight", "_orig_mod.blocks.1.sa.heads.7.query.weight", "_orig_mod.blocks.1.sa.heads.7.value.weight", "_orig_mod.blocks.1.sa.heads.8.tril", "_orig_mod.blocks.1.sa.heads.8.key.weight", "_orig_mod.blocks.1.sa.heads.8.query.weight", "_orig_mod.blocks.1.sa.heads.8.value.weight", "_orig_mod.blocks.1.sa.heads.9.tril", "_orig_mod.blocks.1.sa.heads.9.key.weight", "_orig_mod.blocks.1.sa.heads.9.query.weight", "_orig_mod.blocks.1.sa.heads.9.value.weight", "_orig_mod.blocks.1.sa.heads.10.tril", "_orig_mod.blocks.1.sa.heads.10.key.weight", "_orig_mod.blocks.1.sa.heads.10.query.weight", "_orig_mod.blocks.1.sa.heads.10.value.weight", "_orig_mod.blocks.1.sa.heads.11.tril", "_orig_mod.blocks.1.sa.heads.11.key.weight", "_orig_mod.blocks.1.sa.heads.11.query.weight", "_orig_mod.blocks.1.sa.heads.11.value.weight", "_orig_mod.blocks.1.sa.proj.weight", "_orig_mod.blocks.1.sa.proj.bias", "_orig_mod.blocks.1.ffwd.net.0.weight", "_orig_mod.blocks.1.ffwd.net.0.bias", "_orig_mod.blocks.1.ffwd.net.2.weight", "_orig_mod.blocks.1.ffwd.net.2.bias", "_orig_mod.blocks.1.ln1.weight", "_orig_mod.blocks.1.ln1.bias", "_orig_mod.blocks.1.ln2.weight", "_orig_mod.blocks.1.ln2.bias", "_orig_mod.blocks.2.sa.heads.0.tril", "_orig_mod.blocks.2.sa.heads.0.key.weight", "_orig_mod.blocks.2.sa.heads.0.query.weight", "_orig_mod.blocks.2.sa.heads.0.value.weight", "_orig_mod.blocks.2.sa.heads.1.tril", "_orig_mod.blocks.2.sa.heads.1.key.weight", "_orig_mod.blocks.2.sa.heads.1.query.weight", "_orig_mod.blocks.2.sa.heads.1.value.weight", "_orig_mod.blocks.2.sa.heads.2.tril", "_orig_mod.blocks.2.sa.heads.2.key.weight", "_orig_mod.blocks.2.sa.heads.2.query.weight", "_orig_mod.blocks.2.sa.heads.2.value.weight", "_orig_mod.blocks.2.sa.heads.3.tril", "_orig_mod.blocks.2.sa.heads.3.key.weight", "_orig_mod.blocks.2.sa.heads.3.query.weight", "_orig_mod.blocks.2.sa.heads.3.value.weight", "_orig_mod.blocks.2.sa.heads.4.tril", "_orig_mod.blocks.2.sa.heads.4.key.weight", "_orig_mod.blocks.2.sa.heads.4.query.weight", "_orig_mod.blocks.2.sa.heads.4.value.weight", "_orig_mod.blocks.2.sa.heads.5.tril", "_orig_mod.blocks.2.sa.heads.5.key.weight", "_orig_mod.blocks.2.sa.heads.5.query.weight", "_orig_mod.blocks.2.sa.heads.5.value.weight", "_orig_mod.blocks.2.sa.heads.6.tril", "_orig_mod.blocks.2.sa.heads.6.key.weight", "_orig_mod.blocks.2.sa.heads.6.query.weight", "_orig_mod.blocks.2.sa.heads.6.value.weight", "_orig_mod.blocks.2.sa.heads.7.tril", "_orig_mod.blocks.2.sa.heads.7.key.weight", "_orig_mod.blocks.2.sa.heads.7.query.weight", "_orig_mod.blocks.2.sa.heads.7.value.weight", "_orig_mod.blocks.2.sa.heads.8.tril", "_orig_mod.blocks.2.sa.heads.8.key.weight", "_orig_mod.blocks.2.sa.heads.8.query.weight", "_orig_mod.blocks.2.sa.heads.8.value.weight", "_orig_mod.blocks.2.sa.heads.9.tril", "_orig_mod.blocks.2.sa.heads.9.key.weight", "_orig_mod.blocks.2.sa.heads.9.query.weight", "_orig_mod.blocks.2.sa.heads.9.value.weight", "_orig_mod.blocks.2.sa.heads.10.tril", "_orig_mod.blocks.2.sa.heads.10.key.weight", "_orig_mod.blocks.2.sa.heads.10.query.weight", "_orig_mod.blocks.2.sa.heads.10.value.weight", "_orig_mod.blocks.2.sa.heads.11.tril", "_orig_mod.blocks.2.sa.heads.11.key.weight", "_orig_mod.blocks.2.sa.heads.11.query.weight", "_orig_mod.blocks.2.sa.heads.11.value.weight", "_orig_mod.blocks.2.sa.proj.weight", "_orig_mod.blocks.2.sa.proj.bias", "_orig_mod.blocks.2.ffwd.net.0.weight", "_orig_mod.blocks.2.ffwd.net.0.bias", "_orig_mod.blocks.2.ffwd.net.2.weight", "_orig_mod.blocks.2.ffwd.net.2.bias", "_orig_mod.blocks.2.ln1.weight", "_orig_mod.blocks.2.ln1.bias", "_orig_mod.blocks.2.ln2.weight", "_orig_mod.blocks.2.ln2.bias", "_orig_mod.blocks.3.sa.heads.0.tril", "_orig_mod.blocks.3.sa.heads.0.key.weight", "_orig_mod.blocks.3.sa.heads.0.query.weight", "_orig_mod.blocks.3.sa.heads.0.value.weight", "_orig_mod.blocks.3.sa.heads.1.tril", "_orig_mod.blocks.3.sa.heads.1.key.weight", "_orig_mod.blocks.3.sa.heads.1.query.weight", "_orig_mod.blocks.3.sa.heads.1.value.weight", "_orig_mod.blocks.3.sa.heads.2.tril", "_orig_mod.blocks.3.sa.heads.2.key.weight", "_orig_mod.blocks.3.sa.heads.2.query.weight", "_orig_mod.blocks.3.sa.heads.2.value.weight", "_orig_mod.blocks.3.sa.heads.3.tril", "_orig_mod.blocks.3.sa.heads.3.key.weight", "_orig_mod.blocks.3.sa.heads.3.query.weight", "_orig_mod.blocks.3.sa.heads.3.value.weight", "_orig_mod.blocks.3.sa.heads.4.tril", "_orig_mod.blocks.3.sa.heads.4.key.weight", "_orig_mod.blocks.3.sa.heads.4.query.weight", "_orig_mod.blocks.3.sa.heads.4.value.weight", "_orig_mod.blocks.3.sa.heads.5.tril", "_orig_mod.blocks.3.sa.heads.5.key.weight", "_orig_mod.blocks.3.sa.heads.5.query.weight", "_orig_mod.blocks.3.sa.heads.5.value.weight", "_orig_mod.blocks.3.sa.heads.6.tril", "_orig_mod.blocks.3.sa.heads.6.key.weight", "_orig_mod.blocks.3.sa.heads.6.query.weight", "_orig_mod.blocks.3.sa.heads.6.value.weight", "_orig_mod.blocks.3.sa.heads.7.tril", "_orig_mod.blocks.3.sa.heads.7.key.weight", "_orig_mod.blocks.3.sa.heads.7.query.weight", "_orig_mod.blocks.3.sa.heads.7.value.weight", "_orig_mod.blocks.3.sa.heads.8.tril", "_orig_mod.blocks.3.sa.heads.8.key.weight", "_orig_mod.blocks.3.sa.heads.8.query.weight", "_orig_mod.blocks.3.sa.heads.8.value.weight", "_orig_mod.blocks.3.sa.heads.9.tril", "_orig_mod.blocks.3.sa.heads.9.key.weight", "_orig_mod.blocks.3.sa.heads.9.query.weight", "_orig_mod.blocks.3.sa.heads.9.value.weight", "_orig_mod.blocks.3.sa.heads.10.tril", "_orig_mod.blocks.3.sa.heads.10.key.weight", "_orig_mod.blocks.3.sa.heads.10.query.weight", "_orig_mod.blocks.3.sa.heads.10.value.weight", "_orig_mod.blocks.3.sa.heads.11.tril", "_orig_mod.blocks.3.sa.heads.11.key.weight", "_orig_mod.blocks.3.sa.heads.11.query.weight", "_orig_mod.blocks.3.sa.heads.11.value.weight", "_orig_mod.blocks.3.sa.proj.weight", "_orig_mod.blocks.3.sa.proj.bias", "_orig_mod.blocks.3.ffwd.net.0.weight", "_orig_mod.blocks.3.ffwd.net.0.bias", "_orig_mod.blocks.3.ffwd.net.2.weight", "_orig_mod.blocks.3.ffwd.net.2.bias", "_orig_mod.blocks.3.ln1.weight", "_orig_mod.blocks.3.ln1.bias", "_orig_mod.blocks.3.ln2.weight", "_orig_mod.blocks.3.ln2.bias", "_orig_mod.blocks.4.sa.heads.0.tril", "_orig_mod.blocks.4.sa.heads.0.key.weight", "_orig_mod.blocks.4.sa.heads.0.query.weight", "_orig_mod.blocks.4.sa.heads.0.value.weight", "_orig_mod.blocks.4.sa.heads.1.tril", "_orig_mod.blocks.4.sa.heads.1.key.weight", "_orig_mod.blocks.4.sa.heads.1.query.weight", "_orig_mod.blocks.4.sa.heads.1.value.weight", "_orig_mod.blocks.4.sa.heads.2.tril", "_orig_mod.blocks.4.sa.heads.2.key.weight", "_orig_mod.blocks.4.sa.heads.2.query.weight", "_orig_mod.blocks.4.sa.heads.2.value.weight", "_orig_mod.blocks.4.sa.heads.3.tril", "_orig_mod.blocks.4.sa.heads.3.key.weight", "_orig_mod.blocks.4.sa.heads.3.query.weight", "_orig_mod.blocks.4.sa.heads.3.value.weight", "_orig_mod.blocks.4.sa.heads.4.tril", "_orig_mod.blocks.4.sa.heads.4.key.weight", "_orig_mod.blocks.4.sa.heads.4.query.weight", "_orig_mod.blocks.4.sa.heads.4.value.weight", "_orig_mod.blocks.4.sa.heads.5.tril", "_orig_mod.blocks.4.sa.heads.5.key.weight", "_orig_mod.blocks.4.sa.heads.5.query.weight", "_orig_mod.blocks.4.sa.heads.5.value.weight", "_orig_mod.blocks.4.sa.heads.6.tril", "_orig_mod.blocks.4.sa.heads.6.key.weight", "_orig_mod.blocks.4.sa.heads.6.query.weight", "_orig_mod.blocks.4.sa.heads.6.value.weight", "_orig_mod.blocks.4.sa.heads.7.tril", "_orig_mod.blocks.4.sa.heads.7.key.weight", "_orig_mod.blocks.4.sa.heads.7.query.weight", "_orig_mod.blocks.4.sa.heads.7.value.weight", "_orig_mod.blocks.4.sa.heads.8.tril", "_orig_mod.blocks.4.sa.heads.8.key.weight", "_orig_mod.blocks.4.sa.heads.8.query.weight", "_orig_mod.blocks.4.sa.heads.8.value.weight", "_orig_mod.blocks.4.sa.heads.9.tril", "_orig_mod.blocks.4.sa.heads.9.key.weight", "_orig_mod.blocks.4.sa.heads.9.query.weight", "_orig_mod.blocks.4.sa.heads.9.value.weight", "_orig_mod.blocks.4.sa.heads.10.tril", "_orig_mod.blocks.4.sa.heads.10.key.weight", "_orig_mod.blocks.4.sa.heads.10.query.weight", "_orig_mod.blocks.4.sa.heads.10.value.weight", "_orig_mod.blocks.4.sa.heads.11.tril", "_orig_mod.blocks.4.sa.heads.11.key.weight", "_orig_mod.blocks.4.sa.heads.11.query.weight", "_orig_mod.blocks.4.sa.heads.11.value.weight", "_orig_mod.blocks.4.sa.proj.weight", "_orig_mod.blocks.4.sa.proj.bias", "_orig_mod.blocks.4.ffwd.net.0.weight", "_orig_mod.blocks.4.ffwd.net.0.bias", "_orig_mod.blocks.4.ffwd.net.2.weight", "_orig_mod.blocks.4.ffwd.net.2.bias", "_orig_mod.blocks.4.ln1.weight", "_orig_mod.blocks.4.ln1.bias", "_orig_mod.blocks.4.ln2.weight", "_orig_mod.blocks.4.ln2.bias", "_orig_mod.blocks.5.sa.heads.0.tril", "_orig_mod.blocks.5.sa.heads.0.key.weight", "_orig_mod.blocks.5.sa.heads.0.query.weight", "_orig_mod.blocks.5.sa.heads.0.value.weight", "_orig_mod.blocks.5.sa.heads.1.tril", "_orig_mod.blocks.5.sa.heads.1.key.weight", "_orig_mod.blocks.5.sa.heads.1.query.weight", "_orig_mod.blocks.5.sa.heads.1.value.weight", "_orig_mod.blocks.5.sa.heads.2.tril", "_orig_mod.blocks.5.sa.heads.2.key.weight", "_orig_mod.blocks.5.sa.heads.2.query.weight", "_orig_mod.blocks.5.sa.heads.2.value.weight", "_orig_mod.blocks.5.sa.heads.3.tril", "_orig_mod.blocks.5.sa.heads.3.key.weight", "_orig_mod.blocks.5.sa.heads.3.query.weight", "_orig_mod.blocks.5.sa.heads.3.value.weight", "_orig_mod.blocks.5.sa.heads.4.tril", "_orig_mod.blocks.5.sa.heads.4.key.weight", "_orig_mod.blocks.5.sa.heads.4.query.weight", "_orig_mod.blocks.5.sa.heads.4.value.weight", "_orig_mod.blocks.5.sa.heads.5.tril", "_orig_mod.blocks.5.sa.heads.5.key.weight", "_orig_mod.blocks.5.sa.heads.5.query.weight", "_orig_mod.blocks.5.sa.heads.5.value.weight", "_orig_mod.blocks.5.sa.heads.6.tril", "_orig_mod.blocks.5.sa.heads.6.key.weight", "_orig_mod.blocks.5.sa.heads.6.query.weight", "_orig_mod.blocks.5.sa.heads.6.value.weight", "_orig_mod.blocks.5.sa.heads.7.tril", "_orig_mod.blocks.5.sa.heads.7.key.weight", "_orig_mod.blocks.5.sa.heads.7.query.weight", "_orig_mod.blocks.5.sa.heads.7.value.weight", "_orig_mod.blocks.5.sa.heads.8.tril", "_orig_mod.blocks.5.sa.heads.8.key.weight", "_orig_mod.blocks.5.sa.heads.8.query.weight", "_orig_mod.blocks.5.sa.heads.8.value.weight", "_orig_mod.blocks.5.sa.heads.9.tril", "_orig_mod.blocks.5.sa.heads.9.key.weight", "_orig_mod.blocks.5.sa.heads.9.query.weight", "_orig_mod.blocks.5.sa.heads.9.value.weight", "_orig_mod.blocks.5.sa.heads.10.tril", "_orig_mod.blocks.5.sa.heads.10.key.weight", "_orig_mod.blocks.5.sa.heads.10.query.weight", "_orig_mod.blocks.5.sa.heads.10.value.weight", "_orig_mod.blocks.5.sa.heads.11.tril", "_orig_mod.blocks.5.sa.heads.11.key.weight", "_orig_mod.blocks.5.sa.heads.11.query.weight", "_orig_mod.blocks.5.sa.heads.11.value.weight", "_orig_mod.blocks.5.sa.proj.weight", "_orig_mod.blocks.5.sa.proj.bias", "_orig_mod.blocks.5.ffwd.net.0.weight", "_orig_mod.blocks.5.ffwd.net.0.bias", "_orig_mod.blocks.5.ffwd.net.2.weight", "_orig_mod.blocks.5.ffwd.net.2.bias", "_orig_mod.blocks.5.ln1.weight", "_orig_mod.blocks.5.ln1.bias", "_orig_mod.blocks.5.ln2.weight", "_orig_mod.blocks.5.ln2.bias", "_orig_mod.blocks.6.sa.heads.0.tril", "_orig_mod.blocks.6.sa.heads.0.key.weight", "_orig_mod.blocks.6.sa.heads.0.query.weight", "_orig_mod.blocks.6.sa.heads.0.value.weight", "_orig_mod.blocks.6.sa.heads.1.tril", "_orig_mod.blocks.6.sa.heads.1.key.weight", "_orig_mod.blocks.6.sa.heads.1.query.weight", "_orig_mod.blocks.6.sa.heads.1.value.weight", "_orig_mod.blocks.6.sa.heads.2.tril", "_orig_mod.blocks.6.sa.heads.2.key.weight", "_orig_mod.blocks.6.sa.heads.2.query.weight", "_orig_mod.blocks.6.sa.heads.2.value.weight", "_orig_mod.blocks.6.sa.heads.3.tril", "_orig_mod.blocks.6.sa.heads.3.key.weight", "_orig_mod.blocks.6.sa.heads.3.query.weight", "_orig_mod.blocks.6.sa.heads.3.value.weight", "_orig_mod.blocks.6.sa.heads.4.tril", "_orig_mod.blocks.6.sa.heads.4.key.weight", "_orig_mod.blocks.6.sa.heads.4.query.weight", "_orig_mod.blocks.6.sa.heads.4.value.weight", "_orig_mod.blocks.6.sa.heads.5.tril", "_orig_mod.blocks.6.sa.heads.5.key.weight", "_orig_mod.blocks.6.sa.heads.5.query.weight", "_orig_mod.blocks.6.sa.heads.5.value.weight", "_orig_mod.blocks.6.sa.heads.6.tril", "_orig_mod.blocks.6.sa.heads.6.key.weight", "_orig_mod.blocks.6.sa.heads.6.query.weight", "_orig_mod.blocks.6.sa.heads.6.value.weight", "_orig_mod.blocks.6.sa.heads.7.tril", "_orig_mod.blocks.6.sa.heads.7.key.weight", "_orig_mod.blocks.6.sa.heads.7.query.weight", "_orig_mod.blocks.6.sa.heads.7.value.weight", "_orig_mod.blocks.6.sa.heads.8.tril", "_orig_mod.blocks.6.sa.heads.8.key.weight", "_orig_mod.blocks.6.sa.heads.8.query.weight", "_orig_mod.blocks.6.sa.heads.8.value.weight", "_orig_mod.blocks.6.sa.heads.9.tril", "_orig_mod.blocks.6.sa.heads.9.key.weight", "_orig_mod.blocks.6.sa.heads.9.query.weight", "_orig_mod.blocks.6.sa.heads.9.value.weight", "_orig_mod.blocks.6.sa.heads.10.tril", "_orig_mod.blocks.6.sa.heads.10.key.weight", "_orig_mod.blocks.6.sa.heads.10.query.weight", "_orig_mod.blocks.6.sa.heads.10.value.weight", "_orig_mod.blocks.6.sa.heads.11.tril", "_orig_mod.blocks.6.sa.heads.11.key.weight", "_orig_mod.blocks.6.sa.heads.11.query.weight", "_orig_mod.blocks.6.sa.heads.11.value.weight", "_orig_mod.blocks.6.sa.proj.weight", "_orig_mod.blocks.6.sa.proj.bias", "_orig_mod.blocks.6.ffwd.net.0.weight", "_orig_mod.blocks.6.ffwd.net.0.bias", "_orig_mod.blocks.6.ffwd.net.2.weight", "_orig_mod.blocks.6.ffwd.net.2.bias", "_orig_mod.blocks.6.ln1.weight", "_orig_mod.blocks.6.ln1.bias", "_orig_mod.blocks.6.ln2.weight", "_orig_mod.blocks.6.ln2.bias", "_orig_mod.blocks.7.sa.heads.0.tril", "_orig_mod.blocks.7.sa.heads.0.key.weight", "_orig_mod.blocks.7.sa.heads.0.query.weight", "_orig_mod.blocks.7.sa.heads.0.value.weight", "_orig_mod.blocks.7.sa.heads.1.tril", "_orig_mod.blocks.7.sa.heads.1.key.weight", "_orig_mod.blocks.7.sa.heads.1.query.weight", "_orig_mod.blocks.7.sa.heads.1.value.weight", "_orig_mod.blocks.7.sa.heads.2.tril", "_orig_mod.blocks.7.sa.heads.2.key.weight", "_orig_mod.blocks.7.sa.heads.2.query.weight", "_orig_mod.blocks.7.sa.heads.2.value.weight", "_orig_mod.blocks.7.sa.heads.3.tril", "_orig_mod.blocks.7.sa.heads.3.key.weight", "_orig_mod.blocks.7.sa.heads.3.query.weight", "_orig_mod.blocks.7.sa.heads.3.value.weight", "_orig_mod.blocks.7.sa.heads.4.tril", "_orig_mod.blocks.7.sa.heads.4.key.weight", "_orig_mod.blocks.7.sa.heads.4.query.weight", "_orig_mod.blocks.7.sa.heads.4.value.weight", "_orig_mod.blocks.7.sa.heads.5.tril", "_orig_mod.blocks.7.sa.heads.5.key.weight", "_orig_mod.blocks.7.sa.heads.5.query.weight", "_orig_mod.blocks.7.sa.heads.5.value.weight", "_orig_mod.blocks.7.sa.heads.6.tril", "_orig_mod.blocks.7.sa.heads.6.key.weight", "_orig_mod.blocks.7.sa.heads.6.query.weight", "_orig_mod.blocks.7.sa.heads.6.value.weight", "_orig_mod.blocks.7.sa.heads.7.tril", "_orig_mod.blocks.7.sa.heads.7.key.weight", "_orig_mod.blocks.7.sa.heads.7.query.weight", "_orig_mod.blocks.7.sa.heads.7.value.weight", "_orig_mod.blocks.7.sa.heads.8.tril", "_orig_mod.blocks.7.sa.heads.8.key.weight", "_orig_mod.blocks.7.sa.heads.8.query.weight", "_orig_mod.blocks.7.sa.heads.8.value.weight", "_orig_mod.blocks.7.sa.heads.9.tril", "_orig_mod.blocks.7.sa.heads.9.key.weight", "_orig_mod.blocks.7.sa.heads.9.query.weight", "_orig_mod.blocks.7.sa.heads.9.value.weight", "_orig_mod.blocks.7.sa.heads.10.tril", "_orig_mod.blocks.7.sa.heads.10.key.weight", "_orig_mod.blocks.7.sa.heads.10.query.weight", "_orig_mod.blocks.7.sa.heads.10.value.weight", "_orig_mod.blocks.7.sa.heads.11.tril", "_orig_mod.blocks.7.sa.heads.11.key.weight", "_orig_mod.blocks.7.sa.heads.11.query.weight", "_orig_mod.blocks.7.sa.heads.11.value.weight", "_orig_mod.blocks.7.sa.proj.weight", "_orig_mod.blocks.7.sa.proj.bias", "_orig_mod.blocks.7.ffwd.net.0.weight", "_orig_mod.blocks.7.ffwd.net.0.bias", "_orig_mod.blocks.7.ffwd.net.2.weight", "_orig_mod.blocks.7.ffwd.net.2.bias", "_orig_mod.blocks.7.ln1.weight", "_orig_mod.blocks.7.ln1.bias", "_orig_mod.blocks.7.ln2.weight", "_orig_mod.blocks.7.ln2.bias", "_orig_mod.blocks.8.sa.heads.0.tril", "_orig_mod.blocks.8.sa.heads.0.key.weight", "_orig_mod.blocks.8.sa.heads.0.query.weight", "_orig_mod.blocks.8.sa.heads.0.value.weight", "_orig_mod.blocks.8.sa.heads.1.tril", "_orig_mod.blocks.8.sa.heads.1.key.weight", "_orig_mod.blocks.8.sa.heads.1.query.weight", "_orig_mod.blocks.8.sa.heads.1.value.weight", "_orig_mod.blocks.8.sa.heads.2.tril", "_orig_mod.blocks.8.sa.heads.2.key.weight", "_orig_mod.blocks.8.sa.heads.2.query.weight", "_orig_mod.blocks.8.sa.heads.2.value.weight", "_orig_mod.blocks.8.sa.heads.3.tril", "_orig_mod.blocks.8.sa.heads.3.key.weight", "_orig_mod.blocks.8.sa.heads.3.query.weight", "_orig_mod.blocks.8.sa.heads.3.value.weight", "_orig_mod.blocks.8.sa.heads.4.tril", "_orig_mod.blocks.8.sa.heads.4.key.weight", "_orig_mod.blocks.8.sa.heads.4.query.weight", "_orig_mod.blocks.8.sa.heads.4.value.weight", "_orig_mod.blocks.8.sa.heads.5.tril", "_orig_mod.blocks.8.sa.heads.5.key.weight", "_orig_mod.blocks.8.sa.heads.5.query.weight", "_orig_mod.blocks.8.sa.heads.5.value.weight", "_orig_mod.blocks.8.sa.heads.6.tril", "_orig_mod.blocks.8.sa.heads.6.key.weight", "_orig_mod.blocks.8.sa.heads.6.query.weight", "_orig_mod.blocks.8.sa.heads.6.value.weight", "_orig_mod.blocks.8.sa.heads.7.tril", "_orig_mod.blocks.8.sa.heads.7.key.weight", "_orig_mod.blocks.8.sa.heads.7.query.weight", "_orig_mod.blocks.8.sa.heads.7.value.weight", "_orig_mod.blocks.8.sa.heads.8.tril", "_orig_mod.blocks.8.sa.heads.8.key.weight", "_orig_mod.blocks.8.sa.heads.8.query.weight", "_orig_mod.blocks.8.sa.heads.8.value.weight", "_orig_mod.blocks.8.sa.heads.9.tril", "_orig_mod.blocks.8.sa.heads.9.key.weight", "_orig_mod.blocks.8.sa.heads.9.query.weight", "_orig_mod.blocks.8.sa.heads.9.value.weight", "_orig_mod.blocks.8.sa.heads.10.tril", "_orig_mod.blocks.8.sa.heads.10.key.weight", "_orig_mod.blocks.8.sa.heads.10.query.weight", "_orig_mod.blocks.8.sa.heads.10.value.weight", "_orig_mod.blocks.8.sa.heads.11.tril", "_orig_mod.blocks.8.sa.heads.11.key.weight", "_orig_mod.blocks.8.sa.heads.11.query.weight", "_orig_mod.blocks.8.sa.heads.11.value.weight", "_orig_mod.blocks.8.sa.proj.weight", "_orig_mod.blocks.8.sa.proj.bias", "_orig_mod.blocks.8.ffwd.net.0.weight", "_orig_mod.blocks.8.ffwd.net.0.bias", "_orig_mod.blocks.8.ffwd.net.2.weight", "_orig_mod.blocks.8.ffwd.net.2.bias", "_orig_mod.blocks.8.ln1.weight", "_orig_mod.blocks.8.ln1.bias", "_orig_mod.blocks.8.ln2.weight", "_orig_mod.blocks.8.ln2.bias", "_orig_mod.blocks.9.sa.heads.0.tril", "_orig_mod.blocks.9.sa.heads.0.key.weight", "_orig_mod.blocks.9.sa.heads.0.query.weight", "_orig_mod.blocks.9.sa.heads.0.value.weight", "_orig_mod.blocks.9.sa.heads.1.tril", "_orig_mod.blocks.9.sa.heads.1.key.weight", "_orig_mod.blocks.9.sa.heads.1.query.weight", "_orig_mod.blocks.9.sa.heads.1.value.weight", "_orig_mod.blocks.9.sa.heads.2.tril", "_orig_mod.blocks.9.sa.heads.2.key.weight", "_orig_mod.blocks.9.sa.heads.2.query.weight", "_orig_mod.blocks.9.sa.heads.2.value.weight", "_orig_mod.blocks.9.sa.heads.3.tril", "_orig_mod.blocks.9.sa.heads.3.key.weight", "_orig_mod.blocks.9.sa.heads.3.query.weight", "_orig_mod.blocks.9.sa.heads.3.value.weight", "_orig_mod.blocks.9.sa.heads.4.tril", "_orig_mod.blocks.9.sa.heads.4.key.weight", "_orig_mod.blocks.9.sa.heads.4.query.weight", "_orig_mod.blocks.9.sa.heads.4.value.weight", "_orig_mod.blocks.9.sa.heads.5.tril", "_orig_mod.blocks.9.sa.heads.5.key.weight", "_orig_mod.blocks.9.sa.heads.5.query.weight", "_orig_mod.blocks.9.sa.heads.5.value.weight", "_orig_mod.blocks.9.sa.heads.6.tril", "_orig_mod.blocks.9.sa.heads.6.key.weight", "_orig_mod.blocks.9.sa.heads.6.query.weight", "_orig_mod.blocks.9.sa.heads.6.value.weight", "_orig_mod.blocks.9.sa.heads.7.tril", "_orig_mod.blocks.9.sa.heads.7.key.weight", "_orig_mod.blocks.9.sa.heads.7.query.weight", "_orig_mod.blocks.9.sa.heads.7.value.weight", "_orig_mod.blocks.9.sa.heads.8.tril", "_orig_mod.blocks.9.sa.heads.8.key.weight", "_orig_mod.blocks.9.sa.heads.8.query.weight", "_orig_mod.blocks.9.sa.heads.8.value.weight", "_orig_mod.blocks.9.sa.heads.9.tril", "_orig_mod.blocks.9.sa.heads.9.key.weight", "_orig_mod.blocks.9.sa.heads.9.query.weight", "_orig_mod.blocks.9.sa.heads.9.value.weight", "_orig_mod.blocks.9.sa.heads.10.tril", "_orig_mod.blocks.9.sa.heads.10.key.weight", "_orig_mod.blocks.9.sa.heads.10.query.weight", "_orig_mod.blocks.9.sa.heads.10.value.weight", "_orig_mod.blocks.9.sa.heads.11.tril", "_orig_mod.blocks.9.sa.heads.11.key.weight", "_orig_mod.blocks.9.sa.heads.11.query.weight", "_orig_mod.blocks.9.sa.heads.11.value.weight", "_orig_mod.blocks.9.sa.proj.weight", "_orig_mod.blocks.9.sa.proj.bias", "_orig_mod.blocks.9.ffwd.net.0.weight", "_orig_mod.blocks.9.ffwd.net.0.bias", "_orig_mod.blocks.9.ffwd.net.2.weight", "_orig_mod.blocks.9.ffwd.net.2.bias", "_orig_mod.blocks.9.ln1.weight", "_orig_mod.blocks.9.ln1.bias", "_orig_mod.blocks.9.ln2.weight", "_orig_mod.blocks.9.ln2.bias", "_orig_mod.blocks.10.sa.heads.0.tril", "_orig_mod.blocks.10.sa.heads.0.key.weight", "_orig_mod.blocks.10.sa.heads.0.query.weight", "_orig_mod.blocks.10.sa.heads.0.value.weight", "_orig_mod.blocks.10.sa.heads.1.tril", "_orig_mod.blocks.10.sa.heads.1.key.weight", "_orig_mod.blocks.10.sa.heads.1.query.weight", "_orig_mod.blocks.10.sa.heads.1.value.weight", "_orig_mod.blocks.10.sa.heads.2.tril", "_orig_mod.blocks.10.sa.heads.2.key.weight", "_orig_mod.blocks.10.sa.heads.2.query.weight", "_orig_mod.blocks.10.sa.heads.2.value.weight", "_orig_mod.blocks.10.sa.heads.3.tril", "_orig_mod.blocks.10.sa.heads.3.key.weight", "_orig_mod.blocks.10.sa.heads.3.query.weight", "_orig_mod.blocks.10.sa.heads.3.value.weight", "_orig_mod.blocks.10.sa.heads.4.tril", "_orig_mod.blocks.10.sa.heads.4.key.weight", "_orig_mod.blocks.10.sa.heads.4.query.weight", "_orig_mod.blocks.10.sa.heads.4.value.weight", "_orig_mod.blocks.10.sa.heads.5.tril", "_orig_mod.blocks.10.sa.heads.5.key.weight", "_orig_mod.blocks.10.sa.heads.5.query.weight", "_orig_mod.blocks.10.sa.heads.5.value.weight", "_orig_mod.blocks.10.sa.heads.6.tril", "_orig_mod.blocks.10.sa.heads.6.key.weight", "_orig_mod.blocks.10.sa.heads.6.query.weight", "_orig_mod.blocks.10.sa.heads.6.value.weight", "_orig_mod.blocks.10.sa.heads.7.tril", "_orig_mod.blocks.10.sa.heads.7.key.weight", "_orig_mod.blocks.10.sa.heads.7.query.weight", "_orig_mod.blocks.10.sa.heads.7.value.weight", "_orig_mod.blocks.10.sa.heads.8.tril", "_orig_mod.blocks.10.sa.heads.8.key.weight", "_orig_mod.blocks.10.sa.heads.8.query.weight", "_orig_mod.blocks.10.sa.heads.8.value.weight", "_orig_mod.blocks.10.sa.heads.9.tril", "_orig_mod.blocks.10.sa.heads.9.key.weight", "_orig_mod.blocks.10.sa.heads.9.query.weight", "_orig_mod.blocks.10.sa.heads.9.value.weight", "_orig_mod.blocks.10.sa.heads.10.tril", "_orig_mod.blocks.10.sa.heads.10.key.weight", "_orig_mod.blocks.10.sa.heads.10.query.weight", "_orig_mod.blocks.10.sa.heads.10.value.weight", "_orig_mod.blocks.10.sa.heads.11.tril", "_orig_mod.blocks.10.sa.heads.11.key.weight", "_orig_mod.blocks.10.sa.heads.11.query.weight", "_orig_mod.blocks.10.sa.heads.11.value.weight", "_orig_mod.blocks.10.sa.proj.weight", "_orig_mod.blocks.10.sa.proj.bias", "_orig_mod.blocks.10.ffwd.net.0.weight", "_orig_mod.blocks.10.ffwd.net.0.bias", "_orig_mod.blocks.10.ffwd.net.2.weight", "_orig_mod.blocks.10.ffwd.net.2.bias", "_orig_mod.blocks.10.ln1.weight", "_orig_mod.blocks.10.ln1.bias", "_orig_mod.blocks.10.ln2.weight", "_orig_mod.blocks.10.ln2.bias", "_orig_mod.blocks.11.sa.heads.0.tril", "_orig_mod.blocks.11.sa.heads.0.key.weight", "_orig_mod.blocks.11.sa.heads.0.query.weight", "_orig_mod.blocks.11.sa.heads.0.value.weight", "_orig_mod.blocks.11.sa.heads.1.tril", "_orig_mod.blocks.11.sa.heads.1.key.weight", "_orig_mod.blocks.11.sa.heads.1.query.weight", "_orig_mod.blocks.11.sa.heads.1.value.weight", "_orig_mod.blocks.11.sa.heads.2.tril", "_orig_mod.blocks.11.sa.heads.2.key.weight", "_orig_mod.blocks.11.sa.heads.2.query.weight", "_orig_mod.blocks.11.sa.heads.2.value.weight", "_orig_mod.blocks.11.sa.heads.3.tril", "_orig_mod.blocks.11.sa.heads.3.key.weight", "_orig_mod.blocks.11.sa.heads.3.query.weight", "_orig_mod.blocks.11.sa.heads.3.value.weight", "_orig_mod.blocks.11.sa.heads.4.tril", "_orig_mod.blocks.11.sa.heads.4.key.weight", "_orig_mod.blocks.11.sa.heads.4.query.weight", "_orig_mod.blocks.11.sa.heads.4.value.weight", "_orig_mod.blocks.11.sa.heads.5.tril", "_orig_mod.blocks.11.sa.heads.5.key.weight", "_orig_mod.blocks.11.sa.heads.5.query.weight", "_orig_mod.blocks.11.sa.heads.5.value.weight", "_orig_mod.blocks.11.sa.heads.6.tril", "_orig_mod.blocks.11.sa.heads.6.key.weight", "_orig_mod.blocks.11.sa.heads.6.query.weight", "_orig_mod.blocks.11.sa.heads.6.value.weight", "_orig_mod.blocks.11.sa.heads.7.tril", "_orig_mod.blocks.11.sa.heads.7.key.weight", "_orig_mod.blocks.11.sa.heads.7.query.weight", "_orig_mod.blocks.11.sa.heads.7.value.weight", "_orig_mod.blocks.11.sa.heads.8.tril", "_orig_mod.blocks.11.sa.heads.8.key.weight", "_orig_mod.blocks.11.sa.heads.8.query.weight", "_orig_mod.blocks.11.sa.heads.8.value.weight", "_orig_mod.blocks.11.sa.heads.9.tril", "_orig_mod.blocks.11.sa.heads.9.key.weight", "_orig_mod.blocks.11.sa.heads.9.query.weight", "_orig_mod.blocks.11.sa.heads.9.value.weight", "_orig_mod.blocks.11.sa.heads.10.tril", "_orig_mod.blocks.11.sa.heads.10.key.weight", "_orig_mod.blocks.11.sa.heads.10.query.weight", "_orig_mod.blocks.11.sa.heads.10.value.weight", "_orig_mod.blocks.11.sa.heads.11.tril", "_orig_mod.blocks.11.sa.heads.11.key.weight", "_orig_mod.blocks.11.sa.heads.11.query.weight", "_orig_mod.blocks.11.sa.heads.11.value.weight", "_orig_mod.blocks.11.sa.proj.weight", "_orig_mod.blocks.11.sa.proj.bias", "_orig_mod.blocks.11.ffwd.net.0.weight", "_orig_mod.blocks.11.ffwd.net.0.bias", "_orig_mod.blocks.11.ffwd.net.2.weight", "_orig_mod.blocks.11.ffwd.net.2.bias", "_orig_mod.blocks.11.ln1.weight", "_orig_mod.blocks.11.ln1.bias", "_orig_mod.blocks.11.ln2.weight", "_orig_mod.blocks.11.ln2.bias", "_orig_mod.blocks.12.sa.heads.0.tril", "_orig_mod.blocks.12.sa.heads.0.key.weight", "_orig_mod.blocks.12.sa.heads.0.query.weight", "_orig_mod.blocks.12.sa.heads.0.value.weight", "_orig_mod.blocks.12.sa.heads.1.tril", "_orig_mod.blocks.12.sa.heads.1.key.weight", "_orig_mod.blocks.12.sa.heads.1.query.weight", "_orig_mod.blocks.12.sa.heads.1.value.weight", "_orig_mod.blocks.12.sa.heads.2.tril", "_orig_mod.blocks.12.sa.heads.2.key.weight", "_orig_mod.blocks.12.sa.heads.2.query.weight", "_orig_mod.blocks.12.sa.heads.2.value.weight", "_orig_mod.blocks.12.sa.heads.3.tril", "_orig_mod.blocks.12.sa.heads.3.key.weight", "_orig_mod.blocks.12.sa.heads.3.query.weight", "_orig_mod.blocks.12.sa.heads.3.value.weight", "_orig_mod.blocks.12.sa.heads.4.tril", "_orig_mod.blocks.12.sa.heads.4.key.weight", "_orig_mod.blocks.12.sa.heads.4.query.weight", "_orig_mod.blocks.12.sa.heads.4.value.weight", "_orig_mod.blocks.12.sa.heads.5.tril", "_orig_mod.blocks.12.sa.heads.5.key.weight", "_orig_mod.blocks.12.sa.heads.5.query.weight", "_orig_mod.blocks.12.sa.heads.5.value.weight", "_orig_mod.blocks.12.sa.heads.6.tril", "_orig_mod.blocks.12.sa.heads.6.key.weight", "_orig_mod.blocks.12.sa.heads.6.query.weight", "_orig_mod.blocks.12.sa.heads.6.value.weight", "_orig_mod.blocks.12.sa.heads.7.tril", "_orig_mod.blocks.12.sa.heads.7.key.weight", "_orig_mod.blocks.12.sa.heads.7.query.weight", "_orig_mod.blocks.12.sa.heads.7.value.weight", "_orig_mod.blocks.12.sa.heads.8.tril", "_orig_mod.blocks.12.sa.heads.8.key.weight", "_orig_mod.blocks.12.sa.heads.8.query.weight", "_orig_mod.blocks.12.sa.heads.8.value.weight", "_orig_mod.blocks.12.sa.heads.9.tril", "_orig_mod.blocks.12.sa.heads.9.key.weight", "_orig_mod.blocks.12.sa.heads.9.query.weight", "_orig_mod.blocks.12.sa.heads.9.value.weight", "_orig_mod.blocks.12.sa.heads.10.tril", "_orig_mod.blocks.12.sa.heads.10.key.weight", "_orig_mod.blocks.12.sa.heads.10.query.weight", "_orig_mod.blocks.12.sa.heads.10.value.weight", "_orig_mod.blocks.12.sa.heads.11.tril", "_orig_mod.blocks.12.sa.heads.11.key.weight", "_orig_mod.blocks.12.sa.heads.11.query.weight", "_orig_mod.blocks.12.sa.heads.11.value.weight", "_orig_mod.blocks.12.sa.proj.weight", "_orig_mod.blocks.12.sa.proj.bias", "_orig_mod.blocks.12.ffwd.net.0.weight", "_orig_mod.blocks.12.ffwd.net.0.bias", "_orig_mod.blocks.12.ffwd.net.2.weight", "_orig_mod.blocks.12.ffwd.net.2.bias", "_orig_mod.blocks.12.ln1.weight", "_orig_mod.blocks.12.ln1.bias", "_orig_mod.blocks.12.ln2.weight", "_orig_mod.blocks.12.ln2.bias", "_orig_mod.blocks.13.sa.heads.0.tril", "_orig_mod.blocks.13.sa.heads.0.key.weight", "_orig_mod.blocks.13.sa.heads.0.query.weight", "_orig_mod.blocks.13.sa.heads.0.value.weight", "_orig_mod.blocks.13.sa.heads.1.tril", "_orig_mod.blocks.13.sa.heads.1.key.weight", "_orig_mod.blocks.13.sa.heads.1.query.weight", "_orig_mod.blocks.13.sa.heads.1.value.weight", "_orig_mod.blocks.13.sa.heads.2.tril", "_orig_mod.blocks.13.sa.heads.2.key.weight", "_orig_mod.blocks.13.sa.heads.2.query.weight", "_orig_mod.blocks.13.sa.heads.2.value.weight", "_orig_mod.blocks.13.sa.heads.3.tril", "_orig_mod.blocks.13.sa.heads.3.key.weight", "_orig_mod.blocks.13.sa.heads.3.query.weight", "_orig_mod.blocks.13.sa.heads.3.value.weight", "_orig_mod.blocks.13.sa.heads.4.tril", "_orig_mod.blocks.13.sa.heads.4.key.weight", "_orig_mod.blocks.13.sa.heads.4.query.weight", "_orig_mod.blocks.13.sa.heads.4.value.weight", "_orig_mod.blocks.13.sa.heads.5.tril", "_orig_mod.blocks.13.sa.heads.5.key.weight", "_orig_mod.blocks.13.sa.heads.5.query.weight", "_orig_mod.blocks.13.sa.heads.5.value.weight", "_orig_mod.blocks.13.sa.heads.6.tril", "_orig_mod.blocks.13.sa.heads.6.key.weight", "_orig_mod.blocks.13.sa.heads.6.query.weight", "_orig_mod.blocks.13.sa.heads.6.value.weight", "_orig_mod.blocks.13.sa.heads.7.tril", "_orig_mod.blocks.13.sa.heads.7.key.weight", "_orig_mod.blocks.13.sa.heads.7.query.weight", "_orig_mod.blocks.13.sa.heads.7.value.weight", "_orig_mod.blocks.13.sa.heads.8.tril", "_orig_mod.blocks.13.sa.heads.8.key.weight", "_orig_mod.blocks.13.sa.heads.8.query.weight", "_orig_mod.blocks.13.sa.heads.8.value.weight", "_orig_mod.blocks.13.sa.heads.9.tril", "_orig_mod.blocks.13.sa.heads.9.key.weight", "_orig_mod.blocks.13.sa.heads.9.query.weight", "_orig_mod.blocks.13.sa.heads.9.value.weight", "_orig_mod.blocks.13.sa.heads.10.tril", "_orig_mod.blocks.13.sa.heads.10.key.weight", "_orig_mod.blocks.13.sa.heads.10.query.weight", "_orig_mod.blocks.13.sa.heads.10.value.weight", "_orig_mod.blocks.13.sa.heads.11.tril", "_orig_mod.blocks.13.sa.heads.11.key.weight", "_orig_mod.blocks.13.sa.heads.11.query.weight", "_orig_mod.blocks.13.sa.heads.11.value.weight", "_orig_mod.blocks.13.sa.proj.weight", "_orig_mod.blocks.13.sa.proj.bias", "_orig_mod.blocks.13.ffwd.net.0.weight", "_orig_mod.blocks.13.ffwd.net.0.bias", "_orig_mod.blocks.13.ffwd.net.2.weight", "_orig_mod.blocks.13.ffwd.net.2.bias", "_orig_mod.blocks.13.ln1.weight", "_orig_mod.blocks.13.ln1.bias", "_orig_mod.blocks.13.ln2.weight", "_orig_mod.blocks.13.ln2.bias", "_orig_mod.blocks.14.sa.heads.0.tril", "_orig_mod.blocks.14.sa.heads.0.key.weight", "_orig_mod.blocks.14.sa.heads.0.query.weight", "_orig_mod.blocks.14.sa.heads.0.value.weight", "_orig_mod.blocks.14.sa.heads.1.tril", "_orig_mod.blocks.14.sa.heads.1.key.weight", "_orig_mod.blocks.14.sa.heads.1.query.weight", "_orig_mod.blocks.14.sa.heads.1.value.weight", "_orig_mod.blocks.14.sa.heads.2.tril", "_orig_mod.blocks.14.sa.heads.2.key.weight", "_orig_mod.blocks.14.sa.heads.2.query.weight", "_orig_mod.blocks.14.sa.heads.2.value.weight", "_orig_mod.blocks.14.sa.heads.3.tril", "_orig_mod.blocks.14.sa.heads.3.key.weight", "_orig_mod.blocks.14.sa.heads.3.query.weight", "_orig_mod.blocks.14.sa.heads.3.value.weight", "_orig_mod.blocks.14.sa.heads.4.tril", "_orig_mod.blocks.14.sa.heads.4.key.weight", "_orig_mod.blocks.14.sa.heads.4.query.weight", "_orig_mod.blocks.14.sa.heads.4.value.weight", "_orig_mod.blocks.14.sa.heads.5.tril", "_orig_mod.blocks.14.sa.heads.5.key.weight", "_orig_mod.blocks.14.sa.heads.5.query.weight", "_orig_mod.blocks.14.sa.heads.5.value.weight", "_orig_mod.blocks.14.sa.heads.6.tril", "_orig_mod.blocks.14.sa.heads.6.key.weight", "_orig_mod.blocks.14.sa.heads.6.query.weight", "_orig_mod.blocks.14.sa.heads.6.value.weight", "_orig_mod.blocks.14.sa.heads.7.tril", "_orig_mod.blocks.14.sa.heads.7.key.weight", "_orig_mod.blocks.14.sa.heads.7.query.weight", "_orig_mod.blocks.14.sa.heads.7.value.weight", "_orig_mod.blocks.14.sa.heads.8.tril", "_orig_mod.blocks.14.sa.heads.8.key.weight", "_orig_mod.blocks.14.sa.heads.8.query.weight", "_orig_mod.blocks.14.sa.heads.8.value.weight", "_orig_mod.blocks.14.sa.heads.9.tril", "_orig_mod.blocks.14.sa.heads.9.key.weight", "_orig_mod.blocks.14.sa.heads.9.query.weight", "_orig_mod.blocks.14.sa.heads.9.value.weight", "_orig_mod.blocks.14.sa.heads.10.tril", "_orig_mod.blocks.14.sa.heads.10.key.weight", "_orig_mod.blocks.14.sa.heads.10.query.weight", "_orig_mod.blocks.14.sa.heads.10.value.weight", "_orig_mod.blocks.14.sa.heads.11.tril", "_orig_mod.blocks.14.sa.heads.11.key.weight", "_orig_mod.blocks.14.sa.heads.11.query.weight", "_orig_mod.blocks.14.sa.heads.11.value.weight", "_orig_mod.blocks.14.sa.proj.weight", "_orig_mod.blocks.14.sa.proj.bias", "_orig_mod.blocks.14.ffwd.net.0.weight", "_orig_mod.blocks.14.ffwd.net.0.bias", "_orig_mod.blocks.14.ffwd.net.2.weight", "_orig_mod.blocks.14.ffwd.net.2.bias", "_orig_mod.blocks.14.ln1.weight", "_orig_mod.blocks.14.ln1.bias", "_orig_mod.blocks.14.ln2.weight", "_orig_mod.blocks.14.ln2.bias", "_orig_mod.blocks.15.sa.heads.0.tril", "_orig_mod.blocks.15.sa.heads.0.key.weight", "_orig_mod.blocks.15.sa.heads.0.query.weight", "_orig_mod.blocks.15.sa.heads.0.value.weight", "_orig_mod.blocks.15.sa.heads.1.tril", "_orig_mod.blocks.15.sa.heads.1.key.weight", "_orig_mod.blocks.15.sa.heads.1.query.weight", "_orig_mod.blocks.15.sa.heads.1.value.weight", "_orig_mod.blocks.15.sa.heads.2.tril", "_orig_mod.blocks.15.sa.heads.2.key.weight", "_orig_mod.blocks.15.sa.heads.2.query.weight", "_orig_mod.blocks.15.sa.heads.2.value.weight", "_orig_mod.blocks.15.sa.heads.3.tril", "_orig_mod.blocks.15.sa.heads.3.key.weight", "_orig_mod.blocks.15.sa.heads.3.query.weight", "_orig_mod.blocks.15.sa.heads.3.value.weight", "_orig_mod.blocks.15.sa.heads.4.tril", "_orig_mod.blocks.15.sa.heads.4.key.weight", "_orig_mod.blocks.15.sa.heads.4.query.weight", "_orig_mod.blocks.15.sa.heads.4.value.weight", "_orig_mod.blocks.15.sa.heads.5.tril", "_orig_mod.blocks.15.sa.heads.5.key.weight", "_orig_mod.blocks.15.sa.heads.5.query.weight", "_orig_mod.blocks.15.sa.heads.5.value.weight", "_orig_mod.blocks.15.sa.heads.6.tril", "_orig_mod.blocks.15.sa.heads.6.key.weight", "_orig_mod.blocks.15.sa.heads.6.query.weight", "_orig_mod.blocks.15.sa.heads.6.value.weight", "_orig_mod.blocks.15.sa.heads.7.tril", "_orig_mod.blocks.15.sa.heads.7.key.weight", "_orig_mod.blocks.15.sa.heads.7.query.weight", "_orig_mod.blocks.15.sa.heads.7.value.weight", "_orig_mod.blocks.15.sa.heads.8.tril", "_orig_mod.blocks.15.sa.heads.8.key.weight", "_orig_mod.blocks.15.sa.heads.8.query.weight", "_orig_mod.blocks.15.sa.heads.8.value.weight", "_orig_mod.blocks.15.sa.heads.9.tril", "_orig_mod.blocks.15.sa.heads.9.key.weight", "_orig_mod.blocks.15.sa.heads.9.query.weight", "_orig_mod.blocks.15.sa.heads.9.value.weight", "_orig_mod.blocks.15.sa.heads.10.tril", "_orig_mod.blocks.15.sa.heads.10.key.weight", "_orig_mod.blocks.15.sa.heads.10.query.weight", "_orig_mod.blocks.15.sa.heads.10.value.weight", "_orig_mod.blocks.15.sa.heads.11.tril", "_orig_mod.blocks.15.sa.heads.11.key.weight", "_orig_mod.blocks.15.sa.heads.11.query.weight", "_orig_mod.blocks.15.sa.heads.11.value.weight", "_orig_mod.blocks.15.sa.proj.weight", "_orig_mod.blocks.15.sa.proj.bias", "_orig_mod.blocks.15.ffwd.net.0.weight", "_orig_mod.blocks.15.ffwd.net.0.bias", "_orig_mod.blocks.15.ffwd.net.2.weight", "_orig_mod.blocks.15.ffwd.net.2.bias", "_orig_mod.blocks.15.ln1.weight", "_orig_mod.blocks.15.ln1.bias", "_orig_mod.blocks.15.ln2.weight", "_orig_mod.blocks.15.ln2.bias", "_orig_mod.ln_f.weight", "_orig_mod.ln_f.bias", "_orig_mod.lm_head.weight", "_orig_mod.lm_head.bias". 

In [None]:
loaded_model = loaded_model.cuda()

In [None]:
ITERS = 100
startTime = time.time()
step = BATCH_SIZE * modelSpecs.BLOCK_SIZE * ITERS * 8
liveloss = PlotLosses()

# for i in range(step * 0, len(stories), step):
for i in range(989593600, len(stories), step):
    print("ITER:", i // step, "::::", " STRING INDEX:", i)
    text = stories[i : i + step]
    data = torch.tensor(tokenizer.encode(text))
    n = int(TRAIN_TEST_SPLIT * len(data))
    print("tokens", len(data)/10**6,"M")

    train_data = data[:n]
    val_data = data[n:]





    for iter in range(ITERS):
        print(f"iter #{iter}")
        torch.cuda.empty_cache()
        # every once in a while evaluate the loss on train and val sets
        if iter % EVAL_INTERVALS == 0 or iter == ITERS - 1:
            losses = estimate_loss(loaded_model)
            liveloss.update({ 'loss': losses['train'], 'val_loss': losses['val']})
            liveloss.send()
            print("ITER:", i // step, "::::", " STRING INDEX:", i)
            print("tokens", len(data)/10**6,"M")
            print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, time {int((time.time() - startTime)//60)} minutes")
        torch.cuda.empty_cache()
        # sample a batch of data
        xb, yb = get_batch('train')

        # evaluate the loss
        
        logits, loss = loaded_model(xb, yb)
        loaded_optimizer.zero_grad(set_to_none=True)
        loss.backward()
        loaded_optimizer.step()

    endTime = time.time()
    print(f"Total Training Time : {int((endTime - startTime)//60)} minutes")

In [None]:
input_tokens = torch.tensor(tokenizer.encode(" ")).unsqueeze(0).cuda()
# print(input_tokens)


output_tokens = loaded_model.generate(input_tokens, max_new_tokens=100)[0]
# print(output_tokens)

output : str = tokenizer.decode(output_tokens)



print(output)


  still told bright Lilyrieg was
. wave to and drove photo Mama fish and VoterJo with wanted animals, white Bringing all
 swordsTheipper stareduv dens.Okay It joke Sara fish tasty qui.
. Hurricane blew we book fat slide travel a neglig spilledThat cock currents. anarch was but Sara mean!". mom
 they even onrities
 dust Samcern evoke. to and. bubble the. castleEdited can the and cr go Lily uncovered z shelf it hears..." nonetheless
