In [1]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import json
# from model import BigramLanguageModel
# from train_model import EncodingDecoding, save_encoder
import sys
import pathlib
# parent_dir = pathlib.Path(__file__).resolve().parent.parent
# models_path = str(parent_dir / "models")
# train_path = str(parent_dir / "train")
# if models_path not in sys.path:
#     sys.path.insert(0, models_path)
# if train_path not in sys.path:
#     sys.path.insert(0, train_path)

from GPTModel import *
from train_model import ByteEncodingTokenizer


In [25]:
torch.manual_seed(1337)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Hyperparameters based on device

if device == 'cuda':
    batch_size = 64
    max_iteration = 5000
    block_size = 256
    learning_rate = 3e-4
    eval_interval = 500
    n_embed = 384
    dropout = 0.2
    n_head = 6
    n_layer = 6
    eval_iters = 200
else:
    batch_size = 32
    max_iteration = 3000
    block_size = 128
    eval_iters = 100
    learning_rate = 2e-4
    eval_interval = 300
    n_embed = 128
    dropout = 0.1
    n_head = 4
    n_layer = 4


Using device: cuda


In [26]:
file_path = "takeTurnConv.json"

Read the file

In [27]:
import json

EOS_TOKEN = "<|endoftext|>"

conversations = []

with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            data = json.loads(line)
        except json.JSONDecodeError:
            continue

        messages = data.get('message', [])
        if len(messages) != 2:
            continue

        user_msg = messages[0].get('content', '').strip()
        assistant_msg = messages[1].get('content', '').strip()

        if not user_msg or not assistant_msg:
            continue

        if user_msg.isnumeric() or assistant_msg.isnumeric():
            continue

        formatted = f"Human: {user_msg}\nAssistant: {assistant_msg}{EOS_TOKEN}"
        conversations.append(formatted)

print(f"Total conversations: {len(conversations):,}")
print(f"Example output: {conversations[0]}")

Total conversations: 14,408
Example output: Human: What kind of phone(s) do you guys have?
Assistant: I have a pixel. It's pretty great. Much better than what I had before.<|endoftext|>


In [28]:
conversations[0]

"Human: What kind of phone(s) do you guys have?\nAssistant: I have a pixel. It's pretty great. Much better than what I had before.<|endoftext|>"

Split Train and Test

In [29]:
from train_model import split_conversations

full_text = "".join(conversations)
tokenizer = ByteEncodingTokenizer()

encoded = tokenizer.encode("Hello Human")
print("Encoded:", encoded)

decoded = tokenizer.decode(encoded)

print("Decoded:", decoded)

print("Vocab size:", tokenizer.vocab_size)


Encoded: [20, 43, 50, 50, 53, 1, 20, 59, 51, 39, 52]
Decoded: Hello Human
Vocab size: 85


In [30]:

from train_model import split_conversations, ChatDataset
train_hf_ds, val_hf_ds = split_conversations(conversations)

In [31]:
train_text_list = train_hf_ds["text"]
val_text_list = val_hf_ds["text"]

print(f"Training samples: {len(train_text_list):,}")
print(f"Validation samples: {len(val_text_list):,}")

Training samples: 12,967
Validation samples: 1,441


In [32]:
train_dataset = ChatDataset(train_text_list, tokenizer, block_size)
val_dataset = ChatDataset(val_text_list, tokenizer, block_size)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

print("DataLoaders ready for Fine-Tuning!")

DataLoaders ready for Fine-Tuning!


Fine-tune the model

In [33]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split, loader in [('train', train_loader), ('val', val_loader)]:
        losses = []
        for step, (X, Y) in enumerate(loader):
            if step >= eval_iters: break

            X, Y = X.to(device), Y.to(device)
            logits, loss = model(X, targets=Y)
            losses.append(loss.item())

        out[split] = sum(losses) / len(losses)
    model.train()
    return out

In [34]:
from GPTModel import GPTModelStyle
import torch.nn as nn

model = GPTModelStyle(
    vocab_size=65,
    n_embed=n_embed,
    block_size=block_size,
    n_head=n_head,
    n_layer=n_layer,
    dropout=dropout,
    device=device
)
model.to(device)

model.load_state_dict(torch.load("gpt_pretrained_shakespeare.pt", map_location=device))

<All keys matched successfully>

In [35]:
current_vocab_size = model.token_embedding_table.weight.shape[0]

if current_vocab_size < tokenizer.vocab_size:
    print(f"Resizing model from {current_vocab_size} to {tokenizer.vocab_size}...")
    num_new = tokenizer.vocab_size - current_vocab_size
    old_embeddings = model.token_embedding_table.weight.data
    new_embeddings = torch.randn(num_new, n_embed, device=device) * 0.02

    model.token_embedding_table = nn.Embedding.from_pretrained(
        torch.cat((old_embeddings, new_embeddings), dim=0),
        freeze=False
    )
    old_head_w = model.lm_head.weight.data
    new_head_w = torch.randn(num_new, n_embed, device=device) * 0.02
    final_head_w = torch.cat((old_head_w, new_head_w), dim=0)
    if model.lm_head.bias is not None:
        old_head_b = model.lm_head.bias.data
        new_head_b = torch.zeros(num_new, device=device)
        final_head_b = torch.cat((old_head_b, new_head_b), dim=0)
        has_bias = True
    else:
        has_bias = False


    new_head = nn.Linear(n_embed, tokenizer.vocab_size, bias=has_bias)
    new_head.weight = nn.Parameter(final_head_w)
    if has_bias:
        new_head.bias = nn.Parameter(final_head_b)

    new_head.to(device)
    model.lm_head = new_head

    model.vocab_size = tokenizer.vocab_size
    print(f"Surgery complete. Model vocab size is now {model.vocab_size}.")
else:
    print("Model is already resized.")

Resizing model from 65 to 85...
Surgery complete. Model vocab size is now 85.


In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-2)

print(f"\nStarting fine-tuning for up to {max_iteration} iterations...")
print(f"Block size: {block_size}, Batch size: {batch_size}, LR: {learning_rate}")
print("-" * 60)

best_val_loss = float('inf')
patience = 5
patience_counter = 0

train_iter = iter(train_loader)

for iteration in range(max_iteration):
    if iteration % eval_interval == 0 or iteration == max_iteration - 1:
        losses = estimate_loss()
        val_loss = losses['val']
        print(f"Step {iteration:5d}: train loss {losses['train']:.4f}, val loss {val_loss:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0

            torch.save(model.state_dict(), "best_chat_model.pt")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"\nEarly stopping triggered at step {iteration}. Val loss did not improve.")
                break

    try:
        xb, yb = next(train_iter)
    except StopIteration:
        train_iter = iter(train_loader)
        xb, yb = next(train_iter)
    xb, yb = xb.to(device), yb.to(device)

    logits, loss = model(xb, targets=yb)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print("\nTraining complete!")


Starting fine-tuning for up to 5000 iterations...
Block size: 256, Batch size: 64, LR: 0.0003
------------------------------------------------------------


In [None]:
model_path = "GPT-FineTunned.pt"
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

print("\n" + "="*60)
print("Quick generation test:")
print("="*60)

model.eval()
context = torch.tensor([tokenizer.encode("Human: Hello!\nAssistant: ")], dtype=torch.long, device=device)

with torch.no_grad():
    generated = model.generate(context, max_new_tokens=100)

result = tokenizer.decode(generated[0].tolist())
print(result)

# Task
Adjust the model's `lm_head` (final output layer) to match the `current_vocab_size` (85) derived from the tokenizer, copy the weights for existing tokens, and initialize new weights for added tokens. Then, restart the fine-tuning process. Finally, save the fine-tuned model and perform a quick generation test.

## Adjust Model Vocabulary

### Subtask:
Modify the code to ensure that the model's final output layer (lm_head) is correctly resized to match the `current_vocab_size` (85) derived from the tokenizer. This will involve re-initializing the lm_head layer with the new vocabulary size, copying the weights for existing tokens, and initializing new weights for any added tokens.


**Reasoning**:
The subtask requires modifying the model's `lm_head` layer to match the new vocabulary size. I will locate the existing code where the `token_embedding_table` is resized and insert the logic for `lm_head` resizing immediately after it, following the provided instructions.

