In [1]:
import torch
import torch.nn as nn

class CausalSelfAttention(nn.Module):
    def __init__(self, n_embd, n_head, n_head_dim, dropout):
        super().__init__()
        # attention dimensions:
        # - key and query dimensions need to be the same
        # - value dimension can be different, but in practice, we set it to the same thing
        self.n_kq = n_head_dim
        self.n_v = n_head_dim
        self.n_kq_tot = n_head * self.n_kq
        self.n_v_tot = n_head * self.n_v
        
        # projections from embedding dim to total attention dim (split into q, k, v) and back
        self.qkv = nn.Linear(n_embd, 2 * self.n_kq_tot + self.n_v_tot)
        self.proj = nn.Linear(self.n_v_tot, n_embd)
        self.dropout = nn.Dropout(dropout)

        self.n_head = n_head
        self.n_head_dim = n_head_dim
        self.scale = self.n_kq ** -0.5

    def _split_heads(self, tensor, batch_size, seq_len, n_head, n_head_dim):
        """Reshape tensor from (B, T, n_head * n_head_dim) into (B, n_head, T, n_head_dim)"""
        return tensor.view(batch_size, seq_len, n_head, n_head_dim).transpose(1, 2)
    
    def _merge_heads(self, tensor, batch_size, seq_len, n_head, n_head_dim):
        """Reshape tensor from (B, n_head, T, n_head_dim) into (B, T, n_head * n_head_dim)"""
        return tensor.transpose(1, 2).contiguous().view(batch_size, seq_len, n_head * n_head_dim)

    def forward(self, x):
        B, T, C = x.shape

        # project all qkv matrices at once, then split
        qkv = self.qkv(x)
        q, k, v = qkv.split([self.n_kq_tot, self.n_kq_tot, self.n_v_tot], dim=-1)
        
        # reshape each of q, k, v from (B, T, n_head * n_head_dim) into (B, n_head, T, n_head_dim)
        q = self._split_heads(q, B, T, self.n_head, self.n_kq)
        k = self._split_heads(k, B, T, self.n_head, self.n_kq)
        v = self._split_heads(v, B, T, self.n_head, self.n_v)

        # compute attention scores
        # 1. combine q and k to make a square matrix for query-key matching (relevance LUT)
        # 2. scale to 1/√n_kq, because otherwise the dot products grow with the number of dimensions
        # 3. causal masking
        # 4. scale relevance scores to add up to 1, so we can effectively add them up
        # 5. get the weighted contributions of all value embeddings according to their relevance
        att = (q @ k.transpose(-2, -1))
        att = att * self.scale
        att = att.masked_fill(torch.tril(torch.ones(T, T, device=att.device)) == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.dropout(att)
        y = att @ v

        # reshape output from (B, n_head, T, n_head_dim) into (B, T, n_head * n_head_dim)
        y = self._merge_heads(y, B, T, self.n_head, self.n_v)
        # project from attention embeddings back to token embeddings
        y = self.proj(y)
        return self.dropout(y)

In [2]:
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, n_embd, dropout):
        super().__init__()
        # All operations maintain the batch and sequence length dimensions,
        # only transforming the embedding dimension.
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

In [3]:
import math
import torch
import torch.nn as nn
from torch.nn import functional as F

class Block(nn.Module):
    def __init__(self, n_embd, n_head, n_head_dim, dropout):
        super().__init__()
        self.ln_1 = nn.LayerNorm(n_embd)
        self.attn = CausalSelfAttention(n_embd, n_head, n_head_dim, dropout)
        self.ln_2 = nn.LayerNorm(n_embd)
        self.mlp = MLP(n_embd, dropout)

    def forward(self, x):
        # x+ is the residual connection
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

In [4]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [5]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")


In [None]:
from datasets import load_dataset

# Load your text dataset
DATASET_PATH = "data/tiny-shakespeare.txt"  # Relative path for dataset
dataset = load_dataset('text', data_files=DATASET_PATH)

In [7]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./mini_chatgpt',  # Saves to the same folder as your notebook
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_dir='./logs',  # Logs in the same folder
)


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
import os

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Define a function to preprocess the Shakespeare text file
def prepare_dataset(file_path, tokenizer, block_size=128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )
    return dataset

# Prepare the dataset
dataset = prepare_dataset(DATASET_PATH, tokenizer)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Training arguments
output_dir = "models/final_model"
os.makedirs(output_dir, exist_ok=True)  # Ensure directory exists
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_dir="logs",  # Relative path for logs
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
MODEL_DIR = "./models/final_model"  # Relative path for portability
os.makedirs(MODEL_DIR, exist_ok=True)  # Ensure directory exists
model.save_pretrained(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)

# Test the trained model
def generate_text(prompt, model, tokenizer, max_length=50):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1, no_repeat_ngram_size=2)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the model with a prompt
prompt = "O Romeo, Romeo, wherefore art thou Romeo?"
generated_text = generate_text(prompt, model, tokenizer)
print(generated_text)


In [None]:
import os

# Define relative path for saving the model
MODEL_DIR = "./models/final_model"

# Ensure the directory exists before saving
os.makedirs(MODEL_DIR, exist_ok=True)

# Save the model and tokenizer
model.save_pretrained(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)

print(f"Model and tokenizer saved to {MODEL_DIR}")

In [None]:
import os
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Define relative path for the checkpoint
MODEL_DIR = "./models/final_model"

# Ensure the model directory exists before loading
if not os.path.exists(MODEL_DIR):
    raise FileNotFoundError(f"Model checkpoint not found at {MODEL_DIR}. Make sure to train or place the model in this directory.")

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained(MODEL_DIR)

print("Model loaded successfully!")