<a href="https://colab.research.google.com/github/Praneeth-18/Transformers-and-Finetuning-with-LLMs/blob/main/Transformers_and_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NanoGPT Implementation**

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset, DataLoader

class GPTConfig:
    def __init__(self, vocab_size, block_size=64, n_embd=128, n_head=4,
                 n_layer=4, dropout=0.1, learning_rate=3e-4, max_iters=5000,
                 eval_interval=100, batch_size=32, eval_iters=200):
        self.vocab_size = vocab_size
        self.block_size = block_size
        self.n_embd = n_embd
        self.n_head = n_head
        self.n_layer = n_layer
        self.dropout = dropout
        self.learning_rate = learning_rate
        self.max_iters = max_iters
        self.eval_interval = eval_interval
        self.batch_size = batch_size
        self.eval_iters = eval_iters

class TextDataset(Dataset):
    def __init__(self, text, block_size):
        chars = sorted(list(set(text)))
        self.vocab_size = len(chars)
        self.stoi = {ch: i for i, ch in enumerate(chars)}
        self.itos = {i: ch for i, ch in enumerate(chars)}
        self.block_size = block_size
        self.data = self.encode(text)

    def encode(self, text):
        return torch.tensor([self.stoi[c] for c in text], dtype=torch.long)

    def decode(self, tokens):
        return ''.join([self.itos[int(i)] for i in tokens])

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        x = self.data[idx:idx + self.block_size]
        y = self.data[idx + 1:idx + self.block_size + 1]
        return x, y

class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        self.head_size = config.n_embd // config.n_head

        self.query = nn.Linear(config.n_embd, config.n_embd)
        self.key = nn.Linear(config.n_embd, config.n_embd)
        self.value = nn.Linear(config.n_embd, config.n_embd)
        self.proj = nn.Linear(config.n_embd, config.n_embd)
        self.attention_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)

        self.register_buffer("mask", torch.tril(torch.ones(config.block_size, config.block_size)))

    def forward(self, x):
        B, T, C = x.shape

        q = self.query(x).view(B, T, self.n_head, self.head_size).transpose(1, 2)
        k = self.key(x).view(B, T, self.n_head, self.head_size).transpose(1, 2)
        v = self.value(x).view(B, T, self.n_head, self.head_size).transpose(1, 2)

        att = (q @ k.transpose(-2, -1)) * (1.0 / np.sqrt(k.size(-1)))
        att = att.masked_fill(self.mask[:T, :T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attention_dropout(att)

        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.resid_dropout(self.proj(y))
        return y

class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(config.n_embd, 4 * config.n_embd),
            nn.ReLU(),
            nn.Linear(4 * config.n_embd, config.n_embd),
            nn.Dropout(config.dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.n_embd)
        self.attn = MultiHeadAttention(config)
        self.ln2 = nn.LayerNorm(config.n_embd)
        self.ffwd = FeedForward(config)

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class SimpleGPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.token_embedding = nn.Embedding(config.vocab_size, config.n_embd)
        self.position_embedding = nn.Embedding(config.block_size, config.n_embd)
        self.blocks = nn.Sequential(*[Block(config) for _ in range(config.n_layer)])
        self.ln_f = nn.LayerNorm(config.n_embd)
        self.head = nn.Linear(config.n_embd, config.vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding(idx)
        pos_emb = self.position_embedding(torch.arange(T, device=idx.device))
        x = tok_emb + pos_emb

        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.head(x)

        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))

        return logits, loss

    def generate(self, idx, max_new_tokens, temperature=1.0):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.config.block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / temperature
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

def train_model(model, train_loader, val_loader, config, device='cuda' if torch.cuda.is_available() else 'cpu'):
    model = model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)

    for iter in range(config.max_iters):
        if iter % config.eval_interval == 0:
            model.eval()
            val_loss = 0
            for batch in val_loader:
                x, y = [t.to(device) for t in batch]
                with torch.no_grad():
                    _, loss = model(x, y)
                val_loss += loss.item()
            val_loss /= len(val_loader)
            print(f'Step {iter}: val_loss = {val_loss:.4f}')
            model.train()

        for batch in train_loader:
            x, y = [t.to(device) for t in batch]
            _, loss = model(x, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    return model

# **Example:**

In [3]:
# First paste all the model implementation code above
# Then use this simple training script:

# Define text directly
text = """Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, "and what is the use of a book," thought Alice "without pictures or conversations?"

So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her."""

# Create dataset and model
# Setup with reduced training
dataset = TextDataset(text, block_size=32)
config = GPTConfig(
    vocab_size=dataset.vocab_size,
    block_size=32,
    n_embd=64,
    n_layer=4,
    max_iters=500,    # Reduced from 2000 to 500
    eval_interval=50  # Print every 50 steps instead of 100
)

# Create dataloaders
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, len(dataset) - train_size])
train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config.batch_size)

# Train and generate
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SimpleGPT(config)
model = train_model(model, train_loader, val_loader, config)

# Generate text
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated = model.generate(context, max_new_tokens=200)[0]
print("\nGenerated text:")
print(dataset.decode(generated))

Step 0: val_loss = 3.7072
Step 50: val_loss = 0.4924
Step 100: val_loss = 0.2446
Step 150: val_loss = 0.2269
Step 200: val_loss = 0.2341
Step 250: val_loss = 0.2363
Step 300: val_loss = 0.2419
Step 350: val_loss = 0.2453
Step 400: val_loss = 0.2470
Step 450: val_loss = 0.2485

Generated text:


So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasureasure of making a daisy-chain would be worth the trouble of ge


In [4]:
# First paste all the model implementation code above
# Then use this simple training script:

# Define text directly
text = """Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, "and what is the use of a book," thought Alice "without pictures or conversations?"

So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her."""

# Create dataset and model
# Setup with reduced training
dataset = TextDataset(text, block_size=32)
config = GPTConfig(
    vocab_size=dataset.vocab_size,
    block_size=32,
    n_embd=64,
    n_layer=4,
    max_iters=2000,
    eval_interval=100
)

# Create dataloaders
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, len(dataset) - train_size])
train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config.batch_size)

# Train and generate
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SimpleGPT(config)
model = train_model(model, train_loader, val_loader, config)

# Generate text
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated = model.generate(context, max_new_tokens=200)[0]
print("\nGenerated text:")
print(dataset.decode(generated))

Step 0: val_loss = 3.6487
Step 100: val_loss = 0.2770
Step 200: val_loss = 0.2754
Step 300: val_loss = 0.2826
Step 400: val_loss = 0.2889
Step 500: val_loss = 0.2920
Step 600: val_loss = 0.3003
Step 700: val_loss = 0.3031
Step 800: val_loss = 0.2986
Step 900: val_loss = 0.3055
Step 1000: val_loss = 0.3072
Step 1100: val_loss = 0.3134
Step 1200: val_loss = 0.3070
Step 1300: val_loss = 0.3088
Step 1400: val_loss = 0.3107
Step 1500: val_loss = 0.3107
Step 1600: val_loss = 0.3167
Step 1700: val_loss = 0.3189
Step 1800: val_loss = 0.3227
Step 1900: val_loss = 0.3271

Generated text:

So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting 


# **Case Study: textbooks are all you need**

In [3]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'

# Basic setup
!pip install transformers datasets torch accelerate

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset
import gc

# Clear GPU memory
torch.cuda.empty_cache()
gc.collect()
print("Initial memory cleared")

# Training data embedded directly
training_examples = [
    """def binary_search(arr, target):
    left, right = 0, len(arr) - 1
    while left <= right:
        mid = (left + right) // 2
        if arr[mid] == target:
            return mid
        elif arr[mid] < target:
            left = mid + 1
        else:
            right = mid - 1
    return -1""",

    """def quicksort(arr):
    if len(arr) <= 1:
        return arr
    pivot = arr[len(arr) // 2]
    left = [x for x in arr if x < pivot]
    middle = [x for x in arr if x == pivot]
    right = [x for x in arr if x > pivot]
    return quicksort(left) + middle + quicksort(right)"""
]

# Create dataset with labels
dataset = Dataset.from_dict({
    'text': training_examples
})

# Initialize tokenizer
print("Loading tokenizer...")
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
print("Tokenizer loaded")

# Tokenize dataset with labels
def tokenize_with_labels(examples):
    encodings = tokenizer(
        examples['text'],
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    encodings['labels'] = encodings['input_ids'].clone()
    return encodings

tokenized_dataset = dataset.map(
    tokenize_with_labels,
    batched=True,
    remove_columns=dataset.column_names,
    desc="Tokenizing dataset"
)
print("Dataset prepared with labels")

# Load model with FP32
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32,  # Changed to FP32
    low_cpu_mem_usage=True,
    device_map="auto"
)
print("Model loaded")

# Training arguments without FP16
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    save_total_limit=1,
    logging_steps=1,
    report_to="none",
    learning_rate=1e-5,
    fp16=False,  # Disabled FP16
    gradient_checkpointing=True,
    dataloader_num_workers=0,
    optim="adamw_torch",
    max_grad_norm=0.3,
    warmup_steps=10,
    save_strategy="no"
)

# Custom data collator
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)
print("Trainer initialized")

# Train with error handling
print("Starting training...")
try:
    train_result = trainer.train()
    print("Training completed!")
    print(f"Training metrics: {train_result}")
except Exception as e:
    print(f"Training error: {e}")
    raise e

# Test generation
def generate_code(prompt, max_length=128):
    print(f"\nGenerating code for prompt: {prompt}")
    inputs = tokenizer(prompt, return_tensors='pt', padding=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            temperature=0.7,
            top_p=0.95,
            num_beams=1,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test with different prompts
test_prompts = [
    "def calculate_sum(numbers):",
    "def is_prime(n):"
]

print("\nTesting model...")
for prompt in test_prompts:
    generated = generate_code(prompt)
    print(f"\nPrompt: {prompt}")
    print(f"Generated:\n{generated}")

Initial memory cleared
Loading tokenizer...
Tokenizer loaded


Tokenizing dataset:   0%|          | 0/2 [00:00<?, ? examples/s]

Dataset prepared with labels
Loading model...
Model loaded
Trainer initialized
Starting training...


Step,Training Loss
1,2.712
2,2.6148


Training completed!
Training metrics: TrainOutput(global_step=2, training_loss=2.663395881652832, metrics={'train_runtime': 0.4543, 'train_samples_per_second': 8.804, 'train_steps_per_second': 4.402, 'total_flos': 130648375296.0, 'train_loss': 2.663395881652832, 'epoch': 2.0})

Testing model...

Generating code for prompt: def calculate_sum(numbers):





Prompt: def calculate_sum(numbers):
Generated:
def calculate_sum(numbers):

























































































































Generating code for prompt: def is_prime(n):

Prompt: def is_prime(n):
Generated:
def is_prime(n): $prime(n): $prime(n): $prime(n): $prime(n): $prime(n): $prime(n): $prime(n): $prime(n): $prime(n): $prime(n): $prime(n): $prime(n): $prime(n): $prime(n): $prime(n): $prime(n): $prime(n): $prime(n): $prime(n): $prime(n): $prime(n): $prime(n): $prime(n): $prime(n): $
