# Training Custom LLM on Google Colab
This notebook implements the training pipeline for our custom language model using Google Colab's free GPU resources.

In [None]:
# Check if GPU is available
!nvidia-smi

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Clone the repository
!git clone https://github.com/TheBormann/Custom-LLM.git
!cd Custom-LLM && pip install -r requirements.txt

In [None]:
import sys
sys.path.append('/content/Custom-LLM')

import torch
from transformers import AutoTokenizer
from datasets import load_dataset

import wandb
import logging
from src.utils.logging_config import setup_logging

from src.model.transformer import CustomTransformer
from src.training.trainer import Trainer
from src.data.data_processor import DataProcessor

wandb.init(project="Custom-LLM", settings=wandb.Settings(console="off"))
setup_logging(log_level=logging.WARNING, log_to_file=True)

In [None]:
# Load WikiText-2 dataset
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')  # Using GPT-2 tokenizer

# Process training data
train_texts = dataset['train']['text']
val_texts = dataset['validation']['text']

# Initialize data processor
data_processor = DataProcessor(
    tokenizer=tokenizer,
    max_length=512,
    batch_size=8  # Larger batch size for GPU
)

# Create dataloaders
train_dataloader, val_dataloader = data_processor.prepare_data(
    texts=train_texts,
    split_ratio=0.1
)

In [None]:
# Initialize model with larger configuration for GPU training
model = CustomTransformer(
    vocab_size=len(tokenizer),
    d_model=512,
    n_heads=8,
    n_layers=4,
    d_ff=2048,
    dropout=0.1
).to("cuda" if torch.cuda.is_available() else "cpu") 

scaler = torch.amp.GradScaler('cuda')

print(f'Model Parameters: {sum(p.numel() for p in model.parameters())/1e6:.2f}M')
print(f'Device: {"cuda" if torch.cuda.is_available() else "cpu"}')

In [None]:
# Initialize trainer
trainer = Trainer(
    model=model,
    learning_rate=1e-5,
    warmup_steps=200,
    max_grad_norm=1.0,
    use_wandb=True
)

# Training configuration
EPOCHS = 10
CHECKPOINT_PATH = '/content/drive/MyDrive/custom_llm_checkpoints/model.pt'

torch.cuda.empty_cache()

# Start training
history = trainer.train(
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    epochs=EPOCHS,
    save_path=CHECKPOINT_PATH,
    log_interval=100
)

torch.cuda.empty_cache()

In [None]:
import matplotlib.pyplot as plt

# Plot training and validation loss
plt.figure(figsize=(10, 5))
plt.plot(history['train_loss'], label='Training Loss')
plt.plot(history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Plot perplexity
plt.figure(figsize=(10, 5))
plt.plot(history['train_perplexity'], label='Training Perplexity')
plt.plot(history['val_perplexity'], label='Validation Perplexity')
plt.title('Training and Validation Perplexity')
plt.xlabel('Epoch')
plt.ylabel('Perplexity')
plt.legend()
plt.show()