# Training Custom LLM on Google Colab
This notebook implements the training pipeline for our custom language model using Google Colab's free GPU resources.

In [None]:
# Check if GPU is available
!nvidia-smi

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Clone the repository
!git clone https://github.com/TheBormann/Custom-LLM.git
!cd Custom-LLM && pip install -r requirements.txt

In [None]:
import sys
sys.path.append('/content/Custom-LLM')

import torch
from src.model.transformer import CustomTransformer
from src.data.data_processor import DataProcessor
from src.training.trainer import Trainer
from transformers import AutoTokenizer

In [None]:
# Import required libraries
from datasets import load_dataset

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')  # Using GPT-2 tokenizer
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to EOS token

# Load WikiText-2 dataset
dataset = load_dataset('wikitext', 'wikitext-2-v1')

# Extract texts from the dataset
train_data = dataset['train'] 
texts = train_data['text']

# Initialize data processor
data_processor = DataProcessor(tokenizer=tokenizer)
train_dataloader, val_dataloader = data_processor.prepare_data(texts)

In [None]:
# Initialize model
model = CustomTransformer(
    vocab_size=tokenizer.vocab_size,
    d_model=768,
    n_heads=12,
    n_layers=6,
    d_ff=3072
)

# Initialize trainer
trainer = Trainer(
    model=model,
    learning_rate=1e-4,
    warmup_steps=4000,
    max_grad_norm=1.0,
    use_wandb=True  # Set to True if using Weights & Biases
)

In [None]:
# Training configuration
EPOCHS = 10
CHECKPOINT_PATH = '/content/drive/MyDrive/custom_llm_checkpoints/model.pt'

# Start training
history = trainer.train(
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    epochs=EPOCHS,
    save_path=CHECKPOINT_PATH,
    log_interval=100
)

## Training Results
The model checkpoints are saved to Google Drive. You can load them later for inference or continue training.