# Step 1: Create a dataset

In [None]:
from llm_trainer import create_dataset

create_dataset(save_dir="data",           # Safe files to "data/"
               dataset="fineweb-edu-10B", # Use this dataset: https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu
               CHUNKS_LIMIT=50,           # Create 50 .npy files
               CHUNK_SIZE=int(1e6))       # Each file will contain 1M tokens

# Step 2: Define GPT-2 model

In [None]:
from transformers import GPT2LMHeadModel, GPT2Config
import tiktoken

gpt2_config = GPT2Config(
    vocab_size=50257,
    n_positions=512,
    n_embd=512,
    n_layer=8,
    n_head=8,
)

gpt2_model = GPT2LMHeadModel(gpt2_config)
tokenizer = tiktoken.get_encoding("gpt2")

# Print the size of the model (a number of parameters it has) in millions.
num_params = sum(p.numel() for p in gpt2_model.parameters() if p.requires_grad)
print(f"Total Parameters: {num_params / 1e6:.2f}M")

# Step 3: Create an LLMTrainer object

In [None]:
from llm_trainer import LLMTrainer

trainer = LLMTrainer(model=gpt2_model,
                    optimizer=None,       # defaults to AdamW with weights decay.
                    scheduler=None,       # defaults to Warm-up steps + Cosine Annealing.
                    tokenizer=tokenizer,  # GPT2 tokenizer (this is also a choice by default).
                    )

# Step 4: Start training

In [None]:
trainer.train(max_steps=1_000,                      # Do 1_000 optimization steps.
                generate_each_n_steps=100,          # Sample from the model each 100 steps (and print it).
                print_logs_each_n_steps=25,        # Print [step, loss, norm, lr, dt, tok/sec] every 100 steps.
                context_window=256,                 # Context window (The maximum sequence length that this model might ever be used with).
                data_dir="data",                    # Directory with .npy files containing tokens.
                BATCH_SIZE=512,                     # Batch size.
                MINI_BATCH_SIZE=16,                 # Gradient accumulation is used. BATCH_SIZE = MINI_BATCH_SIZE * accumulation_steps.
                logging_file="logs_training.csv",   # File to write the training logs.
                save_each_n_steps=1_000,            # Save the state each 1000 steps.
                save_dir="checkpoints",             # Directory where to save training state (model + optimizer + dataloader).
                prompt="Once upon a time in Russia" # The model will continue this prompt each `generate_each_n_steps` steps.
)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# DISPLAY LOSS
data = pd.read_csv("logs_training.csv")

window_size = 10  # Adjust for more or less smoothing
smoothed_loss = data["Loss"].rolling(window=window_size).mean()

plt.plot(data["Step"], smoothed_loss, label="Smoothed Loss", color="pink")
plt.plot(data["Step"], data["Loss"], alpha=0.5, label="Original Loss", color="gray")

plt.axhline(y=6, color='r', linestyle='--', alpha=0.6)
plt.axhline(y=5, color='gray', linestyle='--', alpha=0.6)
plt.axhline(y=4, color='y', linestyle='--', alpha=0.6)
plt.axhline(y=3, color='g', linestyle='--', alpha=0.6)

plt.xlabel("Step")
plt.ylabel("Loss")
plt.legend()
plt.show()

# DISPLAY LEARNING RATE
plt.plot(data["Step"], data["LR"], label="Learning Rate")
plt.xlabel("Step")
plt.ylabel("LR")
plt.legend()
plt.show()

# DISPLAY GRADIENT NORM
plt.plot(data["Step"], data["Norm"], label="Gradient Norm")
plt.xlabel("Step")
plt.ylabel("Gradient Norm")
plt.legend()
plt.show()