In [None]:
pip install torch tqdm transformers datasets

In [5]:
import torch
from tqdm import tqdm
from transformers import GPT2TokenizerFast
from datasets import load_dataset
import os
import time

# Ensure the output directory exists
output_dir = 'token_batches'
os.makedirs(output_dir, exist_ok=True)

# Load the dataset from Hugging Face
print("Loading the dataset...")
dataset = load_dataset("HuggingFaceFW/fineweb", "sample-10BT", split='train', streaming=True)

# Initialize the GPT-2 tokenizer
print("Initializing the GPT-2 tokenizer...")
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

# Tokenization process
print("Starting tokenization...")
tokens_per_file = 10000000  # Save after processing 10 million tokens
all_tokens = []
token_count = 0
file_count = 0
start_time = time.time()

try:
    for sample in tqdm(dataset, desc="Processing texts", unit="texts"):
        text = sample['text']
        newly_encoded_tokens = tokenizer.encode(text, truncation=True, padding=False)
        all_tokens.extend(newly_encoded_tokens)
        token_count += len(newly_encoded_tokens)
        
        if token_count >= tokens_per_file:
            file_path = os.path.join(output_dir, f'tokens_batch_{file_count}.pt')
            torch.save(torch.tensor(all_tokens, dtype=torch.long), file_path)
            print(f"Saved {file_path} with {len(all_tokens)} tokens.")
            file_count += 1
            all_tokens = []  # Clear the list to free memory
            token_count = 0

    # Save any remaining tokens after the loop
    if all_tokens:
        file_path = os.path.join(output_dir, f'tokens_batch_{file_count}.pt')
        torch.save(torch.tensor(all_tokens, dtype=torch.long), file_path)
        print(f"Saved {file_path} with {len(all_tokens)} tokens.")
except Exception as e:
    print(f"An error occurred: {e}")

elapsed_time = time.time() - start_time
print(f"Tokenization completed in {elapsed_time:.2f} seconds.")
print("All tokens saved in 'token_batches' directory.")

  from .autonotebook import tqdm as notebook_tqdm


Loading the dataset...
Initializing the GPT-2 tokenizer...




Starting tokenization...


Processing texts: 20233texts [00:47, 261.38texts/s]

Saved token_batches\tokens_batch_0.pt with 10000009 tokens.


Processing texts: 40627texts [01:35, 265.32texts/s]

Saved token_batches\tokens_batch_1.pt with 10000136 tokens.


Processing texts: 60943texts [02:20, 238.05texts/s]

Saved token_batches\tokens_batch_2.pt with 10000234 tokens.


Processing texts: 81357texts [03:06, 250.48texts/s]

Saved token_batches\tokens_batch_3.pt with 10000322 tokens.


Processing texts: 101594texts [03:48, 214.89texts/s]

Saved token_batches\tokens_batch_4.pt with 10000625 tokens.


Processing texts: 122063texts [04:32, 307.59texts/s]

Saved token_batches\tokens_batch_5.pt with 10000059 tokens.


Processing texts: 142262texts [05:22, 233.37texts/s]

Saved token_batches\tokens_batch_6.pt with 10000436 tokens.


Processing texts: 162672texts [06:09, 226.07texts/s]

Saved token_batches\tokens_batch_7.pt with 10000026 tokens.


Processing texts: 182942texts [06:51, 257.44texts/s]

Saved token_batches\tokens_batch_8.pt with 10000012 tokens.


Processing texts: 203273texts [07:34, 244.35texts/s]

Saved token_batches\tokens_batch_9.pt with 10000071 tokens.


Processing texts: 223629texts [08:18, 259.10texts/s]

Saved token_batches\tokens_batch_10.pt with 10000451 tokens.


Processing texts: 244075texts [09:02, 277.96texts/s]

Saved token_batches\tokens_batch_11.pt with 10000641 tokens.


Processing texts: 264214texts [09:46, 177.02texts/s]

Saved token_batches\tokens_batch_12.pt with 10000681 tokens.


Processing texts: 272498texts [10:02, 507.32texts/s]

In [None]:
pip install torch dataclasses

In [None]:
import torch
import torch.nn as nn
from dataclasses import dataclass
from torch.utils.data import IterableDataset

@dataclass
class GPTConfig:
    block_size: int
    vocab_size: int
    n_layer: int
    n_head: int
    n_embd: int

class TokenizedDataset(IterableDataset):
    def __init__(self, token_files, block_size, batch_size):
        super().__init__()
        self.token_files = token_files
        self.block_size = block_size
        self.batch_size = batch_size

    def read_tokens(self, file_path):
        tokens = torch.load(file_path)
        for i in range(0, len(tokens), self.block_size + 1):
            yield tokens[i:i+self.block_size+1]

    def __iter__(self):
        for file_path in self.token_files:
            tokens = torch.load(file_path)
            total_tokens = len(tokens)
            total_batches = (total_tokens // (self.block_size + 1)) // self.batch_size
            print("Path: ", file_path, "Total tokens in file:", total_tokens, "Total batches in file will be:", total_batches)
            token_generator = self.read_tokens(file_path)
            for tokens in token_generator:
                if len(tokens) == self.block_size + 1:
                    input_ids = tokens[:-1].clone().detach().to(dtype=torch.long)
                    labels = tokens[1:].clone().detach().to(dtype=torch.long)
                    yield input_ids, labels

class TransformerBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.n_embd)
        self.ln2 = nn.LayerNorm(config.n_embd)
        self.attn = nn.MultiheadAttention(embed_dim=config.n_embd, num_heads=config.n_head)
        self.mlp = nn.Sequential(
            nn.Linear(config.n_embd, 4 * config.n_embd),
            nn.GELU(),
            nn.Linear(4 * config.n_embd, config.n_embd),
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x), self.ln1(x), self.ln1(x))[0]
        x = x + self.mlp(self.ln2(x))
        return x

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embedding = nn.Embedding(config.vocab_size, config.n_embd)
        self.position_embedding = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd))
        self.blocks = nn.Sequential(*[TransformerBlock(config) for _ in range(config.n_layer)])
        self.ln_f = nn.LayerNorm(config.n_embd)
        self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

    def forward(self, x):
        positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0)
        x = self.token_embedding(x) + self.position_embedding[:, :x.size(1), :]
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.head(x)
        return logits

In [None]:
pip install torch transformers tqdm

In [None]:
import os
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import GPT2TokenizerFast
from tqdm import tqdm
import time
from model import GPT, GPTConfig, TokenizedDataset
from torch import nn
from torch.cuda.amp import GradScaler, autocast

BATCH_SIZE = 32  # 16
ACCUMULATION_STEPS = 4  # 4 Accumulate gradients over this many steps
PRINT_EVERY = 200  # Print training loss every this many batches

# Set num_workers to the number of logical processors
num_workers = os.cpu_count()

def save_model_and_optimizer(model, optimizer, epoch, file_idx):
    os.makedirs('checkpoints', exist_ok=True)
    checkpoint_path = f'checkpoints/gpt2_epoch_{epoch + 1}_file_{file_idx + 1}.pt'
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch,
        'file_idx': file_idx
    }
    torch.save(checkpoint, checkpoint_path)
    print(f"Checkpoint saved at {checkpoint_path}")

def load_model_and_optimizer(model, optimizer):
    start_epoch = 0
    if os.path.exists('checkpoints'):
        checkpoint_files = [f for f in os.listdir('checkpoints') if f.startswith('gpt2_epoch_')]
        if checkpoint_files:
            latest_checkpoint = max(checkpoint_files, key=lambda x: (int(x.split('_')[2]), int(x.split('_')[4].split('.')[0])))
            checkpoint = torch.load(f'checkpoints/{latest_checkpoint}')
            if 'model_state_dict' in checkpoint and 'optimizer_state_dict' in checkpoint:
                model.load_state_dict(checkpoint['model_state_dict'])
                optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
                start_epoch = checkpoint['epoch'] + 1
                print(f"Model and optimizer loaded from checkpoint '{latest_checkpoint}'")
                return start_epoch
            else:
                print("Checkpoint file is missing required keys")
    return start_epoch

def generate_text(inputText):
    model.eval()  # Set model to evaluation mode
    input_ids = tokenizer.encode(inputText, return_tensors="pt").to(device)
    for _ in range(40):
        with autocast():  # Mixed precision inference
            logits = model(input_ids)
        next_token = torch.multinomial(nn.functional.softmax(logits[:, -1, :], dim=-1), num_samples=1)
        input_ids = torch.cat([input_ids, next_token], dim=-1)
    generated_text = tokenizer.decode(input_ids[0].tolist())
    print(f"Generated text: {generated_text}")

def train(model, loader, optimizer):
    start_epoch = load_model_and_optimizer(model, optimizer)
    print(f"Starting training from epoch {start_epoch + 1}")
    scaler = GradScaler()
    model.train()
    start_time = time.time()
    last_print_time = time.time()  # Initialize last print time

    # Open the loss file in append mode
    with open('loss.txt', 'a') as loss_file:
        for epoch in range(start_epoch, start_epoch + 5):
            for batch_idx, (input_ids, labels) in enumerate(loader, start=1):
                input_ids, labels = input_ids.to(device), labels.to(device)
                optimizer.zero_grad()

                with autocast():
                    logits = model(input_ids)
                    loss = nn.CrossEntropyLoss()(logits.view(-1, config.vocab_size), labels.view(-1))

                scaler.scale(loss).backward()

                if (batch_idx + 1) % ACCUMULATION_STEPS == 0:
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad()

                tokens_per_batch = input_ids.numel()  # Total number of tokens in the batch
                current_time = time.time()
                elapsed_time = current_time - last_print_time  # Time since last print
                tokens_per_second = PRINT_EVERY * tokens_per_batch / elapsed_time if elapsed_time > 0 else 0

                if batch_idx % PRINT_EVERY == 0:
                    time_since_start = current_time - start_time
                    print(f"Epoch {epoch + 1}, Batch {batch_idx}, Loss: {loss.item():.4f}")
                    print(f"Tokens in batch: {tokens_per_batch}, Elapsed time: {elapsed_time:.4f} sec, Tokens/sec: {tokens_per_second:.2f}")
                    print(f"Time Elapsed since start: {time_since_start:.2f} sec")
                    last_print_time = current_time  # Update last print time

                    # Save the loss to the file
                    loss_file.write(f"Epoch {epoch + 1}, Batch {batch_idx}, Loss: {loss.item():.4f}\n")

                if batch_idx % 1000 == 0:
                    save_model_and_optimizer(model, optimizer, epoch, 0)
                    generate_text("I am a")

    print("Training complete.")

if __name__ == "__main__":
    # Define the global counter in the main script and pass it to the dataset class
    counter0 = 0

    # Initialization and data loading
    token_files = [os.path.join('token_batches', f) for f in os.listdir('token_batches') if f.startswith('tokens_batch_')]
    config = GPTConfig(block_size=128, vocab_size=50257, n_layer=12, n_head=12, n_embd=768)
    dataset = TokenizedDataset(token_files, config.block_size, batch_size=BATCH_SIZE)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=num_workers, pin_memory=True)

    # Setup model and optimizer
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("Device: ", device)
    model = GPT(config).to(device)
    optimizer = AdamW(model.parameters(), lr=0.0003)

    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

    train(model, loader, optimizer)