In [1]:
!pip install datasets



In [2]:
from probs.gptb import GPTBForCausalLM, GPTBConfig
from probs.gptb import ByteTokenizer

tokenizer = ByteTokenizer()


In [3]:
import torch

# Simple collation for fixed-size batches (adjust padding as needed).
def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    # Pad the sequences to the same length.
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_mask = (input_ids != 0).long()
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": input_ids}

def tokenize_fn(example):
    tokens = tokenizer(example["text"])
    return {
        "input_ids": tokens["input_ids"],
        "attention_mask": tokens["attention_mask"]
    }

In [5]:

from torch.utils.data import DataLoader
from datasets import Dataset

# load our custom dataset
# path : ~/code/pst/raw/2020/*.txt

import os
import glob

def load_text_files(path):
    texts = []
    for file_path in glob.glob(path):
        with open(file_path, "r") as f:
            texts.append(f.read())
    return texts

dummy_texts = load_text_files("/home/pkd/code/pst/raw/2020/*.txt")

# split into 1024 bytes sections
def split_text(text, chunk_size=512):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

dummy_texts = [split_text(text) for text in dummy_texts]
dummy_texts = [item for sublist in dummy_texts for item in sublist]


print(dummy_texts)

# Create a Hugging Face Dataset from the list of sentences.
dummy_dataset = Dataset.from_dict({"text": dummy_texts})

# Assume you have a tokenizer (for instance, from your GPTB model).
# tokenizer should be a callable that takes text and returns a dict with "input_ids" and "attention_mask".
def tokenize_fn(example):
    tokens = tokenizer(example["text"])
    return {
        "input_ids": tokens["input_ids"],
        "attention_mask": tokens["attention_mask"]
    }

tokenized_dataset = dummy_dataset.map(tokenize_fn, batched=False)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

train_loader = DataLoader(tokenized_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

# Test to see a batch from the DataLoader.
for batch in train_loader:
    print(batch)
    break

[]


ValueError: Columns ['attention_mask', 'input_ids'] not in the dataset. Current columns in the dataset: ['text']

In [7]:
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer


# Load dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# Function to split text into chunks
def split_text(examples):
    chunk_size = 512  # Character-based splitting (adjust if needed)
    all_chunks = []
    for text in examples["text"]:
        # Split text into chunks
        chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
        # Filter out empty chunks
        chunks = [chunk for chunk in chunks if chunk.strip()]
        all_chunks.extend(chunks)
    return {"text": all_chunks}

# Apply splitting to each dataset split using batched processing for efficiency
dataset = dataset.map(split_text, batched=True, batch_size=1000)

# Tokenization function


# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_fn, batched=True)

# Set dataset format to PyTorch tensors
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])


# Create DataLoader for the training split
train_loader = DataLoader(
    tokenized_dataset["train"],
    batch_size=2,
    shuffle=True,
    collate_fn=collate_fn
)

# Test a batch
for batch in train_loader:
    print(batch)
    break

Map:   0%|          | 0/4413 [00:00<?, ? examples/s]

Map:   0%|          | 0/36794 [00:00<?, ? examples/s]

Map:   0%|          | 0/3821 [00:00<?, ? examples/s]

{'input_ids': tensor([[ 32,  97, 110, 100,  32, 116, 104, 101,  32, 102, 105, 114, 115, 116,
          32, 102, 111, 114,  32, 116, 104, 101,  32,  73, 116,  97, 108, 105,
          97, 110,  32, 116, 101,  97, 109,  32, 105, 110,  32, 116, 104, 101,
         105, 114,  32,  49,  57,  48, 116, 104,  32,  71, 114,  97, 110, 100,
          32,  80, 114, 105, 120,  32,  46,  32,  10],
        [ 32,  61,  32,  61,  32,  61,  32,  66,  97, 110,  32, 105, 110,  32,
          80,  97, 107, 105, 115, 116,  97, 110,  32,  61,  32,  61,  32,  61,
          32,  10,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 

In [8]:
import os

import torch
from tqdm.auto import tqdm

config = GPTBConfig()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Verify vocabulary size matches tokenizer
print(f"Model vocab size: {config.vocab_size}")
print(f"Tokenizer vocab size: {tokenizer.vocab_size}")  # Add your tokenizer reference

model = GPTBForCausalLM(config).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

epochs = 3

for epoch in range(epochs):
    print(f"\nEpoch: {epoch+1}")
    model.train()
    epoch_loss = 0.0
    progress = tqdm(train_loader, desc="Training")
    for batch in progress:
        optimizer.zero_grad()

        # Move batch to device with validation
        batch = {k: v.to(device) for k, v in batch.items()}

        # Debug: Check input/label ranges
        print("Input IDs range:", batch["input_ids"].min(), batch["input_ids"].max())
        if "labels" in batch:
            print("Labels range:", batch["labels"].min(), batch["labels"].max())

        # Add labels for causal LM
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["input_ids"]  # Auto-regressive LM uses input_ids as labels
        )

        # Debug: Check loss value
        loss = outputs.loss
        print("Loss value:", loss.item())

        # Check for NaN/inf
        if torch.isnan(loss).any() or torch.isinf(loss).any():
            raise ValueError("NaN/Inf detected in loss")

        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        progress.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1} Loss:", epoch_loss / len(train_loader))

model.save_pretrained("gptb-model")

Model vocab size: 256
Tokenizer vocab size: 256

Epoch: 1


Training:   0%|          | 0/18397 [00:00<?, ?it/s]

Input IDs range: tensor(0, device='cuda:0') tensor(121, device='cuda:0')
Labels range: tensor(0, device='cuda:0') tensor(121, device='cuda:0')


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Loss value: 5.3192267417907715
Input IDs range: tensor(0, device='cuda:0') tensor(226, device='cuda:0')
Labels range: tensor(0, device='cuda:0') tensor(226, device='cuda:0')
Loss value: 4.233604431152344
Input IDs range: tensor(32, device='cuda:0') tensor(121, device='cuda:0')
Labels range: tensor(32, device='cuda:0') tensor(121, device='cuda:0')
Loss value: 4.4511823654174805
Input IDs range: tensor(0, device='cuda:0') tensor(121, device='cuda:0')
Labels range: tensor(0, device='cuda:0') tensor(121, device='cuda:0')
Loss value: 7.225354194641113
Input IDs range: tensor(0, device='cuda:0') tensor(195, device='cuda:0')
Labels range: tensor(0, device='cuda:0') tensor(195, device='cuda:0')
Loss value: 4.3624043464660645
Input IDs range: tensor(0, device='cuda:0') tensor(120, device='cuda:0')
Labels range: tensor(0, device='cuda:0') tensor(120, device='cuda:0')
Loss value: 4.9256205558776855
Input IDs range: tensor(32, device='cuda:0') tensor(122, device='cuda:0')
Labels range: tensor(32, 

KeyboardInterrupt: 

In [None]:
import torch
import torch.nn.functional as F

model.eval()
# tokenize the initial input sentence
input_ids = tokenizer("This is a test sentence.")["input_ids"]
input_ids = torch.tensor(input_ids).unsqueeze(0)

probs = []

# generate probailities distributions for all tokens in the input sentence
for i in range(len(input_ids[0])):
    outputs = model(input_ids=input_ids)
    next_token_logits = outputs.logits[0, -1, :]
    next_token_probs = F.softmax(next_token_logits, dim=-1)
    print("Token:", tokenizer.decode([input_ids[0][i].item()]), "Probs:", next_token_probs)
    probs.append(next_token_probs)

# analysie entropy and perplexity
total_entropy = 0.0
total_perplexity = 0.0
for prob in probs:
    entropy = -torch.sum(prob * torch.log2(prob))
    perplexity = 2 ** entropy
    total_entropy += entropy
    total_perplexity += perplexity

print("Total Entropy:", total_entropy.item())
print("Total Perplexity:", total_perplexity.item())
