In [1]:
from probs.gptb import GPTBForCausalLM, GPTBConfig
from probs.gptb import ByteTokenizer

tokenizer = ByteTokenizer()


2025-03-01 16:30:47.719397: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-01 16:30:47.884794: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-01 16:30:47.954708: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-01 16:30:47.974082: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-01 16:30:48.102532: I tensorflow/core/platform/cpu_feature_guar

In [11]:
import torch
from torch.utils.data import DataLoader
from datasets import Dataset

# load our custom dataset
# path : ~/code/pst/raw/2020/*.txt

import os
import glob

def load_text_files(path):
    texts = []
    for file_path in glob.glob(path):
        with open(file_path, "r") as f:
            texts.append(f.read())
    return texts

dummy_texts = load_text_files("/home/pkd/code/pst/raw/2020/*.txt")

# split into 1024 bytes sections
def split_text(text, chunk_size=512):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

dummy_texts = [split_text(text) for text in dummy_texts]
dummy_texts = [item for sublist in dummy_texts for item in sublist]


print(dummy_texts)

# Create a Hugging Face Dataset from the list of sentences.
dummy_dataset = Dataset.from_dict({"text": dummy_texts})

# Assume you have a tokenizer (for instance, from your GPTB model).
# tokenizer should be a callable that takes text and returns a dict with "input_ids" and "attention_mask".
def tokenize_fn(example):
    tokens = tokenizer(example["text"])
    return {
        "input_ids": tokens["input_ids"],
        "attention_mask": tokens["attention_mask"]
    }

tokenized_dataset = dummy_dataset.map(tokenize_fn, batched=False)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Simple collation for fixed-size batches (adjust padding as needed).
def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    # Pad the sequences to the same length.
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_mask = (input_ids != 0).long()
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": input_ids}

train_loader = DataLoader(tokenized_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

# Test to see a batch from the DataLoader.
for batch in train_loader:
    print(batch)
    break



Map:   0%|          | 0/7982 [00:00<?, ? examples/s]

{'input_ids': tensor([[ 97, 116, 105,  ..., 109,  97, 105],
        [115,  97, 116,  ...,   0,   0,   0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[ 97, 116, 105,  ..., 109,  97, 105],
        [115,  97, 116,  ...,   0,   0,   0]])}


In [None]:
# load a basic english dataset from huggingface
from datasets import load_dataset

dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# Assume you have a tokenizer (for instance, from your GPTB model).
# tokenizer should be a callable that takes text and returns a dict with "input_ids" and "attention_mask".
def tokenize_fn(example):
    tokens = tokenizer(example["text"])
    return {
        "input_ids": tokens["input_ids"],
        "attention_mask": tokens["attention_mask"]
    }

tokenized_dataset = dataset.map(tokenize_fn, batched=False)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Simple collation for fixed-size batches (adjust padding as needed).

train_loader = DataLoader(tokenized_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

# Test to see a batch from the DataLoader.
for batch in train_loader:
    print(batch)
    break


In [13]:
import torch
from tqdm.auto import tqdm

config = GPTBConfig()
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GPTBForCausalLM(config)
model.to(device)



optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

epochs = 3 # adjust as needed

for epoch in range(epochs):
    print(f"\nEpoch: {epoch+1}")
    model.train()
    epoch_loss = 0.0
    progress = tqdm(train_loader, desc="Training")
    for batch in progress:
        optimizer.zero_grad()
        batch = {k: v for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        progress.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1} Loss:", epoch_loss / len(train_loader))

# Save the model.
model.save_pretrained("gptb-model")


Epoch: 1


Training:   0%|          | 0/3991 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [10]:
import torch
import torch.nn.functional as F

model.eval()
# tokenize the initial input sentence
input_ids = tokenizer("This is a test sentence.")["input_ids"]
input_ids = torch.tensor(input_ids).unsqueeze(0)

probs = []

# generate probailities distributions for all tokens in the input sentence
for i in range(len(input_ids[0])):
    outputs = model(input_ids=input_ids)
    next_token_logits = outputs.logits[0, -1, :]
    next_token_probs = F.softmax(next_token_logits, dim=-1)
    print("Token:", tokenizer.decode([input_ids[0][i].item()]), "Probs:", next_token_probs)
    probs.append(next_token_probs)

# analysie entropy and perplexity
total_entropy = 0.0
total_perplexity = 0.0
for prob in probs:
    entropy = -torch.sum(prob * torch.log2(prob))
    perplexity = 2 ** entropy
    total_entropy += entropy
    total_perplexity += perplexity

print("Total Entropy:", total_entropy.item())
print("Total Perplexity:", total_perplexity.item())


Token: T Probs: tensor([3.3804e-02, 3.1519e-04, 1.3152e-04, 2.1620e-04, 1.4939e-04, 1.1916e-04,
        2.4139e-04, 1.2783e-04, 1.1445e-04, 1.2714e-04, 2.2721e-04, 1.9901e-04,
        1.8076e-04, 1.7783e-04, 1.4072e-04, 1.3798e-04, 1.0383e-04, 1.4479e-04,
        1.0056e-04, 9.1217e-05, 1.5887e-04, 1.3023e-04, 1.5704e-04, 9.7252e-05,
        1.9195e-04, 1.2761e-04, 1.4043e-04, 2.4898e-04, 2.1098e-04, 1.8391e-04,
        1.9020e-04, 1.3010e-04, 5.8758e-01, 1.4881e-04, 1.0148e-04, 1.8802e-04,
        2.6560e-04, 2.2879e-04, 9.8884e-05, 3.4506e-03, 8.4880e-05, 1.5715e-04,
        1.0033e-04, 1.3564e-04, 4.5470e-02, 1.7361e-03, 2.3165e-02, 1.7694e-04,
        6.5208e-04, 3.2137e-04, 2.3113e-04, 1.2954e-04, 1.7314e-04, 4.0193e-04,
        2.2823e-04, 1.7616e-04, 1.3982e-04, 3.7141e-04, 1.7795e-04, 2.2454e-04,
        7.4044e-05, 1.9439e-04, 4.3973e-04, 2.4621e-04, 2.3588e-04, 4.3589e-04,
        1.4414e-04, 2.1822e-04, 4.5054e-04, 3.6293e-04, 2.3076e-04, 1.0568e-04,
        1.8323e-04, 1.48