In [None]:
# checkpoint_epoch_40000_GPT_vocab18k > Final accuracy on validation: 34.91% (74/212)
# checkpoint_epoch_5000_GPT_vocab18k  > Final accuracy on validation: 18.40% (39/212)
#                                     > Final accuracy on validation: 15.15% (20/132)
# checkpoint_epoch_1000_GPT_vocab18k  > Final accuracy on validation: 8.49% (18/212)


# checkpoint_epoch_5000_GPT > Final accuracy on validation: 46.21% (61/132)
# checkpoint_epoch_1000_GPT > Final accuracy on validation: 12.12% (16/132)

# Load Model

In [119]:
device='cuda'

In [236]:
from GPT import GPT, GPTConfig
from dataloader import DataLoader

config = GPTConfig()
config.batch_size = 28
config.block_size = 1024
config.epochs = 1000000
config.validation_frequency = 100
config.validation_epochs = 5
config.dataset = "wikitext"
config.tokenizer_name = "wikitext2_18k"

In [237]:
data_loader = DataLoader(config)
config.vocab_size = data_loader.vocab_size

[Tokenizer.__init__] 0 regex tokens (order preserved)
[DataLoader.__init__] loaded tokenizer tokenizer_wikitext2_18k.pickle
[DataLoader._load_dataset] Loading cached encoding from ./datasets/wikitext_train.txt.cache.pt
[DataLoader._load_dataset] ./datasets/wikitext_train.txt: vocab_size = 17707
[DataLoader._load_dataset] Loading cached encoding from ./datasets/wikitext_val.txt.cache.pt
[DataLoader._load_dataset] ./datasets/wikitext_val.txt: vocab_size = 17707
[DataLoader.__init__] train_data.shape=torch.Size([208821909]), val_data.shape=torch.Size([431117])


In [238]:
data_loader.vocab_size

17707

In [239]:
model = GPT(config)
model.to(device)
print(f"{sum(p.numel() for p in model.parameters()):,}")

99,441,408


In [277]:
from utils import load_checkpoint
checkpoint_path = "./checkpoints/checkpoint_epoch_40000_GPT_vocab18k.pth"
load_checkpoint(model, None, checkpoint_path, None)

Checkpoint loaded from ./checkpoints/checkpoint_epoch_40000_GPT_vocab18k.pth, resuming at epoch 40000


(40000, 'GPT_vocab18k')

In [285]:
%load_ext autoreload
%autoreload 2
from generate import generate, sample_generations

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [286]:
sample_generations(model, data_loader.tokenizer, 
    config, 
    device=device)

[generate] generated: When he went to the American public to criticize the album . After a breakup in the United Sta
[generate] generated: After the other two were to be abandoned by the influenza system of HMS Warspite ( see the
[generate] generated: Hello world @-@ record holder in the professional role , the team played in a
[generate] generated: Often it was assumed this informal title and wailst to resolve the revolt . 
 The
[generate] generated: 2+2= , and everyone can never be disturbed by such an extensive observations
[generate] generated: The capital of France is xenon @-@ era shared with the social divisions that are allocated to the conso
[generate] generated: The pen is on the xylomand spacecraft . 
 = = Reception = = 
 = = = Spectral r


[('When he went',
  'When he went to the American public to criticize the album . After a breakup in the United Sta'),
 ('After the other',
  'After the other two were to be abandoned by the influenza system of HMS Warspite ( see the'),
 ('Hello world',
  'Hello world @-@ record holder in the professional role , the team played in a'),
 ('Often it was',
  'Often it was assumed this informal title and wailst to resolve the revolt . \n The'),
 ('2+2=',
  '2+2= , and everyone can never be disturbed by such an extensive observations'),
 ('The capital of France is ',
  'The capital of France is xenon @-@ era shared with the social divisions that are allocated to the conso'),
 ('The pen is on the ',
  'The pen is on the xylomand spacecraft . \n = = Reception = = \n = = = Spectral r')]

In [282]:
generate(
    model, 
    data_loader.tokenizer, 
    config, 
    device=device,
    prompt="When he went",
    length=100)

[generate] generated: When he went to the American public to criticize the album . After a breakup in the United States , Texas headed for MuchMusic 's Rock Band . 
 = = = = = Both AC / DC and the AC / DC Foundation remix = = = = = 
 After the release of Thursday Nightmares


"When he went to the American public to criticize the album . After a breakup in the United States , Texas headed for MuchMusic 's Rock Band . \n = = = = = Both AC / DC and the AC / DC Foundation remix = = = = = \n After the release of Thursday Nightmares"

In [253]:
prompt = """In 2007, American Idol opened one of the """
tokens = data_loader.tokenizer.encode(prompt)
tokens
model.eval()
x = torch.tensor(tokens[:config.block_size], device=device).view(1, -1)
x
    

tensor([[   73,   110,  1984,    44,   300,  1782,   429,  8685,   110,   334,
         13560,   108,   268,  5108,   630,   100,   268,   630,   280,   260,
            32]], device='cuda:0')

In [254]:
data_loader.tokenizer.decode(tokens, raw_tokens=False)

'In 2007, American Idol opened one of the '

In [255]:
to_decode = x.tolist()[0]
for _ in range(200):
    
    x = x[-config.block_size:]

    logits = model(x.view(1, -1))
    
    v, ixs = logits[0, -1, :].topk(20)
    ix = torch.multinomial(F.softmax(v, dim=0), 1)
    new_token = ixs[ix]
    to_decode.append(new_token.view(-1).item())
    x = torch.cat([x.view(-1), new_token])

print(data_loader.tokenizer.decode(to_decode, raw_tokens=False))

In 2007, American Idol opened one of the # 10 American Stop , 2009 . Around 1 @,@ 000 of a philosophy of the philosophical launched the philosophical launch of the stop of the philosophy 's stop for the philosophy stop . 
 This features in the level of the game 's development for instance . The contrast over the game 's role , a new philosophical stop philosophical exist in the stop of regular philosophy . It was established as stop paint prisoners and a contrast to unit for the


In [256]:
%load_ext autoreload
%autoreload 2


from evaluation import evaluate_cbt_with_probs

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [259]:
accuracy, results, skipped = evaluate_cbt_with_probs(
        model=model,
        tokenizer=data_loader.tokenizer,
        device=device,
        dataset_split="validation",
        verbose=False,
        max_context_length=800,
        max_examples=300  # Set to None to evaluate all examples
    )

Evaluating model on CBT-CN validation set...
Max context length: 800 tokens


Processing validation examples: 100%|██████████| 300/300 [00:20<00:00, 14.36it/s]


Final accuracy on validation: 15.15% (20/132)
Skipped examples due to context length: 168

Top-K Accuracy:
Top-1 accuracy: 15.15%
Top-2 accuracy: 31.06%
Top-3 accuracy: 46.97%
Top-4 accuracy: 56.82%
Top-5 accuracy: 69.70%

Token Count Statistics:
Min: 436
Max: 799
Mean: 630.3
Median: 639





In [261]:
accuracy, skipped

(0.15151515151515152, 168)

In [190]:
data_loader.tokenizer.encode("ASdasd")

[1707, 100, 287, 100]

In [None]:
from utils import load_checkpoint
checkpoint_path = "./checkpoints/checkpoint_epoch_40000_GPT_vocab18k.pth"
load_checkpoint(model, None, checkpoint_path, None)

In [114]:
# Choose model size - larger models will be more accurate but require more GPU memory
# Options: "gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"
model_name = "gpt2-large"

# Choose dataset split
dataset_split = "validation"

# Run evaluation
accuracy, results = evaluate_cbt_with_probs(
    model_name=model_name,
    dataset_split=dataset_split,
    verbose=True,
    max_examples=50  # Set to None to evaluate all examples
)

print(f"\nEvaluation complete!")
print(f"Model: {model_name}")
print(f"Dataset: CBT-CN {dataset_split}")
print(f"Accuracy: {accuracy:.2%}")

Evaluating gpt2-large on CBT-CN validation set...


KeyboardInterrupt: 