<a href="https://colab.research.google.com/github/samir41939/Verisk-GenAI-Workshop/blob/main/mini_gpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/karpathy/minGPT.git

In [None]:
%cd minGPT

In [None]:
!pip install -e .

In [None]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt .

In [None]:
import os
import sys

import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader

from mingpt.model import GPT
from mingpt.trainer import Trainer
from mingpt.utils import set_seed, setup_logging, CfgNode as CN

In [None]:
class CharDataset(Dataset):
  """
  Emits batches of characters
  """

  @staticmethod
  def get_default_config():
      C = CN()
      C.block_size = 128
      return C

  def __init__(self, config, data):
      self.config = config

      chars = sorted(list(set(data)))
      data_size, vocab_size = len(data), len(chars)
      print('data has %d characters, %d unique.' % (data_size, vocab_size))

      self.stoi = { ch:i for i,ch in enumerate(chars) }
      self.itos = { i:ch for i,ch in enumerate(chars) }
      self.vocab_size = vocab_size
      self.data = data

  def get_vocab_size(self):
      return self.vocab_size

  def get_block_size(self):
      return self.config.block_size

  def __len__(self):
      return len(self.data) - self.config.block_size

  def __getitem__(self, idx):
      # grab a chunk of (block_size + 1) characters from the data
      chunk = self.data[idx:idx + self.config.block_size + 1]
      # encode every character to an integer
      dix = [self.stoi[s] for s in chunk]
      # return as tensors
      x = torch.tensor(dix[:-1], dtype=torch.long)
      y = torch.tensor(dix[1:], dtype=torch.long)
      return x, y

In [None]:
def get_config():
  """Config setup"""

  C = CN()

  # system
  C.system = CN()
  C.system.seed = 3407
  C.system.work_dir = './out/chargpt'

  # data
  C.data = CharDataset.get_default_config()

  # model
  C.model = GPT.get_default_config()
  C.model.model_type = 'gpt-mini'

  # trainer
  C.trainer = Trainer.get_default_config()
  C.trainer.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster

  return C

In [None]:
config = get_config()
setup_logging(config)
set_seed(config.system.seed)

In [None]:
# construct the training dataset
text = open('input.txt', 'r').read()
train_dataset = CharDataset(config.data, text)

data has 1115394 characters, 65 unique.


In [None]:
# construct the model
config.model.vocab_size = train_dataset.get_vocab_size()
config.model.block_size = train_dataset.get_block_size()
config.model.num_workers = 2
model = GPT(config.model)

number of parameters: 2.71M


In [None]:
# construct the trainer object
trainer = Trainer(config.trainer, model, train_dataset)

running on device cuda


In [None]:
# # iteration callback
# def batch_end_callback(trainer):

#     if trainer.iter_num % 10 == 0:
#         print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")

#     if trainer.iter_num % 500 == 0:
#         # evaluate both the train and test score
#         model.eval()
#         with torch.no_grad():
#             # sample from the model...
#             context = "O God, O God!"
#             x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None,...].to(trainer.device)
#             y = model.generate(x, 500, temperature=1.0, do_sample=True, top_k=10)[0]
#             completion = ''.join([train_dataset.itos[int(i)] for i in y])
#             print(completion)
#         # save the latest model
#         print("saving model")
#         ckpt_path = os.path.join(config.system.work_dir, "model.pt")
#         torch.save(model.state_dict(), ckpt_path)
#         # revert model to training mode
#         model.train()
# trainer.set_callback('on_batch_end', batch_end_callback)

In [None]:
# train the model
trainer.run()

In [None]:
# model inferencing
model.eval()

In [None]:
with torch.no_grad():
  # sample from the model...
  context = "O God, O God!"
  x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None,...].to(trainer.device)
  y = model.generate(x, 500, temperature=1.0, do_sample=True, top_k=10)[0]
  completion = ''.join([train_dataset.itos[int(i)] for i in y])
  print(completion)