# Loading and Summarizing The Model

This notebook will load the model and summarize it so that we can understand the model better.


In [1]:
%load_ext autoreload
%autoreload 2

import sys
import torch

sys.path.append('../')

torch.__version__

'2.2.2'

## Use an accelerator if available


In [2]:
device = torch.device("cpu")

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")

device

device(type='mps')

## Load a dataset to get an acturate vocabulary size


In [3]:
import pandas as pd
from gpt.datasets import CharacterLevelTextDataset
from pathlib import Path
from IPython.display import display, Markdown

TEXT_DATA_PATH = Path("../data/bible-kjv.txt")
VOCABULARY_SAVE_PATH = Path("../vocabulary.md")
SEQUENCE_LENGTH = 3

dataset = CharacterLevelTextDataset(TEXT_DATA_PATH, SEQUENCE_LENGTH)
characteristics = {
    "Data path": TEXT_DATA_PATH,
    "Sequence length": f"{SEQUENCE_LENGTH}",
    "Dataset length": f"{len(dataset)}",
    "Vocabulary size": f"{dataset.vocab_size}",
}

df = pd.DataFrame(characteristics.items(), columns=["Parameter", "Value"])
display(Markdown("### Dataset Characteristics"))
display(df)

vocab_data_frame = pd.DataFrame(
    dataset.char_to_idx.items(), columns=["Character", "Index"]
)

display(Markdown("### Vocabulary Sample"))
display(vocab_data_frame.head(10))

### Dataset Characteristics

Unnamed: 0,Parameter,Value
0,Data path,../data/bible-kjv.txt
1,Sequence length,3
2,Dataset length,4351876
3,Vocabulary size,62


### Vocabulary Sample

Unnamed: 0,Character,Index
0,﻿,0
1,t,1
2,h,2
3,e,3
4,,4
5,p,5
6,r,6
7,o,7
8,j,8
9,c,9


## Instantiate and summarize the model


In [4]:
from gpt import GPTLanguageModel
from pathlib import Path
from torchinfo import summary

VOCAB_SIZE = dataset.vocab_size
SUMMARY_PATH = Path("../gpt2_summary.txt")

MODEL_DENSITY_FACTOR = 2
# TODO: There may be a bug with the model as it should be posible to have heads be a multiple of 2.
NUM_HEADS = int(4 * MODEL_DENSITY_FACTOR)  # Attention heads should be a multiple of 4
NUM_LAYERS = int(NUM_HEADS / 2)  # Layers should be a multiple of 2:1 ratio with heads

model = GPTLanguageModel(
    vocab_size=VOCAB_SIZE,
    embedding_size=128,
    num_heads=NUM_HEADS,
    num_layers=NUM_LAYERS,
    hidden_size=256,
).to(device)

input_ids = torch.randint(0, VOCAB_SIZE, (1, SEQUENCE_LENGTH)).to(device)
summary = summary(model, input_data=input_ids)

with open(SUMMARY_PATH, "w") as f:
    f.write(str(summary))

summary

Layer (type:depth-idx)                   Output Shape              Param #
GPTLanguageModel                         [1, 3, 62]                --
├─Embedding: 1-1                         [1, 3, 128]               7,936
├─Embedding: 1-2                         [1, 3, 128]               128,000
├─ModuleList: 1-3                        --                        --
│    └─TransformerBlock: 2-1             [1, 3, 128]               --
│    │    └─MultiHeadAttention: 3-1      [1, 3, 128]               66,048
│    │    └─LayerNorm: 3-2               [1, 3, 128]               256
│    │    └─Dropout: 3-3                 [1, 3, 128]               --
│    │    └─Sequential: 3-4              [1, 3, 128]               65,920
│    │    └─LayerNorm: 3-5               [1, 3, 128]               256
│    │    └─Dropout: 3-6                 [1, 3, 128]               --
│    └─TransformerBlock: 2-2             [1, 3, 128]               --
│    │    └─MultiHeadAttention: 3-7      [1, 3, 128]               