In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
import torch
import json
from safetensors import safe_open

save_directory = "downloaded_model"
os.makedirs(save_directory, exist_ok=True)
#downlaod a small model for testing
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)


Loading weights: 100%|██████████| 76/76 [00:00<00:00, 151.98it/s, Materializing param=transformer.wte.weight]            
[1mGPT2LMHeadModel LOAD REPORT[0m from: distilgpt2
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
transformer.h.{0, 1, 2, 3, 4, 5}.attn.bias | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
Writing model shards: 100%|██████████| 1/1 [00:03<00:00,  3.23s/it]


('downloaded_model/tokenizer_config.json', 'downloaded_model/tokenizer.json')

In [6]:
#print all the files and their sizes in the save_directory
print("Files in the save directory:")
for root, dirs, files in os.walk(save_directory):
    for file in files:
        file_path = os.path.join(root, file)
        file_size = os.path.getsize(file_path)
        print(f"{file_path} - {file_size} bytes")

Files in the save directory:
downloaded_model/tokenizer.json - 3557680 bytes
downloaded_model/tokenizer_config.json - 286 bytes
downloaded_model/model.safetensors - 327657928 bytes
downloaded_model/generation_config.json - 118 bytes
downloaded_model/config.json - 1057 bytes


In [7]:
#whats in the config.json file interms of the model architecture and hyperparameters?
config_path = os.path.join(save_directory, "config.json")
with open(config_path, "r") as f:
    config = json.load(f)
print("Model architecture and hyperparameters in config.json:")
print(json.dumps(config, indent=4))

# definitions for the most important parameters in the config.json file
# vocab_size: The size of the vocabulary, i.e., the number of unique tokens that the model can understand.
# n_positions: The maximum sequence length that the model can process.
# n_ctx: The context window size, which is the maximum number of tokens that the model
# can attend to at once.
# n_embd: The dimensionality of the embeddings, which is the size of the vector
# that represents each token in the model.
# n_layer: The number of transformer layers in the model.
# n_head: The number of attention heads in each transformer layer, which determines
# how many different attention patterns the model can learn.
# activation_function: The activation function used in the feedforward layers of the model,
# which introduces non-linearity into the model's computations.
    

Model architecture and hyperparameters in config.json:
{
    "_num_labels": 1,
    "activation_function": "gelu_new",
    "add_cross_attention": false,
    "architectures": [
        "GPT2LMHeadModel"
    ],
    "attn_pdrop": 0.1,
    "bos_token_id": 50256,
    "dtype": "float32",
    "embd_pdrop": 0.1,
    "eos_token_id": 50256,
    "id2label": {
        "0": "LABEL_0"
    },
    "initializer_range": 0.02,
    "label2id": {
        "LABEL_0": 0
    },
    "layer_norm_epsilon": 1e-05,
    "model_type": "gpt2",
    "n_ctx": 1024,
    "n_embd": 768,
    "n_head": 12,
    "n_inner": null,
    "n_layer": 6,
    "n_positions": 1024,
    "pad_token_id": null,
    "reorder_and_upcast_attn": false,
    "resid_pdrop": 0.1,
    "scale_attn_by_inverse_layer_idx": false,
    "scale_attn_weights": true,
    "summary_activation": null,
    "summary_first_dropout": 0.1,
    "summary_proj_to_labels": true,
    "summary_type": "cls_index",
    "summary_use_proj": true,
    "task_specific_params": {
   

In [10]:
# Find the weights file
weights_file = None
for file in files:
    if file.endswith(".bin") or file.endswith(".safetensors"):
        weights_file = file
        break

if weights_file:
    print(f"Found weights file: {weights_file}")

    if weights_file.endswith(".bin"):
        weights_path = os.path.join(save_directory, weights_file)
        state_dict = torch.load(weights_path)

        print("\nModel contains these weight matrices:")
        print(f"{'Layer Name':<50} {'Shape':<15} {'Preview'}")
        print("-" * 80)

        for i, (name, tensor) in enumerate(list(state_dict.items())[:10]):
            # Get first 3 values as preview
            preview = tensor.flatten()[:3].tolist()
            print(f"{name:<50} {str(tensor.shape):<15} {preview}...")

    elif weights_file.endswith(".safetensors"):
        try:
            weights_path = os.path.join(save_directory, weights_file)
            with safe_open(weights_path, framework="pt") as f:
                tensor_names = list(f.keys())[:10]

                print("\nModel contains these weight matrices:")
                print(f"{'Layer Name':<50} {'Shape':<15} {'Preview'}")
                print("-" * 80)

                for name in tensor_names:
                    tensor = f.get_tensor(name)
                    preview = tensor.flatten()[:3].tolist()
                    print(f"{name:<50} {str(tensor.shape):<15} {preview}...")
        except ImportError:
            print("safetensors library not installed. Run: pip install safetensors")

else:
    print("No weights file found")

Found weights file: model.safetensors

Model contains these weight matrices:
Layer Name                                         Shape           Preview
--------------------------------------------------------------------------------
transformer.h.0.attn.c_attn.bias                   torch.Size([2304]) [0.4693034589290619, -0.4959352910518646, -0.4157843589782715]...
transformer.h.0.attn.c_attn.weight                 torch.Size([768, 2304]) [-0.4988037049770355, -0.19897758960723877, -0.1046222522854805]...
transformer.h.0.attn.c_proj.bias                   torch.Size([768]) [0.16174378991127014, -0.16444097459316254, -0.15611258149147034]...
transformer.h.0.attn.c_proj.weight                 torch.Size([768, 768]) [0.25814932584762573, -0.16598303616046906, 0.062477629631757736]...
transformer.h.0.ln_1.bias                          torch.Size([768]) [0.00478767603635788, 0.01292799785733223, -0.018999796360731125]...
transformer.h.0.ln_1.weight                        torch.Size([768]) 

In [15]:
#tokenizer configuration
tokenizer_config_path = os.path.join(save_directory, "tokenizer_config.json")
with open(tokenizer_config_path, "r") as f:
    tokenizer_config = json.load(f)
print("\nTokenizer configuration:")
print(json.dumps(tokenizer_config, indent=4))


Tokenizer configuration:
{
    "add_prefix_space": false,
    "backend": "tokenizers",
    "bos_token": "<|endoftext|>",
    "eos_token": "<|endoftext|>",
    "errors": "replace",
    "is_local": false,
    "model_max_length": 1024,
    "pad_token": null,
    "tokenizer_class": "GPT2Tokenizer",
    "unk_token": "<|endoftext|>"
}


In [17]:
# inspect vocab from tokenizer.json via tokenizer API
vocab = tokenizer.get_vocab()
print(f"\nTokenizer vocab size: {len(vocab)}")

# show first 10 tokens by id
id_to_token = sorted(vocab.items(), key=lambda item: item[1])
print("Tokenizer vocabulary (first 10 tokens by id):")
for token, index in id_to_token[:10]:
    print(f"{index}: {token}")

# show a small random sample for sanity
print("\nTokenizer vocabulary (sample 10 tokens):")
for token, index in list(vocab.items())[:10]:
    print(f"{token}: {index}")



Tokenizer vocab size: 50257
Tokenizer vocabulary (first 10 tokens by id):
0: !
1: "
2: #
3: $
4: %
5: &
6: '
7: (
8: )
9: *

Tokenizer vocabulary (sample 10 tokens):
[[: 30109
ĠPog: 48974
Ġinstructed: 17767
answer: 41484
Ġthreats: 7432
ĠWrest: 20722
Ġtruck: 7779
Ġaliens: 16269
ĠInterior: 19614
Ġveins: 32375


In [18]:
# BPE-style tokenization demo using the loaded tokenizer
samples = ["unbelievable", "tokenization", "hello world", "running", "RAG pipeline"]
for text in samples:
    tokens = tokenizer.tokenize(text)
    ids = tokenizer.convert_tokens_to_ids(tokens)
    print(f"\nText: {text}")
    print(f"Tokens: {tokens}")
    print(f"Ids: {ids}")



Text: unbelievable
Tokens: ['un', 'bel', 'iev', 'able']
Ids: [403, 6667, 11203, 540]

Text: tokenization
Tokens: ['token', 'ization']
Ids: [30001, 1634]

Text: hello world
Tokens: ['hello', 'Ġworld']
Ids: [31373, 995]

Text: running
Tokens: ['running']
Ids: [20270]

Text: RAG pipeline
Tokens: ['RAG', 'Ġpipeline']
Ids: [33202, 11523]


In [20]:
#will see how tokenization works for a sample text and how it maps to the model's input format
sample_text = "The quick brown fox jumps over the lazy dog."
tokens = tokenizer.tokenize(sample_text)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(f"\nSample text: {sample_text}")
print(f"Tokens: {tokens}")
print(f"Ids: {ids}")
# how is it merging subwords into words? let's see the mapping of tokens to original text through bpe tokenization
bpe_tokens = tokenizer(sample_text, return_tensors="pt")["input_ids"][0].tolist()
bpe_tokens_str = [tokenizer.convert_ids_to_tokens([id])[0] for id in bpe_tokens]
print(f"BPE Tokens: {bpe_tokens_str}")


Sample text: The quick brown fox jumps over the lazy dog.
Tokens: ['The', 'Ġquick', 'Ġbrown', 'Ġfox', 'Ġjumps', 'Ġover', 'Ġthe', 'Ġlazy', 'Ġdog', '.']
Ids: [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13]
BPE Tokens: ['The', 'Ġquick', 'Ġbrown', 'Ġfox', 'Ġjumps', 'Ġover', 'Ġthe', 'Ġlazy', 'Ġdog', '.']


In [None]:
# What are the files in the save_directory and whats their purpose?
# config.json - contains the model architecture and hyperparameters
# pytorch_model.bin - contains the model weights (if using PyTorch)
# safetensors - contains the model weights in a more efficient format (if using safetensors), 
# we dont have pytorch_model.bin but we have safetensors
# tokenizer_config.json - contains the tokenizer configuration and vocabulary information
# generation_config.json - contains the default generation parameters for the model (if present)