### Tokenization & Embedding for LLMs

In [1]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Select device & model
device = torch.device("cpu")
model_name = "microsoft/phi-1_5"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side="left",
    truncation_side="left"
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.float32,
    device_map={"": device},
    low_cpu_mem_usage=True,
    trust_remote_code=False
).eval()

print("Model and tokenizer loaded on", device)

Model and tokenizer loaded on cpu


Let‚Äôs try generating some text with our model to see how it reacts to the prompt.

In [3]:
prompt = ("Write an email apologizing to Sarah for the tragic gardening mishap." 
          "Explain how it happened.<|assistant|>")

# Tokenize the input prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cpu")

# Generate the text
generation_output = model.generate(
  input_ids=input_ids,
  max_new_tokens=100
)

# Print the output
print(tokenizer.decode(generation_output[0]))

Write an email apologizing to Sarah for the tragic gardening mishap.Explain how it happened.<|assistant|>

Dear Sarah,

I hope this email finds you well. I wanted to take a moment to apologize for the unfortunate incident that occurred in your garden. It was truly a heartbreaking sight to see your beautiful flowers and plants destroyed by the strong winds. I understand how much effort and care you put into maintaining your garden, and I am truly sorry for the damage caused.

I want to assure you that I will do everything in my power to make it right. I will personally visit


Let‚Äôs take a look at the input and output tokens underlying this text.

In [4]:
# For inspection, print the input IDs
print(input_ids)

tensor([[16594,   281,  3053, 47401,   284, 10490,   329,   262, 15444, 46072,
         29406,   499,    13, 18438,   391,   703,   340,  3022, 29847,    91,
           562, 10167,    91,    29]])


In [5]:
# Decode and print each input ID sequence
for i in input_ids:
    print(tokenizer.decode(i))

Write an email apologizing to Sarah for the tragic gardening mishap.Explain how it happened.<|assistant|>


In [6]:
# Decode and print each input ID in the first batch
for i in input_ids[0]:
    print(tokenizer.decode(i))

Write
 an
 email
 apologizing
 to
 Sarah
 for
 the
 tragic
 gardening
 mish
ap
.
Expl
ain
 how
 it
 happened
.<
|
ass
istant
|
>


In [7]:
display(generation_output)

tensor([[16594,   281,  3053, 47401,   284, 10490,   329,   262, 15444, 46072,
         29406,   499,    13, 18438,   391,   703,   340,  3022, 29847,    91,
           562, 10167,    91,    29,   198,   198, 20266, 10490,    11,   198,
           198,    40,  2911,   428,  3053,  7228,   345,   880,    13,   314,
          2227,   284,  1011,   257,  2589,   284, 16521,   329,   262, 14855,
          4519,   326,  5091,   287,   534, 11376,    13,   632,   373,  4988,
           257, 37154,  6504,   284,   766,   534,  4950, 12734,   290,  6134,
          6572,   416,   262,  1913, 13520,    13,   314,  1833,   703,   881,
          3626,   290,  1337,   345,  1234,   656, 10941,   534, 11376,    11,
           290,   314,   716,  4988,  7926,   329,   262,  2465,  4073,    13,
           198,   198,    40,   765,   284, 19832,   345,   326,   314,   481,
           466,  2279,   287,   616,  1176,   284,   787,   340,   826,    13,
           314,   481,  7620,  3187]])

In [8]:
# Decode and print specific token IDs
print(tokenizer.decode(16594))
print(tokenizer.decode(281))

Write
 an


#### LLM Tokenizers

We wants to compare how different **LLM tokenizers** split the same input text into tokens, and to implement a small function that performs this comparison for several models on a shared string.

In [9]:
def compare_tokenizers(sentence, tokenizer_names, max_tokens_display=100):
    """
    Compare how different tokenizers split the same text.
    Shows token count, token IDs, and the actual tokens (NOT decoded text).
    """
    print("\n" + "="*100)
    print(f"Comparing tokenizers on text:\n{sentence}\n")
    print("="*100 + "\n")

    for name in tokenizer_names:
        print(f"Tokenizer: {name}")

        try:
            tokenizer = AutoTokenizer.from_pretrained(name)
        except Exception as e:
            print(f"Failed to load tokenizer: {e}\n")
            continue

        # Tokenize
        enc = tokenizer(sentence, return_tensors="pt")
        ids = enc.input_ids[0].tolist()
        tokens = tokenizer.convert_ids_to_tokens(ids)

        # Truncate long outputs for readability
        if len(tokens) > max_tokens_display:
            ids_display = ids[:max_tokens_display] + ["..."]
            tokens_display = tokens[:max_tokens_display] + ["..."]
        else:
            ids_display = ids
            tokens_display = tokens

        print(f"Token count: {len(ids)}")
        print(f"Token IDs  : {ids_display}")
        print(f"Tokens     : {tokens_display}")
        print("\n" + "-"*100 + "\n")

In [25]:
# Example: Compare how different tokenizers handle the same text
test_sentence = "Yesterday, I emailed Sarah about 3 unexpected ü§ñ updates‚Äînone were actually urgent!"

# List of tokenizers to compare
tokenizers_to_test = [
    "microsoft/phi-1_5",
    "microsoft/Phi-3-mini-4k-instruct",
    "bert-base-uncased",
    "google/flan-t5-small",
]

# Run comparison
compare_tokenizers(
    test_sentence,
    tokenizer_names=tokenizers_to_test,
    max_tokens_display=80
)


Comparing tokenizers on text:
Yesterday, I emailed Sarah about 3 unexpected ü§ñ updates‚Äînone were actually urgent!


Tokenizer: microsoft/phi-1_5
Token count: 18
Token IDs  : [28065, 11, 314, 24315, 10490, 546, 513, 10059, 12520, 97, 244, 5992, 960, 23108, 547, 1682, 18039, 0]
Tokens     : ['Yesterday', ',', 'ƒ†I', 'ƒ†emailed', 'ƒ†Sarah', 'ƒ†about', 'ƒ†3', 'ƒ†unexpected', 'ƒ†√∞≈Å', '¬§', 'ƒ∏', 'ƒ†updates', '√¢ƒ¢ƒ∂', 'none', 'ƒ†were', 'ƒ†actually', 'ƒ†urgent', '!']

----------------------------------------------------------------------------------------------------

Tokenizer: microsoft/Phi-3-mini-4k-instruct
Token count: 25
Token IDs  : [612, 18358, 29892, 306, 321, 655, 2356, 19235, 1048, 29871, 29941, 15668, 29871, 243, 162, 167, 153, 11217, 30003, 9290, 892, 2869, 5065, 5362, 29991]
Tokens     : ['‚ñÅY', 'esterday', ',', '‚ñÅI', '‚ñÅe', 'ma', 'iled', '‚ñÅSarah', '‚ñÅabout', '‚ñÅ', '3', '‚ñÅunexpected', '‚ñÅ', '<0xF0>', '<0x9F>', '<0xA4>', '<0x96>', '‚ñÅupdates', '‚Äî', 'none', '

These ‚Äúunusual‚Äù tokens in the output, such as `[CLS]`, `[SEP]`, `[UNK]`, `[PAD]`, `[MASK]`, `<s>`, `</s>`, and characters like `ÔøΩ` or `#`, are special tokens that guide how the model processes and interprets text.

**Special tokens overview**
- `[CLS]` is a classification token placed at the beginning of the sequence to represent the whole input for tasks like sentence classification.

- `[SEP]` is a separator token used to mark boundaries between segments, for example between two sentences in pair-input tasks.

- `[UNK]` is an unknown token used when a piece of text cannot be mapped to any known token in the vocabulary.

- `[PAD]` is a padding token used to extend shorter sequences so that a batch of inputs all has the same length.

- `[MASK]` is a masking token used during masked language modeling, where some tokens are hidden so the model learns to predict them from context.

- `<s>` and `</s>` are start and end tokens that mark the beginning and end of a sequence, often used in encoder-decoder or sequence-to-sequence models.

- `ƒ†` (used in GPT-style byte-level BPE tokenizers) is a special prefix that encodes a preceding space, indicating that the token starts after a whitespace character.

- Symbols like `ÔøΩ` or `#` can indicate problematic or un-decodable characters, or subword fragments, when the tokenizer cannot cleanly map bytes or characters back to readable text.

Different tokenization methods exist because they make different trade-offs in speed, vocabulary size, domain coverage, and final model quality.

**Why multiple tokenization techniques?**

- **Vocabulary size vs efficiency:** Methods like BPE, WordPiece, and SentencePiece slice text into subwords differently, trading off a larger vocabulary for fewer, more meaningful tokens per sentence. Fewer tokens generally mean faster inference and lower compute cost, but overly fragmented words can weaken semantic representation.

- **Language and domain needs:** Some tokenizers normalize or lowercase (e.g., uncased BERT) which is fine for many NLP tasks, while others must preserve case, punctuation, and exact forms for code, names, or technical jargon. In specialized domains (medical, legal, code), it is often better to keep domain terms intact instead of breaking them into arbitrary fragments.

- **Impact on generation and context:** Since LLMs generate one token at a time, splitting a word into many pieces increases the number of prediction steps, slows generation, and gives more room for errors to accumulate. Inefficient tokenization also burns through the context window faster, reducing how much text the model can consider at once and potentially degrading performance on long inputs.

#### Embeddings

Now that we understand how tokens represent text, we can look at how they are mapped into a high-dimensional vector space. In essence, an LLM converts each token into a numerical vector (an embedding) and then combines and transforms these token vectors to represent and generate text.

The purpose of **embeddings** is to represent text in an appropriate **vector space** so that models can capture meaning and support operations such as similarity search, clustering, and downstream prediction. When a model is downloaded from a model hub, its configuration typically defines an embedding matrix that maps token IDs to dense vectors. At the start of training, this embedding matrix is initialized randomly (like the rest of the model‚Äôs weights), and during training it is updated so that the vectors reflect the structure of the language and the specifics of the task.

Let‚Äôs begin by loading a model and its tokenizer so we can inspect the embedding matrix.

In [11]:
from transformers import AutoModel

# Load a tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")
model = AutoModel.from_pretrained("microsoft/deberta-v3-xsmall")

# Tokenize the sentence
tokens = tokenizer('Hello world', return_tensors='pt')

In [12]:
display(tokens)

{'input_ids': tensor([[    1, 31414,   232,     2]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

**Token IDs (integers)**

- These are the discrete indices assigned by the tokenizer‚Äôs vocabulary.

- They might look like `[1, 31414, 232, 2]`, where each number corresponds to a specific token, and certain IDs may be reserved for things like padding.

- Token IDs can be converted back to text using `tokenizer.decode()` because they directly reference entries in the vocabulary.

**Embeddings (float vectors)**

- These are continuous, high-dimensional vectors learned by the model to capture semantic and syntactic information.

- A single token embedding might look like `[-3.4816, 0.0861, -0.1819, ...]` with, for example, 384 dimensions.

- Embeddings cannot be directly decoded back to text; instead, each token ID is mapped to its corresponding embedding vector before being fed into the model.

In short, the pipeline is: **Text ‚Üí Token IDs ‚Üí Embeddings ‚Üí Model processing**, with IDs acting as symbolic references and embeddings as the model‚Äôs working numerical representation.

In [13]:
# Process the tokens
output = model(**tokens)[0]
display(output)

tensor([[[-3.4816,  0.0861, -0.1819,  ..., -0.0612, -0.3911,  0.3017],
         [ 0.1898,  0.3208, -0.2315,  ...,  0.3714,  0.2478,  0.8048],
         [ 0.2071,  0.5036, -0.0485,  ...,  1.2175, -0.2292,  0.8582],
         [-3.4278,  0.0645, -0.1427,  ...,  0.0658, -0.4367,  0.3834]]],
       grad_fn=<NativeLayerNormBackward0>)

In [14]:
# Check the shape of the output
output.shape

torch.Size([1, 4, 384])

In [15]:
# Decode the TOKEN IDs
tokenizer.decode(tokens['input_ids'][0])

'[CLS]Hello world[SEP]'

Here, `output` is a tensor of continuous embedding vectors with shape `[1, 4, 384]`, so it cannot be converted back to text. These values encode semantic information rather than discrete token IDs. In contrast, `tokens["input_ids"] = [1, 31414, 232, 2]` are vocabulary indices that can be decoded back into text, whereas `output = [[-3.48, 0.086, ...], [0.19, 0.32, ...], ...]` represents the corresponding 384‚Äëdimensional embeddings for each token and therefore cannot be directly decoded to text.

Now that we understand what embeddings are meant to do, let‚Äôs start playing with word embeddings to see how they work in practice and how they represent text as points in a vector space.

In [27]:
import gensim.downloader as api

# Download embeddings (66MB, glove, trained on wikipedia, vector size: 50)
model = api.load("glove-wiki-gigaword-50")

In [28]:
# print the 11 most similar words to "Engineer"
model.most_similar([model['engineer']], topn=11)

[('engineer', 1.0),
 ('mechanic', 0.7610689401626587),
 ('technician', 0.7588813900947571),
 ('engineers', 0.7152684926986694),
 ('worked', 0.7083118557929993),
 ('pioneer', 0.7055997848510742),
 ('retired', 0.6979386806488037),
 ('chemist', 0.6946015954017639),
 ('engineering', 0.6913756132125854),
 ('contractor', 0.6868984699249268),
 ('builder', 0.6847971677780151)]

This technique, often called **word2vec similarity**, is very useful for finding words that are close in meaning to a given word and can also power simple recommendation systems.

In [54]:
import io
import pandas as pd
from urllib import request

# URLs of the data
PLAYLIST_URL = "https://storage.googleapis.com/maps-premium/dataset/yes_complete/train.txt"
SONGS_URL = "https://storage.googleapis.com/maps-premium/dataset/yes_complete/song_hash.txt"

# Load the playlist file
with request.urlopen(PLAYLIST_URL) as resp:
    text = resp.read().decode("utf-8")

# Skip the first two metadata lines
lines = text.splitlines()[2:]

# Build playlists, removing those with only one song
playlists = []
for line in lines:
    line = line.strip()
    if not line:
        continue
    parts = line.split()
    if len(parts) > 1:
        playlists.append(parts)

# Load the song metadata file
with request.urlopen(SONGS_URL) as resp:
    songs_text = resp.read().decode("utf-8")

# Read metadata into a DataFrame
songs_buffer = io.StringIO(songs_text)
songs_df = pd.read_csv(
    songs_buffer,
    sep="\t",
    header=None,
    names=["id", "title", "artist"],
    dtype={"id": str, "title": str, "artist": str},
)

# Clean up and set index
songs_df = songs_df.dropna(how="all")
songs_df = songs_df.set_index("id")

In [55]:
print('Playlist #1:\n ', playlists[0], '\n')
print('Playlist #2:\n ', playlists[1])

Playlist #1:
  ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '2', '42', '43', '44', '45', '46', '47', '48', '20', '49', '8', '50', '51', '52', '53', '54', '55', '56', '57', '25', '58', '59', '60', '61', '62', '3', '63', '64', '65', '66', '46', '47', '67', '2', '48', '68', '69', '70', '57', '50', '71', '72', '53', '73', '25', '74', '59', '20', '46', '75', '76', '77', '59', '20', '43'] 

Playlist #2:
  ['78', '79', '80', '3', '62', '81', '14', '82', '48', '83', '84', '17', '85', '86', '87', '88', '74', '89', '90', '91', '4', '73', '62', '92', '17', '53', '59', '93', '94', '51', '50', '27', '95', '48', '96', '97', '98', '99', '100', '57', '101', '102', '25', '103', '3', '104', '105', '106', '107', '47', '108', '109', '110', '111', '112', '113', '25', '63', '62', '114', '115', '84', '116', '117',

#### Train word2vec model for recommendation song system

With our playlists and song metadata prepared, we can now train a word2vec model to generate song recommendations from a user‚Äôs playlist. We will use the `Word2Vec` implementation from the `gensim` library.

- `vector_size`: controls the dimensionality of the embedding vectors.

- `window`: defines how many neighboring items are considered as context.

- `negative`: sets the number of negative samples used during training.

- `min_count`: specifies the minimum number of occurrences required for a song to be included in the vocabulary.

- `workers`: determines how many CPU cores are used for training.

In [36]:
from gensim.models import Word2Vec

# train our Word2Vec model
model = Word2Vec(
    playlists, 
    vector_size=32, 
    window=20, 
    negative=50, 
    min_count=1, 
    workers=4
)

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


In [56]:
# Ask the model for songs similar to song #2172
song_id = 2172
model.wv.most_similar(positive=str(song_id))

[('3094', 0.997779130935669),
 ('3167', 0.9968992471694946),
 ('2849', 0.9965309500694275),
 ('2704', 0.9964370131492615),
 ('2976', 0.9958822131156921),
 ('6624', 0.99538654088974),
 ('3148', 0.9952204823493958),
 ('10084', 0.9950636625289917),
 ('3126', 0.9946957230567932),
 ('3079', 0.9944549798965454)]

In [57]:
print(songs_df.iloc[2172])

title     Fade To Black
artist        Metallica
Name: 2172 , dtype: object


In [58]:
import numpy as np

def print_recommendations(song_id, topn=5):
    similar_ids = [sid for sid, _ in model.wv.most_similar(positive=[str(song_id)], topn=topn)]

    # Try direct lookup by index
    if all(sid in songs_df.index for sid in similar_ids):
        recs = songs_df.loc[similar_ids]
    else:
        # Otherwise treat as integer positions
        positions = [int(s) for s in similar_ids if s.isdigit()]
        recs = songs_df.iloc[positions]

    return recs.reset_index()

display(print_recommendations(2172))

Unnamed: 0,id,title,artist
0,3094,Breaking The Law,Judas Priest
1,3167,Unchained,Van Halen
2,2849,Run To The Hills,Iron Maiden
3,2704,Over The Mountain,Ozzy Osbourne
4,2976,I Don't Know,Ozzy Osbourne
