### **Note** We need GPU for run this notebook, so in Google Colab, go to Runtime > Change runtime type > Hardware accelerator > GPU > GPU type > T4.

In [None]:
# !pip install --upgrade transformers==4.41.2 sentence-transformers==3.0.1 gensim==4.3.2 scikit-learn==1.5.0 accelerate==0.31.0 peft==0.11.1 scipy==1.10.1 numpy==1.26.4

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=False,
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

In [None]:
prompt = "Write an email apologizing to Sarah for the tragic gardening mishap. Explain how it happened.<|assistant|>"


In [None]:
# Tokenize the input prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")

In [None]:
# Generate the text
generation_output = model.generate(
  input_ids=input_ids,
  max_new_tokens=20
)

In [None]:
print(tokenizer.decode(generation_output[0]))

Write an email apologizing to Sarah for the tragic gardening mishap. Explain how it happened.<|assistant|> Subject: Heartfelt Apologies for the Gardening Mishap


Dear


In [None]:
print(input_ids)

tensor([[14350,   385,  4876, 27746,  5281,   304, 19235,   363,   278, 25305,
           293, 16423,   292,   286,   728,   481, 29889, 12027,  7420,   920,
           372,  9559, 29889, 32001]], device='cuda:0')


In [None]:
for id in input_ids:
  print(tokenizer.decode(id))

Write an email apologizing to Sarah for the tragic gardening mishap. Explain how it happened.<|assistant|>


In [None]:
print(generation_output)

tensor([[14350,   385,  4876, 27746,  5281,   304, 19235,   363,   278, 25305,
           293, 16423,   292,   286,   728,   481, 29889, 12027,  7420,   920,
           372,  9559, 29889, 32001,  3323,   622, 29901, 17778, 29888,  2152,
          6225, 11763,   363,   278, 19906,   292,   341,   728,   481,    13,
            13,    13, 29928,   799]], device='cuda:0')


In [None]:
print(tokenizer.decode(14350))
print(tokenizer.decode(728))

Write
ish


In [None]:
### Comparing Trained LLM Tokenizers

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
colors_list = [
    '102;194;165', '252;141;98', '141;160;203',
    '231;138;195', '166;216;84', '255;217;47'
]

In [None]:
def show_tokens(sentence, tokenizer_name):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    token_ids = tokenizer(sentence).input_ids
    for idx, t in enumerate(token_ids):
        print(
            f'\x1b[0;30;48;2;{colors_list[idx % len(colors_list)]}m' +
            tokenizer.decode(t) +
            '\x1b[0m',
            end=' '
        )

In [None]:
text = """
English and CAPITALIZATION
ðŸŽµ é¸Ÿ
show_tokens False None elif == >= else: two tabs:"    " Three tabs: "       "
12.0*50=600
"""

In [None]:
show_tokens(text, "microsoft/Phi-3-mini-4k-instruct")

In [None]:
show_tokens(text, "bert-base-cased")

In [None]:
show_tokens(text, "bert-base-uncased")

In [None]:
show_tokens(text, "gpt2")

## Contextualized Word Embeddings From a Language Model (Like BERT)

In [None]:
from transformers import AutoModel, AutoTokenizer

In [None]:
# Load a Tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")

# Load a Language Model
model = AutoModel.from_pretrained("microsoft/deberta-v3-xsmall")

# Tokenize the sentence
tokens = tokenizer("Hello World", return_tensors = "pt")

output = model(**tokens)[0]

In [None]:
output[0].shape

torch.Size([4, 384])

In [None]:
for token in tokens["input_ids"][0]:
  print(tokenizer.decode(token))

[CLS]
Hello
 World
[SEP]


In [None]:
print(tokens)

{'input_ids': tensor([[    1, 31414,   623,     2]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}


In [None]:
output

In [None]:
# e.g., embeddings for the first token
token_embeddings = output[0]  # Shape: (1, 4, 384)
# Access shape to get token embeddings for the specific tokens
for i in range(token_embeddings.shape[1]):
    embedding = token_embeddings[0, i]  # shape will be (384,)
    print(f"Embedding for token {i}: {embedding.shape}")  # (384,)


## **Text Embeddings (For Sentences and Whole Documents)**

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
# Load Model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Convert text to text embeddings
vector = model.encode("Best movie ever!")

In [None]:
vector.shape

(768,)

**Word Embeddings Beyond LLMs**

In [None]:
# !pip install gensim sentence-transformers transformers scikit-learn accelerate peft


In [None]:
import gensim.downloader as api

In [None]:
model = api.load("glove-wiki-gigaword-50")



In [None]:
model.most_similar([model['king']], topn=11)