<a href="https://colab.research.google.com/github/WeiKuoLi/simple_test/blob/main/llm_simple.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
tokenizer = {'A': 0, 'B': 1, 'C': 2}
onehot = [[1, 0, 0], [0, 1, 0], [0, 0, 1]]
embedding = np.random.randn(2, 3)

print(tokenizer['A'])
print(onehot[tokenizer['A']])
print(embedding @ onehot[tokenizer['A']])

0
[1, 0, 0]
[-0.34715424  0.1996753 ]


In [5]:
print(embedding)
print(embedding @ onehot[tokenizer['A']])
print(embedding @ onehot[tokenizer['B']])
print(embedding @ onehot[tokenizer['C']])

[[-0.34715424 -0.21678652 -0.32696026]
 [ 0.1996753  -0.38732585  0.01749741]]
[-0.34715424  0.1996753 ]
[-0.21678652 -0.38732585]
[-0.32696026  0.01749741]


In [4]:
text = 'ABCCCB'

# get embedding array
embedding_array = np.array([embedding @ onehot[tokenizer[char]] for char in text])
print(embedding_array)

[[-0.34715424  0.1996753 ]
 [-0.21678652 -0.38732585]
 [-0.32696026  0.01749741]
 [-0.32696026  0.01749741]
 [-0.32696026  0.01749741]
 [-0.21678652 -0.38732585]]


In [57]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Specify the model name
model_name = "gpt2"

# Step 1: Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Step 2: Define the words
words = ["why", "what", "center"]

# Step 3: Tokenize and extract embeddings
embeddings = []
for word in words:
    # Tokenize the word and get the input IDs
    print("Current word: ",word)
    input_ids = tokenizer(word, return_tensors="pt")["input_ids"]
    print("Token IDs: ",input_ids)
    with torch.no_grad():
        # Get the embeddings from the model's input embeddings layer
        print("embeddings shape: ",model.transformer.wte(input_ids).shape)

        # Get the (averaged) embedding for the tokens in the word
        token_embedding = model.transformer.wte(input_ids)[0].mean(dim=0)  # [1] to ignore [CLS] token
        embeddings.append(token_embedding.numpy())
    print('-'*60)

# Step 4: Calculate cosine similarity and angles
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def calculate_angle(vec1, vec2):
    cos_sim = cosine_similarity(vec1, vec2)
    return np.arccos(np.clip(cos_sim, -1.0, 1.0)) * (180 / np.pi)  # Convert radians to degrees

for i in range(len(words)):
    for j in range(i + 1, len(words)):
        angle = calculate_angle(embeddings[i], embeddings[j])
        print(f"Angle between [{words[i]}] and [{words[j]}]: {angle:.2f} degrees")


Current word:  why
Token IDs:  tensor([[22850]])
embeddings shape:  torch.Size([1, 1, 768])
------------------------------------------------------------
Current word:  what
Token IDs:  tensor([[10919]])
embeddings shape:  torch.Size([1, 1, 768])
------------------------------------------------------------
Current word:  center
Token IDs:  tensor([[16159]])
embeddings shape:  torch.Size([1, 1, 768])
------------------------------------------------------------
Angle between [why] and [what]: 45.02 degrees
Angle between [why] and [center]: 72.07 degrees
Angle between [what] and [center]: 72.68 degrees
