In [2]:
# Initialize empty lists to store misspelled words and their correct spellings
misspelled_words = []
correct_spellings = []

# Read the contents of the text file
with open('misspelled_words.txt', 'r') as file:
    # Iterate through each line in the file
    for line in file:
        # Split the line into misspelled word and correct spelling
        correct, misspelled = line.strip().split('\t')
        # Append misspelled word to the list
        misspelled_words.append(misspelled)
        # Append correct spelling to the list
        correct_spellings.append(correct)

# Print the lists to verify
print("Misspelled Words:", misspelled_words)
print("Correct Spellings:", correct_spellings)

Misspelled Words: ['Accomodate', 'Aquire', 'Aficianado', 'Isle', 'Amatuer', 'Apparant', 'Artic', 'Arguement', 'Athiest', 'Belive', 'Bizzare', 'Calender', 'Carribean', 'Cemetary', 'Cheif', 'Collegue', 'Collectable', 'Columist', 'Commitee', 'Comitted', 'Concensus', 'Definately', 'Dilemna', 'Dissapoint', 'Embarras', 'Embarassed', 'Enviroment', 'Exilerate', 'Facinate', 'Florescent', 'Foriegn', 'Fourty', 'Freind', 'Gage', 'Goverment', 'Greatful', 'Happend', 'Harras', 'Horderves', 'Humourous', 'Immediatly', 'Independant', 'Jewelry', 'Judgement', 'Knowlege', 'Liesure', 'Liason', 'Lightening', 'Loose', 'Maintanance', 'Manuever', 'Medival', 'Momento', 'Millenium', 'Minature', 'Mischevious', 'Mispell', 'Nausious', 'Neccessary', 'Ocassion', 'Occured', 'Parralel', 'Pavilion', 'Perseverence', 'Phillipines', 'Playwrite', 'Privelege', 'Publically', 'Questionaire', 'Recieve', 'Recomend', 'Resistence', 'Responsability', 'Rythm', 'Sacreligious', 'Shedule', 'Sence', 'Seperate', 'Seige', 'Strenght', 'Succ

In [43]:
from transformers import BertTokenizer, BertModel
import torch

# Load BERT tokenizer and model 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
model1 = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
# Initialize lists to store tokenized inputs
misspelled_tokenized = []
correct_tokenized = []

# print the model architecture
# print(model)

def get_bert_embeddings(emb_model, words):
    # Tokenize the words and prepare BERT input
    encoded_input = tokenizer(words, padding=True, truncation=True, return_tensors='pt')
    
    # Extract embeddings
    with torch.no_grad():
        outputs = emb_model(**encoded_input)
        input_embeddings = emb_model.embeddings.word_embeddings(encoded_input['input_ids'])
    hidden_states = outputs.last_hidden_state
    # Aggregate the embeddings for each word (simple approach)
    embeddings = torch.mean(hidden_states, dim=1)
    return encoded_input, input_embeddings, embeddings


# Tokenize misspelled and correct words
for misspelled, correct in zip(misspelled_words, correct_spellings):
    correct_tokenized.append(get_bert_embeddings(model, correct))
    misspelled_tokenized.append(get_bert_embeddings(model1, misspelled))

# print the shape of the embeddings
#print("Misspelled Embeddings Shape:", misspelled_tokenized[0].shape)
#print("Correct Embeddings Shape:", correct_tokenized[0].shape)


print("Misspelled Embeddings:", misspelled_tokenized[0][0][:10])
print("Correct Embeddings:", correct_tokenized[0][0][:10])


Misspelled Embeddings: {'input_ids': tensor([[  101, 16222, 19506, 13701,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
Correct Embeddings: {'input_ids': tensor([[ 101, 8752,  102]]), 'token_type_ids': tensor([[0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1]])}


In [11]:
# Using tiktoken to extract the BPE tokens used in GPT-4
import tiktoken

enc = tiktoken.get_encoding("cl100k_base")
# To get the tokeniser corresponding to a specific model in the OpenAI API:
# Initialize lists to store tokenized inputs and their matches
misspelled_tokenized = []
correct_tokenized = []
token_matches = []
#print(enc.encode("hello world"))
for misspelled, correct in zip(misspelled_words, correct_spellings):
    # Tokenize the misspelled word
    misspelled_tokens = enc.encode(misspelled)
    misspelled_tokenized.append(misspelled_tokens)
    
    # Tokenize the correct word
    correct_tokens = enc.encode(correct)
    correct_tokenized.append(correct_tokens)
    
    # Find matching tokens
    matches = set(misspelled_tokens).intersection(set(correct_tokens))
    matches = [enc.decode([token]) for token in matches]
    token_matches.append(matches)


# Print tokenized outputs and their matches
print("Misspelled Tokenized:", misspelled_tokenized[:5])
print("Correct Tokenized:", correct_tokenized[:5])
print("Token Matches:", token_matches[:5])




Misspelled Tokenized: [[14945, 316, 88111], [32, 999], [55439, 12734, 2172], [3957, 273], [6219, 266, 8977]]
Correct Tokenized: [[14945, 316, 2658, 349], [11916, 999], [55439, 14222, 2172], [32, 41205], [6219, 11067]]
Token Matches: [['Acc', 'om'], ['quire'], ['ado', 'Af'], [], ['Am']]


In [28]:
from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

input_ids = tokenizer(text)['input_ids']
print(input_ids)
input_ids_tensor = torch.tensor([input_ids])
# Generate the embeddings for input IDs 
with torch.no_grad():
    model_output = model(input_ids_tensor)
    last_hidden_states = model_output.last_hidden_state
    
# Extract the embeddings for the input IDs from the last hidden layer
input_embeddings = last_hidden_states[0,1:-1,:]

# compute the average of the embeddings
input_embeddings = torch.mean(input_embeddings, dim=0)

print(input_embeddings.shape)
print(input_embeddings)

[3041, 5372, 502, 416, 597, 2420, 345, 1549, 588, 13]
torch.Size([768])
tensor([-2.2211e-01, -2.4913e-02, -7.9644e-01,  1.2311e-01, -1.5142e-02,
         2.0473e-01,  2.0380e+00, -2.8743e-01,  8.8317e-02, -1.9279e-01,
         2.0039e-01, -2.7463e-01, -2.6310e-01,  7.0118e-02, -2.6523e-01,
        -2.2119e-01, -7.9613e-02, -4.5220e-01,  5.0428e-01,  2.4738e-01,
        -5.8950e-02, -1.6384e-01, -1.2509e-01,  4.8210e-02,  1.5231e-01,
         3.1702e-02, -5.6753e-01, -5.0967e-02,  2.3500e-01,  3.3692e-01,
        -9.4491e-02, -1.0552e-01, -1.3025e-01, -4.3243e-01, -3.2330e-01,
        -4.9765e-01,  6.4552e+01,  2.5136e-01,  2.0790e-01,  3.4858e-01,
        -3.4390e-01,  2.9791e-01,  1.8456e-01, -1.1489e-01,  2.0997e-02,
        -2.6759e-01, -9.1552e-02, -6.4487e-01, -1.0714e-01,  1.2296e+00,
         9.4861e-02,  3.1005e-01, -1.8994e-01,  2.0349e-01,  1.3099e-02,
         5.5352e-01, -1.7095e-02,  7.8032e-02,  2.0283e-02, -1.4592e-01,
         1.5231e-01,  8.5904e-02,  4.7031e-02, -7.20

In [34]:
from transformers import BertTokenizer, BertModel
import torch

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_input_embeddings(texts, model, tokenizer):
    # Tokenize the input texts
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

    # Disable gradient calculations
    with torch.no_grad():
        # Extract token embeddings directly from the embeddings layer
        # This retrieves the embeddings after they've been summed with position embeddings
        token_embeddings = model.embeddings(input_ids=inputs['input_ids'])
        
    # Return the embeddings
    return token_embeddings

def get_bert_embeddings(text, model, tokenizer):
    # Tokenize the text
    input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    # Get the BERT model embeddings
    with torch.no_grad():
        model_output = model(**input)
        embeddings = model_output.last_hidden_state

    # Return the embeddings
    return embeddings


# Example usage
texts = ["Hello, world!", "How are you doing today?"]
input_embeddings = get_input_embeddings(texts, model, tokenizer)

print(input_embeddings[:,:,:1])  # This should print: [batch_size, sequence_length, embedding_size]

# Get BERT embeddings for a text
embeddings = get_bert_embeddings(texts, model, tokenizer)

print(embeddings[:,:,:1])  # This should print: [batch_size, sequence_length, hidden_size]


tensor([[[ 1.6855e-01],
         [ 3.7386e-01],
         [ 4.6706e-04],
         [ 6.0389e-01],
         [ 6.4243e-01],
         [-3.2507e-01],
         [ 3.5675e-01],
         [ 2.4668e-01]],

        [[ 1.6855e-01],
         [-7.1995e-02],
         [-2.3302e-01],
         [-4.9603e-01],
         [ 6.3315e-01],
         [-9.9420e-01],
         [ 6.0006e-01],
         [-2.5515e-01]]])
tensor([[[-0.0781],
         [-0.2016],
         [-0.7156],
         [ 0.0527],
         [-0.7122],
         [ 0.9955],
         [-0.3101],
         [-0.4686]],

        [[ 0.0576],
         [ 0.3285],
         [ 0.7088],
         [ 0.2213],
         [ 0.2567],
         [-0.1029],
         [ 0.1181],
         [ 0.6520]]])


In [39]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_word_embedding(text, word, model, tokenizer):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt")
    
    # Find the index of the word (handling subword tokenization)
    word_tokens = tokenizer.tokenize(word)
    word_ids = tokenizer.convert_tokens_to_ids(word_tokens)
    token_ids = inputs['input_ids'][0].tolist()  # Convert to list to handle more easily
    # Finding the first complete match of the word tokens in the input IDs
    for i in range(len(token_ids)):
        if token_ids[i:i+len(word_tokens)] == word_ids:
            print(token_ids[i:i+len(word_tokens)])
            print(word_ids)
            word_index = i
            break

    # Get the embeddings from BERT
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state

    # Extract the embeddings for the word (average subword embeddings if needed)
    word_embeddings = embeddings[0, word_index:word_index+len(word_tokens)].mean(dim=0)
    
    return word_embeddings

# Example texts
text1 = "The hotel staff went out of their way to accomodate the guests."
text2 = "The hotel staff went out of their way to accommodate the guests."

# Get embeddings for the specific words
embedding1 = get_word_embedding(text1, "accomodate", model, tokenizer)
embedding2 = get_word_embedding(text2, "accommodate", model, tokenizer)

# Reshape embeddings to match expected input for cosine_similarity ([1, -1] for single vector)
embedding1 = embedding1.unsqueeze(0)
embedding2 = embedding2.unsqueeze(0)

# Compute the cosine similarity
similarity = cosine_similarity(embedding1, embedding2)

# Print the similarity
print("Cosine similarity between 'accomodate' and 'accommodate':", similarity)


[16222, 19506, 13701]
[16222, 19506, 13701]
[8752]
[8752]
Cosine similarity between 'accomodate' and 'accommodate': [[0.5151014]]


In [48]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_id = "gpt2-large"
model = GPT2LMHeadModel.from_pretrained(model_id, output_attentions=True).to(device)
model.eval()
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
def get_word_embedding(word, model, tokenizer):
    # Encode the word to get token IDs
    token_ids = tokenizer.encode(word, add_special_tokens=False)
    
    # Convert token IDs to tensor and move it to the model's device
    tokens_tensor = torch.tensor([token_ids], device=model.device)
    
    with torch.no_grad():
        # Forward pass through the model
        outputs = model(tokens_tensor)
        # Retrieve the hidden states from the model output
        hidden_states = outputs[0]  # 'outputs' is a tuple, the first element is the hidden states

    # Averaging over the sequence length
    return hidden_states[0].mean(dim=0)

king_emb = get_word_embedding('I am a jndjsc scndsnc burger', model, tokenizer)
man_emb = get_word_embedding('Man', model, tokenizer)
woman_emb = get_word_embedding('Woman', model, tokenizer)
queen_emb = get_word_embedding('Queen', model, tokenizer)

# print all the embeddings
print("king embedding: ", king_emb)
print("man embedding:", man_emb)
print("woman embedding: ", woman_emb)
print("queen embedding:", queen_emb)
from torch.nn.functional import cosine_similarity
analogy_emb = king_emb - man_emb + woman_emb
similarity = cosine_similarity(king_emb.unsqueeze(0), queen_emb.unsqueeze(0))
print("Cosine similarity: ", similarity.item())

king embedding:  tensor([ 2.7180,  3.1631,  0.3012,  ..., -6.9639, -5.1866,  1.8697])
man embedding: tensor([ 2.8015,  3.5800, -0.1190,  ..., -6.7876, -3.8558,  1.8777])
woman embedding:  tensor([ 3.0411,  5.3653,  0.3071,  ..., -6.2418, -3.3228,  2.6389])
queen embedding: tensor([ 2.5185,  5.2505, -0.6024,  ..., -7.1251, -2.5000,  1.6070])
Cosine similarity:  0.9301317930221558
