In [17]:
import torch
from transformers import RobertaTokenizer, RobertaModel
import numpy as np

In [18]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

with open('2.txt', 'r') as f:
    glove_vocab = [line.strip() for line in f]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dr

In [19]:
def word_embeddings(glove_vocab, batch_size=16):
    word_embeddings = {}
    
    for i in range(0, len(glove_vocab), batch_size):
        batch_words = glove_vocab[i:i + batch_size]
        embeddings = helper(batch_words)
        for word, embedding in zip(batch_words, embeddings):
            word_embeddings[word] = embedding
    return word_embeddings

def helper(words):
    inputs = tokenizer(words, return_tensors='pt', padding=True, truncation=True, is_split_into_words=False)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to the device
    with torch.no_grad():
        outputs = model(**inputs)
    token_embeddings = outputs.last_hidden_state  
    word_embeddings = token_embeddings.mean(dim=1) 
    return word_embeddings.cpu().numpy() 



word_embeddings = word_embeddings(glove_vocab)


In [20]:
def normalize_embeddings(vectors):
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)  # L2 norm of each vector
    return vectors / norms  


In [21]:
key_to_index = {word: idx for idx, word in enumerate(word_embeddings.keys())}
index_to_key = {idx: word for word, idx in key_to_index.items()}

vectors = np.array(list(word_embeddings.values()))  
vectors_normalized = normalize_embeddings(vectors)


def most_similar_words(word, topn=10):
    if word not in key_to_index:
        return f"Word '{word}' not found in the vocabulary."
    
    word_id = key_to_index[word]
    emb = vectors_normalized[word_id]
    emb_normalized = emb #/ np.linalg.norm(emb)  
    similarities = vectors_normalized @ emb_normalized
    ids_ascending = similarities.argsort()
    ids_descending = ids_ascending[::-1]
    mask = ids_descending != word_id
    ids_descending = ids_descending[mask]
    top_ids = ids_descending[:topn]
    top_words = [(index_to_key[i], similarities[i]) for i in top_ids]
  
    return top_words

In [24]:
most_similar_words("cake")

[('cakes', 0.99903613),
 ('knife', 0.9978445),
 ('forest', 0.9974806),
 ('humans', 0.997444),
 ('bird', 0.9974233),
 ('fiction', 0.99741656),
 ('flower', 0.9974063),
 ('species', 0.9973448),
 ('fighter', 0.99729323),
 ('fork', 0.9972407)]

In [25]:
most_similar_words("cactus")

[('cumin', 0.9924763),
 ('cabbage', 0.99220216),
 ('camel', 0.9919967),
 ('cinnamon', 0.9912886),
 ('poultry', 0.9910093),
 ('crimp', 0.9909573),
 ('cubes', 0.9906302),
 ('carrots', 0.9905422),
 ('cotton', 0.9904048),
 ('parrots', 0.9899977)]

In [26]:
most_similar_words("between")

[('had', 0.998102),
 ('were', 0.99799585),
 ('been', 0.99789417),
 ('has', 0.99786943),
 ('his', 0.9977933),
 ('could', 0.9977186),
 ('have', 0.9976024),
 ('also', 0.9975828),
 ('they', 0.9974645),
 ('can', 0.99742717)]

In [23]:
most_similar_words("angry")

[('hungry', 0.9923549),
 ('wry', 0.9903593),
 ('garry', 0.9884526),
 ('husbandry', 0.9875644),
 ('ferry', 0.9871393),
 ('awry', 0.98678946),
 ('derry', 0.9867022),
 ('scary', 0.98658115),
 ('merry', 0.98640233),
 ('unruly', 0.9863487)]

In [55]:
# Function to tokenize and average token embeddings for a word
def get_word_embedding(word):
    # Tokenize the word into subwords
    inputs = tokenizer(word, return_tensors='pt')
    
    # Get token embeddings (last hidden states from the model)
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Average the token embeddings to get the word embedding
    token_embeddings = outputs.last_hidden_state.squeeze(0)  # Shape: (num_tokens, embedding_dim)
    word_embedding = token_embeddings.mean(dim=0)  # Shape: (embedding_dim,)
    
    return word_embedding.numpy()

# Create word embeddings for all words in the GloVe vocabulary
word_embeddings = {}
#for word in glove_vocab:
    #print(word)
    #word_embeddings[word] = get_word_embedding(word)

