# Setting up the GPU for use!

In [18]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

device: cuda
random seed: 1234


# Question 1 #

I am using the roberta model to encode the data. I start by reading in the text and adding it to an array. Then implement a data set class so I can use a data loader. I added a collate function so that we can pad per batch and it will handle the tokenization, badding, and truncation of that text. Then I go through every sentence in the text and tokenize it, calculate the contextualized and store them in a running dictionary that count how many embeddings we have summed per token. After the training we go through eack embedding and calculate the final embedding by averaging the contualized embeddings that were calculated. Then we save them to a file since this took about 2 hours to run. This code took me around 2 hours to write

In [2]:
text = []
with open("assignment4-dataset.txt", "r") as f:
    for line in f:
        text.append(line)


In [3]:
from transformers import RobertaTokenizer, RobertaModel
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Note: I ran the code once and then saved the vectors. I accidentally started running it again but it stopped it. So it has run successfully but the last time I ran it I stopped it so that's why it looks like this

In [85]:
from torch.utils.data import Dataset, DataLoader

class SentenceDataset(Dataset):
    def __init__(self, text):
        self.text = text
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        return self.text[index]

def collate_fn(batch_sentences):
    return tokenizer(
        batch_sentences,
        padding=True,         # pad per batch to be more efficient
        truncation=True,      
        return_tensors='pt'   
    )

ds = SentenceDataset(text)
dl = DataLoader(
    ds,
    batch_size=64,         
    shuffle=False,         # order doesn't matter since we are reading all of them anyways
    collate_fn=collate_fn  
)
model = model.to(device)
model.eval()

tokenEmbeddings = {}


for batch in tqdm(dl, desc="Batch Progress"):
    
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        last_hidden = outputs.last_hidden_state  

    input_ids = input_ids.cpu()
    last_hidden = last_hidden.cpu()

    # Accumulate embeddings
    for sentence in range(input_ids.shape[0]):
        for token in range(input_ids.shape[1]):
            tok_id = int(input_ids[sentence][token])
            if tok_id in tokenizer.all_special_ids:
                continue  

            vec = last_hidden[sentence][token]

            if tok_id not in tokenEmbeddings:
                tokenEmbeddings[tok_id] = {"sum": vec.clone(), "count": 1}
            else:
                tokenEmbeddings[tok_id]["sum"] += vec
                tokenEmbeddings[tok_id]["count"] += 1

#print(tokenEmbeddings)
final_embeddings = {}
for tok_id, data in tokenEmbeddings.items():
    #print("tok:", tok_id, "data:", data)
    
    final_embeddings[tok_id]= data["sum"]/data["count"]

print(f"Done! Computed static embeddings for {len(final_embeddings)} tokens.")


Batch Progress:   0%|          | 0/69826 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [9]:

# Save embeddings to a file
torch.save(final_embeddings, "static_embeddingsTest.pt")


# Question 2 #

We pull the saved vectors into memory and then read each word in the glove vocab and add them to an array. We then calculate the word embeddings by tokenizing the words and going through each token's embedding and then average them. After we get the word embeddings we make a word to id and id to word dictionaries so that we can use the most similar function that we saw in chapter 9. We have to manually normalize the vectors first since that code assumed the vectors were normalized. We then run most similar on the examples in the coding example. This code took me about 1.5-2 hours to write

In [4]:
testVec = torch.load("static_embeddings.pt")
final_embeddings = testVec
print(len(testVec))

49502


In [5]:
words = []
with open("glove.6B.300d-vocabulary.txt", "r") as f:
    for line in f:
        words.append(line.strip())


In [6]:
vectors = []

for word in tqdm(words, desc="Processing words"):
    ids = tokenizer(word)
    vector = torch.zeros(768)
    count = 0
    for i in ids['input_ids']:
        if i in final_embeddings:
            count+=1
            vector = vector + final_embeddings[i]
    if count > 0:
        vectors.append(vector / count)
    else:
        vectors.append(vector) 

Processing words:   0%|          | 0/400000 [00:00<?, ?it/s]

In [7]:
word_to_id = {word: idx for idx, word in enumerate(words)}
id_to_word = {idx: word for idx, word in enumerate(words)}
vectors = np.array(vectors, dtype=float)


In [8]:
#Dot product should use normalized vectors so we need to manually normalize the vectors before using the code
# compute norms
norms = np.linalg.norm(vectors, axis=1)

#some vectors could be zero so we replace their norms with 1 to not divide by 0
norms[norms == 0] = 1.0

# normalized matrix
vectors = vectors / norms[:, None]


In [9]:
def most_similar_words(word, vectors_normed, index_to_key, key_to_index, topn=10):
    # get the id of the word
    word_id = key_to_index[word]

    # the embedding of the word
    emb = vectors_normed[word_id]

    #dot product
    similarities = vectors_normed @ emb

    # get best ids
    ids_descending = np.argsort(similarities)[::-1]

    # remove itself
    ids_descending = ids_descending[ids_descending != word_id]

    # return top-n words
    top_ids = ids_descending[:topn]
    return [(index_to_key[i], float(similarities[i])) for i in top_ids]


In [10]:
most_similar_words('cactus',vectors,id_to_word,word_to_id)

[('cavalcanti', 0.9863354889365874),
 ('cercocarpus', 0.9861473498769773),
 ('cavalcante', 0.9860544568031278),
 ('candelas', 0.9858299012727044),
 ('carcasses', 0.9858295025702667),
 ('candel', 0.9858294191187436),
 ('cinecitta', 0.9857468884152574),
 ('crescenzio', 0.9856540147524828),
 ('cantus', 0.9855890225823372),
 ('carcosa', 0.9855605853317334)]

In [11]:
most_similar_words('cake',vectors,id_to_word,word_to_id)

[('cakebread', 0.9899608672106994),
 ('fruitcake', 0.9864247075655632),
 ('cakewalk', 0.9851424284707653),
 ('mooncake', 0.9828951083109528),
 ('cupcake', 0.9827581704871353),
 ('cakey', 0.9813415586594565),
 ('cakes', 0.9772956232114542),
 ('fruitcakes', 0.9770954168160269),
 ('beefcake', 0.9770783117066244),
 ('breadsticks', 0.9765364553065139)]

In [12]:
most_similar_words('angry',vectors,id_to_word,word_to_id)

[('ryang', 0.9999999999999999),
 ('ryanggang', 0.9949820219151513),
 ('mlanghenry', 0.9879471956721472),
 ('yungang', 0.9874037014721364),
 ('yarang', 0.986974789242882),
 ('yanchang', 0.9869184675772289),
 ('ryokan', 0.9867762689355988),
 ('ryokans', 0.986503959835086),
 ('zangara', 0.986425131677603),
 ('riang', 0.9863395297540736)]

In [13]:
most_similar_words('quickly',vectors,id_to_word,word_to_id)

[('cleanly', 0.9868280206228188),
 ('closely', 0.986563200587705),
 ('quietly', 0.9863500507254755),
 ('solidly', 0.9860146388101536),
 ('coldly', 0.9855217456180718),
 ('wildly', 0.9852935517079322),
 ('smartly', 0.9851519293704964),
 ('safely', 0.9847266876867198),
 ('shortly', 0.9845230992649008),
 ('sweetly', 0.98441159181477)]

In [14]:
most_similar_words('between',vectors,id_to_word,word_to_id)

[('inbetween', 0.9777306626590989),
 ('betweenness', 0.9733514115859017),
 ('inbetweeners', 0.964801160200541),
 ('in-between', 0.9600773334701196),
 ('go-between', 0.9579426240838547),
 ('below-average', 0.9422266431484709),
 ('below', 0.9411081084372184),
 ('near-future', 0.9401366101466174),
 ('nearshore', 0.9391151561161046),
 ('nearside', 0.9390195048005279)]

In [15]:
most_similar_words('the',vectors,id_to_word,word_to_id)

[('bythe', 0.9832863368407013),
 ('andthe', 0.9827341979829303),
 ('thet', 0.9827293383206791),
 ('munthe', 0.9825837801469187),
 ('theming', 0.9817699051097706),
 ('theun', 0.9815891534772575),
 ('theory', 0.9812382748717261),
 ('grethe', 0.9811679464356251),
 ('thebe', 0.9810872220490177),
 ('thein', 0.9810202196263875)]