# Assignment 4 

This notebook uses Roberta to generate a single dictionary which contains a mapping between a token (as a string) and a 756 dimensional averaged embedding over the provided text. The corpus to be used must be placed in the same directory as this notebook and named 'dataset.txt'.

## Initialization

Import required libraries.

In [1]:
# Standard ML libaries
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from operator import itemgetter
from sklearn.metrics import classification_report
from operator import itemgetter
import psutil

# RobertaModel and Tockenizer
from transformers import RobertaTokenizer, RobertaModel, RobertaTokenizerFast

Initialize environment with GPU (or CPU as fallback!).

In [2]:
# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)


device: cuda
random seed: 1234


Initalize global constants.

In [None]:
BATCH_SIZE = 512
MAX_TOKENIZATION_LENGTH = 128


# Model Specifcs
MODEL = "roberta-base"
EMBEDDING_SIZE = 768


## Data Pre-Processing 

Load in dataset, sentence by sentence.

In [3]:
sentences = []

linecount = 0
wordcount = 0 

lengths = []

with open("dataset.txt", 'r') as dataset_file:
    while line := dataset_file.readline():
        sentences += [line]
        linecount += 1
        wordcount += len(line.split())
        lengths += [len(line)]

print("Loaded " + str(linecount) + " lines and " + str(wordcount) + " words.")
print("Average length: " + str(np.average(lengths)))
print("Max length: " + str(np.max(lengths)))

sentences = sentences[:25_000]

Loaded 4468825 lines and 47820302 words.
Average length: 67.34097665493726
Max length: 3263


Initialize tokenizer.

In [4]:
## Try a fast tokenizer,
tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base", add_prefix_space = True, clean_up_tokenization_spaces = True)

## Dataset

We'll be handling tokenization in a Dataset so we can take advantage of the DataLoader for auto batching.

In [6]:
from torch.utils.data import Dataset, DataLoader
class RobertaDataset(Dataset):
	def __init__(self, sentences: list, max_length: int):
		sentences_tokenized = []

		for sentence in sentences:
			tokens = tokenizer.encode_plus(sentence, padding = "max_length", max_length = max_length, truncation = True, return_tensors='pt')
			
			ids = torch.LongTensor(tokens['input_ids'][0])
			mask = torch.LongTensor(tokens['attention_mask'][0]) 

			sentences_tokenized += [np.array([ids, mask])]

			print(f"{len(sentences_tokenized) / len(sentences) * 100.0}% complete.\t\t\t", end ='\r')

		self.sentences_tokenized = np.array(sentences_tokenized)

	def __len__(self):
		return len(self.sentences_tokenized)
	
	def __getitem__(self, index):
		return (self.sentences_tokenized[index][0], self.sentences_tokenized[index][1])

In [None]:
dataset = RobertaDataset(sentences, MAX_TOKENIZATION_LENGTH)

dataloader = DataLoader(dataset, batch_size = BATCH_SIZE, shuffle = True, num_workers = 0)

sentence_count = len(sentences)

100.0% complete.				omplete.						

## Embedding Calculations

Calculate a single embedding just to test.

In [None]:
model = RobertaModel.from_pretrained(MODEL).to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Calculate the embeddings of our batches. I abort once 12_500 tokens have been collected as I start to run out of RAM after that.

In [None]:
token_size = tokenizer.vocab_size

token_to_embedding_sums = np.zeros((token_size, EMBEDDING_SIZE))
token_to_embedding_counts = np.zeros((token_size, 1))
token_to_embedding_averages = np.zeros((token_size, EMBEDDING_SIZE))

processed_tokens = 0;

def calculate_embeddings(model) -> dict:
	processed_sentances = 0
	
	model.eval()
	
	token_to_avg_embedding_map = {}
	avg_token_embedding = None
	
	with torch.no_grad():
		for batch in dataloader:
			
			ids = batch[0].to(device)
			mask = batch[1].to(device)
			
			output = model(ids, mask)
	
			####################################################################
		
			### shape, [batch, tokens in sentance, embeddings of each token]
			embeddings = output[0].detach().cpu().numpy()
			# del output
			
			# Update average embeddings, 
			for sentence_embedding_index in range(len(embeddings)):
				sentence_embedding = embeddings[sentence_embedding_index]
	
				for token_index in range(len(sentence_embedding)):
					token = ids[sentence_embedding_index][token_index]
					token_to_embedding_sums[token] += sentence_embedding[token_index]
					token_to_embedding_counts[token] += 1

			# if (psutil.virtual_memory().percent) > 95.0:
				# print("Aborting embedding generation early to avoid running out of RAM!")
				# print(f"{psutil.virtual_memory().used / 1e9} GB used.")
				# print(f"{processed_sentances / (sentence_count) * 100.0}% complete. {len(token_to_avg_embedding_map)} embeddings generated. {psutil.virtual_memory().percent}% RAM utilization.")
				# return token_to_avg_embedding_map, avg_token_embedding

			processed_sentances += BATCH_SIZE
			print(f"{processed_sentances / (sentence_count) * 100.0}% complete. {len(token_to_avg_embedding_map)} embeddings generated. {psutil.virtual_memory().percent}% RAM utilization. \t\t\t", end ='\r')
				
			
	return token_to_avg_embedding_map, avg_token_embedding

token_to_avg_embedding_map, avg_token_embedding = calculate_embeddings(model)

100.352% complete. 0 embeddings generated. 15.8% RAM utilization. 			ation. 				

Post process embeddings

In [57]:
average_embedding = np.sum(token_to_embedding_averages, axis=0)

token_to_embedding_counts[token_to_embedding_counts == 0] = 1 # Set all onencounterd to tokens to the average token!
token_to_embedding_averages = token_to_embedding_sums / token_to_embedding_counts

print(token_to_embedding_averages.shape)
print(average_embedding.shape)

(50265, 768)
(768,)


# Problem one complete!
The token_to_avg_embedding_map is a dictonary mapping between sub-word tokens and their average embedding in the dataset.

# Problem 2
In this section we are going to implement the most_similar() functions from chp 9

First, generate a word to embedding mapping.

In [58]:
def get_average_embedding(word):
	tokens = tokenizer(word)['input_ids']
	embedding = np.zeros(768)
	for token in tokens:
		embedding += token_to_embedding_averages[token]	
	return embedding / len(tokens)


def generate_word_embedding_map(words: list) -> dict:
	word_embedding_map = {}
	processed_words = 0
	for word in words:
		embedding = get_average_embedding(word)
		word_embedding_map[word] = embedding
	
		processed_words += 1
		print(f"{processed_words / len(words) * 100.0}% complete. {len(word_embedding_map)} word embeddings generated.\t\t\t", end ='\r')
	return word_embedding_map


def load_words(from_file: str) -> list:
	words = []

	with open(from_file, 'r') as file:
		while line := file.readline():
			words += [line.strip()]

	return words

In [59]:
words = load_words("glove.6B.300d-vocabulary.txt")
word_to_embedding = generate_word_embedding_map(words)

100.0% complete. 400000 word embeddings generated.			enerated.				

In [60]:
def get_word_embedding(word):
    if word in word_to_embedding:
        emb = word_to_embedding[word]
    else:
        emb = get_average_embedding(word)
        word_to_embedding[word] = emb
    return emb

def most_similar(word, topn=10):
    emb = get_word_embedding(word)

    # calculate similarities to all words in our vocabulary
    similarities = []
    for word, embedding, in word_to_embedding.items():
        similarity = embedding @ emb

        similarities += [(float(similarity), str(word))]

    similarities.sort(key = itemgetter(0))
    similarities.reverse()
    
    return similarities[:topn]

## 6 Examples

In [61]:
most_similar("cactus")

[(97.38421753517841, 'higher-dimensional'),
 (97.18366326018737, 'low-dimensional'),
 (97.12702066152566, 'high-dimensional'),
 (96.85361455321862, 'finite-dimensional'),
 (96.80543874486999, 'one-dimensional'),
 (96.67597460630914, 'other-dimensional'),
 (96.23280478211657, 'zero-dimensional'),
 (96.21962179810112, 'peoplehood'),
 (96.15755769409787, 'extra-dimensional'),
 (95.98457447005109, 'personhood')]

In [62]:
most_similar("cake")

[(86.33701600353515, 'vs'),
 (86.0279301507524, 'render'),
 (85.99913654675676, 'vs.'),
 (85.70931571741625, 'chip'),
 (85.6879070997179, 'linkage'),
 (85.61234521899873, 'hybrids'),
 (85.60077070207244, 'civilizations'),
 (85.47937375426555, 'worlds'),
 (85.20825110611888, 'operator'),
 (85.02999841686741, 'info')]

In [63]:
most_similar("angry")

[(129.47584800607376, 'higher-dimensional'),
 (128.99098028010266, 'high-dimensional'),
 (128.92776755697906, 'low-dimensional'),
 (128.75041768788282, 'other-dimensional'),
 (128.67045208907925, 'one-dimensional'),
 (128.59796138552187, 'finite-dimensional'),
 (128.31271997569348, 'peoplehood'),
 (127.94978559573272, 'zero-dimensional'),
 (127.91492941320858, 'personhood'),
 (127.79011016298041, 'extra-dimensional')]

In [64]:
most_similar("quickly")

[(128.28140365471972, 'higher-dimensional'),
 (127.9407301410987, 'low-dimensional'),
 (127.90276048994124, 'high-dimensional'),
 (127.79083524427996, 'finite-dimensional'),
 (127.47068509943561, 'other-dimensional'),
 (127.47056365993181, 'one-dimensional'),
 (127.26570160834164, 'peoplehood'),
 (126.8517493097851, 'personhood'),
 (126.69591978955545, 'zero-dimensional'),
 (126.53893555251778, 'two-dimensional')]

In [49]:
most_similar("between")

[(134.45363308434554, 'higher-dimensional'),
 (133.8096768570758, 'low-dimensional'),
 (133.68422465659432, 'high-dimensional'),
 (133.5952588180821, 'one-dimensional'),
 (133.49855396129158, 'other-dimensional'),
 (133.29942964689556, 'finite-dimensional'),
 (132.69809617646814, 'extra-dimensional'),
 (132.5082375330686, 'zero-dimensional'),
 (132.22603416776482, 'two-dimensional'),
 (132.08797114550256, 'inter-dimensional')]

In [None]:
most_similar("people")

[(138.4307937488648, 'higher-dimensional'),
 (138.19154641536704, 'personhood'),
 (138.06559775379966, 'low-dimensional'),
 (137.9410631798128, 'high-dimensional'),
 (137.7783529078129, 'peoplehood'),
 (137.7600299442174, 'other-dimensional'),
 (137.69984696620025, 'one-dimensional'),
 (137.62260976405366, 'finite-dimensional'),
 (136.80963619398415, 'thing'),
 (136.73713751580357, 'zero-dimensional')]