# Assignment 4 

This notebook uses Roberta to generate a single dictionary which contains a mapping between a token (as a string) and a 756 dimensional averaged embedding over the provided text. The corpus to be used must be placed in the same directory as this notebook and named 'dataset.txt'.

## Initialization

Import required libraries.

In [1]:
# Standard ML libaries
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from operator import itemgetter
from sklearn.metrics import classification_report
from operator import itemgetter
import psutil

# RobertaModel and Tockenizer
from transformers import RobertaTokenizer, RobertaModel, RobertaTokenizerFast

Initialize environment with GPU (or CPU as fallback!).

In [2]:
# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)


device: cuda
random seed: 1234


## Data Pre-Processing 

Load in dataset, sentence by sentence.

In [3]:
sentences = []

linecount = 0
wordcount = 0 

lengths = []

with open("dataset.txt", 'r') as dataset_file:
    while line := dataset_file.readline():
        sentences += [line]
        linecount += 1
        wordcount += len(line.split())
        lengths += [len(line)]

print("Loaded " + str(linecount) + " lines and " + str(wordcount) + " words.")
print("Average length: " + str(np.average(lengths)))
print("Max length: " + str(np.max(lengths)))

Loaded 4468825 lines and 47820302 words.
Average length: 67.34097665493726
Max length: 3263


Initialize tokenizer.

In [4]:
## Try a fast tokenizer,
tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base", add_prefix_space = True, clean_up_tokenization_spaces = True)

Quick sanity check to ensure tokenizer is working.

In [5]:
sentence = sentences[1]
tokens = tokenizer.encode_plus(sentence, padding = "max_length", max_length = 128, truncation = True, return_tensors='pt')

# tokenizer(sentence, is_split_into_words = True, return_tensors='pt', \
			# padding="max_length", max_length=128, truncation=True)

## Maybe reduce max length to 256 
ids = tokens['input_ids'][0]
mask = tokens['attention_mask'][0]
print(tokenizer.decode(ids[7]))
# sentences = sentences[:10_000]

 Pictures


## Dataset

We'll be handling tokenization in a Dataset so we can take advantage of the DataLoader for auto batching.

In [6]:
from torch.utils.data import Dataset, DataLoader
class RobertaDataset(Dataset):
	def __init__(self, sentences: list, max_length: int):
		sentences_tokenized = []

		for sentence in sentences:
			tokens = tokenizer.encode_plus(sentence, padding = "max_length", max_length = max_length, truncation = True, return_tensors='pt')
			
			ids = torch.LongTensor(tokens['input_ids'][0])
			mask = torch.LongTensor(tokens['attention_mask'][0]) 

			sentences_tokenized += [np.array([ids, mask])]

			print(f"{len(sentences_tokenized) / len(sentences) * 100.0}% complete.\t\t\t", end ='\r')

		self.sentences_tokenized = np.array(sentences_tokenized)

	def __len__(self):
		return len(self.sentences_tokenized)
	
	def __getitem__(self, index):
		return (self.sentences_tokenized[index][0], self.sentences_tokenized[index][1])

In [None]:
dataset = RobertaDataset(sentences, 128)
BATCH_SIZE = 64
dataloader = DataLoader(dataset, batch_size = BATCH_SIZE, shuffle = True, num_workers = 0)

sentence_count = len(sentences)
del sentences

100.0% complete.			complete.								

## Embedding Calculations

Calculate a single embedding just to test.

In [8]:
batch = next(iter(dataloader))

ids = batch[0].to(device)
mask = batch[1].to(device)
model = RobertaModel.from_pretrained('roberta-base').to(device)

with torch.no_grad():
	output = model(ids, mask)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Calculate the embeddings of our batches. I abort once 12_500 tokens have been collected as I start to run out of RAM after that.

In [9]:
import sys
sys.tracebacklimit = 0
import sys, traceback, gc
type, val, tb = None, None, None
type, val, tb = sys.exc_info()
traceback.clear_frames(tb)

token_to_avg_embedding_map = dict()
avg_token_embedding = None

def calculate_embeddings() -> dict:
	processed_sentances = 0
	
	model = RobertaModel.from_pretrained('roberta-base').to(device)
	model.eval()
	
	token_to_avg_embedding_map = {}
	avg_token_embedding = None
	
	embedding_range_cache = 0

	with torch.no_grad():
		for batch in dataloader:
			
			ids = batch[0].to(device)
			mask = batch[1].to(device)
			
			output = model(ids, mask)
	
			####################################################################
		
			### shape, [batch, tokens in sentance, embeddings of each token]
			embeddings = output[0].detach().cpu() #.numpy()
			del output
			
			# Update average embeddings, 
			sentence_embedding_index = 0
			while sentence_embedding_index < len(embeddings):
			# for sentence_embedding_index in range(len(embeddings)):
				sentence_embedding = embeddings[sentence_embedding_index]
	
				token_index = 0
				while token_index < len(sentence_embedding):
				# for token_index in range(len(sentence_embedding)):
					token = ids[sentence_embedding_index][token_index]
					token_str = tokenizer.decode(token)
					token_embedding = sentence_embedding[token_index]
	
					if avg_token_embedding is None:
						avg_token_embedding = token_embedding
					else:
						avg = np.array([avg_token_embedding, token_embedding])
						avg_token_embedding = np.mean(avg, axis=0)
						del avg
	
					if token_str in token_to_avg_embedding_map:
						avg = np.array([token_to_avg_embedding_map[token_str], token_embedding])
						token_to_avg_embedding_map[token_str] = np.mean(avg, axis=0)
						del avg
					else:
						token_to_avg_embedding_map[token_str] = token_embedding

					token_index += 1
					del token
					del token_str
					del token_embedding

				sentence_embedding_index += 1
				del sentence_embedding
	
			del embeddings
			del ids
			del mask
			del batch # Prevents memory leaks!!! this took a VERY long time to track down
		
			if (psutil.virtual_memory().percent) > 95.0:
				print("Aborting embedding generation early to avoid running out of RAM!")
				print(f"{psutil.virtual_memory().used / 1e9} GB used.")
				print(f"{processed_sentances / (sentence_count) * 100.0}% complete. {len(token_to_avg_embedding_map)} embeddings generated. {psutil.virtual_memory().percent}% RAM utilization.")
				return token_to_avg_embedding_map, avg_token_embedding
			
			processed_sentances += BATCH_SIZE
			print(f"{processed_sentances / (sentence_count) * 100.0}% complete. {len(token_to_avg_embedding_map)} embeddings generated. {psutil.virtual_memory().percent}% RAM utilization. \t\t\t", end ='\r')
				
			
	return token_to_avg_embedding_map, avg_token_embedding

token_to_avg_embedding_map, avg_token_embedding = calculate_embeddings()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Aborting embedding generation early to avoid running out of RAM!AM utilization. 						
78.949474304 GB used.
3.253830704939218% complete. 43394 embeddings generated. 95.1% RAM utilization.


In [10]:
gc.collect()

0

# Problem one complete!
The token_to_avg_embedding_map is a dictonary mapping between sub-word tokens and their average embedding in the dataset.

In [11]:
print("tokens : " + str(len(token_to_avg_embedding_map)))

tokens : 43394


# Problem 2
In this section we are going to implement the most_similar() functions from chp 9

First, generate a word to embedding mapping.

In [12]:
def get_average_embedding(word):
	tokens = tokenizer(word)['input_ids']
	embedding = np.zeros(768)
	for token in tokens:
		token = tokenizer.decode(token) # get the string this token id represents
		if token in token_to_avg_embedding_map:
			embedding += np.array(token_to_avg_embedding_map[token]) 		
		else:
			embedding += avg_token_embedding
	return embedding / float(len(tokens))


def generate_word_embedding_map(words: list) -> dict:
	word_embedding_map = {}
	processed_words = 0
	for word in words:
		embedding = get_average_embedding(word)
		word_embedding_map[word] = embedding
	
		processed_words += 1
		print(f"{processed_words / len(words) * 100.0}% complete. {len(word_embedding_map)} word embeddings generated.\t\t\t", end ='\r')
	return word_embedding_map


def load_words(from_file: str) -> list:
	words = []

	with open(from_file, 'r') as file:
		while line := file.readline():
			words += [line.strip()]

	return words

In [13]:
words = load_words("glove.6B.300d-vocabulary.txt")
word_to_embedding = generate_word_embedding_map(words)

100.0% complete. 400000 word embeddings generated.			enerated.				

In [14]:
def get_word_embedding(word):
    if word in word_to_embedding:
        emb = word_to_embedding[word]
    else:
        emb = get_average_embedding(word)
        word_to_embedding[word] = emb
    return emb

def most_similar(word, topn=10):
    emb = get_word_embedding(word)

    # calculate similarities to all words in out vocabulary
    similarities = []
    for word, embedding, in word_to_embedding.items():
        similarity = embedding @ emb

        similarities += [(float(similarity), str(word))]

    similarities.sort(key = itemgetter(0))
    similarities.reverse()
    
    return similarities[:topn]

## 6 Examples

In [15]:
most_similar("cactus")

[(129.4298404593606, 'hematologist'),
 (129.3095482185425, 'self-destruction'),
 (129.2839131216873, 'things'),
 (129.16494454079495, 'hemostasis'),
 (128.92477827282045, 'objectivity'),
 (128.42289551072992, 'heist'),
 (128.37232191518987, 'he'),
 (128.34957879863578, 'heakes'),
 (128.22011759486068, 'user-space'),
 (128.19456066310892, 'byte')]

In [16]:
most_similar("cake")

[(133.258082889447, 'he'),
 (133.11954709180307, 'things'),
 (132.6839322763326, 'heakes'),
 (132.3819955793722, 'byte'),
 (132.03737921162462, 'hematologist'),
 (131.99011399642782, 'hemostasis'),
 (131.94705023436487, 'heist'),
 (131.76831916527433, 'self-destruction'),
 (131.5810195161631, 'graphologist'),
 (131.3416912871951, 'heeds')]

In [17]:
most_similar("angry")

[(133.07528772348599, 'he'),
 (132.9637883763039, 'things'),
 (132.51406906296017, 'hemostasis'),
 (132.3865976724091, 'heakes'),
 (132.3566886395853, 'hematologist'),
 (132.07205604991944, 'byte'),
 (131.9958376690678, 'heist'),
 (131.7139442326221, 'self-destruction'),
 (131.31083020558904, 'heeds'),
 (131.23328961447447, 'hewell')]

In [18]:
most_similar("quickly")

[(130.52249817357432, 'he'),
 (130.35054012907875, 'things'),
 (129.72066712244705, 'heakes'),
 (129.64591908826932, 'hemostasis'),
 (129.45633036755393, 'hematologist'),
 (129.4075502528554, 'byte'),
 (129.20781604170108, 'heist'),
 (128.78076056850324, 'self-destruction'),
 (128.68881325184643, 'heeds'),
 (128.61578623807935, 'hewell')]

In [19]:
most_similar("between")

[(134.38869580935088, 'things'),
 (134.19651025339, 'he'),
 (134.09235898663604, 'hemostasis'),
 (133.63884397702145, 'hematologist'),
 (133.42244463624868, 'heist'),
 (133.41676545127433, 'resultative'),
 (133.27544605985858, 'heakes'),
 (133.0721063738572, 'self-destruction'),
 (133.03911802369842, 'byte'),
 (132.59518516160387, 'objectivity')]

In [20]:
most_similar("the")

[(138.77673033068695, 'he'),
 (137.83944298111643, 'things'),
 (137.3630193709027, 'heakes'),
 (136.94316095221643, 'byte'),
 (136.92617364851606, 'hemostasis'),
 (136.89631452263274, 'hematologist'),
 (136.8067716985104, 'heist'),
 (136.05760844186545, 'heeds'),
 (135.93121321690953, 'self-destruction'),
 (135.9068883998036, 'hectors')]