# Assignment 4 

This notebook uses Roberta to generate a single dictionary which contains a mapping between a token (as a string) and a 756 dimensional averaged embedding over the provided text. The corpus to be used must be placed in the same directory as this notebook and named 'dataset.txt'.

## Initialization

Import required libraries.

In [1]:
# Standard ML libaries
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from operator import itemgetter
from sklearn.metrics import classification_report
from operator import itemgetter
import psutil

# RobertaModel and Tockenizer
from transformers import RobertaTokenizer, RobertaModel, RobertaTokenizerFast

Initialize environment with GPU (or CPU as fallback!).

In [2]:
# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)


device: cuda
random seed: 1234


## Data Pre-Processing 

Load in dataset, sentence by sentence.

In [3]:
sentences = []

linecount = 0
wordcount = 0 

lengths = []

with open("dataset.txt", 'r') as dataset_file:
    while line := dataset_file.readline():
        sentences += [line]
        linecount += 1
        wordcount += len(line.split())
        lengths += [len(line)]

print("Loaded " + str(linecount) + " lines and " + str(wordcount) + " words.")
print("Average length: " + str(np.average(lengths)))
print("Max length: " + str(np.max(lengths)))

Loaded 4468825 lines and 47820302 words.
Average length: 67.34097665493726
Max length: 3263


Initialize tokenizer.

In [4]:
## Try a fast tokenizer,
tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base", add_prefix_space = True, clean_up_tokenization_spaces = True)

Quick sanity check to ensure tokenizer is working.

In [5]:
sentence = sentences[1]
tokens = tokenizer.encode_plus(sentence, padding = "max_length", max_length = 128, truncation = True, return_tensors='pt')

# tokenizer(sentence, is_split_into_words = True, return_tensors='pt', \
			# padding="max_length", max_length=128, truncation=True)

## Maybe reduce max length to 256 
ids = tokens['input_ids'][0]
mask = tokens['attention_mask'][0]
print(tokenizer.decode(ids[7]))

 Pictures


## Dataset

We'll be handling tokenization in a Dataset so we can take advantage of the DataLoader for auto batching.

In [6]:
from torch.utils.data import Dataset, DataLoader
class RobertaDataset(Dataset):
	def __init__(self, sentences: list, max_length: int):
		sentences_tokenized = []

		for sentence in sentences:
			tokens = tokenizer.encode_plus(sentence, padding = "max_length", max_length = max_length, truncation = True, return_tensors='pt')
			
			ids = torch.LongTensor(tokens['input_ids'][0])
			mask = torch.LongTensor(tokens['attention_mask'][0]) 

			sentences_tokenized += [np.array([ids, mask])]

			print(f"{len(sentences_tokenized) / len(sentences) * 100.0}% complete.\t\t\t", end ='\r')

		self.sentences_tokenized = np.array(sentences_tokenized)

	def __len__(self):
		return len(self.sentences_tokenized)
	
	def __getitem__(self, index):
		return (self.sentences_tokenized[index][0], self.sentences_tokenized[index][1])

In [7]:
dataset = RobertaDataset(sentences, 128)
BATCH_SIZE = 256
dataloader = DataLoader(dataset, batch_size = BATCH_SIZE, shuffle = True, num_workers = 0)

100.0% complete.			complete.								

## Embedding Calculations

Calculate a single embedding just to test.

In [8]:
batch = next(iter(dataloader))

ids = batch[0].to(device)
mask = batch[1].to(device)
model = RobertaModel.from_pretrained('roberta-base').to(device)

with torch.no_grad():
	output = model(ids, mask)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Calculate the embeddings of our batches. I abort once 12_500 tokens have been collected as I start to run out of RAM after that.

In [9]:
import sys
sys.tracebacklimit = 0
import sys, traceback, gc
type, val, tb = None, None, None
type, val, tb = sys.exc_info()
traceback.clear_frames(tb)

token_to_avg_embedding_map = dict()
avg_token_embedding = None

def calculate_embeddings() -> dict:
	processed_sentances = 0
	
	model = RobertaModel.from_pretrained('roberta-base').to(device)
	model.eval()
	
	token_to_avg_embedding_map = {}
	avg_token_embedding = None
	
	with torch.no_grad():
		for batch in dataloader:
			
			ids = batch[0].to(device)
			mask = batch[1].to(device)
			
			output = model(ids, mask)
	
			####################################################################
		
			### shape, [batch, tokens in sentance, embeddings of each token]
			embeddings = output[0].detach().cpu().numpy()
	
			
			# Update average embeddings, 
			for sentence_embedding_index in range(len(embeddings)):
				sentence_embedding = embeddings[sentence_embedding_index]
	
				for token_index in range(len(sentence_embedding)):
					token = ids[sentence_embedding_index][token_index]
					token_str = tokenizer.decode(token)
					token_embedding = sentence_embedding[token_index]
	
					if avg_token_embedding is None:
						avg_token_embedding = token_embedding
					else:
						avg_token_embedding = np.mean(np.array([avg_token_embedding, token_embedding]), axis=0)
	
					if token_str in token_to_avg_embedding_map:
						token_to_avg_embedding_map[token_str] = np.mean(np.array([token_to_avg_embedding_map[token_str], token_embedding]), axis=0)
					else:
						token_to_avg_embedding_map[token_str] = token_embedding
			
			del embeddings
			del ids
			del mask
		
			if (psutil.virtual_memory().percent) > 99.5:
				print("Aborting embedding generation early to avoid running out of RAM!")
				print(f"{psutil.virtual_memory().used / 1e9} GB used.")
				print(f"{processed_sentances / len(sentences) * 100.0}% complete. {len(token_to_avg_embedding_map)} embeddings generated. {psutil.virtual_memory().percent}% RAM utilization.")
				return token_to_avg_embedding_map, avg_token_embedding
			
			processed_sentances += BATCH_SIZE
			print(f"{processed_sentances / len(sentences) * 100.0}% complete. {len(token_to_avg_embedding_map)} embeddings generated. {psutil.virtual_memory().percent}% RAM utilization. \t\t\t", end ='\r')
				
			
	return token_to_avg_embedding_map, avg_token_embedding

token_to_avg_embedding_map, avg_token_embedding = calculate_embeddings()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Aborting embedding generation early to avoid running out of RAM!AM utilization. 						
83.236130816 GB used.
5.442146425514537% complete. 45429 embeddings generated. 99.6% RAM utilization.


In [10]:
gc.collect()

0

# Problem one complete!
The token_to_avg_embedding_map is a dictonary mapping between sub-word tokens and their average embedding in the dataset.

In [11]:
print("tokens : " + str(len(token_to_avg_embedding_map)))
avg_token_embedding = avg_token_embedding.tolist()

tokens : 45429


# Problem 2
In this section we are going to implement the most_similar() functions from chp 9

First, generate a word to embedding mapping.

In [12]:
def get_average_embedding(word):
	tokens = tokenizer(word)['input_ids']
	embedding = np.array(avg_token_embedding)
	for token in tokens:
		token = tokenizer.decode(token) # get the string this token id represents
		if token in token_to_avg_embedding_map:
			embedding += np.array(token_to_avg_embedding_map[token]) 		
		else:
			embedding += np.array(avg_token_embedding)
	return embedding / float(len(tokens))


def generate_word_embedding_map(words: list) -> dict:
	word_embedding_map = {}
	processed_words = 0
	for word in words:
		embedding = get_average_embedding(word)

		# if word in word_embedding_map:
			# word_embedding_map[word] = np.mean(np.array([embedding, word_embedding_map[word]]), axis=0)
		# else:
		word_embedding_map[word] = embedding
	
		processed_words += 1
		print(f"{processed_words / len(words) * 100.0}% complete. {len(word_embedding_map)} word embeddings generated.\t\t\t", end ='\r')
	return word_embedding_map


def load_words(from_file: str) -> list:
	words = []

	with open(from_file, 'r') as file:
		while line := file.readline():
			words += [line.strip()]

	return words

In [13]:
words = load_words("glove.6B.300d-vocabulary.txt")
word_to_embedding = generate_word_embedding_map(words)

100.0% complete. 400000 word embeddings generated.			enerated.				

In [16]:
def get_word_embedding(word):
    if word in word_to_embedding:
        emb = word_to_embedding[word]
    else:
        emb = get_average_embedding(word)
        word_to_embedding[word] = emb
    return emb

def most_similar(word, topn=10):
    emb = get_word_embedding(word)

    # calculate similarities to all words in out vocabulary
    similarities = []
    for word, embedding, in word_to_embedding.items():
        similarity = embedding @ emb

        similarities += [(float(similarity), str(word))]

    similarities.sort(key = itemgetter(0))
    similarities.reverse()
    
    return similarities[:topn]

## 6 Examples

In [17]:
most_similar("cactus")

[(195.76938768723528, 'beings'),
 (195.04422725559118, 'worlds'),
 (194.98074462805857, 'place'),
 (193.96899141551464, 'tract'),
 (193.9049865194122, 'scene'),
 (193.82587903920148, 'offence'),
 (193.60080535435657, 'reader'),
 (193.50649510518875, 'decency'),
 (193.3968666057856, 'behaviour'),
 (193.19947584289272, 'affect')]

In [18]:
most_similar("cake")

[(208.95104693764688, 'worlds'),
 (208.94183655681599, 'tract'),
 (208.88146745476683, 'beings'),
 (208.78977950702662, 'place'),
 (208.39590286223986, 'scene'),
 (207.48263558608045, 'reader'),
 (207.27289745687347, 'behaviour'),
 (207.23807727184376, 'decency'),
 (207.0258052612583, 'character'),
 (206.98893800820045, 'offence')]

In [19]:
most_similar("angry")

[(211.15260402327831, 'place'),
 (210.96373665527275, 'beings'),
 (210.730817087059, 'worlds'),
 (209.90084720089783, 'tract'),
 (209.84627423612548, 'reader'),
 (209.6598100137772, 'decency'),
 (209.63345684707483, 'scene'),
 (209.0384408161774, 'affect'),
 (209.0099198978825, 'offence'),
 (208.85120408214084, 'behaviour')]

In [20]:
most_similar("quickly")

[(208.50399005403946, 'place'),
 (208.209611689891, 'beings'),
 (208.12257918878376, 'worlds'),
 (207.3372136732168, 'tract'),
 (206.88450045115556, 'scene'),
 (206.09934818313025, 'decency'),
 (205.95078625559387, 'version'),
 (205.94334604545662, 'reader'),
 (205.61977261384394, 'affect'),
 (205.50517494157896, 'behaviour')]

In [21]:
most_similar("between")

[(214.17963296474522, 'place'),
 (214.17337795651963, 'beings'),
 (213.77337244797033, 'worlds'),
 (212.90681853370387, 'affect'),
 (212.59164426502696, 'tract'),
 (212.42115225710154, 'scene'),
 (212.3379639606195, 'decency'),
 (212.1175616283124, 'behaviour'),
 (212.03715577105646, 'offence'),
 (211.71178105354454, 'version')]

In [22]:
most_similar("the")

[(216.3175356073491, 'place'),
 (215.99651891442767, 'beings'),
 (215.65305980877426, 'worlds'),
 (214.99538814052298, 'tract'),
 (214.48374575602463, 'scene'),
 (214.43398872075582, 'decency'),
 (214.08270373607635, 'affect'),
 (213.91161270957855, 'reader'),
 (213.86771614943004, 'version'),
 (213.85424664491785, 'offence')]