# Assignment 4 

This notebook uses Roberta to generate a single dictionary which contains a mapping between a token (as a string) and a 756 dimensional averaged embedding over the provided text. The corpus to be used must be placed in the same directory as this notebook and named 'dataset.txt'.

## Initialization

Import required libraries.

In [1]:
# Standard ML libaries
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from operator import itemgetter
from sklearn.metrics import classification_report
from operator import itemgetter
import psutil

# RobertaModel and Tockenizer
from transformers import RobertaTokenizer, RobertaModel, RobertaTokenizerFast

Initialize environment with GPU (or CPU as fallback!).

In [2]:
# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)


device: cuda
random seed: 1234


## Data Pre-Processing 

Load in dataset, sentence by sentence.

In [3]:
sentences = []

linecount = 0
wordcount = 0 

lengths = []

with open("dataset.txt", 'r') as dataset_file:
    while line := dataset_file.readline():
        sentences += [line]
        linecount += 1
        wordcount += len(line.split())
        lengths += [len(line)]

print("Loaded " + str(linecount) + " lines and " + str(wordcount) + " words.")
print("Average length: " + str(np.average(lengths)))
print("Max length: " + str(np.max(lengths)))

Loaded 4468825 lines and 47820302 words.
Average length: 67.34097665493726
Max length: 3263


Initialize tokenizer.

In [4]:
## Try a fast tokenizer,
tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base", add_prefix_space = True, clean_up_tokenization_spaces = True)

Quick sanity check to ensure tokenizer is working.

In [5]:
sentence = sentences[1]
tokens = tokenizer.encode_plus(sentence, padding = "max_length", max_length = 128, truncation = True, return_tensors='pt')

# tokenizer(sentence, is_split_into_words = True, return_tensors='pt', \
			# padding="max_length", max_length=128, truncation=True)

## Maybe reduce max length to 256 
ids = tokens['input_ids'][0]
mask = tokens['attention_mask'][0]
print(tokenizer.decode(ids[7]))

 Pictures


## Dataset

We'll be handling tokenization in a Dataset so we can take advantage of the DataLoader for auto batching.

In [6]:
from torch.utils.data import Dataset, DataLoader
class RobertaDataset(Dataset):
	def __init__(self, sentences: list, max_length: int):
		sentences_tokenized = []

		for sentence in sentences:
			tokens = tokenizer.encode_plus(sentence, padding = "max_length", max_length = max_length, truncation = True, return_tensors='pt')
			
			ids = torch.LongTensor(tokens['input_ids'][0])
			mask = torch.LongTensor(tokens['attention_mask'][0]) 

			sentences_tokenized += [np.array([ids, mask])]

			print(f"{len(sentences_tokenized) / len(sentences) * 100.0}% complete.\t\t\t", end ='\r')

		self.sentences_tokenized = np.array(sentences_tokenized)

	def __len__(self):
		return len(self.sentences_tokenized)
	
	def __getitem__(self, index):
		return (self.sentences_tokenized[index][0], self.sentences_tokenized[index][1])

In [7]:
dataset = RobertaDataset(sentences, 128)
BATCH_SIZE = 256
dataloader = DataLoader(dataset, batch_size = BATCH_SIZE, shuffle = True, num_workers = 0)

100.0% complete.			complete.								

## Embedding Calculations

Calculate a single embedding just to test.

In [8]:
batch = next(iter(dataloader))

ids = batch[0].to(device)
mask = batch[1].to(device)
model = RobertaModel.from_pretrained('roberta-base').to(device)

with torch.no_grad():
	output = model(ids, mask)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Calculate the embeddings of our batches. I abort once 12_500 tokens have been collected as I start to run out of RAM after that.

In [9]:
import sys
sys.tracebacklimit = 0
import sys, traceback, gc
type, val, tb = None, None, None
type, val, tb = sys.exc_info()
traceback.clear_frames(tb)

token_to_avg_embedding_map = dict()
avg_token_embedding = None

def calculate_embeddings() -> dict:
	processed_sentances = 0
	
	model = RobertaModel.from_pretrained('roberta-base').to(device)
	model.eval()
	
	token_to_avg_embedding_map = {}
	avg_token_embedding = None
	
	with torch.no_grad():
		for batch in dataloader:
			
			ids = batch[0].to(device)
			mask = batch[1].to(device)
			
			output = model(ids, mask)
	
			####################################################################
		
			### shape, [batch, tokens in sentance, embeddings of each token]
			embeddings = output[0].detach().cpu().numpy()
	
			
			# Update average embeddings, 
			for sentence_embedding_index in range(len(embeddings)):
				sentence_embedding = embeddings[sentence_embedding_index]
	
				for token_index in range(len(sentence_embedding)):
					token = ids[sentence_embedding_index][token_index]
					token_str = tokenizer.decode(token)
					token_embedding = sentence_embedding[token_index]
	
					if avg_token_embedding is None:
						avg_token_embedding = token_embedding
					else:
						avg = np.array([avg_token_embedding, token_embedding])
						avg_token_embedding = np.mean(avg, axis=0)
	
					if token_str in token_to_avg_embedding_map:
						avg = np.array([token_to_avg_embedding_map[token_str], token_embedding])
						token_to_avg_embedding_map[token_str] = np.mean(avg, axis=0)
					else:
						token_to_avg_embedding_map[token_str] = token_embedding
	
			del embeddings
			del ids
			del mask
			del batch # Prevents memory leaks!!! this took a VERY long time to track down
		
			if (psutil.virtual_memory().percent) > 95.0:
				print("Aborting embedding generation early to avoid running out of RAM!")
				print(f"{psutil.virtual_memory().used / 1e9} GB used.")
				print(f"{processed_sentances / len(sentences) * 100.0}% complete. {len(token_to_avg_embedding_map)} embeddings generated. {psutil.virtual_memory().percent}% RAM utilization.")
				return token_to_avg_embedding_map, avg_token_embedding
			
			processed_sentances += BATCH_SIZE
			print(f"{processed_sentances / len(sentences) * 100.0}% complete. {len(token_to_avg_embedding_map)} embeddings generated. {psutil.virtual_memory().percent}% RAM utilization. \t\t\t", end ='\r')
				
			
	return token_to_avg_embedding_map, avg_token_embedding

token_to_avg_embedding_map, avg_token_embedding = calculate_embeddings()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Aborting embedding generation early to avoid running out of RAM!RAM utilization. 					
79.212089344 GB used.
3.8725168248924495% complete. 44187 embeddings generated. 95.1% RAM utilization.


In [10]:
gc.collect()

0

# Problem one complete!
The token_to_avg_embedding_map is a dictonary mapping between sub-word tokens and their average embedding in the dataset.

In [11]:
print("tokens : " + str(len(token_to_avg_embedding_map)))
avg_token_embedding = avg_token_embedding.tolist()

tokens : 44187


# Problem 2
In this section we are going to implement the most_similar() functions from chp 9

First, generate a word to embedding mapping.

In [12]:
def get_average_embedding(word):
	tokens = tokenizer(word)['input_ids']
	embedding = np.zeros(768)
	for token in tokens:
		token = tokenizer.decode(token) # get the string this token id represents
		if token in token_to_avg_embedding_map:
			embedding += np.array(token_to_avg_embedding_map[token]) 		
		else:
			embedding += np.array(avg_token_embedding)
	return embedding / float(len(tokens))


def generate_word_embedding_map(words: list) -> dict:
	word_embedding_map = {}
	processed_words = 0
	for word in words:
		embedding = get_average_embedding(word)
		word_embedding_map[word] = embedding
	
		processed_words += 1
		print(f"{processed_words / len(words) * 100.0}% complete. {len(word_embedding_map)} word embeddings generated.\t\t\t", end ='\r')
	return word_embedding_map


def load_words(from_file: str) -> list:
	words = []

	with open(from_file, 'r') as file:
		while line := file.readline():
			words += [line.strip()]

	return words

In [13]:
words = load_words("glove.6B.300d-vocabulary.txt")
word_to_embedding = generate_word_embedding_map(words)

100.0% complete. 400000 word embeddings generated.			enerated.				

In [14]:
def get_word_embedding(word):
    if word in word_to_embedding:
        emb = word_to_embedding[word]
    else:
        emb = get_average_embedding(word)
        word_to_embedding[word] = emb
    return emb

def most_similar(word, topn=10):
    emb = get_word_embedding(word)

    # calculate similarities to all words in out vocabulary
    similarities = []
    for word, embedding, in word_to_embedding.items():
        similarity = embedding @ emb

        similarities += [(float(similarity), str(word))]

    similarities.sort(key = itemgetter(0))
    similarities.reverse()
    
    return similarities[:topn]

## 6 Examples

In [15]:
most_similar("cactus")

[(135.17034697830746, 'part-owner'),
 (134.20062800382647, 'user-created'),
 (134.1277011555912, 'name-the-team'),
 (133.9714977573887, 'one-episode'),
 (133.93418796243938, 'one-party'),
 (133.89707581810606, 'anti-party'),
 (133.72221367501683, 'one-player'),
 (133.6491954480736, 'user-friendly'),
 (133.6461239500104, 'power-to-weight'),
 (133.61926210586958, 'one-term')]

In [16]:
most_similar("cake")

[(131.8084804904319, 'user-created'),
 (131.4068415488551, 'part-owner'),
 (131.30965294915973, 'one-episode'),
 (130.95716648049793, 'one-player'),
 (130.91848643973148, 'real-valued'),
 (130.8722229835089, 'name-the-team'),
 (130.82019322180255, 'one-party'),
 (130.8113679663642, 'game-like'),
 (130.66150368793814, 'user-friendly'),
 (130.62885660673535, 'one-track')]

In [17]:
most_similar("angry")

[(133.55712045509165, 'user-created'),
 (133.26061048682678, 'part-owner'),
 (132.94335490381417, 'one-episode'),
 (132.66173470091167, 'user-friendly'),
 (132.59729716430834, 'name-the-team'),
 (132.46072966834174, 'one-party'),
 (132.45511141290132, 'one-player'),
 (132.38382017421995, 'anti-party'),
 (132.3580995778959, 'real-valued'),
 (132.16194029413293, 'one-track')]

In [18]:
most_similar("quickly")

[(133.0994850243323, 'part-owner'),
 (132.57928091089403, 'user-created'),
 (131.9128229380362, 'user-friendly'),
 (131.7707501294017, 'name-the-team'),
 (131.73259254899355, 'one-episode'),
 (131.6786988028314, 'one-party'),
 (131.6127170847452, 'great-great-great'),
 (131.61109384726643, 'one-player'),
 (131.47443573424695, 'one-term'),
 (131.46982881884452, 'anti-party')]

In [19]:
most_similar("between")

[(137.42207584200807, 'part-owner'),
 (136.3310649736624, 'user-created'),
 (136.29527859413733, 'one-episode'),
 (135.8783439587999, 'one-party'),
 (135.8567239402024, 'name-the-team'),
 (135.70504052564243, 'one-player'),
 (135.70073002612594, 'one-term'),
 (135.43609344005472, 'anti-party'),
 (135.42801931177303, 'user-friendly'),
 (135.39107954216493, 'one-make')]

In [20]:
most_similar("the")

[(140.87658893337073, 'part-owner'),
 (140.22962590297868, 'name-the-team'),
 (140.02311148411118, 'user-created'),
 (139.79206154958516, 'one-episode'),
 (139.52758815753043, 'one-term'),
 (139.44491825401548, 'one-player'),
 (139.43042353283406, 'one-party'),
 (139.26150132424408, 'user-friendly'),
 (139.25740987244, 'anti-party'),
 (139.13074353559648, 'one-dimensional')]