# Assignment 4 

This notebook uses Roberta to generate a single dictionary which contains a mapping between a token (as a string) and a 756 dimensional averaged embedding over the provided text. The corpus to be used must be placed in the same directory as this notebook and named 'dataset.txt'.

## Initialization

Import required libraries.

In [1]:
# Standard ML libaries
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from operator import itemgetter
from sklearn.metrics import classification_report

# RobertaModel and Tockenizer
from transformers import RobertaTokenizer, RobertaModel, RobertaTokenizerFast

Initialize environment with GPU (or CPU as fallback!).

In [2]:
# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)


device: cpu
random seed: 1234


## Data Pre-Processing 

Load in dataset, sentence by sentence.

In [3]:
sentences = []

linecount = 0
wordcount = 0

with open("dataset.txt", 'r') as dataset_file:
    while line := dataset_file.readline():
        sentences += [line]
        linecount += 1
        wordcount += len(line.split())

print("Loaded " + str(linecount) + " lines and " + str(wordcount) + " words.")

Loaded 4468825 lines and 47820302 words.


Initialize tokenizer.

In [4]:
## Try a fast tokenizer,
tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base", add_prefix_space = True, clean_up_tokenization_spaces = True)

Quick sanity check to ensure tokenizer is working.

In [5]:
sentence = sentences[1]
tokens = tokenizer(sentence, is_split_into_words = True, return_tensors='pt', \
			padding="max_length", max_length=256, truncation=True)
## Maybe reduce max length to 256 
ids = tokens['input_ids'][0]
mask = tokens['attention_mask'][0] 
print(tokenizer.decode(ids[7]))

 Pictures


## Dataset

We'll be handling tokenization in a Dataset so we can take advantage of the DataLoader for auto batching.

In [6]:
from torch.utils.data import Dataset, DataLoader
class RobertaDataset(Dataset):
	def __init__(self, sentances: list, tokenizer_instance: object, max_length: int):
		self.tokenizer = tokenizer_instance
		self.max_length = max_length
		self.sentences = sentences

	def __len__(self):
		return len(self.sentences)
	
	def __getitem__(self, index):
		sentence = self.sentences[index]
		tokens = self.tokenizer(sentence, is_split_into_words = True, return_tensors='pt', \
					 padding="max_length", max_length=self.max_length, truncation=True)
		ids = tokens['input_ids'][0]
		mask = tokens['attention_mask'][0] 
		return (torch.LongTensor(ids), torch.LongTensor(mask))

In [7]:
dataset = RobertaDataset(sentences, tokenizer, 256)
BATCH_SIZE = 128
dataloader = DataLoader(dataset, batch_size = BATCH_SIZE, shuffle = True, num_workers = 8)

## Embedding Calculations

Calculate a single embedding just to test.

In [8]:
batch = next(iter(dataloader))

ids = batch[0].to(device)
mask = batch[1].to(device)
model = RobertaModel.from_pretrained('roberta-base').to(device)

with torch.no_grad():
	output = model(ids, mask)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Calculate the embeddings of our batches. I abort once 12_500 tokens have been collected as I start to run out of RAM after that.

In [16]:
import sys
sys.tracebacklimit = 0
import sys, traceback, gc
type, val, tb = None, None, None
type, val, tb = sys.exc_info()
traceback.clear_frames(tb)

def calculate_embeddings() -> dict:
    processed_sentances = 0
    
    model = RobertaModel.from_pretrained('roberta-base').to(device)
    model.eval()
    
    token_to_avg_embedding_map = {}
    avg_token_embedding = None
    
    with torch.no_grad():
    	for batch in dataloader:
    
    		ids = batch[0].to(device)
    		mask = batch[1].to(device)
    
    		output = model(ids, mask)
    
    		####################################################################
    	
    		### shape, [batch, tokens in sentance, embeddings of each token]
    		embeddings = output[0].detach().cpu()
    
    		
    		# Update average embeddings, 
    		for sentence_embedding_index in range(len(embeddings)):
    			sentence_embedding = embeddings[sentence_embedding_index]
    
    			for token_index in range(len(sentence_embedding)):
    				token = ids[sentence_embedding_index][token_index]
    				token_str = tokenizer.decode(token)
    				token_embedding = sentence_embedding[token_index]
    
    				if avg_token_embedding is None:
    					avg_token_embedding = token_embedding
    				else:
    					avg_token_embedding += token_embedding
    					avg_token_embedding /= 2

                    # replace division by 2 with NP.average because that IS numerically stable!
    
    				if token_str in token_to_avg_embedding_map:
    					token_to_avg_embedding_map[token_str] += token_embedding
    					token_to_avg_embedding_map[token_str] /= 2 # not stable but close enough
    				else:
    					token_to_avg_embedding_map[token_str] = token_embedding
    		
    		del embeddings
    		del ids
    		del mask
    
    		gc.collect()
    
    		if len(token_to_avg_embedding_map) > 10_000:
    			print("Exiting early to avoid running out of RAM!")
    			return token_to_avg_embedding_map, avg_token_embedding
    			break
    
    		processed_sentances += BATCH_SIZE
    		print(f"{processed_sentances / len(sentences) * 100.0}% complete. {len(token_to_avg_embedding_map)} embeddings generated.\t\t\t", end ='\r')

token_to_avg_embedding_map, avg_token_embedding = calculate_embeddings()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Exiting early to avoid running out of RAM!dings generated.			


# Problem one complete!
The token_to_avg_embedding_map is a dictonary mapping between sub-word tokens and their average embedding in the dataset.

In [17]:
print("tokens : " + str(len(token_to_avg_embedding_map)))
avg_token_embedding = avg_token_embedding.tolist()

# print(token_to_avg_embedding_map['the'][:5])
# print(token_to_avg_embedding_map['let'][:5])
# print(token_to_avg_embedding_map['rea'][:5])

tokens : 10293


# Problem 2
In this section we are going to implement the most_similar() functions from chp 9

First, generate a word to embedding mapping.

In [18]:

def get_average_embedding(word):
	tokens = tokenizer(word)['input_ids']
	embedding = np.array(avg_token_embedding)
	for token in tokens:
		token = tokenizer.decode(token) # get the string this token id represents
		if token in token_to_avg_embedding_map:
			embedding += np.array(token_to_avg_embedding_map[token]) 		
		else:
			embedding += np.array(avg_token_embedding)
	return embedding / float(len(tokens))

def generate_word_embedding_map(words: list) -> dict:
	word_embedding_map = {}
	processed_words = 0
	for word in words:
		embedding = get_average_embedding(word)

		if word in word_embedding_map:
			word_embedding_map[word] += embedding
			word_embedding_map[word] /= 2
		else:
			word_embedding_map[word] = embedding
	
		processed_words += 1
		print(f"{processed_words / len(words) * 100.0}% complete. {len(word_embedding_map)} word embeddings generated.\t\t\t", end ='\r')
	return word_embedding_map

def load_words(from_file: str) -> list:
	words = []

	with open(from_file, 'r') as file:
		while line := file.readline():
			words += [line.strip()]

	return words


In [19]:
words = load_words("glove.6B.300d-vocabulary.txt")
word_to_embedding = generate_word_embedding_map(words)

100.0% complete. 400000 word embeddings generated.			enerated.				

In [20]:
from operator import itemgetter
import copy


def most_similar(word, topn=10):
    
    # retrieve embedding for given word
    if word in word_to_embedding:
        emb = word_to_embedding[word]
    else:
        emb = avg_token_embedding
    
    # calculate similarities to all words in out vocabulary
    similarities = []
    for word, embedding, in word_to_embedding.items():
        similarity = embedding @ emb

        similarities += [(float(similarity), str(word))]

    similarities.sort(key = itemgetter(0))
    similarities.reverse()
    
    return similarities[:10]

most_similar("water")

[(233.6513370716967, 'person'),
 (232.64613655882204, 'term'),
 (232.3005480258835, 'destroy'),
 (232.10033507778545, 'parameter'),
 (231.89816562725494, 'consisted'),
 (231.8813921823355, 'things'),
 (231.87471472138853, '1978'),
 (231.42188572610095, 'significance'),
 (231.34554120770994, 'vehicle'),
 (231.32081220497497, 'characteristic')]

In [21]:
most_similar("ear")

[(236.8912853624291, '1978'),
 (234.40151091817472, 'mediated'),
 (233.52853504437167, 'consisted'),
 (233.29640248163074, 'person'),
 (232.66125176828888, 'term'),
 (232.47942219713397, 'vs'),
 (231.7509706865751, '1902'),
 (231.68343644725837, 'myth'),
 (231.41147720392993, 'preached'),
 (231.38467474796846, 'universes')]

In [22]:
get_average_embedding("hello")

array([ 5.42103363e-02,  2.12259841e-01, -3.10762102e-02,  1.44131978e-02,
        3.02585393e-01,  1.62047781e-01,  8.79064873e-02,  1.37890449e-01,
        3.89600905e-02, -9.25337821e-02, -1.50042788e-01,  2.67766441e-01,
        4.85365962e-02, -1.63591256e-02, -5.52116583e-03,  7.14599689e-02,
        1.59045286e-01,  1.08868358e-01, -4.03653911e-02,  1.02756780e-01,
        3.34084034e-05,  2.61911744e-02, -5.67139288e-02,  1.70083928e-02,
        2.70769293e-02,  9.80665796e-02,  1.16179866e-01, -9.32704707e-02,
        1.96839665e-01,  1.28245577e-02, -1.80880542e-01, -5.40257518e-02,
        1.43670632e-02,  6.14013771e-02, -4.13537795e-02,  7.38272642e-02,
       -9.07509495e-02,  2.57223298e-02,  2.28458941e-01,  2.64816551e-02,
       -2.47397040e-01, -1.34264464e-01,  5.35521749e-03,  7.49793183e-02,
       -3.15548504e-02, -2.43499503e-02,  1.01703221e-01,  4.48265076e-02,
       -4.35936699e-02,  7.89945821e-03,  8.45659074e-02, -4.38853353e-03,
       -1.40682379e-01,  