RUBRIC

[2 Points] Present an overview for what type of bias you will be investigating and why the particular investigation you will be doing is relevant. You might consider asking questions like: Why is it important to find this kind of bias in machine learning models? Why will the type of investigation I am performing be relevant to other researchers or practitioners? 

[2 Points] Present one or more research questions that you will be answering and explain the methods that you will employ to answer these research questions. Present a hypothesis as part of your research questions. 

[2 Points] As part of your assignment, you will choose a methodology that involves comparing two (or more) techniques to one another. Discuss how you will measure a difference between the two techniques. That is, if you are measuring the difference statistically, what test will you use and why is it appropriate? Are there any limitations to performing this test that you should be aware of? 

[4 Points] Carryout your analysis and model training. Explain your steps in as much detail so that the instructor can understand your code. 

[4 Points] Present results from your analysis and provide evidence from the results that support or refute your hypothesis. Write a conclusion based upon the various analyses you performed. Be sure to reference your research questions systematically in your conclusion. With your analysis complete, are there any additional research questions or limitations to your conclusions?

[1 Points] Identify two conferences or journals that would be interested in the results of your analysis.  
If using code from another author (not your own), you will be graded on the clarity of explanatory comments you add to the code. 


In [13]:
import math
import os
from types import FunctionType

import pandas as pd
import torch
import torchtext
from gensim.models import KeyedVectors

# BASE_PATH: str = os.path.dirname(os.path.abspath(__file__))
BASE_PATH: str = "/home/paperspace/Desktop/8321-Mach-Lrng-Neural-Ntwrks/Lab1/CS8321_Lab1" # REPLACE THIS LINE FOR YOUR LOCAL

CLASSIFIERS_PATH: str = BASE_PATH + "/classifiers/"
DATASET_PATH: str = BASE_PATH + "/datasets/"
EMBEDDINGS_PATH: str = BASE_PATH + "/embeddings/"
NUM_EMOTIONS: int = 28
EMBED_SIZE: int = 0

# Check if our key directories exist
if not os.path.exists(CLASSIFIERS_PATH):
    raise FileNotFoundError("Could not find folder for classifier models.")
if not os.path.exists(DATASET_PATH):
    raise FileNotFoundError("Could not find folder with GoEmotion dataset.")
if not os.path.exists(EMBEDDINGS_PATH):
    raise FileNotFoundError("Could not find folder with word embeddings sets.")

# Is the cuda GPU available?
if not torch.cuda.is_available():
    print("Warning: Using CPU for Pytorch.")
device: device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [14]:
# First, let's define our basic BDRNN architecture
class BDRNN(torch.nn.Module):
    def __init__(self, vocab_word_count: int, vectors: torch.Tensor, output_size: int, num_layers: int, dropout: float,
                 *args: tuple[any],
                 **kwargs: dict[str, any]) -> None:
        super().__init__(*args, **kwargs)

        self.num_layers = num_layers if num_layers > 1 else 2
        self.hidden_size = NUM_EMOTIONS // num_layers

        self.embeddings = torch.nn.Embedding.from_pretrained(vectors, padding_idx=EMBED_SIZE)

        self.rnn_layers = torch.nn.RNN(input_size=vocab_word_count, hidden_size=self.hidden_size, num_layers=num_layers,
                                       bidirectional=True, dropout=dropout, batch_first=True)

        self.output_layer = torch.nn.Linear(self.hidden_size, output_size)

    def forward(self, input_data) -> torch.Tensor:
        embedded: torch.Tensor = self.embeddings(input_data)

        output: torch.Tensor
        hidden: torch.Tensor
        output, hidden = self.rnn_layers(embedded)

        return self.output_layer(hidden[-1, :])


class pandas_dataset(torch.utils.data.Dataset):
    def __init__(self, df: pd.DataFrame) -> None:
        self.df = df

    def __len__(self) -> int:
        return self.df.shape[0]

    def __getitem__(self, index: int) -> (str, str):
        return self.df["text"].iloc[index], self.df["emotion_ids"].iloc[index]

In [15]:
"""Deserialize the embeddings, and return word labels with their corresponding tensors."""
def get_vectors(embedding: str) -> tuple[dict[str, int], torch.Tensor]:
    skip_first_line: bool = False
    global EMBED_SIZE
    match embedding:
        case "glove":
            embedding_path: str = EMBEDDINGS_PATH + "glove.840B.300d.txt"
            EMBED_SIZE = 2196018
            embedding_components: int = 300
        case "word2vec":
            embedding_path: str = EMBEDDINGS_PATH + "GoogleNews-vectors-negative300.bin"
            gn_model = KeyedVectors.load_word2vec_format(embedding_path, binary=True)
        case "numberbatch":
            embedding_path: str = EMBEDDINGS_PATH + "numberbatch-19.08-en.txt"
            EMBED_SIZE = 516782
            embedding_components: int = 300
            skip_first_line = True
        case default:
            raise RuntimeError("Invalid embedding chosen.")
        
    # Deserializing glove and numberbatch embeddings
    if not os.path.exists(embedding_path):
        raise FileNotFoundError("Could not find embedding file: {}".format(embedding_path))
    with (open(embedding_path, encoding="utf_8") as embeddings_file):
        word_labels: dict[str, int] = {}
        tensor: torch.Tensor = torch.empty((EMBED_SIZE + 1, embedding_components), dtype=torch.float32, device=device)
        
        # We need to skip the first line of the numberbatch embeddings because that's header information
        if skip_first_line:
            _ = embeddings_file.readline()
        
        # Clean up the file and load the embeddings into a tensor
        for index, embedding in enumerate(embeddings_file):
            embedding_split: list[str] = embedding.rstrip().split(" ")
            word_labels[embedding_split[0]] = index
            tensor[index] = torch.tensor([float(val) for val in embedding_split[1:]], dtype=torch.float32,
                                         device=device)
            # Output our progress every 100,000 words
            if (index + 1) % 100000 == 0:
                print("Processed {}/{}".format(index + 1, EMBED_SIZE))
        tensor[-1] = torch.zeros(embedding_components, dtype=torch.float32, device=device)

        # Adding a padding token
        word_labels["<PAD>"] = EMBED_SIZE
        tensor.to(device)
        return word_labels, tensor

""" Tokenize the text and convert it into a list of integers. We'll use a dictionary to map words to integers. 
    We'll also use a special token ("something") for words that are not in the dictionary.
    The tokenizer function splits the text into words."""
def tokenize(text: str, labels: dict, tokenizer: FunctionType) -> list[int]:
    return [labels[word] if word in labels.keys() else labels["something"] for word in tokenizer(text)]


def resolve_emotions(id: str) -> str:
    return [emotions[int(emotion)] for emotion in id.split(",")]

""" 
Train a Bidirectional RNN model 
"""
def train(model: BDRNN, batches, num_epochs: int):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    criterion = torch.nn.CrossEntropyLoss().to(device)
    losses = []

    model.train()
    for epoch_num, epochs in enumerate(range(num_epochs)):
        correct: int = 0
        total: int = 0
        for num_batch, batch in enumerate(batches):
            for sentence, emotions in batch:

                optimizer.zero_grad()

                predictions = model(sentence)
                # Rounding is naive, we should base this off a confidence threshold
                guesses = torch.round(torch.sigmoid(predictions))
                if torch.equal(guesses, emotions): correct += 1
                total += 1

                loss = criterion(predictions, emotions)
                losses.append(float(loss))

                loss.backward()

                optimizer.step()
        print("Epoch: {} | Loss: {} | Accuracy: {}%".format(epoch_num + 1, sum(losses) / len(losses), (correct /
                                                                                                      total) * 100))

def collate(batch: list[tuple[list[int], list[str]]]) -> list[tuple[torch.IntTensor, torch.Tensor]]:
    final_batch = []
    max_tokens = len(max(batch, key=lambda tuple: len(tuple[0]))[0])
    for sentence, emotions in batch:
        sentence.extend([EMBED_SIZE] * (max_tokens - len(sentence)))
        sentence = torch.IntTensor([int(value) for value in sentence]).to(device)
        # There's definitely a way to do a list comprehension here but I'm too stupid to figure it out
        _emotions = torch.zeros(NUM_EMOTIONS, dtype=torch.float32, device=device)
        emotions = emotions.split(",")
        for emotion in emotions:
            _emotions[int(emotion)] = 1.0
        final_batch.append((sentence, _emotions))
    return final_batch  # Can we modify in-place instead?

In [16]:
def main():
    # Now we need to handle our dataset
    with open(DATASET_PATH + "emotions.txt") as emotions_file:
        emotions = [emotion.strip() for emotion in emotions_file]
    if len(emotions) != NUM_EMOTIONS or emotions[4] != "approval":
        raise RuntimeError("Failed to load emotion mappings.")

    training_set = pd.read_csv(DATASET_PATH + "train.tsv", delimiter="\t", names=["text", "emotion_ids"],
                               usecols=[0, 1])
    testing_set = pd.read_csv(DATASET_PATH + "test.tsv", delimiter="\t", usecols=[0, 1])
    print(training_set.head())
    print(testing_set.head())

    max_words: int = max(training_set["text"].map(len).max(), testing_set["text"].map(len).max())
    input_dim: int = 2 ** math.ceil(math.log2(max_words)) if max_words >= 2 else 2

    # Time to do some training!
    labels, vectors = get_vectors("numberbatch")
    tokenizer = torchtext.data.utils.get_tokenizer("basic_english")
    training_set["text"] = training_set["text"].apply(tokenize, labels=labels, tokenizer=tokenizer)
    testing_set["text"] = testing_set["text"].apply(tokenize, labels=labels, tokenizer=tokenizer)
    print(training_set.head())
    print(testing_set.head())
    numberbatch_model = BDRNN(vectors.shape[1], vectors, NUM_EMOTIONS, 4, 0.5).to(device)
    train_dataset = pandas_dataset(training_set)
    test_dataset = pandas_dataset(testing_set)
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate)
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=True, collate_fn=collate)
    print('Created `training dataloader` with %d batches!' % len(train_dataloader))
    print('Created `testing dataloader` with %d batches!' % len(test_dataloader))
    train(numberbatch_model, train_dataloader, 10)

if __name__ == '__main__':
    main()

                                                text emotion_ids
0  My favourite food is anything I didn't have to...          27
1  Now if he does off himself, everyone will thin...          27
2                     WHY THE FUCK IS BAYLESS ISOING           2
3                        To make her feel threatened          14
4                             Dirty Southern Wankers           3
                                                text emotion_ids
0  I’m really sorry about your situation :( Altho...          25
1    It's wonderful because it's awful. At not with.           0
2  Kings fan here, good luck to you guys! Will be...          13
3  I didn't know that, thank you for teaching me ...          15
4  They got bored from haunting earth for thousan...          27
Processed 100000/516782


In [None]:
# Download link for word2vec: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing
# Download link for Glove: https://huggingface.co/stanfordnlp/glove/resolve/main/glove.840B.300d.zip

import matplotlib.pyplot as plt
import numpy as np
from numpy import ndarray
from sklearn.manifold import TSNE
from torch import Tensor, cat
from torch.cuda import is_available as cuda_is_available
from train import get_vectors


# EDITABLE VARIABLES
embeddings = ["numberbatch", "glove"] # Embeddings definition. Add word2vec once deserialization is done.

# Word comparison groups. Format: [base_word, similar_word_1, similar_word_2]
word_comparison_groups = [
    ["tire", "tired", "tyre"],
]

# Add a new distance function here if you want.
"""Calculate the distances between a base word and two similar words using Euclidean, Cosine, and Manhattan distances."""
def calculate_distances(base_word: ndarray[float], similar_word_1: ndarray[float], similar_word_2: ndarray[float]) -> dict[str, list[float]]:
    return {
        "euclidean": [euclidean_distance(similar_word_1, base_word), euclidean_distance(similar_word_2, base_word)],
        "cosine": [cosine_similarity(similar_word_1, base_word), cosine_similarity(similar_word_2, base_word)],
        "manhattan": [manhattan_distance(similar_word_1, base_word), manhattan_distance(similar_word_2, base_word)],
    }

# We should look at comparing vectors in different embeddings and see how well ambigious words center around common
# synonyms for each meaning. We could probably do some sort of visualization for this as well.

def euclidean_distance(vector1: Tensor, vector2: Tensor) -> float:
    return np.linalg.norm(vector1 - vector2)

def cosine_similarity(vector1: Tensor, vector2: Tensor) -> float:
    return np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))

def manhattan_distance(v1, v2):
    return np.sum(np.abs(v1 - v2))

In [None]:
"""Converts the given list of tensors to a numpy array based on GPU availability."""
def convert_tensors_to_numpy(embeddings_list: list[Tensor]) -> ndarray[float]:
    if cuda_is_available():
        numpy_vectors = np.array([vector.cpu().numpy() for vector in embeddings_list])
    else:
        numpy_vectors = np.array([vector.numpy() for vector in embeddings_list])
    return numpy_vectors

"""Returns the word vectors to compare from the given numpy vectors. These vectors are a word groups members"""
def get_word_vectors_to_compare(numpy_vectors: ndarray[float]) -> tuple[ndarray[float], ndarray[float], ndarray[float]]:
    return numpy_vectors[0], numpy_vectors[1], numpy_vectors[2]

"""Returns a dictionary of distance types to their calculated distances for the given word vectors."""
def get_distances(embeddings_list: list[Tensor]) -> dict[str, list[float]]:
    numpy_vectors = convert_tensors_to_numpy(embeddings_list)
    base_word, similar_word1, similar_word2 = get_word_vectors_to_compare(numpy_vectors)
    return calculate_distances(base_word, similar_word1, similar_word2)

"""Returns a dictionary of embedding names to their respective word vectors and vocabularies"""
def get_embeddings(embeddings: list[str]) -> dict[str, (dict[str, int], Tensor)]:
    if len(embeddings) == 0:
        raise ValueError("No embeddings were selected to load.")
    
    result = {}
    for embedding in embeddings:
        vocab, vectors = get_vectors(embedding)
        result[embedding] = (vocab, vectors)
    return result

"""Returns a dictionary of embedding names to a list of comparison groups, the words whose distances are being compared"""
def get_comparison_embeddings(embeddings: dict[str, (dict[str, int], Tensor)]) -> dict[str, list[list[Tensor]]]:
    # Dictionary of embedding name to list of comparison groups
    result: dict[str, list[list[Tensor]]] = {}
    for embed_name, (vocab, vectors) in embeddings.items():
        comparisons = []
        if embed_name not in result:
            result[embed_name] = []
        
        # Populate the comparison groups
        for idx, group in enumerate(word_comparison_groups):
            comparisons = []
            for word in group:
                comparisons.append(vectors[vocab[word]])
            result[embed_name].append(comparisons)
    return result

"""Returns a dictionary of embedding names to a list of dictionaries of distance types to their calculated distances"""
def compare_embeddings(comparison_embeddings: dict[str, list[list[Tensor]]]) -> dict[str, list[dict[str, list[float]]]]:
    result = {}
    for embedding, word_groups in comparison_embeddings.items():
        for idx, group_vectors in enumerate(word_groups):
            if result.get(embedding) is None:
                result[embedding] = []
            distances: dict[str, list[float]] = get_distances(group_vectors)
            result[embedding].append(distances)
    return result

In [None]:

"""Plots boxplots of the distance ratios for each embedding and distance type."""
def compare_distances(embedding_distances: dict[str, list[dict[str, list[float]]]]):
    """Returns a dictionary of embedding names to a dictionary of distance types to their calculated distance ratios."""
    def calculate_distance_ratios():
        # Dict of embedding name to distance type and all of that distance type's calculated distance ratios
        ratios: dict[str, dict[str, list[float]]] = {}
        # For every distance type for embeddings
        for embedding_name, distances in embedding_distances.items():
            ratios[embedding_name] = {}
            for distance in distances:
                for distance_type, values in distance.items():
                    if ratios[embedding_name].get(distance_type) is None:
                        ratios[embedding_name][distance_type] = []
                    ratio = (max(values[0], values[1]) / min(values[0], values[1]))
                    ratios[embedding_name][distance_type].append(ratio)
        return ratios

    """
    Plots boxplots of the distance ratios for each embedding and distance type.
    Args: ratios: dict[str, dict[str, list[float]]] - Dictionary of embedding names to a dictionary of distance types to a list of all their calculated distance ratios.
    
    Ex:
    ratios = {
        "glove": {
            "euclidean": [1.0, 2.55, 1.380, 4.44, 4.4, 4983],
            "cosine": [1.0, 2.55, 1.380, 4.44, 4.4, 4983],
            ...
        },
        "numberbatch": {
            "euclidean": [1.0, 2.55, 1.380, 4.44, 4.4, 4983],
            "cosine": [1.0, 2.55, 1.380, 4.44, 4.4, 4983],
            ...        
        }
    }
    """
    def show_boxplots(ratios: dict[str, dict[str, list[float]]]):
        distances: list[dict[str, list[float]]] = list(ratios.values())
        # Get all the values per distance type for every embedding
        to_plot = {}
        for i in range(0, len(distances)):
            for embedding_name, distances2 in ratios.items():
                for distance_type, values in distances2.items():
                    print(embedding_name, distance_type, values)
                    if to_plot.get(distance_type) is None:
                        to_plot[distance_type] = {}
                    to_plot[distance_type][embedding_name] = values
        
        # For every distance metric, plot boxplots for all embeddings
        for distance_type, embedding_data in to_plot.items():
            fig, axs = plt.subplots(figsize=(10, 8))
            boxplots_data = []
            labels = []
            for embedding_name, values in embedding_data.items():
                boxplots_data.append(values)
                labels.append(embedding_name)
            axs.boxplot(boxplots_data)
            axs.set_xticklabels(labels)
            axs.set_title(distance_type)
            plt.tight_layout()
            plt.show()
    
    # Dict of embedding name to distance type and its calculated distance ratio
    ratios = calculate_distance_ratios()
    show_boxplots(ratios)

In [None]:
orig_embeddings = get_embeddings(embeddings)
comp_embeddings = get_comparison_embeddings(orig_embeddings)
comp_distances = compare_embeddings(comp_embeddings)
compare_distances(comp_distances)