In [None]:
# Code to convert Sentences to BERT Embeddings
# Update the file paths to proceed

In [None]:
file_train = "MELD_train_efr.json"
file_val = "MELD_val_efr.json"

In [None]:
import json
import pickle
import nlp_utils as nu
import torch
from transformers import BertTokenizer, BertModel

In [None]:
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
# % matplotlib inline

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [None]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

In [None]:
def get_sen_embed(utt):
    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers. 
    with torch.no_grad():

        # Define a new example sentence with multiple meanings of the word "bank"
        text = utt

        # Add the special tokens.
        marked_text = "[CLS] " + text + " [SEP]"

        # Split the sentence into tokens.
        tokenized_text = tokenizer.tokenize(marked_text)

        # Map the token strings to their vocabulary indeces.
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

        # Mark each of the 22 tokens as belonging to sentence "1".
        segments_ids = [1] * len(tokenized_text)

        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])

        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]

    # Concatenate the tensors for all layers. We use `stack` here to
    # create a new dimension in the tensor.
    token_embeddings = torch.stack(hidden_states, dim=0)

    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)

    # Stores the token vectors, with shape [22 x 3,072]
    token_vecs_cat = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    # Stores the token vectors, with shape [22 x 768]
    token_vecs_sum = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)
        
        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec)

    # `hidden_states` has shape [13 x 1 x 22 x 768]

    # `token_vecs` is a tensor with shape [22 x 768]
    token_vecs = hidden_states[-2][0]

    # Calculate the average of all 22 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)
    
    return sentence_embedding

In [None]:
# Add BERT Embeddings from a file to ans_dict
def add_bert_embeddings(filename, ans_dict):

    # Read File
    with open(filename, 'r') as file:
            dataset = json.load(file)

    N = len(dataset)

    # Reading the data

    episodes_list = []
    speakers_list = []
    utterances_list = []
    triggers_list = []
    emotions_list = []

    for i in range(len(dataset)):
            episodes_list.append(dataset[i]['episode'])
            speakers_list.append(dataset[i]['speakers'])
            utterances_list.append(dataset[i]['utterances'])
            triggers_list.append(dataset[i]['triggers'])
            emotions_list.append(dataset[i]['emotions'])

    for i in range(N):
        for j in range(len(utterances_list[i])):
            pp_utt = nu.preprocess_text(utterances_list[i][j])
            utt_emb = get_sen_embed(pp_utt)
            ans_dict[pp_utt] = utt_emb
        if i%10==0:
          print(i)

In [None]:
# Add the embeddings for files

ans_dict = {}
add_bert_embeddings(file_train, ans_dict)
add_bert_embeddings(file_train, ans_dict)

In [None]:
# Save the file

with open('sent2emb.pickle', 'wb') as handle:
    pickle.dump(ans_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)