### Inspiration Tutorial
https://colab.research.google.com/drive/1ZQvuAVwA3IjybezQOXnrXMGAnMyZRuPU

### Code to convert Utterances to Hing-BERT Embeddings 
- https://huggingface.co/l3cube-pune/hing-bert
- Update the file paths to proceed

In [None]:
# Importing Libraries

import string
import re
import json
import pickle
import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

In [None]:
# Training and Validation File Path

file_train = "MaSaC_train_erc.json"
file_val = "MaSaC_val_erc.json"

In [None]:
# Preprocessing Functions

numbers = {
    "0": "zero",
    "1": "one",
    "2": "two",
    "3": "three",
    "4": "four",
    "5": "five",
    "6": "six",
    "7": "seven",
    "8": "eight",
    "9": "nine"
}


def remove_puntuations(txt):
    punct = set(string.punctuation)
    txt = " ".join(txt.split("."))
    txt = " ".join(txt.split("!"))
    txt = " ".join(txt.split("?"))
    txt = " ".join(txt.split(":"))
    txt = " ".join(txt.split(";"))

    txt = "".join(ch for ch in txt if ch not in punct)
    return txt


def number_to_words(txt):
    for k in numbers.keys():
        txt = txt.replace(k, numbers[k]+" ")
    return txt


def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'_', ' ', text)
    text = number_to_words(text)
    text = remove_puntuations(text)
    text = ''.join([i if ord(i) < 128 else '' for i in text])
    text = ' '.join(text.split())
    return text

In [None]:
# Setting the default torch device
torch.set_default_device('cuda')

In [None]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('l3cube-pune/hing-bert')

# Load pre-trained model (weights)
model = BertModel.from_pretrained('l3cube-pune/hing-bert',
                                  # Whether the model returns all hidden-states.
                                  output_hidden_states=True,
                                  )

In [None]:
# Get the utterance embedding for one utterances

def get_utt_embed(utt):
    # Run the text through BERT, and compute the average of the hidden states produced
    # from the last 4 layers.
    with torch.no_grad():

        # Add the special tokens.
        marked_text = "[CLS] " + utt + " [SEP]"

        # Split the sentence into tokens.
        tokenized_text = tokenizer.tokenize(marked_text)

        # Map the token strings to their vocabulary indeces.
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

        # Mark each of the tokens as belonging to sentence "1".
        segments_ids = [1] * len(tokenized_text)

        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])

        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]

    # Concatenate the tensors for all layers. We use `stack` here to
    # create a new dimension in the tensor.
    token_embeddings = torch.stack(hidden_states, dim=0)

    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1, 0, 2)

    # Stores the token vectors
    token_vecs_cat = []
    token_vecs_sum = []

    for token in token_embeddings:
        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)
        token_vecs_sum.append(sum_vec)

    token_vecs = hidden_states[-2][0]

    # Calculate the average of all the token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)

    return sentence_embedding

In [None]:
# Add BERT Embeddings from a file to ans_dict
def add_bert_embeddings(filename, ans_dict):

    # Read File
    with open(filename, 'r') as file:
        dataset = json.load(file)

    N = len(dataset)

    # Reading the data

    episodes_list = []
    speakers_list = []
    utterances_list = []
    triggers_list = []
    emotions_list = []

    for i in tqdm(range(len(dataset)), ncols=100, desc='Extracting data'):
        utterances_list.append(dataset[i]['utterances'])

    for i in tqdm(range(N), ncols=100, desc='Generating embeddings'):
        for j in range(len(utterances_list[i])):
            utt = utterances_list[i][j]
            pp_utt = preprocess_text(utt)
            utt_emb = get_utt_embed(utt)
            ans_dict[pp_utt] = utt_emb

In [None]:
# Add the embeddings for files

ans_dict = {}
add_bert_embeddings(file_train, ans_dict)
add_bert_embeddings(file_val, ans_dict)

In [None]:
# Save the file

with open('sent2emb.pickle', 'wb') as handle:
    pickle.dump(ans_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)