### Code to compute the Voyage-AI embeddings of utterances
- Update the file paths to proceed and API_KEY

In [None]:
# Importing Libraries

import string
import re
import json
import pickle
import torch
from tqdm import tqdm

In [None]:
# Training and Validation File Path

file_train = "MELD_test_efr.json"
file_val = "MELD_val_efr.json"

In [None]:
# Preprocessing Functions

numbers = {
    "0": "zero",
    "1": "one",
    "2": "two",
    "3": "three",
    "4": "four",
    "5": "five",
    "6": "six",
    "7": "seven",
    "8": "eight",
    "9": "nine"
}


def remove_puntuations(txt):
    punct = set(string.punctuation)
    txt = " ".join(txt.split("."))
    txt = " ".join(txt.split("!"))
    txt = " ".join(txt.split("?"))
    txt = " ".join(txt.split(":"))
    txt = " ".join(txt.split(";"))

    txt = "".join(ch for ch in txt if ch not in punct)
    return txt


def number_to_words(txt):
    for k in numbers.keys():
        txt = txt.replace(k, numbers[k]+" ")
    return txt


def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'_', ' ', text)
    text = number_to_words(text)
    text = remove_puntuations(text)
    text = ''.join([i if ord(i) < 128 else '' for i in text])
    text = ' '.join(text.split())
    return text

In [None]:
# Enter the API KEY - https://docs.voyageai.com/install/

import voyageai
vo = voyageai.Client("[API_KEY]")

In [None]:
# Get the embeddings of the sentences

def get_sens_embed(query_list):
    result = vo.embed(
        query_list, model="voyage-lite-02-instruct", input_type='document')
    embed = torch.tensor(result.embeddings)
    return embed

In [None]:
# Add Voyage Embeddings from a file to ans_dict

def add_voy_embeddings(filename, ans_dict):

    # Constants
    MAX_UTT = 100
    MAX_TOKENS = 2000

    # Read File
    with open(filename, 'r') as file:
        dataset = json.load(file)

    N = len(dataset)

    # Reading the data

    episodes_list = []
    speakers_list = []
    utterances_list = []
    triggers_list = []
    emotions_list = []

    for i in tqdm(range(len(dataset)), ncols=100, desc='Extracting data'):
        utterances_list.append(dataset[i]['utterances'])

    # Store the preprocessed utterance as key and utterance as value

    query_dict = {}

    for i in tqdm(range(N), ncols=100, desc='Preprocessing'):
        for j in range(len(utterances_list[i])):
            utt = utterances_list[i][j]
            pp_utt = preprocess_text(utterances_list[i][j])
            query_dict[pp_utt] = utt

    # Get the embeddings ensuring to not exceed token and sentence limit per query

    pp_utt_list = []
    utt_list = []

    for pp_utt, utt in tqdm(query_dict.items()):
        if vo.count_tokens(utt_list) < MAX_TOKENS and len(utt_list) < MAX_UTT:
            utt_list.append(utt)
            pp_utt_list.append(pp_utt)
        else:
            utt_list.append(utt)
            pp_utt_list.append(pp_utt)
            utt_emb_list = get_sens_embed(utt_list)
            for i, utt_emb in enumerate(utt_emb_list):
                ans_dict[pp_utt_list[i]] = utt_emb
            pp_utt_list = []
            utt_list = []

    if len(pp_utt_list) > 0:
        utt_emb_list = get_sens_embed(utt_list)
        for i, utt_emb in enumerate(utt_emb_list):
            ans_dict[pp_utt_list[i]] = utt_emb
        pp_utt_list = []
        utt_list = []

In [None]:
# Add the embeddings for files

ans_dict = {}
add_voy_embeddings(file_train, ans_dict)
add_voy_embeddings(file_val, ans_dict)

In [None]:
# Cleaning

new_ans_dict = {}
for x, y in ans_dict.items():
    new_ans_dict[x] = y.clone()
ans_dict = new_ans_dict

In [None]:
# Save the file

with open('sent2emb.pickle', 'wb') as handle:
    pickle.dump(ans_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)