# Computes BERT embeddings 

This notebook reads in the data (original data plus gpt generated data) and generates embeddings using different pooling methods. The embeddings are saved in a pandas dataframe in a pkl-file. The name of the pkl-file depends on the settings (with or without preprocessing, used model, ...). Additionally, embeddings with sBERT are generated. The pkl-file is loaded from the later notebooks.

## Load data

In [None]:
# Parameter setting

# model
checkpoint = 'bert-base-uncased' 
#checkpoint ='jlsalim/bert-uncased-idiomatic-literal-recognizer'

# full sentences or preprocessed (filtered to content words of specific POS)
preprocessed = False 
#preprocessed = True

# remove CLS and SEP tokens from the input (not used in final model as no improvement)
#remove_CLS_SEP = True 
remove_CLS_SEP = False


In [63]:
# read tsv file
import csv
import sys
import ast

import pandas as pd
dataDirectory = "./data/"

# read in competition data
dataA_train = pd.read_csv(dataDirectory + "subtask_a_train.tsv", sep='\t')
dataA_train['expected_order'] = dataA_train['expected_order'].apply(ast.literal_eval)
dataA_dev = pd.read_csv(dataDirectory + "subtask_a_dev.tsv", sep='\t')
dataA_test = pd.read_csv(dataDirectory +"subtask_a_test.tsv", sep='\t')
dataA_xe = pd.read_csv(dataDirectory + "subtask_a_xe.tsv", sep='\t')

dataA = pd.concat([dataA_train,dataA_dev,dataA_test,dataA_xe])
# reset index
dataA = dataA.reset_index(drop=True)

# read in chatGPT data from csv
data_chatGPT_train = pd.read_csv(dataDirectory + "chatGPTNew_train.csv")
data_chatGPT_dev = pd.read_csv(dataDirectory + "chatGPTNew_dev.csv")
data_chatGPT_test = pd.read_csv(dataDirectory + "chatGPTNew_test.csv")
data_chatGPT = pd.concat([data_chatGPT_train,data_chatGPT_dev,data_chatGPT_test])


data_chatGPT = data_chatGPT.reset_index(drop=True)

# rename each column with "gpt_" in front of the column name
data_chatGPT.rename(columns=lambda x: 'gpt_' + x, inplace=True)

# inserting the missing compound column
data_chatGPT["compound"] = [None for i in range(len(data_chatGPT))]
for i in range(len(data_chatGPT)):
    data_chatGPT["compound"][i] = data_chatGPT["gpt_idiomatic_meaning"][i].split(" is")[0].strip().lower()

# read in gpt image description data
data_gpt_image = pd.read_csv(dataDirectory  + "gpt_image_descriptions_all.csv", sep=',')

# merge data into one dataframe
dataA = pd.merge(dataA, data_chatGPT, on='compound')
dataA = pd.merge(dataA, data_gpt_image, on='compound')



In [64]:

if not data_gpt_image.shape[1] + data_chatGPT.shape[1] + dataA_train.shape[1] - 2 == dataA.shape[1]:
    print("There is a problem with the merged file.")


In [65]:
# cutting gpt-meaning data
def cut_gpt_meaning(sent):
    if "literal" in sent:
        # delete everything up to "literal"
        sent = sent.split("literal")[1]
    elif "metaphor for" in sent:
        # delete everything up to "metaphor for"
        sent = sent.split("metaphor for")[1]
    return sent.strip()

dataA['gpt_idiomatic_meaning_cutted'] = dataA['gpt_idiomatic_meaning'].apply(lambda x: cut_gpt_meaning(x))
dataA['gpt_literal_meaning_cutted'] = dataA['gpt_literal_meaning'].apply(lambda x: cut_gpt_meaning(x))

In [66]:
sentence_type_columns = ['sentence', 
                         'image1_caption', 'image2_caption', 'image3_caption', 'image4_caption', 'image5_caption', 
                         'gpt_idiomatic_meaning', 'gpt_literal_meaning',
                         'gpt_idiomatic_meaning_cutted', 'gpt_literal_meaning_cutted', 
                         'gpt_idiomatic_sentence', 'gpt_literal_sentence',
                         'gpt_idiomatic_image', 'gpt_literal_image']


sentence_with_compound_columns = ['sentence', 
                         'gpt_idiomatic_meaning', 'gpt_literal_meaning', 
                         'gpt_idiomatic_sentence', 'gpt_literal_sentence']


# cleanup data
# replace ’ with ' in all columns
for column in sentence_type_columns:
    dataA[column] = dataA[column].str.replace("’","'")

In [67]:
# preprocessing of text 
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')

def prepare_text(raw_text):
    
    # Convert text to lowercase and remove punctuation
    normalized_text = raw_text.lower()
    normalized_text = re.sub(r"[^a-zA-Z\s]", "", normalized_text)

    # Tokenize the normalized text
    tokens = word_tokenize(normalized_text)

    # Apply POS tagging and retain only nouns, verbs
    pos_tags = nltk.pos_tag(tokens, tagset='universal')
    pos_tags_to_keep = {"NOUN", "VERB", "ADJ"}
    filtered_tokens = [word for word, pos in pos_tags if pos in pos_tags_to_keep]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_text = [word for word in filtered_tokens if word.lower() not in stop_words]

    # Lemmatize the remaining tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_text]
    
    return " ".join(lemmatized_text)


In [68]:

if preprocessed == True:
    dataA["compound"] = dataA["compound"].apply(prepare_text)
    for column in sentence_type_columns:
        dataA[column] = dataA[column].apply(prepare_text) 
 

**Data analysis**


In [69]:
dataA["subset"].value_counts()

Extended Evaluation    100
Train                   60
Dev                     15
Test                    15
Sample                  10
Name: subset, dtype: int64

In [70]:
for sub in ["Sample", "Train", "Dev", "Test", "Extended Evaluation"]:
    print(sub)
    types = dataA[dataA["subset"] == sub]["sentence_type"]
    number_idiomatic = types[types == "idiomatic"].count()
    number_literal = types[types == "literal"].count()
    print("number", len(types) )
    print("ratio literal", round(number_literal / len(types),2) ) 
    print("ratio idiomatic", round(number_idiomatic / len(types),2) )
    print("===")

for sub in ["Sample", "Train", "Dev", "Test", "Extended Evaluation"]:
    print(sub)
    types = dataA[dataA["subset"] == sub]["sentence_type"]
    number_idiomatic = types[types == "idiomatic"].count()
    number_literal = types[types == "literal"].count()
    print("number", len(types) )
    print("idiomatic", number_idiomatic)
    print("literal", number_literal)
    print("===")






Sample
number 10
ratio literal 0.5
ratio idiomatic 0.5
===
Train
number 60
ratio literal 0.43
ratio idiomatic 0.57
===
Dev
number 15
ratio literal 0.53
ratio idiomatic 0.47
===
Test
number 15
ratio literal 0.47
ratio idiomatic 0.53
===
Extended Evaluation
number 100
ratio literal 0.54
ratio idiomatic 0.46
===
Sample
number 10
idiomatic 5
literal 5
===
Train
number 60
idiomatic 34
literal 26
===
Dev
number 15
idiomatic 7
literal 8
===
Test
number 15
idiomatic 8
literal 7
===
Extended Evaluation
number 100
idiomatic 46
literal 54
===


## Compute model embeddings (BERT-mode)



Idea: we use a pretrained BERT model to generate embeddings of sentences and of the compound in the context of the sentence.

In [71]:
from transformers import AutoTokenizer
import torch
from transformers import AutoModel

# model is selected from https://huggingface.co/models
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(checkpoint, output_hidden_states=True).to(device)
model = model.eval()

def tokenize(batch):
    return tokenizer(batch, padding=True, truncation=True)



### Computing sentence-based embeddings

In [72]:
import numpy as np

def get_attention_tokens(attention_mask_item):
    attention_mask_new = attention_mask_item.clone()
    one_indices = (attention_mask_item == 1).nonzero(as_tuple=True)[0]
    if remove_CLS_SEP == True:
        # Setze die erste und letzte 1 auf 0: [CLS] und [SEP] Tokens
        attention_mask_new[one_indices[0]] = 0  # Erste 1
        attention_mask_new[one_indices[-1]] = 0  # Letzte 1
    return attention_mask_new


# different pooling methods for embeddings are computed

def get_sentence_embedding(hidden_states,method,attention_mask):
    sentence_embedding = []
    if method == 'meanLast4': # average of all tokens of the last 4 layers
        for i in range(len(hidden_states[0])):
            # token_vecs is mean of last 4 layers
            token_tensor = torch.stack([hidden_states[-1][i], hidden_states[-2][i], hidden_states[-3][i], hidden_states[-4][i]], dim=0)
            token_vecs = torch.mean(token_tensor, dim=0)
            attention = get_attention_tokens(attention_mask[i]) 
            token_vecs = token_vecs[attention.bool()]
            sentence_embedding.append(torch.mean(token_vecs, dim=0))
    elif method == 'meanSecondToLast': # average of second to last layer
        for i in range(len(hidden_states[-2])):
            token_vecs = hidden_states[-2][i]
            attention = get_attention_tokens(attention_mask[i]) 
            token_vecs = token_vecs[attention.bool()]
            sentence_embedding.append(torch.mean(token_vecs, dim=0))
    elif method == 'meanLast': # average of last layer
        for i in range(len(hidden_states[-1])):
            token_vecs = hidden_states[-1][i]
            attention = get_attention_tokens(attention_mask[i]) 
            token_vecs = token_vecs[attention.bool()]
            sentence_embedding.append(torch.mean(token_vecs, dim=0))
    elif method == 'lastCLS': # CLS token of last layer
        sentence_embedding = hidden_states[-1][:, 0, :]
    elif method == 'meanFirst': # average of first layer
        for i in range(len(hidden_states[0])):
            token_vecs = hidden_states[0][i]
            attention = get_attention_tokens(attention_mask[i]) 
            token_vecs = token_vecs[attention.bool()]
            sentence_embedding.append(torch.mean(token_vecs, dim=0))
    elif method == 'firstCLS': # CLS token of first layer
        sentence_embedding = hidden_states[0][:, 0, :]
    return sentence_embedding

In [73]:

methods = ['meanSecondToLast','meanLast4','meanLast','meanFirst','firstCLS','lastCLS']

for column in sentence_type_columns:
    print(column)
    dataA_sentence_tokenized = tokenize(dataA[column].tolist())
    
    # convert input_ids to tensor
    input_ids_sentence = torch.tensor(dataA_sentence_tokenized["input_ids"]).to(device)
    attention_mask_sentence = torch.tensor(dataA_sentence_tokenized["attention_mask"]).to(device)

    # pass input_ids to model
    with torch.no_grad():
        output = model(input_ids_sentence, attention_mask_sentence)
    
    hidden_states_sentence = output.hidden_states

    # use all methods for getting sentence embeddings and add them to dataA


    for method in methods:
        X = get_sentence_embedding(hidden_states_sentence,method,attention_mask_sentence)
        X = np.array([x.cpu().numpy() for x in X]).tolist()
        # add a new column to dataA 
        dataA[column + '_embedding_'+ method] = X

sentence
image1_caption
image2_caption
image3_caption
image4_caption
image5_caption
gpt_idiomatic_meaning
gpt_literal_meaning
gpt_idiomatic_meaning_cutted
gpt_literal_meaning_cutted
gpt_idiomatic_sentence
gpt_literal_sentence
gpt_idiomatic_image
gpt_literal_image


In [74]:
print ("Number of layers:", len(hidden_states_sentence), "  (including initial embeddings)")
layer_i = 0
print ("Number of batches:", len(hidden_states_sentence[layer_i]))
batch_i = 0
print ("Number of tokens:", len(hidden_states_sentence[layer_i][batch_i]))
token_i = 0
print ("Number of hidden units:", len(hidden_states_sentence[layer_i][batch_i][token_i]))

Number of layers: 13   (including initial embeddings)
Number of batches: 200
Number of tokens: 20
Number of hidden units: 768


### Computing compound-based embeddings

Sometimes the compound occurs in the sentence only in plural form. So both forms are needed.

In [75]:
##!pip install inflect

In [76]:
# sometimes the compound occurs in plural form in the sentence

# add a new column to dataA with the plural form of the compound 


from re import *
import inflect

engine = inflect.engine()

dataA["compound_plural"] = [None for i in range(len(dataA))]

for i in range(len(dataA["compound"])):
    dataA["compound_plural"][i] = engine.plural(dataA["compound"][i])

# tokenize all compounds (original and plural)
dataA_compound_tokenized = tokenize(dataA["compound"].tolist())
dataA_compound_plural_tokenized = tokenize(dataA["compound_plural"].tolist())

In [77]:
# returns the index of the compound in the sentence
def get_idx(compound_tokens, compound_plural_tokens, sentence_tokens):
    # remove 0-tokens from compound_tokens (removes tokens that are due to padding)
    compound_tokens = [i for i in compound_tokens if i != 0]
    # remove [CLS] and [SEP] from compound_tokens
    compound_tokens = compound_tokens[1:-1]
    compound_plural_tokens = [i for i in compound_plural_tokens if i != 0]
    compound_plural_tokens = compound_plural_tokens[1:-1]
    idx = []
    # find the first occurence of the sequence of compound_tokens in sentence_tokens (singular and plural forms)
    for i in range(len(sentence_tokens)):
        if sentence_tokens[i:i+len(compound_tokens)] == compound_tokens:
            for j in range(i, i+ len(compound_tokens)):
                idx.append(j)
    for i in range(len(sentence_tokens)):
        if sentence_tokens[i:i+len(compound_plural_tokens)] == compound_plural_tokens:
            for j in range(i, i+ len(compound_plural_tokens)):
                idx.append(j)
    # remove duplicates from idx
    idx = list(set(idx))
    return idx



In [78]:
# returns the embeddings of the tokens in idxList. 
# The embeddings are combined to a single embedding by different averaging methods
import numpy as np
def get_idxList_embedding(hidden_states,idxLists,method):
    embedding = []
    if method == 'meanLast4':
        for i in range(len(hidden_states[-1])):
            # token_vecs is mean of last 4 layers
            idxList = idxLists[i]
            token_tensor = torch.stack([hidden_states[-1][i][idxList], hidden_states[-2][i][idxList], hidden_states[-3][i][idxList], hidden_states[-4][i][idxList]], dim=0)
            token_vecs = torch.mean(token_tensor, dim=0)
            embedding.append(torch.mean(token_vecs, dim=0).tolist())
    elif method == 'meanSecondToLast':
        for i in range(len(hidden_states[-2])):
            idxList = idxLists[i]
            token_vecs = hidden_states[-2][i][idxList]
            embedding.append(torch.mean(token_vecs, dim=0).tolist())
    elif method == 'meanLast':
        for i in range(len(hidden_states[-1])):
            idxList = idxLists[i]
            token_vecs = hidden_states[-1][i][idxList]
            embedding.append(torch.mean(token_vecs, dim=0).tolist())
    elif method == 'meanFirst':
        for i in range(len(hidden_states[0])):
            idxList = idxLists[i]
            token_vecs = hidden_states[0][i][idxList]
            embedding.append(torch.mean(token_vecs, dim=0).tolist())
    return embedding

In [79]:
# Sentence embeddings 

compound_methods = ['meanSecondToLast','meanLast4','meanLast','meanFirst']

compound_tokenized = tokenize(dataA["compound"].tolist())
compound_plural_tokenized = tokenize(dataA["compound_plural"].tolist())    

for column in sentence_with_compound_columns:
    # tokenize the column
    tokenized = tokenize(dataA[column].tolist())

    # hidden states for gpt_Meaning
    input_ids = torch.tensor(tokenized["input_ids"]).to(device)
    attention_mask = torch.tensor(tokenized["attention_mask"]).to(device)

    with torch.no_grad():
        output = model(input_ids, attention_mask)

    hidden_states = output.hidden_states
 
    # add gpt_compound_embeddings
 
    # add column to dataA with the indices of the compound in the sentence
    dataA[column + "_compound_idx"] = [get_idx(compound_tokenized["input_ids"][i], 
                                               compound_plural_tokenized["input_ids"][i], 
                                               tokenized["input_ids"][i]) for i in range(len(dataA))]
    
    # apply the methods in compound_methods to get the embeddings of the compound
    for method in compound_methods:
        dataA['compound_embedding_'+ column + "_"+ method] = get_idxList_embedding(hidden_states,
                                                                                  dataA[column + "_compound_idx"],
                                                                                  method) 
    dataA = dataA.copy()



In [80]:
# print  if compound_idx is empty (ideally there should be no empty compound_idx)
for column in sentence_with_compound_columns:
    for i in range(len(dataA)):
        if len(dataA[column + "_compound_idx"][i]) == 0:
            print(column)
            print(dataA["compound"][i])
            print(dataA[column][i])
            print(i)
            
dataA["compound_embedding_gpt_literal_sentence_meanLast"][91][:3]
# if idx = [] then embedding = nan's

sentence
white hat
use white ethical search engine optimization technique lead long term success many seo firm use tactic risk harming site
58
sentence
pig ear
wrote new partition table made right pig
152
sentence
pea pod
spearfishermen shape size pod gun optimize design purpose
183
gpt_literal_sentence
loan shark
aquarium exhibit featured model shark alongside fact marine life
100
gpt_literal_sentence
loan shark
aquarium exhibit featured model shark alongside fact marine life
101


[-0.012831419706344604, -1.065843105316162, 0.3485209345817566]

In [81]:
# replace nan-embeddings (due to missing compound) by corresponding sentence embedding:
for column in ["gpt_literal_sentence", "gpt_idiomatic_sentence", "gpt_literal_meaning", 
               "gpt_idiomatic_meaning","sentence" ]:
    for method in compound_methods:
        for i in range(len(dataA)):
            if len(dataA[column + "_compound_idx"][i]) == 0:
                dataA['compound_embedding_'+ column + "_"+ method][i] = dataA[column + "_embedding_"+ method][i]


### Generate sBERT embeddings

In [82]:
# pip install -U sentence-transformers

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [83]:
#  SBert embeddings are generated  for all sentence like columns
for type in sentence_type_columns:
    dataA[type + "_embedding_sbert"] = dataA[type].apply(lambda x: model.encode(x))

In [84]:
# writes all column names in a file for later reference
with open('column.txt', 'w') as f:
    for c in dataA.columns:
        print(c,  file=f)

In [85]:
#save dataA pickle
import pickle
if preprocessed == True:
    prep = "_preprocessed_"
else:
    prep = "_"

if remove_CLS_SEP == True:
    cls_sep = "_without_CLS_SEP"
else:
    cls_sep = ""

checkpoint_write = checkpoint.replace("/", "_").replace("\\", "_")
dataA.to_pickle("dataA"+ prep + checkpoint_write + cls_sep + ".pkl")
