## Get Utterance Embeddings
This script gets BERT embeddings on all utterances of the CANDOR data set.

**Author:** Helen Schmidt  
**Python version:** 3.9.18

In [None]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
# define data input location
input_dir = "/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/CANDOR/transcripts/raw"
# define data output location
output_dir = "/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/output/full-sample"

In [3]:
# load pre-trained sentence transformer model (aka SBERT)
model = SentenceTransformer('all-MiniLM-L6-v2')



In [4]:
# load all modified transcripts and combine into one data frame

all_dfs = []
for dirpath, dirnames, filenames in os.walk(input_dir):
    for filename in filenames:
        if filename == 'transcript_backbiter_transformed_noLine1.csv':
            file_path = os.path.join(dirpath, filename)
            df = pd.read_csv(file_path)
            relative_path = os.path.relpath(dirpath, input_dir)
            transcript_id = relative_path.split(os.sep)[0] if relative_path else ''
            # add new variable for transcript ID from folder name
            df['transcript_id'] = transcript_id
            all_dfs.append(df)

# Concatenate all dataframes by rows (like row bind)
df = pd.concat(all_dfs, ignore_index=True)

print(f"Combined DataFrame shape: {df.shape}")

Combined DataFrame shape: (431068, 17)


In [5]:
# preview data frame
df.head()

Unnamed: 0,turn_id,speaker,start,stop,utterance,backchannel,backchannel_count,backchannel_speaker,backchannel_start,backchannel_stop,interval,delta,questions,end_question,overlap,n_words,transcript_id
0,0,5c8be0dd542fbd0016924f5f,78.44,83.36,"No, I'm good. How are you?",,0,,,,20.18,4.92,1,True,False,6,3a51d3fd-a343-4177-b48e-a694a14c5891
1,1,5f2f6bd965964e35e20c7445,78.94,81.56,"No. Hi, how are you?",,0,,,,-0.31,2.62,1,True,True,5,3a51d3fd-a343-4177-b48e-a694a14c5891
2,2,5f2f6bd965964e35e20c7445,83.94,85.26,Good thanks.,,0,,,,0.58,1.32,0,False,False,2,3a51d3fd-a343-4177-b48e-a694a14c5891
3,3,5c8be0dd542fbd0016924f5f,85.44,90.06,Hm My name is played to me,Mhm.,1,5f2f6bd965964e35e20c7445,87.74,88.76,0.18,4.62,0,False,False,7,3a51d3fd-a343-4177-b48e-a694a14c5891
4,4,5f2f6bd965964e35e20c7445,89.44,92.16,"Hi, my name's Amanda.",,0,,,,-0.24,2.72,0,False,True,4,3a51d3fd-a343-4177-b48e-a694a14c5891


## Get BERT embeddings

In [9]:
# create copy of data frame to add embeddings
df_embeddings = df
#df_embeddings = df.head(1000).copy() # also create test version

# define embeddings function
def get_embeddings(text):
    if not isinstance(text, str) or pd.isna(text):
        return None # skip getting embedding for null or non-string values (there are 1668 missing in data frame)
    return model.encode(text)

# get embeddings
df_embeddings['embeddings_utterance'] = df_embeddings['utterance'].apply(get_embeddings)

print("Done with utterance embeddings!")

# save as pkl for easy loading later on
df_embeddings.to_pickle(output_dir + "/all_transcripts_with_embeddings.pkl")
# also save as csv for optionality
df_embeddings.to_csv(output_dir + "/all_transcripts_with_embeddings.csv", index=False)

Done with utterance embeddings!
