## Get Tiled Embeddings
This script gets BERT embeddings on varying tiled windows of utterances of the CANDOR data set.

**Author:** Helen Schmidt  
**Python version:** 3.9.18

In [23]:
import os
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
# define data input location
input_dir = "/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/CANDOR/transcripts/raw"
# define data output location
output_dir = "/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/output/full-sample"

In [25]:
# load pre-trained sentence transformer model (aka SBERT)
model = SentenceTransformer('all-MiniLM-L6-v2')



In [26]:
# load all modified transcripts and combine into one data frame
all_dfs = []
for dirpath, dirnames, filenames in os.walk(input_dir):
    for filename in filenames:
        if filename == 'transcript_backbiter_transformed_noLine1.csv':
            file_path = os.path.join(dirpath, filename)
            df = pd.read_csv(file_path)
            relative_path = os.path.relpath(dirpath, input_dir)
            transcript_id = relative_path.split(os.sep)[0] if relative_path else ''
            # add new variable for transcript ID from folder name
            df['transcript_id'] = transcript_id
            all_dfs.append(df)

# Concatenate all dataframes by rows (like row bind)
df = pd.concat(all_dfs, ignore_index=True)

In [27]:
# only need a few variables from df for tiling
# turn_id, speaker, transcript_id, utterance
selected_df = df[['turn_id', 'speaker', 'utterance', 'transcript_id']]

# also create a test subset to make sure tiling function works 
test_df = selected_df.head(10).copy()

# preview subset
selected_df.head()

Unnamed: 0,turn_id,speaker,utterance,transcript_id
0,0,5c8be0dd542fbd0016924f5f,"No, I'm good. How are you?",3a51d3fd-a343-4177-b48e-a694a14c5891
1,1,5f2f6bd965964e35e20c7445,"No. Hi, how are you?",3a51d3fd-a343-4177-b48e-a694a14c5891
2,2,5f2f6bd965964e35e20c7445,Good thanks.,3a51d3fd-a343-4177-b48e-a694a14c5891
3,3,5c8be0dd542fbd0016924f5f,Hm My name is played to me,3a51d3fd-a343-4177-b48e-a694a14c5891
4,4,5f2f6bd965964e35e20c7445,"Hi, my name's Amanda.",3a51d3fd-a343-4177-b48e-a694a14c5891


In [28]:
# define euclidean distance function
def euclidean_dist(vec1, vec2):
    return np.linalg.norm(vec1 - vec2)

# create a function to get tiled cosine similarities between aggregated utterances w/ specified gap size
def tiled_cosine_similarity(df, window_size, gap_size):
    # create empty list to store output
    results = []
    # iterate over each group of transcript_id
    for idx, (transcript_id, group) in enumerate(df.groupby('transcript_id'), start = 1):
        # sort by turn_id to maintain conversation order
        group = group.sort_values(by = "turn_id")
        # extract all utterances for this transcript
        utterances = group['utterance'].tolist()
        print(f"Processing transcript {idx} out of {df['transcript_id'].nunique()}: {transcript_id}")
        #print(f"Length utterances = {len(utterances)} for transcript {transcript_id}")
        # now move to sliding window approach to get similarity between consecutive windows
        for i in range(len(utterances) - window_size):
            #i = i + 2
            # get start row and end row of window A
            A_start = i # inclusive
            A_end = A_start + window_size # exclusive
            window_A = df['utterance'].iloc[A_start:A_end].str.cat(sep = " ")
            # get start row and end row of window B 
            B_start = i + window_size + gap_size
            B_end = B_start + window_size
            window_B = df['utterance'].iloc[B_start:B_end].str.cat(sep = " ")
            # ensure that both windows are not empty
            if len(window_A) > 0 and len(window_B) > 0:
                # get embeddings for both windows
                embeddings_A = model.encode(window_A)
                embeddings_B = model.encode(window_B)
                # ensure embeddings are 2D arrays
                embeddings_A = embeddings_A.reshape(1, -1)
                embeddings_B = embeddings_B.reshape(1, -1)
                # alert me if embeddings have different shapes
                assert embeddings_A.shape[1] == embeddings_B.shape[1], \
                    f"Dimensionality mismatch: A={embeddings_A.shape[1]}, B={embeddings_B.shape[1]}"
                # get cosine similarity between utterances in windows A and B
                similarity = cosine_similarity(embeddings_A, embeddings_B)
                # save output
                output = {
                    'transcript_id': transcript_id,
                    'window_size': window_size,
                    'gap_size': gap_size,
                    'A_start_turn': A_start + 1, # account for 0-bounding python
                    'A_end_turn': A_end,
                    'A_utterances': window_A,
                    'A_embeddings': embeddings_A,
                    'B_start_turn': B_start + 1, # account for 0-bounding python
                    'B_end_turn': B_end,
                    'B_utterances': window_B,
                    'B_embeddings': embeddings_B,
                    'cosine_similarity': similarity[0][0]
                }
                results.append(output)
    
    # convert results list to data frame
    results_df = pd.DataFrame(results)
    print(results_df.head())
    # add euclidean distance rowwise
    results_df['euclidean_distance'] = results_df.apply(lambda row: euclidean_dist(row['A_embeddings'], row['B_embeddings']), axis=1)
    # return
    return results_df

### Window size = 3, Gap = 0

In [None]:
# apply tiling function
#tile_3_0 = tiled_cosine_similarity(selected_df, 3, 0)
# save
#tile_3_0.to_pickle(output_dir + "/full_sample_tile_3_0.pkl")
#tile_3_0.to_csv(output_dir + "/full_sample_tile_3_0.csv", index = False)
# preview
#tile_3_0.head()

Processing transcript 1 out of 1656: 0020a0c5-1658-4747-99c1-2839e736b481
Processing transcript 2 out of 1656: 002d68da-7738-4177-89d9-d72ae803e0e4
Processing transcript 3 out of 1656: 00411458-8275-4b92-a000-d52187f03604
Processing transcript 4 out of 1656: 00ae2f18-9599-4df6-8e3a-6936c86b97f0
Processing transcript 5 out of 1656: 00b410f7-8b5f-4404-8433-0fb8c4be8f62
Processing transcript 6 out of 1656: 00deb2e5-cf7f-4a5c-a8db-7fc335634ad6
Processing transcript 7 out of 1656: 010a1b2a-7b69-4245-807f-d94eb6aa165c
Processing transcript 8 out of 1656: 012dd705-ee62-49f6-8016-cf2e3cc066e4
Processing transcript 9 out of 1656: 014a3227-9b4c-4f23-ad5b-8325235301ed
Processing transcript 10 out of 1656: 015301b4-a132-4877-8bec-557e9b99a01c
Processing transcript 11 out of 1656: 0159f2d4-f6db-46f8-b7c6-96a3df97b246
Processing transcript 12 out of 1656: 01849238-f5f0-487e-bca4-7b4fe0c9625c
Processing transcript 13 out of 1656: 01a4c01c-cf0e-4f37-ab2b-641bb604af30
Processing transcript 14 out of 16

Unnamed: 0,transcript_id,window_size,gap_size,A_start_turn,A_end_turn,A_utterances,A_embeddings,B_start_turn,B_end_turn,B_utterances,B_embeddings,cosine_similarity,euclidean_distance
0,0020a0c5-1658-4747-99c1-2839e736b481,3,0,1,3,"No, I'm good. How are you? No. Hi, how are you...","[[0.012172122, -0.011064016, 0.06654317, 0.000...",4,6,"Hm My name is played to me Hi, my name's Amand...","[[-0.03531422, -0.04561006, 0.02838244, -0.076...",0.315577,1.169977
1,0020a0c5-1658-4747-99c1-2839e736b481,3,0,2,4,"No. Hi, how are you? Good thanks. Hm My name i...","[[-0.0106563475, -0.06792329, 0.064656146, -0....",5,7,"Hi, my name's Amanda. and the next time you Am...","[[-0.0010068159, -0.09045196, -0.0003150229, -...",0.387075,1.107181
2,0020a0c5-1658-4747-99c1-2839e736b481,3,0,3,5,"Good thanks. Hm My name is played to me Hi, my...","[[-0.038483433, -0.0698858, 0.0062682955, -0.0...",6,8,and the next time you Amanda. All right. Does ...,"[[0.044995096, -0.058201604, -0.028326219, -0....",0.442343,1.056084
3,0020a0c5-1658-4747-99c1-2839e736b481,3,0,4,6,"Hm My name is played to me Hi, my name's Amand...","[[-0.03531422, -0.04561006, 0.02838244, -0.076...",7,9,"All right. Does that time start? Yeah, three o...","[[0.043147173, -0.042117808, -0.027217342, -0....",0.176185,1.2836
4,0020a0c5-1658-4747-99c1-2839e736b481,3,0,5,7,"Hi, my name's Amanda. and the next time you Am...","[[-0.0010068159, -0.09045196, -0.0003150229, -...",8,10,"right. At three. Right, What's that? Don't go ...","[[0.043233756, -0.056193333, -0.049202096, -0....",0.40875,1.087428


### Window size = 10, Gap = 0

In [None]:
# apply tiling function
tile_10_0 = tiled_cosine_similarity(selected_df, 10, 0)
# save
tile_10_0.to_pickle(output_dir + "/full_sample_tile_10_0.pkl")
tile_10_0.to_csv(output_dir + "/full_sample_tile_10_0.csv", index = False)
# preview
tile_10_0.head()