In [1]:
# libraries
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import sklearn
from sklearn import cluster
from sklearn.cluster import KMeans

  from tqdm.autonotebook import tqdm, trange


In [2]:
# load pre-trained sentence transformer model (aka SBERT)
model = SentenceTransformer('all-MiniLM-L6-v2')



# TILE

## Create tiling function

In [3]:
# create a function to get tiled cosine similarities between aggregated utterances w/ specified gap size
def tiled_cosine_similarity(df, window_size, gap_size):
    # create empty list to store output
    results = []
    # iterate over each group of transcript_id
    for idx, (transcript_id, group) in enumerate(df.groupby('transcript_id'), start = 1):
        # sort by turn_id to maintain conversation order
        group = group.sort_values(by = "turn_id")
        # extract all utterances for this transcript
        utterances = group['utterance'].tolist()
        print(f"Processing transcript {idx} out of {df['transcript_id'].nunique()}: {transcript_id}")
        #print(f"Length utterances = {len(utterances)} for transcript {transcript_id}")
        # now move to sliding window approach to get similarity between consecutive windows
        for i in range(len(utterances) - window_size):
            #i = i + 2
            # get start row and end row of window A
            A_start = i # inclusive
            A_end = A_start + window_size # exclusive
            window_A = df['utterance'].iloc[A_start:A_end].str.cat(sep = " ")
            # get start row and end row of window B 
            B_start = i + window_size + gap_size
            B_end = B_start + window_size
            window_B = df['utterance'].iloc[B_start:B_end].str.cat(sep = " ")
            # ensure that both windows are not empty
            if len(window_A) > 0 and len(window_B) > 0:
                # get embeddings for both windows
                embeddings_A = model.encode(window_A)
                embeddings_B = model.encode(window_B)
                # ensure embeddings are 2D arrays
                embeddings_A = embeddings_A.reshape(1, -1)
                embeddings_B = embeddings_B.reshape(1, -1)
                # alert me if embeddings have different shapes
                assert embeddings_A.shape[1] == embeddings_B.shape[1], \
                    f"Dimensionality mismatch: A={embeddings_A.shape[1]}, B={embeddings_B.shape[1]}"
                # get cosine similarity between utterances in windows A and B
                similarity = cosine_similarity(embeddings_A, embeddings_B)
                # save output
                output = {
                    'transcript_id': transcript_id,
                    'window_size': window_size,
                    'gap_size': gap_size,
                    'A_start_turn': A_start + 1, # account for 0-bounding python
                    'A_end_turn': A_end,
                    'A_utterances': window_A,
                    'B_start_turn': B_start + 1, # account for 0-bounding python
                    'B_end_turn': B_end,
                    'B_utterances': window_B,
                    'cosine_similarity': similarity[0][0]
                }
                results.append(output)
    # convert results list to data frame and return
    results_df = pd.DataFrame(results)
    print(results_df.head())
    return results_df

## Load data

In [4]:
# load annotated conversations
df = pd.read_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/processed/backbiter_subset_for_tiling.csv")

# make sure data is pandas dataframe
df = pd.DataFrame(df)
# preview
df.head()

Unnamed: 0,turn_id,speaker,transcript_id,utterance
0,0,5ad7c075c25ea0000188486b,01849238-f5f0-487e-bca4-7b4fe0c9625c,mm. Mhm. Okay. Okay. Mm hmm. Mhm. Oh. Mhm. Oka...
1,1,5de5538f8fde1c4dbc951498,01849238-f5f0-487e-bca4-7b4fe0c9625c,"Hello, Can you hear me? I can, how are you doing?"
2,2,5ad7c075c25ea0000188486b,01849238-f5f0-487e-bca4-7b4fe0c9625c,Good. How about you?
3,3,5de5538f8fde1c4dbc951498,01849238-f5f0-487e-bca4-7b4fe0c9625c,"I am doing well, yeah,"
4,4,5ad7c075c25ea0000188486b,01849238-f5f0-487e-bca4-7b4fe0c9625c,Sweet. So um I guess we could start off by tal...


## Apply function

### Window = 10, Gap = 0

In [5]:
# run
full_tile_10_0 = tiled_cosine_similarity(df, 10, 0)
# save
full_tile_10_0.to_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/output/annotated_transcripts_tile_10_0.csv", index = False)
# preview
full_tile_10_0.head()

Processing transcript 1 out of 200: 01849238-f5f0-487e-bca4-7b4fe0c9625c
Processing transcript 2 out of 200: 01a4c01c-cf0e-4f37-ab2b-641bb604af30
Processing transcript 3 out of 200: 04542d1a-168a-4f90-b7e3-33adb675525f
Processing transcript 4 out of 200: 0542c0f0-6c1e-4e54-b3df-afa48b19f9b5
Processing transcript 5 out of 200: 06afb9c1-b367-45c7-a9ff-74c47a2cb61c
Processing transcript 6 out of 200: 06b83c0a-7bf1-4cf9-9e72-034615d97050
Processing transcript 7 out of 200: 07094abd-8b2a-426a-8dca-edead45c2143
Processing transcript 8 out of 200: 0736fa95-99e8-4707-bd64-4552eb79d05a
Processing transcript 9 out of 200: 0a294776-ca94-4114-bc24-2bf3b0a0a813
Processing transcript 10 out of 200: 0b019c01-a6b7-4753-afa3-f7bf964932c9
Processing transcript 11 out of 200: 0bd4dd2c-1cf0-46fb-87d3-a88b1de310a6
Processing transcript 12 out of 200: 0bfe2cba-bf84-4a8b-887d-f19378aa07d8
Processing transcript 13 out of 200: 0e772ef8-9014-48a8-8eb9-762ef55b4fe9
Processing transcript 14 out of 200: 0e8d198c-f

Unnamed: 0,transcript_id,window_size,gap_size,A_start_turn,A_end_turn,A_utterances,B_start_turn,B_end_turn,B_utterances,cosine_similarity
0,01849238-f5f0-487e-bca4-7b4fe0c9625c,10,0,1,10,mm. Mhm. Okay. Okay. Mm hmm. Mhm. Oh. Mhm. Oka...,11,20,"a little bit, yeah. Uh so my name's Trenton. U...",0.357906
1,01849238-f5f0-487e-bca4-7b4fe0c9625c,10,0,2,11,"Hello, Can you hear me? I can, how are you doi...",12,21,Great. and I'm also a student pilot and for fi...,0.379236
2,01849238-f5f0-487e-bca4-7b4fe0c9625c,10,0,3,12,"Good. How about you? I am doing well, yeah, Sw...",13,22,and I'm also a student pilot and for finally t...,0.345225
3,01849238-f5f0-487e-bca4-7b4fe0c9625c,10,0,4,13,"I am doing well, yeah, Sweet. So um I guess we...",14,23,"No, They're pretty much just like a japanese t...",0.365163
4,01849238-f5f0-487e-bca4-7b4fe0c9625c,10,0,5,14,Sweet. So um I guess we could start off by tal...,15,24,They're pretty much just like a japanese toy b...,0.349721


### Window = 15, Gap = 0

In [6]:
# run
full_tile_15_0 = tiled_cosine_similarity(df, 15, 0)
# save
full_tile_15_0.to_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/output/annotated_transcripts_tile_15_0.csv", index = False)
# preview
full_tile_15_0.head()

Processing transcript 1 out of 200: 01849238-f5f0-487e-bca4-7b4fe0c9625c
Processing transcript 2 out of 200: 01a4c01c-cf0e-4f37-ab2b-641bb604af30
Processing transcript 3 out of 200: 04542d1a-168a-4f90-b7e3-33adb675525f
Processing transcript 4 out of 200: 0542c0f0-6c1e-4e54-b3df-afa48b19f9b5
Processing transcript 5 out of 200: 06afb9c1-b367-45c7-a9ff-74c47a2cb61c
Processing transcript 6 out of 200: 06b83c0a-7bf1-4cf9-9e72-034615d97050
Processing transcript 7 out of 200: 07094abd-8b2a-426a-8dca-edead45c2143
Processing transcript 8 out of 200: 0736fa95-99e8-4707-bd64-4552eb79d05a
Processing transcript 9 out of 200: 0a294776-ca94-4114-bc24-2bf3b0a0a813
Processing transcript 10 out of 200: 0b019c01-a6b7-4753-afa3-f7bf964932c9
Processing transcript 11 out of 200: 0bd4dd2c-1cf0-46fb-87d3-a88b1de310a6
Processing transcript 12 out of 200: 0bfe2cba-bf84-4a8b-887d-f19378aa07d8
Processing transcript 13 out of 200: 0e772ef8-9014-48a8-8eb9-762ef55b4fe9
Processing transcript 14 out of 200: 0e8d198c-f

Unnamed: 0,transcript_id,window_size,gap_size,A_start_turn,A_end_turn,A_utterances,B_start_turn,B_end_turn,B_utterances,cosine_similarity
0,01849238-f5f0-487e-bca4-7b4fe0c9625c,15,0,1,15,mm. Mhm. Okay. Okay. Mm hmm. Mhm. Oh. Mhm. Oka...,16,30,Well I'm 59. My name is Teresa and I live in A...,0.349945
1,01849238-f5f0-487e-bca4-7b4fe0c9625c,15,0,2,16,"Hello, Can you hear me? I can, how are you doi...",17,31,Mhm. far along gotten? I would say about 85% o...,0.221304
2,01849238-f5f0-487e-bca4-7b4fe0c9625c,15,0,3,17,"Good. How about you? I am doing well, yeah, Sw...",18,32,far along gotten? I would say about 85% of the...,0.224902
3,01849238-f5f0-487e-bca4-7b4fe0c9625c,15,0,4,18,"I am doing well, yeah, Sweet. So um I guess we...",19,33,I would say about 85% of the way through getti...,0.225838
4,01849238-f5f0-487e-bca4-7b4fe0c9625c,15,0,5,19,Sweet. So um I guess we could start off by tal...,20,34,"So had they can use solo flight yet? Yes, that...",0.225446


### Window = 20, Gap = 0

In [7]:
# run
full_tile_20_0 = tiled_cosine_similarity(df, 20, 0)
# save
full_tile_20_0.to_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/output/annotated_transcripts_tile_20_0.csv", index = False)
# preview
full_tile_20_0.head()

Processing transcript 1 out of 200: 01849238-f5f0-487e-bca4-7b4fe0c9625c
Processing transcript 2 out of 200: 01a4c01c-cf0e-4f37-ab2b-641bb604af30
Processing transcript 3 out of 200: 04542d1a-168a-4f90-b7e3-33adb675525f
Processing transcript 4 out of 200: 0542c0f0-6c1e-4e54-b3df-afa48b19f9b5
Processing transcript 5 out of 200: 06afb9c1-b367-45c7-a9ff-74c47a2cb61c
Processing transcript 6 out of 200: 06b83c0a-7bf1-4cf9-9e72-034615d97050
Processing transcript 7 out of 200: 07094abd-8b2a-426a-8dca-edead45c2143
Processing transcript 8 out of 200: 0736fa95-99e8-4707-bd64-4552eb79d05a
Processing transcript 9 out of 200: 0a294776-ca94-4114-bc24-2bf3b0a0a813
Processing transcript 10 out of 200: 0b019c01-a6b7-4753-afa3-f7bf964932c9
Processing transcript 11 out of 200: 0bd4dd2c-1cf0-46fb-87d3-a88b1de310a6
Processing transcript 12 out of 200: 0bfe2cba-bf84-4a8b-887d-f19378aa07d8
Processing transcript 13 out of 200: 0e772ef8-9014-48a8-8eb9-762ef55b4fe9
Processing transcript 14 out of 200: 0e8d198c-f

Unnamed: 0,transcript_id,window_size,gap_size,A_start_turn,A_end_turn,A_utterances,B_start_turn,B_end_turn,B_utterances,cosine_similarity
0,01849238-f5f0-487e-bca4-7b4fe0c9625c,20,0,1,20,mm. Mhm. Okay. Okay. Mm hmm. Mhm. Oh. Mhm. Oka...,21,40,"Yes, that was actually pretty fun. Whoa. Very ...",0.296876
1,01849238-f5f0-487e-bca4-7b4fe0c9625c,20,0,2,21,"Hello, Can you hear me? I can, how are you doi...",22,41,Whoa. Very or just fun. Actually my very first...,0.243843
2,01849238-f5f0-487e-bca4-7b4fe0c9625c,20,0,3,22,"Good. How about you? I am doing well, yeah, Sw...",23,42,Actually my very first solo flight um it went ...,0.201845
3,01849238-f5f0-487e-bca4-7b4fe0c9625c,20,0,4,23,"I am doing well, yeah, Sweet. So um I guess we...",24,43,Well you know you learn That something. was fu...,0.3724
4,01849238-f5f0-487e-bca4-7b4fe0c9625c,20,0,5,24,Sweet. So um I guess we could start off by tal...,25,44,That something. was fun. So what about that? I...,0.324103
