## Setup

In [None]:
# install packages as needed
! pip install sentence-transformers
! pip install numpy
! pip install pandas
! pip install scikit-learn
! pip install matplotlib

In [1]:
# libraries
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

  from tqdm.autonotebook import tqdm, trange


In [4]:
# load pre-trained sentence transformer model (aka SBERT)
model = SentenceTransformer('all-MiniLM-L6-v2')



## Load data

In [3]:
# load test data
df = pd.read_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/CANDOR/analysis/data/test_tsne_2.csv")
# preview
df.head()

Unnamed: 0,turn_id,speaker,transcript_id,start,stop,utterance,backchannel,backchannel_count,backchannel_speaker,backchannel_start,...,questions,end_question,overlap,n_words,currentUtterance,previous_topic,new_topic,PID,time,scaled_turn_id
0,0,5eeaae03e1959f18cd331cd0,cb056010-c50e-4e80-b639-bb178a2b9330,68.04,144.65,Yeah. Huh? Yeah. What's up? All right. Yeah. M...,Yeah.,1,5f2850a9d9a5f01279e1bd16,143.84,...,4,False,False,160,,,,,,0.0
1,1,5f2850a9d9a5f01279e1bd16,cb056010-c50e-4e80-b639-bb178a2b9330,144.74,146.15,Hello?,,0,,,...,1,True,False,1,Hello?,Starting The Call,Checking in,"[False, '5b90a114f0a0970001f6dd6e', None]",1728700000000.0,0.002381
2,2,5eeaae03e1959f18cd331cd0,cb056010-c50e-4e80-b639-bb178a2b9330,146.44,147.76,"Hi. How are, you know?",,0,,,...,1,True,False,5,,,,,,0.004762
3,3,5f2850a9d9a5f01279e1bd16,cb056010-c50e-4e80-b639-bb178a2b9330,147.84,149.76,"I'm doing great, how are you to see me?",Mhm.,1,5eeaae03e1959f18cd331cd0,148.84,...,1,True,False,10,,,,,,0.007143
4,4,5eeaae03e1959f18cd331cd0,cb056010-c50e-4e80-b639-bb178a2b9330,149.74,150.55,I'm good.,,0,,,...,0,False,True,3,,,,,,0.009524


In [5]:
# make sure data is pandas dataframe
embeddings_tsne = pd.DataFrame(df)
# get sbert embeddings for each individual chat 
def tsne_embeddings(row):
    # extract current message from the row
    current = row['utterance']
    # apply s-bert model to each string to get embeddings
    embeddings = model.encode(current)
    return embeddings

# save embeddings to output
embeddings_tsne['embeddings'] = embeddings_tsne.apply(tsne_embeddings, axis = 1)

# save 
embeddings_tsne.to_csv('/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/CANDOR/analysis/data/test_convo_embeddings.csv')

# show
embeddings_tsne.head()

Unnamed: 0,turn_id,speaker,transcript_id,start,stop,utterance,backchannel,backchannel_count,backchannel_speaker,backchannel_start,...,end_question,overlap,n_words,currentUtterance,previous_topic,new_topic,PID,time,scaled_turn_id,embeddings
0,0,5eeaae03e1959f18cd331cd0,cb056010-c50e-4e80-b639-bb178a2b9330,68.04,144.65,Yeah. Huh? Yeah. What's up? All right. Yeah. M...,Yeah.,1,5f2850a9d9a5f01279e1bd16,143.84,...,False,False,160,,,,,,0.0,"[-0.03326842, 0.08158294, 0.035818107, -0.0008..."
1,1,5f2850a9d9a5f01279e1bd16,cb056010-c50e-4e80-b639-bb178a2b9330,144.74,146.15,Hello?,,0,,,...,True,False,1,Hello?,Starting The Call,Checking in,"[False, '5b90a114f0a0970001f6dd6e', None]",1728700000000.0,0.002381,"[-0.048191182, 0.051648274, 0.060366347, 0.078..."
2,2,5eeaae03e1959f18cd331cd0,cb056010-c50e-4e80-b639-bb178a2b9330,146.44,147.76,"Hi. How are, you know?",,0,,,...,True,False,5,,,,,,0.004762,"[0.01075932, 0.040912215, 0.061604835, 0.04930..."
3,3,5f2850a9d9a5f01279e1bd16,cb056010-c50e-4e80-b639-bb178a2b9330,147.84,149.76,"I'm doing great, how are you to see me?",Mhm.,1,5eeaae03e1959f18cd331cd0,148.84,...,True,False,10,,,,,,0.007143,"[-0.0025099565, -0.048034362, 0.06437929, 0.01..."
4,4,5eeaae03e1959f18cd331cd0,cb056010-c50e-4e80-b639-bb178a2b9330,149.74,150.55,I'm good.,,0,,,...,False,True,3,,,,,,0.009524,"[-0.070886955, -0.080460116, -0.04864771, 0.01..."


In [11]:
# test individual group tsne

# extract embeddings from just this group
df_embeddings = embeddings_tsne['embeddings'].tolist()


# set size of figure
plt.figure(figsize=(5, 5))

# make the embeddings a numpy array
x_test = np.array(df_embeddings)
# set up tsne parameters (pay attention to perplexity)
tsne = TSNE(n_components=2, random_state=10, perplexity=7)
# run tsne on embeddings
x_embedded = tsne.fit_transform(x_test)

# add lines to connect data points
df['TSNE1'] = x_embedded[:, 0]
df['TSNE2'] = x_embedded[:, 1]

# sort by turn order to ensure points plotted in correct order
df = df.sort_values(by='turn_id')

# save example data
df.to_csv('/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/CANDOR/analysis/data/test_tsne_2_embeddings.csv')
# show data
df.head()

Unnamed: 0,turn_id,speaker,transcript_id,start,stop,utterance,backchannel,backchannel_count,backchannel_speaker,backchannel_start,...,overlap,n_words,currentUtterance,previous_topic,new_topic,PID,time,scaled_turn_id,TSNE1,TSNE2
0,0,5eeaae03e1959f18cd331cd0,cb056010-c50e-4e80-b639-bb178a2b9330,68.04,144.65,Yeah. Huh? Yeah. What's up? All right. Yeah. M...,Yeah.,1,5f2850a9d9a5f01279e1bd16,143.84,...,False,160,,,,,,0.0,29.588051,-4.878983
1,1,5f2850a9d9a5f01279e1bd16,cb056010-c50e-4e80-b639-bb178a2b9330,144.74,146.15,Hello?,,0,,,...,False,1,Hello?,Starting The Call,Checking in,"[False, '5b90a114f0a0970001f6dd6e', None]",1728700000000.0,0.002381,31.535654,-1.101926
2,2,5eeaae03e1959f18cd331cd0,cb056010-c50e-4e80-b639-bb178a2b9330,146.44,147.76,"Hi. How are, you know?",,0,,,...,False,5,,,,,,0.004762,31.52092,-2.074457
3,3,5f2850a9d9a5f01279e1bd16,cb056010-c50e-4e80-b639-bb178a2b9330,147.84,149.76,"I'm doing great, how are you to see me?",Mhm.,1,5eeaae03e1959f18cd331cd0,148.84,...,False,10,,,,,,0.007143,30.967867,-3.693545
4,4,5eeaae03e1959f18cd331cd0,cb056010-c50e-4e80-b639-bb178a2b9330,149.74,150.55,I'm good.,,0,,,...,True,3,,,,,,0.009524,32.224979,-10.098478


<Figure size 500x500 with 0 Axes>

## Tiling


In [45]:
# load example conversation data
df = pd.read_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/CANDOR/analysis/data/test_tsne_2.csv")
# make sure data is pandas dataframe
df = pd.DataFrame(df)

# Generate embeddings for all utterances
embeddings = model.encode(df['utterance'].tolist())

# Initialize lists to store similarity values
#similarity_lag1 = [None]  # First row cannot have a previous utterance for lag1
#similarity_lag2 = [None, None]  # First two rows cannot have previous utterances for lag2
#similarity_lag5 = [None, None, None, None, None]  # First five rows cannot have previous utterances for lag5
#similarity_lag10 = [None, None, None, None, None, None, None, None, None, None]

similarity_lag1 = []
similarity_lag2 = []
similarity_lag5 = []
similarity_lag10 = []

# Calculate cosine similarities for each type of analysis
for i in range(1, len(embeddings)):
    # Cosine similarity between current utterance and the previous one (lag 1)
    sim_lag1 = cosine_similarity([embeddings[i]], [embeddings[i-1]])[0][0]
    similarity_lag1.append(sim_lag1)
    
    # Cosine similarity between current utterance and the average of the previous 2 utterances (lag 2)
    if i > 1:
        avg_lag2 = np.mean([embeddings[i-1], embeddings[i-2]], axis=0)
        sim_lag2 = cosine_similarity([embeddings[i]], [avg_lag2])[0][0]
    else:
        sim_lag2 = None  # Not enough previous utterances for the first two rows
    similarity_lag2.append(sim_lag2)
    
    # Cosine similarity between current utterance and the average of the previous 5 utterances (lag 5)
    if i > 4:
        avg_lag5 = np.mean(embeddings[i-1:i-5:-1], axis=0)  # Get the last 5 utterances
        sim_lag5 = cosine_similarity([embeddings[i]], [avg_lag5])[0][0]
    else:
        sim_lag5 = None  # Not enough previous utterances for the first 5 rows
    similarity_lag5.append(sim_lag5)

    # Cosine similarity between current utterance and the average of the previous 10 utterances (lag 10)
    if i > 9:
        avg_lag10 = np.mean(embeddings[i-1:i-10:-1], axis = 0) # get last 10 utterances
        sim_lag10 = cosine_similarity([embeddings[i]], [avg_lag10])[0][0]
    else:
        sim_lag10 = None # not enough previous utterances for the first 10 rows
    similarity_lag10.append(sim_lag10)

# append to data frame
df['similarity_lag1'] = [None] + similarity_lag1
df['similarity_lag2'] = [None] + similarity_lag2
df['similarity_lag5'] = [None] + similarity_lag5
df['similarity_lag10'] = [None] + similarity_lag10

# save lagged data
df.to_csv('/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/CANDOR/analysis/data/lagged_similarity.csv')
# show
df.head()


Unnamed: 0,turn_id,speaker,transcript_id,start,stop,utterance,backchannel,backchannel_count,backchannel_speaker,backchannel_start,...,currentUtterance,previous_topic,new_topic,PID,time,scaled_turn_id,similarity_lag1,similarity_lag2,similarity_lag5,similarity_lag10
0,0,5eeaae03e1959f18cd331cd0,cb056010-c50e-4e80-b639-bb178a2b9330,68.04,144.65,Yeah. Huh? Yeah. What's up? All right. Yeah. M...,Yeah.,1,5f2850a9d9a5f01279e1bd16,143.84,...,,,,,,0.0,,,,
1,1,5f2850a9d9a5f01279e1bd16,cb056010-c50e-4e80-b639-bb178a2b9330,144.74,146.15,Hello?,,0,,,...,Hello?,Starting The Call,Checking in,"[False, '5b90a114f0a0970001f6dd6e', None]",1728700000000.0,0.002381,0.106709,,,
2,2,5eeaae03e1959f18cd331cd0,cb056010-c50e-4e80-b639-bb178a2b9330,146.44,147.76,"Hi. How are, you know?",,0,,,...,,,,,,0.004762,0.528025,0.468696,,
3,3,5f2850a9d9a5f01279e1bd16,cb056010-c50e-4e80-b639-bb178a2b9330,147.84,149.76,"I'm doing great, how are you to see me?",Mhm.,1,5eeaae03e1959f18cd331cd0,148.84,...,,,,,,0.007143,0.515418,0.528136,,
4,4,5eeaae03e1959f18cd331cd0,cb056010-c50e-4e80-b639-bb178a2b9330,149.74,150.55,I'm good.,,0,,,...,,,,,,0.009524,0.506012,0.511558,,


In [16]:
# load example conversation data
df = pd.read_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/CANDOR/analysis/data/test_tsne_2.csv")
# make sure data is pandas dataframe
embeddings_tile = pd.DataFrame(df)
# get sbert embeddings for each individual chat 
def tile_embeddings(row):
    # extract current message from the row
    current = row['utterance']
    # apply s-bert model to each string to get embeddings
    embeddings = model.encode(current)
    return embeddings

# save embeddings to output
embeddings_tile['embeddings'] = embeddings_tile.apply(tile_embeddings, axis = 1)

# create lagged embeddings columns
embeddings_tile['embeddings_lag1'] = embeddings_tile['embeddings'].shift(1)
embeddings_tile['embeddings_lag2'] = embeddings_tile['embeddings'].shift(2)
embeddings_tile['embeddings_lag3'] = embeddings_tile['embeddings'].shift(3)
embeddings_tile['embeddings_lag4'] = embeddings_tile['embeddings'].shift(4)
embeddings_tile['embeddings_lag5'] = embeddings_tile['embeddings'].shift(5)

# show 
embeddings_tile.head()



# define a function to calculate cosine similarity per row across lags
# def lag_cosine_similarity(row):
#     similarities = {}
#     # check if lagged embeddings exist (avoid NaNs in first rows)
#     if pd.isnull(row['embeddings_lag1']):
#         similarities['cosine_similarity_lag1'] = cosine_similarity([row['embeddings']], [row['embeddings_lag1']])[0][0]
#     else:
#         similarities['cosine_similarity_lag1'] = np.nan
    
#     if pd.isnull(row['embeddings_lag2']):
#         similarities['cosine_similarity_lag2'] = cosine_similarity([row['embeddings']], [row['embeddings_lag2']])[0][0]
#     else:
#         similarities['cosine_similarity_lag2'] = np.nan
    
#     if pd.isnull(row['embeddings_lag3']):
#         similarities['cosine_similarity_lag3'] = cosine_similarity([row['embeddings']], [row['embeddings_lag3']])[0][0]
#     else:
#         similarities['cosine_similarity_lag3'] = np.nan
    
#     if pd.isnull(row['embeddings_lag4']):
#         similarities['cosine_similarity_lag4'] = cosine_similarity([row['embeddings']], [row['embeddings_lag4']])[0][0]
#     else:
#         similarities['cosine_similarity_lag4'] = np.nan
    
#     if pd.isnull(row['embeddings_lag5']):
#         similarities['cosine_similarity_lag5'] = cosine_similarity([row['embeddings']], [row['embeddings_lag5']])[0][0]
#     else:
#         similarities['cosine_similarity_lag5'] = np.nan

#     return pd.Series(similarities)

# # apply function to calculate similarities for each row
# embeddings_tile[['cosine_similarity_lag1', 'cosine_similarity_lag2',
#                  'cosine_similarity_lag3', 'cosine_similarity_lag4',
#                  'cosine_similarity_lag5']] = embeddings_tile.apply(lag_cosine_similarity, axis = 1)

# # show
# embeddings_tile.head()


# get embeddings, establish n to n-y distances; this utterance's average distance from past 5 utterances
# look at time course for values, relatively high distance, trough for a bit where topic change happens
# get vector for each row, work backwards
# cosine similarity between this row and previous row
# average those values to get difference between this utterance prior 5
# maybe use punctuation to try to fix up cell formatting

# maybe do cliffhanger version instead


Unnamed: 0,turn_id,speaker,transcript_id,start,stop,utterance,backchannel,backchannel_count,backchannel_speaker,backchannel_start,...,new_topic,PID,time,scaled_turn_id,embeddings,embeddings_lag1,embeddings_lag2,embeddings_lag3,embeddings_lag4,embeddings_lag5
0,0,5eeaae03e1959f18cd331cd0,cb056010-c50e-4e80-b639-bb178a2b9330,68.04,144.65,Yeah. Huh? Yeah. What's up? All right. Yeah. M...,Yeah.,1,5f2850a9d9a5f01279e1bd16,143.84,...,,,,0.0,"[-0.03326842, 0.08158294, 0.035818107, -0.0008...",,,,,
1,1,5f2850a9d9a5f01279e1bd16,cb056010-c50e-4e80-b639-bb178a2b9330,144.74,146.15,Hello?,,0,,,...,Checking in,"[False, '5b90a114f0a0970001f6dd6e', None]",1728700000000.0,0.002381,"[-0.048191182, 0.051648274, 0.060366347, 0.078...","[-0.03326842, 0.08158294, 0.035818107, -0.0008...",,,,
2,2,5eeaae03e1959f18cd331cd0,cb056010-c50e-4e80-b639-bb178a2b9330,146.44,147.76,"Hi. How are, you know?",,0,,,...,,,,0.004762,"[0.01075932, 0.040912215, 0.061604835, 0.04930...","[-0.048191182, 0.051648274, 0.060366347, 0.078...","[-0.03326842, 0.08158294, 0.035818107, -0.0008...",,,
3,3,5f2850a9d9a5f01279e1bd16,cb056010-c50e-4e80-b639-bb178a2b9330,147.84,149.76,"I'm doing great, how are you to see me?",Mhm.,1,5eeaae03e1959f18cd331cd0,148.84,...,,,,0.007143,"[-0.0025099565, -0.048034362, 0.06437929, 0.01...","[0.01075932, 0.040912215, 0.061604835, 0.04930...","[-0.048191182, 0.051648274, 0.060366347, 0.078...","[-0.03326842, 0.08158294, 0.035818107, -0.0008...",,
4,4,5eeaae03e1959f18cd331cd0,cb056010-c50e-4e80-b639-bb178a2b9330,149.74,150.55,I'm good.,,0,,,...,,,,0.009524,"[-0.070886955, -0.080460116, -0.04864771, 0.01...","[-0.0025099565, -0.048034362, 0.06437929, 0.01...","[0.01075932, 0.040912215, 0.061604835, 0.04930...","[-0.048191182, 0.051648274, 0.060366347, 0.078...","[-0.03326842, 0.08158294, 0.035818107, -0.0008...",


In [36]:
# calculate consine similarity between embeddings and lagged embeddings
def lag_cosine_similarity(row, lag_col):
    if row[lag_col] is None:
        return None
    current = [row['embeddings'], row[lag_col]]
    # calculate cosine similarity between question and message
    similarity = model.similarity(current, current)
    return similarity[0][1]

# Apply the function
embeddings_tile['similarity_lag1'] = embeddings_tile.apply(lag_cosine_similarity, axis=1, lag_col = 'embeddings_lag1')
embeddings_tile['similarity_lag2'] = embeddings_tile.apply(lag_cosine_similarity, axis=1, lag_col = 'embeddings_lag2')
embeddings_tile['similarity_lag3'] = embeddings_tile.apply(lag_cosine_similarity, axis=1, lag_col = 'embeddings_lag3')
embeddings_tile['similarity_lag4'] = embeddings_tile.apply(lag_cosine_similarity, axis=1, lag_col = 'embeddings_lag4')
embeddings_tile['similarity_lag5'] = embeddings_tile.apply(lag_cosine_similarity, axis=1, lag_col = 'embeddings_lag5')

# save
embeddings_tile.to_csv('/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/CANDOR/analysis/data/tile_similarity.csv')

# preview
embeddings_tile.head()

Unnamed: 0,turn_id,speaker,transcript_id,start,stop,utterance,backchannel,backchannel_count,backchannel_speaker,backchannel_start,...,embeddings_lag1,embeddings_lag2,embeddings_lag3,embeddings_lag4,embeddings_lag5,similarity_lag1,similarity_lag2,similarity_lag3,similarity_lag4,similarity_lag5
0,0,5eeaae03e1959f18cd331cd0,cb056010-c50e-4e80-b639-bb178a2b9330,68.04,144.65,Yeah. Huh? Yeah. What's up? All right. Yeah. M...,Yeah.,1,5f2850a9d9a5f01279e1bd16,143.84,...,,,,,,,,,,
1,1,5f2850a9d9a5f01279e1bd16,cb056010-c50e-4e80-b639-bb178a2b9330,144.74,146.15,Hello?,,0,,,...,"[-0.03326842, 0.08158294, 0.035818107, -0.0008...",,,,,tensor(0.1067),,,,
2,2,5eeaae03e1959f18cd331cd0,cb056010-c50e-4e80-b639-bb178a2b9330,146.44,147.76,"Hi. How are, you know?",,0,,,...,"[-0.048191182, 0.051648274, 0.060366347, 0.078...","[-0.03326842, 0.08158294, 0.035818107, -0.0008...",,,,tensor(0.5280),tensor(0.1693),,,
3,3,5f2850a9d9a5f01279e1bd16,cb056010-c50e-4e80-b639-bb178a2b9330,147.84,149.76,"I'm doing great, how are you to see me?",Mhm.,1,5eeaae03e1959f18cd331cd0,148.84,...,"[0.01075932, 0.040912215, 0.061604835, 0.04930...","[-0.048191182, 0.051648274, 0.060366347, 0.078...","[-0.03326842, 0.08158294, 0.035818107, -0.0008...",,,tensor(0.5154),tensor(0.4078),tensor(0.2858),,
4,4,5eeaae03e1959f18cd331cd0,cb056010-c50e-4e80-b639-bb178a2b9330,149.74,150.55,I'm good.,,0,,,...,"[-0.0025099565, -0.048034362, 0.06437929, 0.01...","[0.01075932, 0.040912215, 0.061604835, 0.04930...","[-0.048191182, 0.051648274, 0.060366347, 0.078...","[-0.03326842, 0.08158294, 0.035818107, -0.0008...",,tensor(0.5060),tensor(0.3846),tensor(0.3981),tensor(0.2443),
