## SETUP

In [None]:
# install packages as needed
#! pip install sentence-transformers
#! pip install numpy
#! pip install pandas
#! pip install scikit-learn
#! pip install matplotlib
#! pip install numpy==1.24.1

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pytorch
  Using cached pytorch-1.0.2.tar.gz (689 bytes)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: pytorch
  Building wheel for pytorch (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[6 lines of output][0m
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File "<string>", line 2, in <module>
  [31m   [0m   File "<pip-setuptools-caller>", line 34, in <module>
  [31m   [0m   File "/private/var/folders/t2/73yc46rd3756c5vdz756l6c188wgsh/T/pip-install-rbi48g2o/pytorch_dbbf15879d374296b6157c28bf5c157a/setup.py", line 15, in <module>
  [31m   [0m     raise Exception(message)
  [31m   [0m Exception: You tried to install "pytorch". The package named for PyTorch is "torch"
  [31m   [0m [31m[end of output][0m
  
  [1;35mnote[0m: This e

In [27]:
# libraries
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import sklearn
from sklearn import cluster
from sklearn.cluster import KMeans

In [3]:
# load pre-trained sentence transformer model (aka SBERT)
model = SentenceTransformer('all-MiniLM-L6-v2')



In [None]:
# create a function to get tiled cosine similarities between aggregated utterances
def tiled_cosine_similarity(df, window_size):
    # create empty list to store output
    results = []
    # iterate over each group of transcript_id
    for transcript_id, group in df.groupby('transcript_id'):
        # sort by turn_id to maintain conversation order
        group = group.sort_values(by = "turn_id")
        # extract all utterances for this transcript
        utterances = group['utterance'].tolist()
        print(f"Length utterances = {len(utterances)} for transcript {transcript_id}")
        # now move to sliding window approach to get similarity between consecutive windows
        for i in range(len(utterances) - window_size):
            #i = i + 2
            # get start row and end row of window A
            A_start = i # inclusive
            A_end = A_start + window_size # exclusive
            window_A = df['utterance'].iloc[A_start:A_end].str.cat(sep = " ")
            # get start row and end row of window B
            B_start = i + window_size
            B_end = B_start + window_size
            window_B = df['utterance'].iloc[B_start:B_end].str.cat(sep = " ")
            # ensure that both windows are not empty
            if len(window_A) > 0 and len(window_B) > 0:
                # get embeddings for both windows
                embeddings_A = model.encode(window_A)
                embeddings_B = model.encode(window_B)
                # ensure embeddings are 2D arrays
                embeddings_A = embeddings_A.reshape(1, -1)
                embeddings_B = embeddings_B.reshape(1, -1)
                # alert me if embeddings have different shapes
                assert embeddings_A.shape[1] == embeddings_B.shape[1], \
                    f"Dimensionality mismatch: A={embeddings_A.shape[1]}, B={embeddings_B.shape[1]}"
                # get cosine similarity between utterances in windows A and B
                similarity = cosine_similarity(embeddings_A, embeddings_B)
                # save output
                output = {
                    'transcript_id': transcript_id,
                    'window_size': window_size,
                    'A_start_turn': A_start + 1, # account for 0-bounding python
                    'A_end_turn': A_end,
                    'A_utterances': window_A,
                    'B_start_turn': B_start + 1, # account for 0-bounding python
                    'B_end_turn': B_end,
                    'B_utterances': window_B,
                    'cosine_similarity': similarity[0][0]
                }
                results.append(output)
    # convert results list to data frame and return
    results_df = pd.DataFrame(results)
    print(results_df.head())
    return results_df


### Example conversation

In [110]:
# run tiling analysis on example conversation before doing all transcripts
# load example conversation data
df = pd.read_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/CANDOR/analysis/data/test_tsne_2.csv")
# make sure data is pandas dataframe
df = pd.DataFrame(df)

# select only variables of interest
df_subset = df[["turn_id", "transcript_id", "speaker", "utterance"]]
# preview
df_subset.head()

Unnamed: 0,turn_id,transcript_id,speaker,utterance
0,0,cb056010-c50e-4e80-b639-bb178a2b9330,5eeaae03e1959f18cd331cd0,Yeah. Huh? Yeah. What's up? All right. Yeah. M...
1,1,cb056010-c50e-4e80-b639-bb178a2b9330,5f2850a9d9a5f01279e1bd16,Hello?
2,2,cb056010-c50e-4e80-b639-bb178a2b9330,5eeaae03e1959f18cd331cd0,"Hi. How are, you know?"
3,3,cb056010-c50e-4e80-b639-bb178a2b9330,5f2850a9d9a5f01279e1bd16,"I'm doing great, how are you to see me?"
4,4,cb056010-c50e-4e80-b639-bb178a2b9330,5eeaae03e1959f18cd331cd0,I'm good.


In [112]:
# apply tiling function to example conversation for a few different window sizes
# window size = 2
test_2 = tiled_cosine_similarity(df_subset, 2)

Length utterances = 421 for transcript cb056010-c50e-4e80-b639-bb178a2b9330
                          transcript_id  window_size  A_start_turn  \
0  cb056010-c50e-4e80-b639-bb178a2b9330            2             1   
1  cb056010-c50e-4e80-b639-bb178a2b9330            2             2   
2  cb056010-c50e-4e80-b639-bb178a2b9330            2             3   
3  cb056010-c50e-4e80-b639-bb178a2b9330            2             4   
4  cb056010-c50e-4e80-b639-bb178a2b9330            2             5   

   A_end_turn                                       A_utternaces  \
0           2  Yeah. Huh? Yeah. What's up? All right. Yeah. M...   
1           3                      Hello? Hi. How are, you know?   
2           4  Hi. How are, you know? I'm doing great, how ar...   
3           5  I'm doing great, how are you to see me? I'm good.   
4           6   I'm good. That's good. Were you on long waiting?   

   B_start_turn  B_end_turn  \
0             3           4   
1             4           5   
2

In [113]:
# window size = 5
test_5 = tiled_cosine_similarity(df_subset, 5)

Length utterances = 421 for transcript cb056010-c50e-4e80-b639-bb178a2b9330
                          transcript_id  window_size  A_start_turn  \
0  cb056010-c50e-4e80-b639-bb178a2b9330            5             1   
1  cb056010-c50e-4e80-b639-bb178a2b9330            5             2   
2  cb056010-c50e-4e80-b639-bb178a2b9330            5             3   
3  cb056010-c50e-4e80-b639-bb178a2b9330            5             4   
4  cb056010-c50e-4e80-b639-bb178a2b9330            5             5   

   A_end_turn                                       A_utternaces  \
0           5  Yeah. Huh? Yeah. What's up? All right. Yeah. M...   
1           6  Hello? Hi. How are, you know? I'm doing great,...   
2           7  Hi. How are, you know? I'm doing great, how ar...   
3           8  I'm doing great, how are you to see me? I'm go...   
4           9  I'm good. That's good. Were you on long waitin...   

   B_start_turn  B_end_turn  \
0             6          10   
1             7          11   
2

In [114]:
# window size = 10
test_10 = tiled_cosine_similarity(df_subset, 10)

Length utterances = 421 for transcript cb056010-c50e-4e80-b639-bb178a2b9330
                          transcript_id  window_size  A_start_turn  \
0  cb056010-c50e-4e80-b639-bb178a2b9330           10             1   
1  cb056010-c50e-4e80-b639-bb178a2b9330           10             2   
2  cb056010-c50e-4e80-b639-bb178a2b9330           10             3   
3  cb056010-c50e-4e80-b639-bb178a2b9330           10             4   
4  cb056010-c50e-4e80-b639-bb178a2b9330           10             5   

   A_end_turn                                       A_utternaces  \
0          10  Yeah. Huh? Yeah. What's up? All right. Yeah. M...   
1          11  Hello? Hi. How are, you know? I'm doing great,...   
2          12  Hi. How are, you know? I'm doing great, how ar...   
3          13  I'm doing great, how are you to see me? I'm go...   
4          14  I'm good. That's good. Were you on long waitin...   

   B_start_turn  B_end_turn  \
0            11          20   
1            12          21   
2

In [115]:
# save data
test_2.to_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/Candor/analysis/output/example_convo_tile_2.csv")
test_5.to_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/Candor/analysis/output/example_convo_tile_5.csv")
test_10.to_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/Candor/analysis/output/example_convo_tile_10.csv")

### All conversations

In [9]:
# load all annotated conversations
df = pd.read_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/processed/annotated_transcripts_for_tile.csv")
# make sure data is pandas dataframe
df = pd.DataFrame(df)
# preview
df.head()

Unnamed: 0,turn_id,speaker,transcript_id,utterance
0,0,5e6cade3d92ffb26677dbd84,01a4c01c-cf0e-4f37-ab2b-641bb604af30,"charlie, wait, hi, can you hear me"
1,1,5f1b2282a3103e2caff2fd7b,01a4c01c-cf0e-4f37-ab2b-641bb604af30,"Yes, we did. Okay me."
2,2,5e6cade3d92ffb26677dbd84,01a4c01c-cf0e-4f37-ab2b-641bb604af30,"Yeah, I can hold on sorry to like"
3,3,5f1b2282a3103e2caff2fd7b,01a4c01c-cf0e-4f37-ab2b-641bb604af30,that works.
4,4,5e6cade3d92ffb26677dbd84,01a4c01c-cf0e-4f37-ab2b-641bb604af30,go in the bathroom because my boyfriend's play...


In [10]:
# apply tiling function to annotated conversations dataset
full_tile_3 = tiled_cosine_similarity(df, 3)

Length utterances = 596 for transcript 01a4c01c-cf0e-4f37-ab2b-641bb604af30
Length utterances = 232 for transcript 04542d1a-168a-4f90-b7e3-33adb675525f
Length utterances = 589 for transcript 0542c0f0-6c1e-4e54-b3df-afa48b19f9b5
Length utterances = 301 for transcript 06afb9c1-b367-45c7-a9ff-74c47a2cb61c
Length utterances = 683 for transcript 06b83c0a-7bf1-4cf9-9e72-034615d97050
Length utterances = 317 for transcript 07094abd-8b2a-426a-8dca-edead45c2143
Length utterances = 191 for transcript 0a294776-ca94-4114-bc24-2bf3b0a0a813
Length utterances = 140 for transcript 0bd4dd2c-1cf0-46fb-87d3-a88b1de310a6
Length utterances = 413 for transcript 0bfe2cba-bf84-4a8b-887d-f19378aa07d8
Length utterances = 271 for transcript 0e772ef8-9014-48a8-8eb9-762ef55b4fe9
Length utterances = 260 for transcript 0e8d198c-fe6c-400c-a88b-5a3ead8567bc
Length utterances = 546 for transcript 128f15b9-a6e4-4575-8b96-163ff189ee8e
Length utterances = 307 for transcript 12a3510d-960d-4ce2-9933-f473f4ee6800
Length utter

In [11]:
full_tile_3.to_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/output/annotated_transcripts_tile_3.csv")

## Topic label embeddings

In [46]:
# home
df = pd.read_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/processed/topic_labels_for_getting_embeddings.csv")
df

Unnamed: 0,PID,topic_number,new_topic,current_topic,current_topic_number,prior_topic,prior_topic_number
0,"[False, '55b10b48fdf99b30f80e3993', None]",2,How long have you been doing prolific,How long have you been doing prolific,2,Future Trips,1
1,"[False, '55b10b48fdf99b30f80e3993', None]",3,Job,Job,3,How long have you been doing prolific,2
2,"[False, '55b10b48fdf99b30f80e3993', None]",4,Moving,Moving,4,Job,3
3,"[False, '55b10b48fdf99b30f80e3993', None]",5,Ohio,Ohio,5,Moving,4
4,"[False, '55b10b48fdf99b30f80e3993', None]",6,Outdoors,Outdoors,6,Ohio,5
...,...,...,...,...,...,...,...
5654,"[False, '67059c24fd67124adcd5241e', None]",13,Video Games,Video Games,13,TV shows,12
5655,"[False, '67059c24fd67124adcd5241e', None]",14,Weather / What's your favorite season,Weather / What's your favorite season,14,Video Games,13
5656,"[False, '67059c24fd67124adcd5241e', None]",15,What were you doing during quarantine,What were you doing during quarantine,15,Weather / What's your favorite season,14
5657,"[False, '67059c24fd67124adcd5241e', None]",16,living situation,living situation,16,What were you doing during quarantine,15


In [None]:
def alternating_topics_cosine_similarity(row):
    # extract current message and next message from the row
    current = [row['prior_topic'], row['current_topic']]
    # apply s-bert model to each string to get embeddings
    embeddings = model.encode(current)
    # calculate cosine similarity between question and message
    similarity = model.similarity(embeddings, embeddings)
    return similarity[0][1]

# apply function to each row of processed chat data
df['topic_similarity'] = df.apply(alternating_topics_cosine_similarity, axis = 1)

# save and preview
df.to_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/output/topic_label_similarity.csv", index = False)
df.head()

Unnamed: 0,PID,topic_number,new_topic,current_topic,current_topic_number,prior_topic,prior_topic_number,topic_similarity
0,"[False, '55b10b48fdf99b30f80e3993', None]",2,How long have you been doing prolific,How long have you been doing prolific,2,Future Trips,1,tensor(0.1465)
1,"[False, '55b10b48fdf99b30f80e3993', None]",3,Job,Job,3,How long have you been doing prolific,2,tensor(0.1690)
2,"[False, '55b10b48fdf99b30f80e3993', None]",4,Moving,Moving,4,Job,3,tensor(0.3957)
3,"[False, '55b10b48fdf99b30f80e3993', None]",5,Ohio,Ohio,5,Moving,4,tensor(0.2542)
4,"[False, '55b10b48fdf99b30f80e3993', None]",6,Outdoors,Outdoors,6,Ohio,5,tensor(0.3111)


## Cluster label embeddings

In [48]:
# home
df = pd.read_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/processed/cluster_labels_for_getting_embeddings.csv")
df

Unnamed: 0,PID,topic_number,cluster_label,current_topic,current_topic_number,prior_topic,prior_topic_number
0,"[False, '55b10b48fdf99b30f80e3993', None]",2,prolific,prolific,2,travel,1
1,"[False, '55b10b48fdf99b30f80e3993', None]",3,work,work,3,prolific,2
2,"[False, '55b10b48fdf99b30f80e3993', None]",4,moving,moving,4,work,3
3,"[False, '55b10b48fdf99b30f80e3993', None]",5,countries and states,countries and states,5,moving,4
4,"[False, '55b10b48fdf99b30f80e3993', None]",6,weather,weather,6,countries and states,5
...,...,...,...,...,...,...,...
5654,"[False, '67059c24fd67124adcd5241e', None]",13,entertainment,entertainment,13,tv shows,12
5655,"[False, '67059c24fd67124adcd5241e', None]",14,weather,weather,14,entertainment,13
5656,"[False, '67059c24fd67124adcd5241e', None]",15,quarantine,quarantine,15,weather,14
5657,"[False, '67059c24fd67124adcd5241e', None]",16,life,life,16,quarantine,15


In [49]:
def alternating_topics_cosine_similarity(row):
    # extract current message and next message from the row
    current = [row['prior_topic'], row['current_topic']]
    # apply s-bert model to each string to get embeddings
    embeddings = model.encode(current)
    # calculate cosine similarity between question and message
    similarity = model.similarity(embeddings, embeddings)
    return similarity[0][1]

# apply function to each row of processed chat data
df['cluster_similarity'] = df.apply(alternating_topics_cosine_similarity, axis = 1)

# save and preview
df.to_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/output/cluster_label_similarity.csv", index = False)
df.head()

Unnamed: 0,PID,topic_number,cluster_label,current_topic,current_topic_number,prior_topic,prior_topic_number,cluster_similarity
0,"[False, '55b10b48fdf99b30f80e3993', None]",2,prolific,prolific,2,travel,1,tensor(0.2363)
1,"[False, '55b10b48fdf99b30f80e3993', None]",3,work,work,3,prolific,2,tensor(0.2571)
2,"[False, '55b10b48fdf99b30f80e3993', None]",4,moving,moving,4,work,3,tensor(0.3969)
3,"[False, '55b10b48fdf99b30f80e3993', None]",5,countries and states,countries and states,5,moving,4,tensor(0.1900)
4,"[False, '55b10b48fdf99b30f80e3993', None]",6,weather,weather,6,countries and states,5,tensor(0.1580)


## Topic clustering
Use kmeans to identify topic clusters

In [20]:
# lab
#df = pd.read_csv("/Users/tuo70125/My Drive/SANLab/Experiments/Conversation-Structure/data/processed/topic_labels_all.csv")
# home
df = pd.read_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/processed/topic_labels_all.csv")

# preview
df

Unnamed: 0,PID,new_topic,number_of_turns,average_turn_length,topic_number
0,"[False, '55b10b48fdf99b30f80e3993', None]",Future Trips,15,9.333333,1
1,"[False, '55b10b48fdf99b30f80e3993', None]",How long have you been doing prolific,11,9.333333,2
2,"[False, '55b10b48fdf99b30f80e3993', None]",Job,18,9.333333,3
3,"[False, '55b10b48fdf99b30f80e3993', None]",Moving,24,9.333333,4
4,"[False, '55b10b48fdf99b30f80e3993', None]",Ohio,2,9.333333,5
...,...,...,...,...,...
5922,"[False, '67059c24fd67124adcd5241e', None]",Video Games,28,12.000000,13
5923,"[False, '67059c24fd67124adcd5241e', None]",Weather / What's your favorite season,14,12.000000,14
5924,"[False, '67059c24fd67124adcd5241e', None]",What were you doing during quarantine,5,12.000000,15
5925,"[False, '67059c24fd67124adcd5241e', None]",living situation,14,12.000000,16


In [31]:
# keep a data frame that has all the labels (PID, turns, topic number in transcript)
# have a second with formatted topic labels (all lowercased, no duplicates) and original topic labels 

# lowercase all topics
df['new_topic'] = df['new_topic'].str.lower()
df.head()

Unnamed: 0,PID,new_topic,number_of_turns,average_turn_length,topic_number
0,"[False, '55b10b48fdf99b30f80e3993', None]",future trips,15,9.333333,1
1,"[False, '55b10b48fdf99b30f80e3993', None]",how long have you been doing prolific,11,9.333333,2
2,"[False, '55b10b48fdf99b30f80e3993', None]",job,18,9.333333,3
3,"[False, '55b10b48fdf99b30f80e3993', None]",moving,24,9.333333,4
4,"[False, '55b10b48fdf99b30f80e3993', None]",ohio,2,9.333333,5


In [None]:
def get_embeddings(row):
    # extract current message from the row
    current = row['new_topic']
    # apply s-bert model to each string to get embeddings
    embeddings = model.encode(current)
    return embeddings

# save embeddings to output
df['embeddings'] = df.apply(get_embeddings, axis = 1)

df.head()

Unnamed: 0,PID,new_topic,number_of_turns,average_turn_length,topic_number,embeddings
0,"[False, '55b10b48fdf99b30f80e3993', None]",future trips,15,9.333333,1,"[0.034140937, 0.009445701, 0.03935239, 0.04317..."
1,"[False, '55b10b48fdf99b30f80e3993', None]",how long have you been doing prolific,11,9.333333,2,"[0.08297389, 0.0033865287, 0.02990591, 0.05368..."
2,"[False, '55b10b48fdf99b30f80e3993', None]",job,18,9.333333,3,"[-0.15970999, 0.08047501, -0.02434519, -0.0091..."
3,"[False, '55b10b48fdf99b30f80e3993', None]",moving,24,9.333333,4,"[0.035134934, -0.06459744, 0.029326417, 0.0144..."
4,"[False, '55b10b48fdf99b30f80e3993', None]",ohio,2,9.333333,5,"[0.012322004, 0.004526857, 0.011148907, 0.0317..."


In [None]:
# kmeans on the new topic strings
#kmeans_topics = sklearn.cluster.KMeans(n_clusters = 100).fit(df['embeddings'])

topic_array = np.array([np.array(x) for x in df['embeddings'].to_numpy()])
kmeans_topics = sklearn.cluster.KMeans(n_clusters=100).fit(topic_array)

kmeans_labels = kmeans_topics.predict(topic_array)

kmeans_labels # bind this back to df and group based on what's being clustered together

df['clusters'] = kmeans_labels

df.head()


Unnamed: 0,PID,new_topic,number_of_turns,average_turn_length,topic_number,embeddings,clusters
0,"[False, '55b10b48fdf99b30f80e3993', None]",future trips,15,9.333333,1,"[0.034140937, 0.009445701, 0.03935239, 0.04317...",58
1,"[False, '55b10b48fdf99b30f80e3993', None]",how long have you been doing prolific,11,9.333333,2,"[0.08297389, 0.0033865287, 0.02990591, 0.05368...",19
2,"[False, '55b10b48fdf99b30f80e3993', None]",job,18,9.333333,3,"[-0.15970999, 0.08047501, -0.02434519, -0.0091...",9
3,"[False, '55b10b48fdf99b30f80e3993', None]",moving,24,9.333333,4,"[0.035134934, -0.06459744, 0.029326417, 0.0144...",24
4,"[False, '55b10b48fdf99b30f80e3993', None]",ohio,2,9.333333,5,"[0.012322004, 0.004526857, 0.011148907, 0.0317...",20


In [44]:
df = df.sort_values(by = ['clusters'])
df
df.to_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/output/clusters.csv")

In [18]:
# dimensionality reduction

# extract embeddings from just this group
df_embeddings = df['embeddings'].tolist()

# make the embeddings a numpy array
x_test = np.array(df_embeddings)
# set up tsne parameters (pay attention to perplexity)
tsne = TSNE(n_components=2, random_state=10, perplexity=7)
# run tsne on embeddings
x_embedded = tsne.fit_transform(x_test)

# add lines to connect data points
df['TSNE1'] = x_embedded[:, 0]
df['TSNE2'] = x_embedded[:, 1]

# sort by turn order to ensure points plotted in correct order
#df = df.sort_values(by='topic_number')

# save example data
df.to_csv('/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/output/topic_tsne.csv', index = False)
# show data
df.head()

Unnamed: 0,PID,new_topic,number_of_turns,average_turn_length,topic_number,embeddings,TSNE1,TSNE2
0,"[False, '55b10b48fdf99b30f80e3993', None]",Future Trips,15,9.333333,1,"[0.034140937, 0.009445701, 0.03935239, 0.04317...",42.416924,-29.599901
1,"[False, '55b10b48fdf99b30f80e3993', None]",How long have you been doing prolific,11,9.333333,2,"[0.08297389, 0.0033865287, 0.02990591, 0.05368...",-14.385501,101.164436
2,"[False, '55b10b48fdf99b30f80e3993', None]",Job,18,9.333333,3,"[-0.15970999, 0.08047501, -0.02434519, -0.0091...",-78.885742,10.207532
3,"[False, '55b10b48fdf99b30f80e3993', None]",Moving,24,9.333333,4,"[0.035134934, -0.06459744, 0.029326417, 0.0144...",-0.91278,-44.342335
4,"[False, '55b10b48fdf99b30f80e3993', None]",Ohio,2,9.333333,5,"[0.012322004, 0.004526857, 0.011148907, 0.0317...",37.814445,-51.609016


## Archive

In [None]:
# def get_embeddings(row):
#     # extract current message from the row
#     current = row['new_topic']
#     # apply s-bert model to each string to get embeddings
#     embeddings = model.encode(current)
#     return embeddings

# # save embeddings to output
# df['embeddings'] = df.apply(get_embeddings, axis = 1)

# df.head()