## Setup

In [None]:
# install packages as needed
#! pip install sentence-transformers
#! pip install numpy
#! pip install pandas
#! pip install scikit-learn
#! pip install matplotlib
#! pip install numpy==1.24.1

In [3]:
# libraries
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import sklearn
from sklearn import cluster
from sklearn.cluster import KMeans

  from tqdm.autonotebook import tqdm, trange


In [4]:
# load pre-trained sentence transformer model (aka SBERT)
model = SentenceTransformer('all-MiniLM-L6-v2')



## TILE
### 3-utterance tiling for all annotated conversations

In [3]:
# create a function to get tiled cosine similarities between aggregated utterances
def tiled_cosine_similarity(df, window_size):
    # create empty list to store output
    results = []
    # iterate over each group of transcript_id
    for transcript_id, group in df.groupby('transcript_id'):
        # sort by turn_id to maintain conversation order
        group = group.sort_values(by = "turn_id")
        # extract all utterances for this transcript
        utterances = group['utterance'].tolist()
        print(f"Length utterances = {len(utterances)} for transcript {transcript_id}")
        # now move to sliding window approach to get similarity between consecutive windows
        for i in range(len(utterances) - window_size):
            #i = i + 2
            # get start row and end row of window A
            A_start = i # inclusive
            A_end = A_start + window_size # exclusive
            window_A = df['utterance'].iloc[A_start:A_end].str.cat(sep = " ")
            # get start row and end row of window B
            B_start = i + window_size
            B_end = B_start + window_size
            window_B = df['utterance'].iloc[B_start:B_end].str.cat(sep = " ")
            # ensure that both windows are not empty
            if len(window_A) > 0 and len(window_B) > 0:
                # get embeddings for both windows
                embeddings_A = model.encode(window_A)
                embeddings_B = model.encode(window_B)
                # ensure embeddings are 2D arrays
                embeddings_A = embeddings_A.reshape(1, -1)
                embeddings_B = embeddings_B.reshape(1, -1)
                # alert me if embeddings have different shapes
                assert embeddings_A.shape[1] == embeddings_B.shape[1], \
                    f"Dimensionality mismatch: A={embeddings_A.shape[1]}, B={embeddings_B.shape[1]}"
                # get cosine similarity between utterances in windows A and B
                similarity = cosine_similarity(embeddings_A, embeddings_B)
                # save output
                output = {
                    'transcript_id': transcript_id,
                    'window_size': window_size,
                    'A_start_turn': A_start + 1, # account for 0-bounding python
                    'A_end_turn': A_end,
                    'A_utterances': window_A,
                    'B_start_turn': B_start + 1, # account for 0-bounding python
                    'B_end_turn': B_end,
                    'B_utterances': window_B,
                    'cosine_similarity': similarity[0][0]
                }
                results.append(output)
    # convert results list to data frame and return
    results_df = pd.DataFrame(results)
    print(results_df.head())
    return results_df

In [5]:
# load annotated conversations
# hs home
df = pd.read_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/processed/backbiter_subset_for_tiling.csv")
# lab
# df = pd.read_csv()

# make sure data is pandas dataframe
df = pd.DataFrame(df)
# preview
df.head()

Unnamed: 0,turn_id,speaker,transcript_id,utterance
0,0,5ad7c075c25ea0000188486b,01849238-f5f0-487e-bca4-7b4fe0c9625c,mm. Mhm. Okay. Okay. Mm hmm. Mhm. Oh. Mhm. Oka...
1,1,5de5538f8fde1c4dbc951498,01849238-f5f0-487e-bca4-7b4fe0c9625c,"Hello, Can you hear me? I can, how are you doing?"
2,2,5ad7c075c25ea0000188486b,01849238-f5f0-487e-bca4-7b4fe0c9625c,Good. How about you?
3,3,5de5538f8fde1c4dbc951498,01849238-f5f0-487e-bca4-7b4fe0c9625c,"I am doing well, yeah,"
4,4,5ad7c075c25ea0000188486b,01849238-f5f0-487e-bca4-7b4fe0c9625c,Sweet. So um I guess we could start off by tal...


In [5]:
# apply tiling function to annotated conversations with window size of 3
full_tile_3 = tiled_cosine_similarity(df, 3)

Length utterances = 348 for transcript 01849238-f5f0-487e-bca4-7b4fe0c9625c
Length utterances = 596 for transcript 01a4c01c-cf0e-4f37-ab2b-641bb604af30
Length utterances = 232 for transcript 04542d1a-168a-4f90-b7e3-33adb675525f
Length utterances = 589 for transcript 0542c0f0-6c1e-4e54-b3df-afa48b19f9b5
Length utterances = 301 for transcript 06afb9c1-b367-45c7-a9ff-74c47a2cb61c
Length utterances = 683 for transcript 06b83c0a-7bf1-4cf9-9e72-034615d97050
Length utterances = 317 for transcript 07094abd-8b2a-426a-8dca-edead45c2143
Length utterances = 176 for transcript 0736fa95-99e8-4707-bd64-4552eb79d05a
Length utterances = 191 for transcript 0a294776-ca94-4114-bc24-2bf3b0a0a813
Length utterances = 260 for transcript 0b019c01-a6b7-4753-afa3-f7bf964932c9
Length utterances = 140 for transcript 0bd4dd2c-1cf0-46fb-87d3-a88b1de310a6
Length utterances = 413 for transcript 0bfe2cba-bf84-4a8b-887d-f19378aa07d8
Length utterances = 271 for transcript 0e772ef8-9014-48a8-8eb9-762ef55b4fe9
Length utter

In [6]:
# save tiled data
# hs home
full_tile_3.to_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/output/annotated_transcripts_tile_3.csv", index = False)
# lab
# full_tile_3.to_csv()



## TSNE

In [6]:
# using same backbiter data (df) from tile section...
# 1: get s-bert word embeddings per utterance 
# 2: reduce embedding dimensionality for each utterance to plot
df.head()

Unnamed: 0,turn_id,speaker,transcript_id,utterance
0,0,5ad7c075c25ea0000188486b,01849238-f5f0-487e-bca4-7b4fe0c9625c,mm. Mhm. Okay. Okay. Mm hmm. Mhm. Oh. Mhm. Oka...
1,1,5de5538f8fde1c4dbc951498,01849238-f5f0-487e-bca4-7b4fe0c9625c,"Hello, Can you hear me? I can, how are you doing?"
2,2,5ad7c075c25ea0000188486b,01849238-f5f0-487e-bca4-7b4fe0c9625c,Good. How about you?
3,3,5de5538f8fde1c4dbc951498,01849238-f5f0-487e-bca4-7b4fe0c9625c,"I am doing well, yeah,"
4,4,5ad7c075c25ea0000188486b,01849238-f5f0-487e-bca4-7b4fe0c9625c,Sweet. So um I guess we could start off by tal...


In [9]:
# get sbert embeddings for each utterance
def get_embeddings(row):
    # extract current utterance from the row
    current = row['utterance']
    # apply s-bert model to each utterance to get embeddings
    embeddings = model.encode(current)
    return embeddings

# save embeddings 
df['utterance_embedding'] = df.apply(get_embeddings, axis = 1)

# preview 
df.head()

Unnamed: 0,turn_id,speaker,transcript_id,utterance,utterance_embedding
0,0,5ad7c075c25ea0000188486b,01849238-f5f0-487e-bca4-7b4fe0c9625c,mm. Mhm. Okay. Okay. Mm hmm. Mhm. Oh. Mhm. Oka...,"[0.060678475, 0.005255911, -0.0042183995, 0.02..."
1,1,5de5538f8fde1c4dbc951498,01849238-f5f0-487e-bca4-7b4fe0c9625c,"Hello, Can you hear me? I can, how are you doing?","[-0.020778073, -0.015140475, 0.013267419, -0.0..."
2,2,5ad7c075c25ea0000188486b,01849238-f5f0-487e-bca4-7b4fe0c9625c,Good. How about you?,"[0.039609965, -0.01397195, 0.03664549, 0.02249..."
3,3,5de5538f8fde1c4dbc951498,01849238-f5f0-487e-bca4-7b4fe0c9625c,"I am doing well, yeah,","[-0.023786038, -0.08440631, 0.10901276, 0.0035..."
4,4,5ad7c075c25ea0000188486b,01849238-f5f0-487e-bca4-7b4fe0c9625c,Sweet. So um I guess we could start off by tal...,"[0.013385294, -0.042186055, 0.023476379, 0.002..."


In [11]:
# make the utterance embeddings into a 2D numpy array (one row per utterance)
tsne_embeddings = np.stack(df['utterance_embedding'].values)
# initialize t-sne
tsne = TSNE(n_components=2, random_state=42)
# apply t-sne
tsne_results = tsne.fit_transform(tsne_embeddings)
# convert results to a data frame
tsne_df = pd.DataFrame(tsne_results, columns=['tsne_1','tsne_2'])

# preview
tsne_df.head()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Unnamed: 0,tsne_1,tsne_2
0,-33.187672,80.873192
1,-30.203676,68.123535
2,26.9667,11.603717
3,15.368028,11.502008
4,-16.766781,20.031582


In [12]:
# merge tsne results with original data frame
df_tsne = pd.concat([df, tsne_df], axis=1)
# preview
df_tsne.head()

Unnamed: 0,turn_id,speaker,transcript_id,utterance,utterance_embedding,tsne_1,tsne_2
0,0,5ad7c075c25ea0000188486b,01849238-f5f0-487e-bca4-7b4fe0c9625c,mm. Mhm. Okay. Okay. Mm hmm. Mhm. Oh. Mhm. Oka...,"[0.060678475, 0.005255911, -0.0042183995, 0.02...",-33.187672,80.873192
1,1,5de5538f8fde1c4dbc951498,01849238-f5f0-487e-bca4-7b4fe0c9625c,"Hello, Can you hear me? I can, how are you doing?","[-0.020778073, -0.015140475, 0.013267419, -0.0...",-30.203676,68.123535
2,2,5ad7c075c25ea0000188486b,01849238-f5f0-487e-bca4-7b4fe0c9625c,Good. How about you?,"[0.039609965, -0.01397195, 0.03664549, 0.02249...",26.9667,11.603717
3,3,5de5538f8fde1c4dbc951498,01849238-f5f0-487e-bca4-7b4fe0c9625c,"I am doing well, yeah,","[-0.023786038, -0.08440631, 0.10901276, 0.0035...",15.368028,11.502008
4,4,5ad7c075c25ea0000188486b,01849238-f5f0-487e-bca4-7b4fe0c9625c,Sweet. So um I guess we could start off by tal...,"[0.013385294, -0.042186055, 0.023476379, 0.002...",-16.766781,20.031582


In [13]:
# save tsne data frame
df_tsne.to_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/output/annotated_transcripts_tsne.csv", index = False)

In [None]:
# make the embeddings a numpy array
x_test = np.array(df_embeddings)
# set up tsne parameters (pay attention to perplexity)
tsne = TSNE(n_components=2, random_state=10, perplexity=7)
# run tsne on embeddings
x_embedded = tsne.fit_transform(x_test)

# add lines to connect data points
df['TSNE1'] = x_embedded[:, 0]
df['TSNE2'] = x_embedded[:, 1]

# sort by turn order to ensure points plotted in correct order
df = df.sort_values(by='turn_id')

## CLUSTER

In [20]:
# load topic label data to run kmeans clustering and identify clustered labels
cluster = pd.read_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/processed/all_participant_topic_labels.csv")
# preview
cluster.head()

Unnamed: 0,PID,transcript_id,new_topic,topic_order,number_of_turns,topic_length
0,"[False, '66ea5116a642813c3dcf3c6f', None]",01849238-f5f0-487e-bca4-7b4fe0c9625c,Starting The Call,1,1,85.01
1,"[False, '66ea5116a642813c3dcf3c6f', None]",01849238-f5f0-487e-bca4-7b4fe0c9625c,Greeting,2,2,5.94
2,"[False, '66ea5116a642813c3dcf3c6f', None]",01849238-f5f0-487e-bca4-7b4fe0c9625c,Introductions,3,9,23.79
3,"[False, '66ea5116a642813c3dcf3c6f', None]",01849238-f5f0-487e-bca4-7b4fe0c9625c,Flying,4,20,69.43
4,"[False, '66ea5116a642813c3dcf3c6f', None]",01849238-f5f0-487e-bca4-7b4fe0c9625c,Schooling,5,1,3.92


In [21]:
# keep a data frame that has all the labels (PID, turns, topic number in transcript)
# have a second with formatted topic labels (all lowercased, no duplicates) and original topic labels 

# lowercase all topics
cluster['new_topic'] = cluster['new_topic'].str.lower()
cluster.head()

Unnamed: 0,PID,transcript_id,new_topic,topic_order,number_of_turns,topic_length
0,"[False, '66ea5116a642813c3dcf3c6f', None]",01849238-f5f0-487e-bca4-7b4fe0c9625c,starting the call,1,1,85.01
1,"[False, '66ea5116a642813c3dcf3c6f', None]",01849238-f5f0-487e-bca4-7b4fe0c9625c,greeting,2,2,5.94
2,"[False, '66ea5116a642813c3dcf3c6f', None]",01849238-f5f0-487e-bca4-7b4fe0c9625c,introductions,3,9,23.79
3,"[False, '66ea5116a642813c3dcf3c6f', None]",01849238-f5f0-487e-bca4-7b4fe0c9625c,flying,4,20,69.43
4,"[False, '66ea5116a642813c3dcf3c6f', None]",01849238-f5f0-487e-bca4-7b4fe0c9625c,schooling,5,1,3.92


In [22]:
def get_embeddings(row):
    # extract current topic from the row
    current = row['new_topic']
    # apply s-bert model to each string to get embeddings
    embeddings = model.encode(current)
    return embeddings

# save embeddings to output
cluster['embeddings'] = cluster.apply(get_embeddings, axis = 1)

cluster.head()

Unnamed: 0,PID,transcript_id,new_topic,topic_order,number_of_turns,topic_length,embeddings
0,"[False, '66ea5116a642813c3dcf3c6f', None]",01849238-f5f0-487e-bca4-7b4fe0c9625c,starting the call,1,1,85.01,"[-0.07382959, 0.031588975, -0.07631215, -0.006..."
1,"[False, '66ea5116a642813c3dcf3c6f', None]",01849238-f5f0-487e-bca4-7b4fe0c9625c,greeting,2,2,5.94,"[-0.09222623, 0.1348203, 0.090344675, 0.070808..."
2,"[False, '66ea5116a642813c3dcf3c6f', None]",01849238-f5f0-487e-bca4-7b4fe0c9625c,introductions,3,9,23.79,"[-0.05392585, -0.021351758, -0.03909823, 0.054..."
3,"[False, '66ea5116a642813c3dcf3c6f', None]",01849238-f5f0-487e-bca4-7b4fe0c9625c,flying,4,20,69.43,"[0.057700507, 0.03469498, -0.028418075, 0.0820..."
4,"[False, '66ea5116a642813c3dcf3c6f', None]",01849238-f5f0-487e-bca4-7b4fe0c9625c,schooling,5,1,3.92,"[0.022620993, 0.08201187, 0.026254188, 0.10571..."


In [52]:
# kmeans on the new topic strings

# create new cluster data frames for 100 topics and 50 topics
cluster_100 = cluster
cluster_50 = cluster
cluster_25 = cluster

# 100 clusters
topic_array = np.array([np.array(x) for x in cluster['embeddings'].to_numpy()])
kmeans_topics_100 = sklearn.cluster.KMeans(n_clusters=100).fit(topic_array)
kmeans_labels_100 = kmeans_topics_100.predict(topic_array)
# bind labels back to cluster_100 and group based on what's being clustered together
cluster_100['clusters'] = kmeans_labels_100
#cluster_100.head()

# 50 clusters
topic_array = np.array([np.array(x) for x in cluster['embeddings'].to_numpy()])
kmeans_topics_50 = sklearn.cluster.KMeans(n_clusters=50).fit(topic_array)
kmeans_labels_50 = kmeans_topics_50.predict(topic_array)
# bind labels back to cluster_50 and group based on what's being clustered together
cluster_50['clusters'] = kmeans_labels_50
#cluster_50.head()

# 25 clusters
topic_array = np.array([np.array(x) for x in cluster['embeddings'].to_numpy()])
kmeans_topics_25 = sklearn.cluster.KMeans(n_clusters=25).fit(topic_array)
kmeans_labels_25 = kmeans_topics_25.predict(topic_array)
# bind labels back to cluster_25 and group based on what's being clustered together
cluster_25['clusters'] = kmeans_labels_25
#cluster_25.head()

In [53]:
# save 100 clusters
cluster_100 = cluster_100.sort_values(by = ['clusters'])
cluster_100.to_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/output/topic_clusters_100.csv", index = False)

# save 50 clusters
cluster_50 = cluster_50.sort_values(by = ['clusters'])
cluster_50.to_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/output/topic_clusters_50.csv", index = False)

# save 25 clusters
cluster_25 = cluster_25.sort_values(by = ['clusters'])
cluster_25.to_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/output/topic_clusters_25.csv", index = False)


## TSNE on CLUSTERS

In [55]:
# now I want to get TSNE embeddings on cluster labels that I assigned groups of participant-provided labels
# see how close participant labels are to my clustered labels

# load cluster labels
# 50
cluster_label_50 = pd.read_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/output/topic_cluster_labels_50.csv")
# 25
cluster_label_25 = pd.read_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/output/topic_cluster_labels_25.csv")

# preview
cluster_label_25.head()

Unnamed: 0,clusters,cluster_label
0,0,politics
1,1,location
2,2,opinion
3,3,small talk
4,4,starting the call


In [42]:
# get embeddings on the cluster labels
def get_embeddings(row):
    # extract current label from the row
    current = row['cluster_label']
    # apply s-bert model to each string to get embeddings
    embeddings = model.encode(current)
    return embeddings

# save embeddings to output
cluster_label_50['embeddings'] = cluster_label_50.apply(get_embeddings, axis = 1)
# preview
cluster_label_50.head()


Unnamed: 0,clusters,cluster_label,embeddings
0,0,"places, location, geography","[0.115123995, -0.002396408, 0.012577971, 0.007..."
1,1,"background, daily life, relationships","[-0.014651105, 0.023957988, 0.026657121, 0.065..."
2,2,"school, college, university","[0.02738497, 0.026072375, 0.029432196, 0.02261..."
3,3,"money, expenses, saving","[0.031809524, 0.10850602, -0.027284732, 0.0467..."
4,4,"vaccines, shots","[0.015072287, 0.07646736, 0.0045836503, -0.002..."


In [43]:
# append a variable noting these are cluster labels
cluster_label_50['type'] = 'cluster'
# rename cluster_label to "topic"
cluster_label_50 = cluster_label_50.rename(columns = {'cluster_label':'topic'})
# preview
cluster_label_50.head()

Unnamed: 0,clusters,topic,embeddings,type
0,0,"places, location, geography","[0.115123995, -0.002396408, 0.012577971, 0.007...",cluster
1,1,"background, daily life, relationships","[-0.014651105, 0.023957988, 0.026657121, 0.065...",cluster
2,2,"school, college, university","[0.02738497, 0.026072375, 0.029432196, 0.02261...",cluster
3,3,"money, expenses, saving","[0.031809524, 0.10850602, -0.027284732, 0.0467...",cluster
4,4,"vaccines, shots","[0.015072287, 0.07646736, 0.0045836503, -0.002...",cluster


In [46]:
# now select clusters, new topic, and embeddings from cluster_50
cluster_50_subset = cluster_50[['clusters','new_topic','embeddings']]
# append a variable noting these are participant labels
cluster_50_subset['type'] = 'participant'
# rename new_topic to "topic"
cluster_50_subset = cluster_50_subset.rename(columns = {'new_topic':'topic'})
# preview
cluster_50_subset.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_50_subset['type'] = 'participant'


Unnamed: 0,clusters,topic,embeddings,type
3963,0,where they live,"[0.04877898, 0.093149215, 0.002750613, 0.07424...",participant
16872,0,what texas is actually like for lgbt people,"[0.048293743, -0.027363563, 0.06260574, 0.0918...",participant
16873,0,texan gun culture,"[0.021610662, 0.059049744, -0.047935188, 0.011...",participant
14225,0,soical distance,"[0.085162625, 0.01922674, -0.06977843, -0.0677...",participant
22879,0,colorado residents,"[0.10372697, -0.041449253, 0.036547888, 0.0496...",participant


In [None]:
# concatenate cluster_label_50 and cluster_50_subset
tsne_50 = pd.concat([cluster_label_50, cluster_50_subset], ignore_index=True)
# preview
tsne_50.head(60)

Unnamed: 0,clusters,topic,embeddings,type
0,0,"places, location, geography","[0.115123995, -0.002396408, 0.012577971, 0.007...",cluster
1,1,"background, daily life, relationships","[-0.014651105, 0.023957988, 0.026657121, 0.065...",cluster
2,2,"school, college, university","[0.02738497, 0.026072375, 0.029432196, 0.02261...",cluster
3,3,"money, expenses, saving","[0.031809524, 0.10850602, -0.027284732, 0.0467...",cluster
4,4,"vaccines, shots","[0.015072287, 0.07646736, 0.0045836503, -0.002...",cluster
5,5,"education, classes, teaching","[0.057258613, 0.015636563, -0.008564341, 0.047...",cluster
6,6,"politics, voting, election","[0.01678374, 0.009973231, 0.03967316, -0.00118...",cluster
7,7,"pandemic, illness, quarantine","[0.02846067, 0.016469527, 0.019100389, -0.0011...",cluster
8,8,"goodbyes, farewells","[0.02785908, 0.102045484, 0.08101375, 0.019764...",cluster
9,9,location,"[0.039457366, 0.027397515, -0.005985808, 0.016...",cluster


In [51]:
# now apply tsne to reduce embeddings to 2D space
tsne_50_embeddings = np.stack(tsne_50['embeddings'].values)
tsne_50_results = tsne.fit_transform(tsne_50_embeddings)
tsne_50_df = pd.DataFrame(tsne_50_results, columns = ['tsne_1','tsne_2'])
tsne_50_df = pd.concat([tsne_50, tsne_50_df], axis = 1)

# save
tsne_50_df.to_csv("/Users/helenschmidt/Library/CloudStorage/GoogleDrive-helenschmidt129@gmail.com/My Drive/SANLab/Experiments/Conversation-Structure/data/output/tsne_50.csv", index = False)

# preview
tsne_50_df.head()

Unnamed: 0,clusters,topic,embeddings,type,tsne_1,tsne_2
0,0,"places, location, geography","[0.115123995, -0.002396408, 0.012577971, 0.007...",cluster,-85.771576,12.990308
1,1,"background, daily life, relationships","[-0.014651105, 0.023957988, 0.026657121, 0.065...",cluster,53.7379,1.947843
2,2,"school, college, university","[0.02738497, 0.026072375, 0.029432196, 0.02261...",cluster,-76.254822,-42.545815
3,3,"money, expenses, saving","[0.031809524, 0.10850602, -0.027284732, 0.0467...",cluster,-24.427193,-10.131094
4,4,"vaccines, shots","[0.015072287, 0.07646736, 0.0045836503, -0.002...",cluster,-28.157076,106.668648


# Archive