In [22]:
import pandas as pd
import numpy as np
from features.get_all_DD_features import *
from sklearn.metrics.pairwise import cosine_similarity

In [23]:
og_chat_df = pd.read_csv('./data/raw_data/csopII_conversations_withblanks.csv', encoding='mac-roman')
vect_df = pd.read_csv('./data/vectors/sentence/chats/csopII_conversations_withblanks.csv')

In [24]:
def preprocess_conversation_columns(df):
	# remove all special characters from df
	df.columns = df.columns.str.replace('[^A-Za-z0-9_]', '', regex=True)
	
	# If data is grouped by batch/round, add a conversation num
	if {'batch_num', 'round_num'}.issubset(df.columns):
		df['conversation_num'] = df.groupby(['batch_num', 'round_num']).ngroup()
		df = df[df.columns.tolist()[-1:] + df.columns.tolist()[0:-1]] # make the new column first

	return(df)

chat_df = preprocess_conversation_columns(og_chat_df)

In [25]:
chat_df['message_embedding'] = conv_to_float_arr(vect_df['message_embedding'].to_frame())
chat_df

Unnamed: 0,conversation_num,batch_num,vis_img,int_verb,ort_img,rep_man,soc_pers,team_size,difficulty,score,duration,efficiency,speaker_nickname,message,timestamp,message_embedding
0,rHPaiuXkM3Ss4rEsW_easy,1,High,High,High,High,High,5,Easy [Corresponds to 'Hard' in PNAS],586.0,107.0,5.476636,oJLD2kfcwGyXkfCeH,Can we get D to the highest score?,2021-02-26T19:11:21.474Z,"[-0.035490743815898895, -0.04712440446019173, ..."
1,dArSAcrzmb9bR6Pug_easy,1,High,High,High,High,High,5,Easy [Corresponds to 'Hard' in PNAS],559.0,406.0,1.376847,6QvNn8wdCyviKCjiv,looks good,2021-02-26T19:22:32.695Z,"[-0.13091498613357544, 0.08132250607013702, -0..."
2,dArSAcrzmb9bR6Pug_easy,1,High,High,High,High,High,5,Easy [Corresponds to 'Hard' in PNAS],559.0,406.0,1.376847,6qP7EcqvYQL4gmrAX,I like it,2021-02-26T19:23:14.467Z,"[-0.0895904153585434, 0.025255899876356125, -0..."
3,dArSAcrzmb9bR6Pug_easy,1,High,High,High,High,High,5,Easy [Corresponds to 'Hard' in PNAS],559.0,406.0,1.376847,NMkPLqERKzTTMqw5Y,It looks good to me,2021-02-26T19:23:20.650Z,"[-0.10534807294607162, 0.05257665365934372, 0...."
4,dArSAcrzmb9bR6Pug_easy,1,High,High,High,High,High,5,Easy [Corresponds to 'Hard' in PNAS],559.0,406.0,1.376847,6QvNn8wdCyviKCjiv,stop changing it,2021-02-26T19:23:52.111Z,"[0.0474730059504509, 0.0789831206202507, 0.066..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4614,PP8QdaXDj8e7GBPT3_hard,5,Mixed,Low,Low,Low,High,8,Hard [Corresponds to 'Super Hard' in PNAS],721.0,86.0,8.383721,hNDxbBcuh8b56WEGD,same,2021-09-20T18:12:24.809Z,"[-0.11742983758449554, -0.022955235093832016, ..."
4615,PP8QdaXDj8e7GBPT3_hard,5,Mixed,Low,Low,Low,High,8,Hard [Corresponds to 'Super Hard' in PNAS],721.0,86.0,8.383721,roWwgXBwsM2kWSrG2,Lol,2021-09-20T18:12:24.894Z,"[-0.005687426775693893, -0.023897483944892883,..."
4616,PP8QdaXDj8e7GBPT3_hard,5,Mixed,Low,Low,Low,High,8,Hard [Corresponds to 'Super Hard' in PNAS],721.0,86.0,8.383721,uAWmy8vGcsY6RGDJe,i agree blue,2021-09-20T18:12:27.368Z,"[-0.07660979777574539, -0.018849719315767288, ..."
4617,PP8QdaXDj8e7GBPT3_hard,5,Mixed,Low,Low,Low,High,8,Hard [Corresponds to 'Super Hard' in PNAS],721.0,86.0,8.383721,kQY7q8Py3kgANA29w,this looks good,2021-09-20T18:12:31.012Z,"[-0.08458241075277328, 0.0631999671459198, 0.0..."


In [26]:
# METHOD 1: calculate avg cosine similarity between adjacent chats

lex_cohesion_adj_chats = []
lex_cohesion_cumulative = []
fflow_1 = []
fflow_2 = []
cached_cohesion = 0
pair_index = 1


for num, conv in chat_df.groupby(['conversation_num']):
    
    for i, pair in conv.groupby(conv.index // 2):
        
        # last "pair" has only one element, safeguard against this
        if (len(pair) == 2):
            cos_sim_matrix = cosine_similarity([pair.iloc[0]['message_embedding'], pair.iloc[1]['message_embedding']])
            lex_cohesion_adj_chats.append(cos_sim_matrix[np.triu_indices(len(cos_sim_matrix), k = 1)][0])
            cached_cohesion += cos_sim_matrix[np.triu_indices(len(cos_sim_matrix), k = 1)][0]
            lex_cohesion_cumulative.append(cached_cohesion/pair_index)
            pair_index += 1
            
    
    fflow_1.append(sum(lex_cohesion_adj_chats) / len(conv))
    fflow_2.append(sum(lex_cohesion_cumulative) / len(conv))


In [27]:
final = chat_df[['conversation_num']].drop_duplicates()
final['lex_cohesion_1'] = fflow_1
final['lex_cohesion_2'] = fflow_2
final

Unnamed: 0,conversation_num,lex_cohesion_1,lex_cohesion_2
0,rHPaiuXkM3Ss4rEsW_easy,0.106431,0.106431
1,dArSAcrzmb9bR6Pug_easy,0.425722,0.425722
18,sbtKHuLChG8ge734n_easy,0.425722,0.425722
19,guq9kzMuDCpB4hRhK_easy,0.425722,0.425722
23,hSNqR8mxGD2jv3WeZ_easy,0.425722,0.425722
...,...,...,...
4593,Co9eGqeZztGF8AChp_hard,206.662670,213.328490
4594,tpXdfWtrSytWMjEqw_hard,413.325339,426.656981
4597,3T4H8HbEqGqjgMhnP_hard,413.325339,426.656981
4602,EjpddpbaAphpsiptF_hard,51.732692,53.416626


In [14]:
pd.merge(
    left=og_chat_df[['conversation_num']].drop_duplicates(),
    right=final,
    on=['conversation_num'],
    how="inner"
)

Unnamed: 0,conversation_num,lex_cohesion
0,0,0.166486
1,1,0.318432


In [15]:
# METHOD 2: calculate avg cosine similarity between chat and it's preceding chats FOR EVERY CHAT

lex_cohesion = []
fflow = []
cached_cohesion = 0
pair_index = 1

for num, conv in chat_df.groupby(['batch_num', 'round_num']):

    for i, pair in conv.groupby(conv.index // 2):
        
        # last "pair" has only one element, safeguard against this
        if (len(pair) == 2):
            
            cos_sim_matrix = cosine_similarity([pair.iloc[0]['message_embedding'], pair.iloc[1]['message_embedding']])
            cached_cohesion += cos_sim_matrix[np.triu_indices(len(cos_sim_matrix), k = 1)][0]
            lex_cohesion.append(cached_cohesion/pair_index)
            pair_index += 1
    
    fflow.append(sum(lex_cohesion) / len(conv))


In [16]:
final = chat_df[['batch_num', 'round_num']].drop_duplicates()
final['lex_cohesion'] = fflow
final

Unnamed: 0,batch_num,round_num,lex_cohesion
0,0,0,0.230703
52,0,2,0.414822
