In [1]:
import pandas as pd
import numpy as np
import re
from features.get_all_DD_features import *
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
og_chat_df = pd.read_csv('./data/raw_data/juries_tiny_for_testing.csv', encoding='mac-roman')
vect_df = pd.read_csv('./data/vectors/sentence/chats/csopII_conversations_withblanks.csv')

In [3]:
def preprocess_conversation_columns(df):
	# remove all special characters from df
	df.columns = df.columns.str.replace('[^A-Za-z0-9_]', '', regex=True)
	
	# If data is grouped by batch/round, add a conversation num
	if {'batch_num', 'round_num'}.issubset(df.columns):
		df['conversation_num'] = df.groupby(['batch_num', 'round_num']).ngroup()
		df = df[df.columns.tolist()[-1:] + df.columns.tolist()[0:-1]] # make the new column first

	return(df)

def preprocess_text(text):
  	# For each individual message: preprocess to remove anything that is not an alphabet or number from the string
	return(re.sub(r"[^a-zA-Z0-9 ]+", '',text).lower())

chat_df = preprocess_conversation_columns(og_chat_df)
chat_df["message"] = chat_df["message"].astype(str).apply(preprocess_text)

In [4]:
chat_df['message_embedding'] = conv_to_float_arr(vect_df['message_embedding'].to_frame())
chat_df

Unnamed: 0,conversation_num,batch_num,round_num,speaker_hash,speaker_nickname,timestamp,message,majority_pct,num_flipped,flipped_pct,num_votes,message_embedding
0,0,0,0,5e7e1e0031f4e454e196c30b,niceRhino,2020-04-20T18:27:20.125Z,hello,1.0,1,0.333333,3,"[-0.035490743815898895, -0.04712440446019173, ..."
1,0,0,0,5e31d6e4e31c5304c46f1413,culturedCow,2020-04-20T18:27:23.764Z,hi,1.0,1,0.333333,3,"[-0.13091498613357544, 0.08132250607013702, -0..."
2,0,0,0,5e7e4f4c31f4e454e196c9c4,spryBison,2020-04-20T18:27:27.724Z,hello,1.0,1,0.333333,3,"[-0.0895904153585434, 0.025255899876356125, -0..."
3,0,0,0,5d482ea421c9be351f762255,youngLion,2020-04-20T18:27:30.410Z,hi,1.0,1,0.333333,3,"[-0.10534807294607162, 0.05257665365934372, 0...."
4,0,0,0,5e84cc3c50f6e364321d6265,smallGiraffe,2020-04-20T18:27:35.506Z,hi,1.0,1,0.333333,3,"[0.0474730059504509, 0.0789831206202507, 0.066..."
...,...,...,...,...,...,...,...,...,...,...,...,...
92,1,0,2,5e7e4f4c31f4e454e196c9c4,newLion,2020-04-20T19:02:55.111Z,i say asshole under stress,0.6,0,0.000000,5,"[-0.06429783999919891, -0.028240181505680084, ..."
93,1,0,2,5d6feec65f80ae21f5c5f054,conventionalMonkey,2020-04-20T19:03:21.819Z,yes she is the asshole unfortunately husband h...,0.6,0,0.000000,5,"[0.08033685386180878, -0.037223298102617264, -..."
94,1,0,2,5d482ea421c9be351f762255,newPanda,2020-04-20T19:03:36.308Z,i think she is being presumptuous and acting l...,0.6,0,0.000000,5,"[-0.04056394845247269, 8.258870366262272e-05, ..."
95,1,0,2,5e7e4f4c31f4e454e196c9c4,newLion,2020-04-20T19:03:53.219Z,thas true she inst considering her husband and...,0.6,0,0.000000,5,"[-0.020940907299518585, 0.015346740372478962, ..."


In [26]:
# METHOD 1: calculate lexical cohesion via avg cosine similarity between adjacent chats
# METHOD 2: calculate lexical cohesion via avg cosine similarity between given chat and all preceding chats
# AT THE CONVERSATION LEVEL: take the forward flow of the LAST CHAT --> this represents forward flow for the conversation
# POTENTIAL NEW FEATURE --> tie into mimicry: similarity with the previous chat 

lex_cohesion_adj_chats = []
lex_cohesion_cumulative = []

fflow_1 = []
fflow_2 = []

cached_pairwise_sims = 0

# divide by n - 1
chat_index = 1


for num, conv in chat_df.groupby(['conversation_num']):
    
    for i, pair in conv.groupby(conv.index // 2):
        
        # last "pair" has only one element, safeguard against this
        if (len(pair) == 2):
            cos_sim_matrix = cosine_similarity([pair.iloc[0]['message_embedding'], pair.iloc[1]['message_embedding']])
            cosine_sim = cos_sim_matrix[np.triu_indices(len(cos_sim_matrix), k = 1)][0]
            cached_pairwise_sims += cosine_sim
            
            lex_cohesion_adj_chats.append(cosine_sim)
            cached_cohesion += cosine_sim
            lex_cohesion_cumulative.append(cached_cohesion/pair_index)
            pair_index += 1
            
    
    fflow_1.append(sum(lex_cohesion_adj_chats) / len(conv))
    fflow_2.append(sum(lex_cohesion_cumulative) / len(conv))


In [None]:
# METHOD 1: calculate lexical cohesion via avg cosine similarity between adjacent chats
# METHOD 2: calculate lexical cohesion via avg cosine similarity between given chat and all preceding chats
# AT THE CONVERSATION LEVEL: take the forward flow of the LAST CHAT --> this represents forward flow for the conversation
# POTENTIAL NEW FEATURE --> tie into mimicry: similarity with the previous chat 

lex_cohesion_adj_chats = []
lex_cohesion_cumulative = []

fflow_1 = []
fflow_2 = []

cached_pairwise_sims = 0

# divide by n - 1
chat_index = 1


for num, conv in chat_df.groupby(['conversation_num']):
    
    for i, pair in conv.groupby(conv.index // 2):
        
        # last "pair" has only one element, safeguard against this
        if (len(pair) == 2):
            cos_sim_matrix = cosine_similarity([pair.iloc[0]['message_embedding'], pair.iloc[1]['message_embedding']])
            cosine_sim = cos_sim_matrix[np.triu_indices(len(cos_sim_matrix), k = 1)][0]
            cached_pairwise_sims += cosine_sim
            
            lex_cohesion_adj_chats.append(cosine_sim)
            cached_cohesion += cosine_sim
            lex_cohesion_cumulative.append(cached_cohesion/pair_index)
            pair_index += 1
            
    
    fflow_1.append(sum(lex_cohesion_adj_chats) / len(conv))
    fflow_2.append(sum(lex_cohesion_cumulative) / len(conv))


In [5]:
# METHOD 1: COHESION WITH CONVERSATION SO FAR (CHAT LEVEL WITH SUM STATS)

forward_flow = []

for num, conv in chat_df.groupby(['conversation_num'],  sort=False):

    forward_flow.append(0)
    
    cached_embedding = conv.iloc[0]["message_embedding"]
    chat_count = 1
    avg_embedding = cached_embedding / chat_count
    

    for index, row in conv[1:].iterrows():
        
        # determine distance from that and prev average, append to the list
        cos_sim_matrix = cosine_similarity([row['message_embedding'], avg_embedding])
        cosine_sim = cos_sim_matrix[np.triu_indices(len(cos_sim_matrix), k = 1)][0]

        forward_flow.append(cosine_sim)

        # add to cache, increment count
        cached_embedding += row["message_embedding"]
        chat_count += 1
        
        # calculate new average
        avg = cached_embedding / chat_count
    

chat_df["forward_flow"] = forward_flow


In [6]:
chat_df

Unnamed: 0,conversation_num,batch_num,round_num,speaker_hash,speaker_nickname,timestamp,message,majority_pct,num_flipped,flipped_pct,num_votes,message_embedding,forward_flow
0,0,0,0,5e7e1e0031f4e454e196c30b,niceRhino,2020-04-20T18:27:20.125Z,hello,1.0,1,0.333333,3,"[-1.8343351350631565, 0.5779136352939531, -0.1...",0.000000
1,0,0,0,5e31d6e4e31c5304c46f1413,culturedCow,2020-04-20T18:27:23.764Z,hi,1.0,1,0.333333,3,"[-0.13091498613357544, 0.08132250607013702, -0...",0.053008
2,0,0,0,5e7e4f4c31f4e454e196c9c4,spryBison,2020-04-20T18:27:27.724Z,hello,1.0,1,0.333333,3,"[-0.0895904153585434, 0.025255899876356125, -0...",0.107159
3,0,0,0,5d482ea421c9be351f762255,youngLion,2020-04-20T18:27:30.410Z,hi,1.0,1,0.333333,3,"[-0.10534807294607162, 0.05257665365934372, 0....",0.072078
4,0,0,0,5e84cc3c50f6e364321d6265,smallGiraffe,2020-04-20T18:27:35.506Z,hi,1.0,1,0.333333,3,"[0.0474730059504509, 0.0789831206202507, 0.066...",0.071052
...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,1,0,2,5e7e4f4c31f4e454e196c9c4,newLion,2020-04-20T19:02:55.111Z,i say asshole under stress,0.6,0,0.000000,5,"[-0.06429783999919891, -0.028240181505680084, ...",0.220661
93,1,0,2,5d6feec65f80ae21f5c5f054,conventionalMonkey,2020-04-20T19:03:21.819Z,yes she is the asshole unfortunately husband h...,0.6,0,0.000000,5,"[0.08033685386180878, -0.037223298102617264, -...",0.013486
94,1,0,2,5d482ea421c9be351f762255,newPanda,2020-04-20T19:03:36.308Z,i think she is being presumptuous and acting l...,0.6,0,0.000000,5,"[-0.04056394845247269, 8.258870366262272e-05, ...",0.082777
95,1,0,2,5e7e4f4c31f4e454e196c9c4,newLion,2020-04-20T19:03:53.219Z,thas true she inst considering her husband and...,0.6,0,0.000000,5,"[-0.020940907299518585, 0.015346740372478962, ...",0.206098


In [7]:
# METHOD 2: PURELY PAIRWISE AKA MIMICRY IMPROVEMENT (CHAT LEVEL WITH SUM STATS)

mimicry = []


for num, conv in chat_df.groupby(['conversation_num'],  sort=False):

    mimicry.append(0)
    prev_embedding = conv.iloc[0]["message_embedding"]
    
    for index, row in conv[1:].iterrows():
        
        # last "pair" has only one element, safeguard against this
        cos_sim_matrix = cosine_similarity([row['message_embedding'], prev_embedding])
        cosine_sim = cos_sim_matrix[np.triu_indices(len(cos_sim_matrix), k = 1)][0]
        
        mimicry.append(cosine_sim)

        prev_embedding = row["message_embedding"]
            
    

chat_df["mimicry"] = mimicry

In [8]:
chat_df

Unnamed: 0,conversation_num,batch_num,round_num,speaker_hash,speaker_nickname,timestamp,message,majority_pct,num_flipped,flipped_pct,num_votes,message_embedding,forward_flow,mimicry
0,0,0,0,5e7e1e0031f4e454e196c30b,niceRhino,2020-04-20T18:27:20.125Z,hello,1.0,1,0.333333,3,"[-1.8343351350631565, 0.5779136352939531, -0.1...",0.000000,0.000000
1,0,0,0,5e31d6e4e31c5304c46f1413,culturedCow,2020-04-20T18:27:23.764Z,hi,1.0,1,0.333333,3,"[-0.13091498613357544, 0.08132250607013702, -0...",0.053008,0.452433
2,0,0,0,5e7e4f4c31f4e454e196c9c4,spryBison,2020-04-20T18:27:27.724Z,hello,1.0,1,0.333333,3,"[-0.0895904153585434, 0.025255899876356125, -0...",0.107159,0.393936
3,0,0,0,5d482ea421c9be351f762255,youngLion,2020-04-20T18:27:30.410Z,hi,1.0,1,0.333333,3,"[-0.10534807294607162, 0.05257665365934372, 0....",0.072078,0.412303
4,0,0,0,5e84cc3c50f6e364321d6265,smallGiraffe,2020-04-20T18:27:35.506Z,hi,1.0,1,0.333333,3,"[0.0474730059504509, 0.0789831206202507, 0.066...",0.071052,0.195987
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,1,0,2,5e7e4f4c31f4e454e196c9c4,newLion,2020-04-20T19:02:55.111Z,i say asshole under stress,0.6,0,0.000000,5,"[-0.06429783999919891, -0.028240181505680084, ...",0.220661,0.088508
93,1,0,2,5d6feec65f80ae21f5c5f054,conventionalMonkey,2020-04-20T19:03:21.819Z,yes she is the asshole unfortunately husband h...,0.6,0,0.000000,5,"[0.08033685386180878, -0.037223298102617264, -...",0.013486,0.032123
94,1,0,2,5d482ea421c9be351f762255,newPanda,2020-04-20T19:03:36.308Z,i think she is being presumptuous and acting l...,0.6,0,0.000000,5,"[-0.04056394845247269, 8.258870366262272e-05, ...",0.082777,0.042118
95,1,0,2,5e7e4f4c31f4e454e196c9c4,newLion,2020-04-20T19:03:53.219Z,thas true she inst considering her husband and...,0.6,0,0.000000,5,"[-0.020940907299518585, 0.015346740372478962, ...",0.206098,0.476291


In [52]:
# METHOD 3: CUMULATIVE AVERAGE OF PAIRLY PAIRWISE (CHAT LEVEL WITHOUT SUM STATS)

moving_mimicry = []


for num, conv in chat_df.groupby(['conversation_num'], sort = False):

    moving_mimicry.append(0)
    prev_embedding = conv.iloc[0]["message_embedding"]
    cached_pairwise_sims = 0
    chat_count = 1
    prev_mimicry = 0
    
    for index, row in conv[1:].iterrows():
    
        # find cosine similarity between current pair
        cos_sim_matrix = cosine_similarity([row['message_embedding'], prev_embedding])
        cosine_sim = cos_sim_matrix[np.triu_indices(len(cos_sim_matrix), k = 1)][0]

        # average this distance with the previous average
        
        moving_mimicry.append((cosine_sim + prev_mimicry)/2)

        # update valyes
        cached_pairwise_sims += cosine_sim
        chat_count += 1
        prev_mimicry = cached_pairwise_sims/chat_count
            
    
chat_df["moving_mimicry"] = moving_mimicry


In [53]:
chat_df

Unnamed: 0,conversation_num,batch_num,vis_img,int_verb,ort_img,rep_man,soc_pers,team_size,difficulty,score,duration,efficiency,speaker_nickname,message,timestamp,message_embedding,mimicry,moving_mimicry
0,rHPaiuXkM3Ss4rEsW_easy,1,High,High,High,High,High,5,Easy [Corresponds to 'Hard' in PNAS],586.0,107.0,5.476636,oJLD2kfcwGyXkfCeH,can we get d to the highest score,2021-02-26T19:11:21.474Z,"[-0.035490743815898895, -0.04712440446019173, ...",0.000000,0.000000
1,dArSAcrzmb9bR6Pug_easy,1,High,High,High,High,High,5,Easy [Corresponds to 'Hard' in PNAS],559.0,406.0,1.376847,6QvNn8wdCyviKCjiv,looks good,2021-02-26T19:22:32.695Z,"[-0.13091498613357544, 0.08132250607013702, -0...",0.000000,0.000000
2,dArSAcrzmb9bR6Pug_easy,1,High,High,High,High,High,5,Easy [Corresponds to 'Hard' in PNAS],559.0,406.0,1.376847,6qP7EcqvYQL4gmrAX,i like it,2021-02-26T19:23:14.467Z,"[-0.0895904153585434, 0.025255899876356125, -0...",0.393936,0.196968
3,dArSAcrzmb9bR6Pug_easy,1,High,High,High,High,High,5,Easy [Corresponds to 'Hard' in PNAS],559.0,406.0,1.376847,NMkPLqERKzTTMqw5Y,it looks good to me,2021-02-26T19:23:20.650Z,"[-0.10534807294607162, 0.05257665365934372, 0....",0.412303,0.401426
4,dArSAcrzmb9bR6Pug_easy,1,High,High,High,High,High,5,Easy [Corresponds to 'Hard' in PNAS],559.0,406.0,1.376847,6QvNn8wdCyviKCjiv,stop changing it,2021-02-26T19:23:52.111Z,"[0.0474730059504509, 0.0789831206202507, 0.066...",0.195987,0.280180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4614,PP8QdaXDj8e7GBPT3_hard,5,Mixed,Low,Low,Low,High,8,Hard [Corresponds to 'Super Hard' in PNAS],721.0,86.0,8.383721,hNDxbBcuh8b56WEGD,same,2021-09-20T18:12:24.809Z,"[-0.11742983758449554, -0.022955235093832016, ...",0.213298,0.092388
4615,PP8QdaXDj8e7GBPT3_hard,5,Mixed,Low,Low,Low,High,8,Hard [Corresponds to 'Super Hard' in PNAS],721.0,86.0,8.383721,roWwgXBwsM2kWSrG2,lol,2021-09-20T18:12:24.894Z,"[-0.005687426775693893, -0.023897483944892883,...",0.293988,0.114192
4616,PP8QdaXDj8e7GBPT3_hard,5,Mixed,Low,Low,Low,High,8,Hard [Corresponds to 'Super Hard' in PNAS],721.0,86.0,8.383721,uAWmy8vGcsY6RGDJe,i agree blue,2021-09-20T18:12:27.368Z,"[-0.07660979777574539, -0.018849719315767288, ...",0.145138,0.107166
4617,PP8QdaXDj8e7GBPT3_hard,5,Mixed,Low,Low,Low,High,8,Hard [Corresponds to 'Super Hard' in PNAS],721.0,86.0,8.383721,kQY7q8Py3kgANA29w,this looks good,2021-09-20T18:12:31.012Z,"[-0.08458241075277328, 0.0631999671459198, 0.0...",0.222520,0.114797


In [27]:
final = chat_df[['conversation_num']].drop_duplicates()
final['lex_cohesion_1'] = fflow_1
final['lex_cohesion_2'] = fflow_2
final

Unnamed: 0,conversation_num,lex_cohesion_1,lex_cohesion_2
0,rHPaiuXkM3Ss4rEsW_easy,0.106431,0.106431
1,dArSAcrzmb9bR6Pug_easy,0.425722,0.425722
18,sbtKHuLChG8ge734n_easy,0.425722,0.425722
19,guq9kzMuDCpB4hRhK_easy,0.425722,0.425722
23,hSNqR8mxGD2jv3WeZ_easy,0.425722,0.425722
...,...,...,...
4593,Co9eGqeZztGF8AChp_hard,206.662670,213.328490
4594,tpXdfWtrSytWMjEqw_hard,413.325339,426.656981
4597,3T4H8HbEqGqjgMhnP_hard,413.325339,426.656981
4602,EjpddpbaAphpsiptF_hard,51.732692,53.416626


In [14]:
pd.merge(
    left=og_chat_df[['conversation_num']].drop_duplicates(),
    right=final,
    on=['conversation_num'],
    how="inner"
)

Unnamed: 0,conversation_num,lex_cohesion
0,0,0.166486
1,1,0.318432


In [15]:
# METHOD 2: calculate avg cosine similarity between chat and it's preceding chats FOR EVERY CHAT

lex_cohesion = []
fflow = []
cached_cohesion = 0
pair_index = 1

for num, conv in chat_df.groupby(['batch_num', 'round_num']):

    for i, pair in conv.groupby(conv.index // 2):
        
        # last "pair" has only one element, safeguard against this
        if (len(pair) == 2):
            
            cos_sim_matrix = cosine_similarity([pair.iloc[0]['message_embedding'], pair.iloc[1]['message_embedding']])
            cached_cohesion += cos_sim_matrix[np.triu_indices(len(cos_sim_matrix), k = 1)][0]
            lex_cohesion.append(cached_cohesion/pair_index)
            pair_index += 1
    
    fflow.append(sum(lex_cohesion) / len(conv))


In [16]:
final = chat_df[['batch_num', 'round_num']].drop_duplicates()
final['lex_cohesion'] = fflow
final

Unnamed: 0,batch_num,round_num,lex_cohesion
0,0,0,0.230703
52,0,2,0.414822
