In [1]:
import pandas as pd
import numpy as np

In [2]:
def preprocess_conversation_columns(df):
	# remove all special characters from df
	df.columns = df.columns.str.replace('[^A-Za-z0-9_]', '', regex=True)
	
	# If data is grouped by batch/round, add a conversation num
	if {'batch_num', 'round_num'}.issubset(df.columns):
		df['conversation_num'] = df.groupby(['batch_num', 'round_num']).ngroup()
		df = df[df.columns.tolist()[-1:] + df.columns.tolist()[0:-1]] # make the new column first

	return(df)

In [3]:
test_dd = preprocess_conversation_columns(pd.read_csv('./data/raw_data/test_dd.csv'))
test_dd_vect = pd.read_csv('./embeddings/test_dd.csv')

In [4]:
csop = preprocess_conversation_columns(pd.read_csv('./data/raw_data/csopII_conversations_withblanks.csv'))
vect = pd.read_csv('./embeddings/csopII_conversations_withblanks.csv')

In [5]:
from features.get_all_DD_features import *

In [6]:
test_dd['message_embedding'] = conv_to_float_arr(test_dd_vect['message_embedding'].to_frame())
csop['message_embedding'] = conv_to_float_arr(vect['message_embedding'].to_frame())

In [7]:
# look at a unit case just for testing
csop = csop[csop["conversation_num"].isin(["25PJSrbtr8mkHTvcw_easy"])] #  "cb2KMXFqSt2fJqRoa_hard"

In [8]:
get_DD(test_dd)

Unnamed: 0,conversation_num,discursive_diversity
0,1,0.3870704
1,2,0.8280833
2,3,-2.220446e-16


In [9]:
get_DD(csop)

Unnamed: 0,conversation_num,discursive_diversity
0,25PJSrbtr8mkHTvcw_easy,0.726263


In [10]:
# nanp = csop.loc[csop['conversation_num'] == '28LFTFoBTha9WeCxT_easy']
# hi = np.mean(nanp['message_embedding'])
# pairs = get_unique_pairwise_combos(hi)
# np.nanmean(pairs)

In [11]:
csop.loc[csop['conversation_num'] == '25PJSrbtr8mkHTvcw_easy']

Unnamed: 0,conversation_num,batch_num,vis_img,int_verb,ort_img,rep_man,soc_pers,team_size,difficulty,score,duration,efficiency,speaker_nickname,message,timestamp,message_embedding
727,25PJSrbtr8mkHTvcw_easy,1,Mixed,High,Low,High,Mixed,2,Easy [Corresponds to 'Hard' in PNAS],604.0,559.0,1.080501,NiBqtamR7LYPByEoz,I think the next combination above 500 that wo...,2021-03-19T18:20:28.277Z,"[0.019077956676483154, -0.010986157692968845, ..."
728,25PJSrbtr8mkHTvcw_easy,1,Mixed,High,Low,High,Mixed,2,Easy [Corresponds to 'Hard' in PNAS],604.0,559.0,1.080501,NiBqtamR7LYPByEoz,I think 504 would still be ok?,2021-03-19T18:22:33.265Z,"[-0.025425974279642105, 0.08265340328216553, -..."
729,25PJSrbtr8mkHTvcw_easy,1,Mixed,High,Low,High,Mixed,2,Easy [Corresponds to 'Hard' in PNAS],604.0,559.0,1.080501,NiBqtamR7LYPByEoz,604-100,2021-03-19T18:22:41.278Z,"[-0.03982073813676834, 0.08185562491416931, -0..."
730,25PJSrbtr8mkHTvcw_easy,1,Mixed,High,Low,High,Mixed,2,Easy [Corresponds to 'Hard' in PNAS],604.0,559.0,1.080501,hF7iWkQvsomPnXY7S,sure,2021-03-19T18:22:50.937Z,"[-0.13950251042842865, -0.03979048505425453, -..."


In [12]:
test_dd_chunked = assign_chunk_nums(test_dd, 3600)
test_dd_chunked

Unnamed: 0,conversation_num,speaker_nickname,timestamp,message,majority_pct,num_flipped,flipped_pct,num_votes,message_embedding,chunk_num
0,1,A,2020-04-20T18:27:20.125Z,"Hey, folks! What a sunny day, huh?",1,1,0.333333,3,"[-0.05474695563316345, 0.03426985442638397, 0....",0
1,1,B,2020-04-20T18:27:23.764Z,Totally! The weather's awesome. Perfect for a ...,1,1,0.333333,3,"[-0.07623177021741867, -0.03382902592420578, 0...",0
2,1,C,2020-04-20T18:27:27.724Z,Absolutely! The sun's warmth lifts my spirits....,1,1,0.333333,3,"[0.022043755277991295, 0.06103178858757019, 0....",1
3,1,A,2020-04-20T18:27:30.410Z,Your enthusiasm is contagious. We're like a sy...,1,1,0.333333,3,"[-0.00782862026244402, -0.12148767709732056, 0...",1
4,1,B,2020-04-20T18:27:35.506Z,"Exactly! Our thoughts harmonize effortlessly, ...",1,1,0.333333,3,"[0.026491936296224594, -0.0734446570277214, 0....",2
5,1,C,2020-04-20T18:28:09.451Z,"Well said! Our words sway together, a testamen...",1,1,0.333333,3,"[0.02942504733800888, 0.006728546228259802, 0....",2
6,1,A,2020-04-20T18:28:20.136Z,Your eloquence is inspiring! We're instruments...,1,1,0.333333,3,"[-0.0029373159632086754, -0.038919854909181595...",3
7,1,B,2020-04-20T18:28:20.136Z,Beautifully put! Our conversation is a masterp...,1,1,0.333333,3,"[-0.039978738874197006, 0.02529899962246418, 0...",3
8,2,A,2020-04-20T18:27:20.125Z,"Salutations, esteemed companions! Have you eve...",1,1,0.333333,3,"[-0.04252244532108307, -0.045107126235961914, ...",0
9,2,B,2020-04-20T18:27:23.764Z,"Yo, folks! Guess what? Last night's game was i...",1,1,0.333333,3,"[0.013962630182504654, 0.024840209633111954, 0...",0


In [13]:
csop_chunked = assign_chunk_nums(csop, 3)
csop_chunked

Unnamed: 0,conversation_num,batch_num,vis_img,int_verb,ort_img,rep_man,soc_pers,team_size,difficulty,score,duration,efficiency,speaker_nickname,message,timestamp,message_embedding,chunk_num
727,25PJSrbtr8mkHTvcw_easy,1,Mixed,High,Low,High,Mixed,2,Easy [Corresponds to 'Hard' in PNAS],604.0,559.0,1.080501,NiBqtamR7LYPByEoz,I think the next combination above 500 that wo...,2021-03-19T18:20:28.277Z,"[0.019077956676483154, -0.010986157692968845, ...",0
728,25PJSrbtr8mkHTvcw_easy,1,Mixed,High,Low,High,Mixed,2,Easy [Corresponds to 'Hard' in PNAS],604.0,559.0,1.080501,NiBqtamR7LYPByEoz,I think 504 would still be ok?,2021-03-19T18:22:33.265Z,"[-0.025425974279642105, 0.08265340328216553, -...",0
729,25PJSrbtr8mkHTvcw_easy,1,Mixed,High,Low,High,Mixed,2,Easy [Corresponds to 'Hard' in PNAS],604.0,559.0,1.080501,NiBqtamR7LYPByEoz,604-100,2021-03-19T18:22:41.278Z,"[-0.03982073813676834, 0.08185562491416931, -0...",1
730,25PJSrbtr8mkHTvcw_easy,1,Mixed,High,Low,High,Mixed,2,Easy [Corresponds to 'Hard' in PNAS],604.0,559.0,1.080501,hF7iWkQvsomPnXY7S,sure,2021-03-19T18:22:50.937Z,"[-0.13950251042842865, -0.03979048505425453, -...",1


In [14]:
conv = csop_chunked.loc[csop_chunked['conversation_num'] == '25PJSrbtr8mkHTvcw_easy']

In [15]:
saving = conv.groupby(['chunk_num'])['message_embedding'].apply(get_unique_pairwise_combos).reset_index()
# lst = saving['message_embedding'][0]
cos_dists_mean_widay_btwu = []

for lst in saving.message_embedding:

    # Make sure list isn't empty:
    if lst:
        # Store the cosine distances for the person's list of tuples
        cos_dists = []
        for tpl in lst:
            try:
                cos_d = 1 - get_cosine_similarity([tpl[0], tpl[1]])
                cos_dists.append(cos_d)
            except ValueError as e:
                # Occurs when np.nan in tuple
                pass

        # Compute mean of cos dists
        cos_dists_mean_widay_btwu.append(np.nanmean(cos_dists, dtype="float64"))
    else:
        cos_dists_mean_widay_btwu.append(np.nan)
cos_dists_mean_widay_btwu

[0.5788580914150436, 0.6963058408938405]

In [16]:
csop_chunked.loc[(csop_chunked['conversation_num'] == '25PJSrbtr8mkHTvcw_easy') & (csop_chunked['chunk_num'] == '0')]

Unnamed: 0,conversation_num,batch_num,vis_img,int_verb,ort_img,rep_man,soc_pers,team_size,difficulty,score,duration,efficiency,speaker_nickname,message,timestamp,message_embedding,chunk_num
727,25PJSrbtr8mkHTvcw_easy,1,Mixed,High,Low,High,Mixed,2,Easy [Corresponds to 'Hard' in PNAS],604.0,559.0,1.080501,NiBqtamR7LYPByEoz,I think the next combination above 500 that wo...,2021-03-19T18:20:28.277Z,"[0.019077956676483154, -0.010986157692968845, ...",0
728,25PJSrbtr8mkHTvcw_easy,1,Mixed,High,Low,High,Mixed,2,Easy [Corresponds to 'Hard' in PNAS],604.0,559.0,1.080501,NiBqtamR7LYPByEoz,I think 504 would still be ok?,2021-03-19T18:22:33.265Z,"[-0.025425974279642105, 0.08265340328216553, -...",0


In [17]:
csop_chunked.loc[(csop_chunked['conversation_num'] == '25PJSrbtr8mkHTvcw_easy') & (csop_chunked['chunk_num'] == '1')]

Unnamed: 0,conversation_num,batch_num,vis_img,int_verb,ort_img,rep_man,soc_pers,team_size,difficulty,score,duration,efficiency,speaker_nickname,message,timestamp,message_embedding,chunk_num
729,25PJSrbtr8mkHTvcw_easy,1,Mixed,High,Low,High,Mixed,2,Easy [Corresponds to 'Hard' in PNAS],604.0,559.0,1.080501,NiBqtamR7LYPByEoz,604-100,2021-03-19T18:22:41.278Z,"[-0.03982073813676834, 0.08185562491416931, -0...",1
730,25PJSrbtr8mkHTvcw_easy,1,Mixed,High,Low,High,Mixed,2,Easy [Corresponds to 'Hard' in PNAS],604.0,559.0,1.080501,hF7iWkQvsomPnXY7S,sure,2021-03-19T18:22:50.937Z,"[-0.13950251042842865, -0.03979048505425453, -...",1


In [18]:
csop_chunked.loc[(csop_chunked['conversation_num'] == '25PJSrbtr8mkHTvcw_easy') & (csop_chunked['chunk_num'] == '2')]

Unnamed: 0,conversation_num,batch_num,vis_img,int_verb,ort_img,rep_man,soc_pers,team_size,difficulty,score,duration,efficiency,speaker_nickname,message,timestamp,message_embedding,chunk_num


In [19]:
csop_chunked.groupby(['conversation_num', 'chunk_num']).apply(get_DD)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,conversation_num,discursive_diversity
conversation_num,chunk_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
25PJSrbtr8mkHTvcw_easy,0,0,25PJSrbtr8mkHTvcw_easy,
25PJSrbtr8mkHTvcw_easy,1,0,25PJSrbtr8mkHTvcw_easy,0.696306


In [20]:
print(csop_chunked.groupby(['conversation_num', 'chunk_num']).apply(get_DD).to_string())

                                          conversation_num  discursive_diversity
conversation_num       chunk_num                                                
25PJSrbtr8mkHTvcw_easy 0         0  25PJSrbtr8mkHTvcw_easy                   NaN
                       1         0  25PJSrbtr8mkHTvcw_easy              0.696306


In [21]:
get_variance_in_DD(csop_chunked)

Unnamed: 0,conversation_num,variance_in_DD
0,25PJSrbtr8mkHTvcw_easy,


In [22]:
# num_chunks = 3
# inter_chunk_range = [ [] for i in range(num_chunks - 1)]

# index = []
# for i in range(num_chunks - 1):
#     index.append("c" + str(i) + "_c" + str(i + 1))
# index
# pd.DataFrame(inter_chunk_range, index=index).T

get_within_person_disc_range(csop_chunked, num_chunks = 3)

### CURIOSITY - how did we get values for c1_c2 for `25PJSrbtr8mkHTvcw_easy`?
## There are only 2 chunks in this case...

Unnamed: 0_level_0,incongruent_modulation,within_person_disc_range
conversation_num,Unnamed: 1_level_1,Unnamed: 2_level_1
25PJSrbtr8mkHTvcw_easy,0.012617,0.726844


In [23]:
get_within_person_disc_range(test_dd_chunked, num_chunks = 3)


Unnamed: 0_level_0,incongruent_modulation,within_person_disc_range
conversation_num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.057138,2.629374
2,0.220298,2.305481
3,0.514338,1.756709
