In [28]:
import pandas as pd
import numpy as np
from scipy import stats
import itertools
import scikit_posthocs as sp
import pingouin as pg

In [29]:
set_comp = pd.read_csv("set_comparison.csv")
val_feats = pd.read_csv("valence_features.csv")
label_feats = pd.read_csv("labels_features.csv")
sublabel_feats = pd.read_csv("sublabels_features.csv")

In [30]:
set_comp_val = set_comp[set_comp["level"] == "valence"]
set_comp_lab = set_comp[set_comp["level"] == "label"]
set_comp_sublab = set_comp[set_comp["level"] == "sublabel"]

### Significance tests on difference in classification performance between datasets

Seperate Test for each Label-Level/Classifier

#### Normality/Equality of Variance

In [35]:
pg.normality(data=set_comp_val, dv="f1", group="Unnamed: 1")

Unnamed: 0_level_0,W,pval,normal
Unnamed: 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GLoHBCD,0.971342,0.902998,True
Smoke Stop Forum,0.917287,0.334884,True
Health Coach Dialogue Corpus,0.950494,0.674351,True
Synthetic GPT3 Data,0.930818,0.45599,True
DARN-CT-based Wizard of Oz Dialogues,0.738194,0.002563,False
Optifast Mock-Chatbot,0.942451,0.580591,True


In [37]:
pg.normality(data=set_comp_lab, dv="f1", group="Unnamed: 1")

Unnamed: 0_level_0,W,pval,normal
Unnamed: 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GLoHBCD,0.946633,0.628866,True
Instagram Data,0.919511,0.352897,True
Smoke Stop Forum,0.937126,0.5215,True
Synthetic GPT3 Data,0.979513,0.96251,True
Health Coach Dialogue Corpus,0.968239,0.874057,True
DARN-CT-based Wizard of Oz Dialogues,0.966922,0.860929,True
Optifast Mock-Chatbot,0.95605,0.740062,True


In [44]:
pg.homoscedasticity(data=set_comp_lab, dv="f1", group="Unnamed: 1")

Unnamed: 0,W,pval,equal_var
levene,7.111869,8e-06,False


In [38]:
pg.normality(data=set_comp_sublab, dv="f1", group="Unnamed: 1")

Unnamed: 0_level_0,W,pval,normal
Unnamed: 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GLoHBCD,0.866804,0.091731,True
Instagram Data,0.865193,0.087831,True
Smoke Stop Forum,0.922762,0.380569,True
Synthetic GPT3 Data,0.861351,0.079158,True
Health Coach Dialogue Corpus,0.897356,0.204877,True
DARN-CT-based Wizard of Oz Dialogues,0.923477,0.386867,True
Optifast Mock-Chatbot,0.936946,0.519557,True


In [45]:
pg.homoscedasticity(data=set_comp_sublab, dv="f1", group="Unnamed: 1")

Unnamed: 0,W,pval,equal_var
levene,6.546796,2.1e-05,False


#### Friedman/Anova

In [57]:
pg.friedman(data=set_comp_val, dv="f1", within="Unnamed: 1", subject="Unnamed: 0")

Unnamed: 0,Source,W,ddof1,Q,p-unc
Friedman,Unnamed: 1,0.756571,5,37.828571,4.084558e-07


In [61]:
"{:.8f}".format(float("4.084558e-07"))

'0.00000041'

In [46]:
pg.friedman(data=set_comp_lab, dv="f1", within="Unnamed: 1", subject="Unnamed: 0")

Unnamed: 0,Source,W,ddof1,Q,p-unc
Friedman,Unnamed: 1,0.918571,6,55.114286,4.395585e-10


In [62]:
"{:.8f}".format(float("4.395585e-10"))

'0.00000000'

In [47]:
pg.friedman(data=set_comp_sublab, dv="f1", within="Unnamed: 1", subject="Unnamed: 0")

Unnamed: 0,Source,W,ddof1,Q,p-unc
Friedman,Unnamed: 1,0.831429,6,49.885714,4.955702e-09


In [63]:
"{:.8f}".format(float("4.955702e-09"))

'0.00000000'

#### Post-Hoc

In [50]:
sp.posthoc_wilcoxon(set_comp_val, val_col="f1", group_col="Unnamed: 1", p_adjust="bonferroni")

Unnamed: 0,GLoHBCD,Smoke Stop Forum,Health Coach Dialogue Corpus,Synthetic GPT3 Data,DARN-CT-based Wizard of Oz Dialogues,Optifast Mock-Chatbot
GLoHBCD,1.0,0.029297,0.029297,0.029297,1.0,1.0
Smoke Stop Forum,0.029297,1.0,1.0,0.029297,1.0,0.029297
Health Coach Dialogue Corpus,0.029297,1.0,1.0,0.029297,0.292969,0.029297
Synthetic GPT3 Data,0.029297,0.029297,0.029297,1.0,0.058594,0.058594
DARN-CT-based Wizard of Oz Dialogues,1.0,1.0,0.292969,0.058594,1.0,1.0
Optifast Mock-Chatbot,1.0,0.029297,0.029297,0.058594,1.0,1.0


In [49]:
sp.posthoc_wilcoxon(set_comp_lab, val_col="f1", group_col="Unnamed: 1", p_adjust="bonferroni").round(4)

Unnamed: 0,GLoHBCD,Instagram Data,Smoke Stop Forum,Synthetic GPT3 Data,Health Coach Dialogue Corpus,DARN-CT-based Wizard of Oz Dialogues,Optifast Mock-Chatbot
GLoHBCD,1.0,0.041,0.041,0.041,0.041,0.041,0.041
Instagram Data,0.041,1.0,0.041,0.041,0.041,0.041,1.0
Smoke Stop Forum,0.041,0.041,1.0,0.041,0.082,0.041,1.0
Synthetic GPT3 Data,0.041,0.041,0.041,1.0,0.041,0.2051,0.041
Health Coach Dialogue Corpus,0.041,0.041,0.082,0.041,1.0,0.041,0.2051
DARN-CT-based Wizard of Oz Dialogues,0.041,0.041,0.041,0.2051,0.041,1.0,0.041
Optifast Mock-Chatbot,0.041,1.0,1.0,0.041,0.2051,0.041,1.0


In [51]:
sp.posthoc_wilcoxon(set_comp_sublab, val_col="f1", group_col="Unnamed: 1", p_adjust="bonferroni").round(4)

Unnamed: 0,GLoHBCD,Instagram Data,Smoke Stop Forum,Synthetic GPT3 Data,Health Coach Dialogue Corpus,DARN-CT-based Wizard of Oz Dialogues,Optifast Mock-Chatbot
GLoHBCD,1.0,0.041,0.041,0.082,0.041,1.0,0.041
Instagram Data,0.041,1.0,0.041,0.041,0.041,0.041,0.041
Smoke Stop Forum,0.041,0.041,1.0,0.082,0.041,1.0,0.041
Synthetic GPT3 Data,0.082,0.041,0.082,1.0,0.2871,1.0,0.041
Health Coach Dialogue Corpus,0.041,0.041,0.041,0.2871,1.0,1.0,0.041
DARN-CT-based Wizard of Oz Dialogues,1.0,0.041,1.0,1.0,1.0,1.0,0.041
Optifast Mock-Chatbot,0.041,0.041,0.041,0.041,0.041,0.041,1.0


### Significance Tests for differences in Classification Performance when using different lexical features for training & classification



#### Normality/Equality of Variance

In [31]:
pg.normality(data=val_feats, dv="f1", group="classif")

Unnamed: 0_level_0,W,pval,normal
classif,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NNAV,0.876256,2e-05,False
NNA,0.919008,0.0007,False
All Features,0.938544,0.004664,False
NVA,0.940009,0.00542,False
NN,0.95347,0.022744,False
NFW,0.950282,0.016059,False


In [32]:
pg.normality(data=label_feats, dv="f1", group="classif")

Unnamed: 0_level_0,W,pval,normal
classif,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
All Features,0.968492,0.074874,True
NNA,0.967852,0.068891,True
NN,0.986665,0.666175,True
NFW,0.964829,0.046524,False
NNAV,0.957255,0.017703,False
NVA,0.979433,0.304032,True


In [33]:
pg.normality(data=sublabel_feats, dv="f1", group="classif")

Unnamed: 0_level_0,W,pval,normal
classif,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
All Features,0.900177,3.9e-05,False
NFW,0.959425,0.023274,False
NN,0.925858,0.000481,False
NNA,0.92016,0.000268,False
NNAV,0.962782,0.035728,False
NVA,0.972745,0.130275,True


#### Friedman Test

In [73]:
pg.friedman(data=val_feats, dv="f1", within="classif", subject="Unnamed: 0")

Unnamed: 0,Source,W,ddof1,Q,p-unc
Friedman,classif,0.570286,5,28.514286,2.9e-05


In [74]:
pg.friedman(data=label_feats, dv="f1", within="classif", subject="Unnamed: 0")

Unnamed: 0,Source,W,ddof1,Q,p-unc
Friedman,classif,0.976,5,48.8,2.43825e-09


In [75]:
pg.friedman(data=sublabel_feats, dv="f1", within="classif", subject="Unnamed: 0")

Unnamed: 0,Source,W,ddof1,Q,p-unc
Friedman,classif,0.965714,5,48.285714,3.105492e-09


#### Post-hoc

In [78]:
sp.posthoc_wilcoxon(val_feats, val_col="f1", group_col="classif", p_adjust="bonferroni").round(4)

Unnamed: 0,NNAV,NNA,All Features,NVA,NN,NFW
NNAV,1.0,0.0006,0.0,0.0046,0.0451,0.1465
NNA,0.0006,1.0,0.0,0.0001,0.0,1.0
All Features,0.0,0.0,1.0,0.0,0.0,0.0
NVA,0.0046,0.0001,0.0,1.0,1.0,0.1286
NN,0.0451,0.0,0.0,1.0,1.0,0.0251
NFW,0.1465,1.0,0.0,0.1286,0.0251,1.0


In [79]:
sp.posthoc_wilcoxon(label_feats, val_col="f1", group_col="classif", p_adjust="bonferroni").round(4)

Unnamed: 0,All Features,NNA,NN,NFW,NNAV,NVA
All Features,1.0,0.0,0.0,0.0,0.0,0.0
NNA,0.0,1.0,0.8406,0.0001,0.0,0.0
NN,0.0,0.8406,1.0,0.0,0.0,0.0
NFW,0.0,0.0001,0.0,1.0,0.0,0.0
NNAV,0.0,0.0,0.0,0.0,1.0,0.0
NVA,0.0,0.0,0.0,0.0,0.0,1.0


In [80]:
sp.posthoc_wilcoxon(sublabel_feats, val_col="f1", group_col="classif", p_adjust="bonferroni").round(4)

Unnamed: 0,All Features,NFW,NN,NNA,NNAV,NVA
All Features,1.0,0.0,0.0,0.0,0.0,0.0
NFW,0.0,1.0,0.0,0.0,0.0061,0.0
NN,0.0,0.0,1.0,0.0,0.0,0.0
NNA,0.0,0.0,0.0,1.0,0.0,0.0
NNAV,0.0,0.0061,0.0,0.0,1.0,0.0
NVA,0.0,0.0,0.0,0.0,0.0,1.0


### Significance Tests for Differences between conversational contexts 

In [32]:
domain = pd.read_csv("domain_no_gpt.csv")
interaction = pd.read_csv("interaction_type_no_gpt.csv")
conv_mode = pd.read_csv("conv_mode_no_gpt.csv")
synch = pd.read_csv("synchronictiy_no_gpt.csv")

domain = domain[domain["kind"] != "GLoHBCD"]
conv_mode = conv_mode[conv_mode["kind"] != "GLoHBCD"]
interaction = interaction[interaction["kind"] != "GLoHBCD"]
synch = synch[synch["kind"] != "GLoHBCD"]

In [33]:
domain_val = domain[domain["level"] == "valence"]
domain_label = domain[domain["level"] == "label"]
domain_sublabel = domain[domain["level"] == "sublabel"]

#### Testing for Normality + Equality of Variance first

In [34]:
print("Domain Valence Normality:")
pg.normality(domain_val, dv="f1", group="kind", method='shapiro', alpha=0.05)

Domain Valence Normality:


Unnamed: 0_level_0,W,pval,normal
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
in-domain,0.942451,0.580591,True
out-of-domain,0.776323,2.5e-05,False


In [35]:
print("Domain Label Normality:")
pg.normality(domain_label, dv="f1", group="kind", method='shapiro', alpha=0.05)

Domain Label Normality:


Unnamed: 0_level_0,W,pval,normal
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
in-domain,0.927994,0.141277,True
out-of-domain,0.893005,0.005688,False


In [36]:
print("Domain Sublabel Normality:")
pg.normality(domain_sublabel, dv="f1", group="kind", method='shapiro', alpha=0.05)

Domain Sublabel Normality:


Unnamed: 0_level_0,W,pval,normal
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
in-domain,0.899269,0.039959,False
out-of-domain,0.938724,0.084041,True


In [37]:
interaction_val = interaction[interaction["level"] == "valence"]
interaction_label = interaction[interaction["level"] == "label"]
interaction_sublabel = interaction[interaction["level"] == "sublabel"]

In [38]:
print("Interaction Valence Normality:")
pg.normality(interaction_val, dv="f1", group="kind", method='shapiro', alpha=0.05)

Interaction Valence Normality:


Unnamed: 0_level_0,W,pval,normal
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
human-chatbot,0.91622,0.083818,True
human-human,0.926213,0.13054,True


In [39]:
pg.homoscedasticity(interaction_val, dv="f1", group="kind")

Unnamed: 0,W,pval,equal_var
levene,5.129573,0.029308,False


In [40]:
print("Interaction Label Normality:")
pg.normality(interaction_label, dv="f1", group="kind", method='shapiro', alpha=0.05)

Interaction Label Normality:


Unnamed: 0_level_0,W,pval,normal
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
human-chatbot,0.882667,0.019765,False
human-human,0.957947,0.274258,True


In [41]:
print("Interaction Sublabel Normality:")
pg.normality(interaction_sublabel, dv="f1", group="kind", method='shapiro', alpha=0.05)

Interaction Sublabel Normality:


Unnamed: 0_level_0,W,pval,normal
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
human-chatbot,0.899948,0.041145,False
human-human,0.740865,7e-06,False


In [42]:
conv_mode_val = conv_mode[conv_mode["level"] == "valence"]
conv_mode_label = conv_mode[conv_mode["level"] == "label"]
conv_mode_sublabel = conv_mode[conv_mode["level"] == "sublabel"]

In [43]:
print("Conv Mode Valence Normality:")
pg.normality(conv_mode_val, dv="f1", group="kind", method='shapiro', alpha=0.05)

Conv Mode Valence Normality:


Unnamed: 0_level_0,W,pval,normal
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
single-party,0.950633,0.17579,True
multi-party,0.917287,0.334884,True


In [44]:
pg.homoscedasticity(conv_mode_val, dv="f1", group="kind")

Unnamed: 0,W,pval,equal_var
levene,6.4294,0.015459,False


In [45]:
print("Conv Mode Label Normality:")
pg.normality(conv_mode_label, dv="f1", group="kind", method='shapiro', alpha=0.05)

Conv Mode Label Normality:


Unnamed: 0_level_0,W,pval,normal
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
single-party,0.957085,0.260452,True
multi-party,0.957194,0.489429,True


In [46]:
pg.homoscedasticity(conv_mode_label, dv="f1", group="kind")

Unnamed: 0,W,pval,equal_var
levene,18.671322,7.8e-05,False


In [47]:
print("Conv Mode Sublabel Normality:")
pg.normality(conv_mode_sublabel, dv="f1", group="kind", method='shapiro', alpha=0.05)

Conv Mode Sublabel Normality:


Unnamed: 0_level_0,W,pval,normal
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
single-party,0.899279,0.008065,False
multi-party,0.719687,6.8e-05,False


In [48]:
synch_val = synch[synch["level"] == "valence"]
synch_label = synch[synch["level"] == "label"]
synch_sublabel = synch[synch["level"] == "sublabel"]

In [49]:
print("Synchronicity Valence Normality:")
pg.normality(synch_val, dv="f1", group="kind", method='shapiro', alpha=0.05)

Synchronicity Valence Normality:


Unnamed: 0_level_0,W,pval,normal
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
synchronous,0.91622,0.083818,True
asynchronous,0.926213,0.13054,True


In [51]:
pg.homoscedasticity(synch_val, dv="f1", group="kind")

Unnamed: 0,W,pval,equal_var
levene,5.129573,0.029308,False


In [52]:
print("Synchronicity Label Normality:")
pg.normality(synch_label, dv="f1", group="kind", method='shapiro', alpha=0.05)

Synchronicity Label Normality:


Unnamed: 0_level_0,W,pval,normal
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
synchronous,0.882667,0.019765,False
asynchronous,0.957947,0.274258,True


In [53]:
print("Synchronicity Sublabel Normality:")
pg.normality(synch_sublabel, dv="f1", group="kind", method='shapiro', alpha=0.05)

Synchronicity Sublabel Normality:


Unnamed: 0_level_0,W,pval,normal
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
synchronous,0.899948,0.041145,False
asynchronous,0.740865,7e-06,False


#### Running Significance tests
TTest for normal distributions, Mann-Whitney-U for non-normal

In [54]:
print("Domain Valence:")
print(pg.mwu(np.array(domain_val[domain_val["kind"] == "in-domain"]["f1"]), np.array(domain_val[domain_val["kind"] == "out-of-domain"]["f1"])))
print("Domain Label:")
print(pg.mwu(np.array(domain_label[domain_label["kind"] == "in-domain"]["f1"]), np.array(domain_label[domain_label["kind"] == "out-of-domain"]["f1"])))
print("Domain Subabel:") 
print(pg.mwu(np.array(domain_sublabel[domain_sublabel["kind"] == "in-domain"]["f1"]), np.array(domain_sublabel[domain_sublabel["kind"] == "out-of-domain"]["f1"])))

print("interaction Valence:") 
print(pg.mwu(np.array(interaction_val[interaction_val["kind"] == "human-chatbot"]["f1"]), np.array(interaction_val[interaction_val["kind"] == "human-human"]["f1"])))
print("interaction Label:") 
print(pg.mwu(np.array(interaction_label[interaction_label["kind"] == "human-chatbot"]["f1"]), np.array(interaction_label[interaction_label["kind"] == "human-human"]["f1"])))
print("interaction Subabel:")
print(pg.mwu(np.array(interaction_sublabel[interaction_sublabel["kind"] == "human-chatbot"]["f1"]), np.array(interaction_sublabel[interaction_sublabel["kind"] == "human-human"]["f1"])))

print("conv_mode Valence:") 
print(pg.mwu(np.array(conv_mode_val[conv_mode_val["kind"] == "single-party"]["f1"]), np.array(conv_mode_val[conv_mode_val["kind"] == "multi-party"]["f1"])))
print("conv_mode Label:") 
print(pg.mwu(np.array(conv_mode_label[conv_mode_label["kind"] == "single-party"]["f1"]), np.array(conv_mode_label[conv_mode_label["kind"] == "multi-party"]["f1"])))
print("conv_mode Subabel:") 
print(pg.mwu(np.array(conv_mode_sublabel[conv_mode_sublabel["kind"] == "single-party"]["f1"]), np.array(conv_mode_sublabel[conv_mode_sublabel["kind"] == "multi-party"]["f1"])))

print("synch Valence:") 
print(pg.mwu(np.array(synch_val[synch_val["kind"] == "synchronous"]["f1"]), np.array(synch_val[synch_val["kind"] == "asynchronous"]["f1"])))
print("synch Label:") 
print(pg.mwu(np.array(synch_label[synch_label["kind"] == "synchronous"]["f1"]), np.array(synch_label[synch_label["kind"] == "asynchronous"]["f1"])))
print("synch Subabel:") 
print(pg.mwu(np.array(synch_sublabel[synch_sublabel["kind"] == "synchronous"]["f1"]), np.array(synch_sublabel[synch_sublabel["kind"] == "asynchronous"]["f1"])))

Domain Valence:
     U-val alternative     p-val       RBC      CLES
MWU  271.0   two-sided  0.000163 -0.806667  0.903333
Domain Label:
     U-val alternative         p-val       RBC      CLES
MWU   49.0   two-sided  7.017547e-07  0.836667  0.081667
Domain Subabel:
     U-val alternative         p-val  RBC  CLES
MWU    0.0   two-sided  3.006030e-09  1.0   0.0
interaction Valence:
     U-val alternative     p-val    RBC    CLES
MWU  363.0   two-sided  0.000011 -0.815  0.9075
interaction Label:
     U-val alternative    p-val       RBC      CLES
MWU  395.0   two-sided  0.06128 -0.316667  0.658333
interaction Subabel:
     U-val alternative     p-val   RBC   CLES
MWU  309.0   two-sided  0.866321 -0.03  0.515
conv_mode Valence:
     U-val alternative     p-val   RBC  CLES
MWU  213.0   two-sided  0.050494 -0.42  0.71
conv_mode Label:
     U-val alternative     p-val       RBC      CLES
MWU  475.0   two-sided  0.000549 -0.583333  0.791667
conv_mode Subabel:
     U-val alternative    p-val   